diff options
Diffstat (limited to 'net/sched')
59 files changed, 12077 insertions, 3490 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a36270a994d..a1a8e29e5fc 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -24,7 +24,7 @@ menuconfig NET_SCHED  	  To administer these schedulers, you'll need the user-level utilities  	  from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.  	  That package also contains some documentation; for more, check out -	  <http://linux-net.osdl.org/index.php/Iproute2>. +	  <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.  	  This Quality of Service (QoS) support will enable you to use  	  Differentiated Services (diffserv) and Resource Reservation Protocol @@ -126,6 +126,17 @@ config NET_SCH_RED  	  To compile this code as a module, choose M here: the  	  module will be called sch_red. +config NET_SCH_SFB +	tristate "Stochastic Fair Blue (SFB)" +	---help--- +	  Say Y here if you want to use the Stochastic Fair Blue (SFB) +	  packet scheduling algorithm. + +	  See the top of <file:net/sched/sch_sfb.c> for more details. + +	  To compile this code as a module, choose M here: the +	  module will be called sch_sfb. +  config NET_SCH_SFQ  	tristate "Stochastic Fairness Queueing (SFQ)"  	---help--- @@ -205,6 +216,98 @@ config NET_SCH_DRR  	  If unsure, say N. +config NET_SCH_MQPRIO +	tristate "Multi-queue priority scheduler (MQPRIO)" +	help +	  Say Y here if you want to use the Multi-queue Priority scheduler. +	  This scheduler allows QOS to be offloaded on NICs that have support +	  for offloading QOS schedulers. + +	  To compile this driver as a module, choose M here: the module will +	  be called sch_mqprio. + +	  If unsure, say N. + +config NET_SCH_CHOKE +	tristate "CHOose and Keep responsive flow scheduler (CHOKE)" +	help +	  Say Y here if you want to use the CHOKe packet scheduler (CHOose +	  and Keep for responsive flows, CHOose and Kill for unresponsive +	  flows). This is a variation of RED which trys to penalize flows +	  that monopolize the queue. + +	  To compile this code as a module, choose M here: the +	  module will be called sch_choke. + +config NET_SCH_QFQ +	tristate "Quick Fair Queueing scheduler (QFQ)" +	help +	  Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ) +	  packet scheduling algorithm. + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_qfq. + +	  If unsure, say N. + +config NET_SCH_CODEL +	tristate "Controlled Delay AQM (CODEL)" +	help +	  Say Y here if you want to use the Controlled Delay (CODEL) +	  packet scheduling algorithm. + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_codel. + +	  If unsure, say N. + +config NET_SCH_FQ_CODEL +	tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)" +	help +	  Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL) +	  packet scheduling algorithm. + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_fq_codel. + +	  If unsure, say N. + +config NET_SCH_FQ +	tristate "Fair Queue" +	help +	  Say Y here if you want to use the FQ packet scheduling algorithm. + +	  FQ does flow separation, and is able to respect pacing requirements +	  set by TCP stack into sk->sk_pacing_rate (for localy generated +	  traffic) + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_fq. + +	  If unsure, say N. + +config NET_SCH_HHF +	tristate "Heavy-Hitter Filter (HHF)" +	help +	  Say Y here if you want to use the Heavy-Hitter Filter (HHF) +	  packet scheduling algorithm. + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_hhf. + +config NET_SCH_PIE +	tristate "Proportional Integral controller Enhanced (PIE) scheduler" +	help +	  Say Y here if you want to use the Proportional Integral controller +	  Enhanced scheduler packet scheduling algorithm. +	  For more information, please see +	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00 + +	  To compile this driver as a module, choose M here: the module +	  will be called sch_pie. + +	  If unsure, say N. +  config NET_SCH_INGRESS  	tristate "Ingress Qdisc"  	depends on NET_CLS_ACT @@ -215,6 +318,32 @@ config NET_SCH_INGRESS  	  To compile this code as a module, choose M here: the  	  module will be called sch_ingress. +config NET_SCH_PLUG +	tristate "Plug network traffic until release (PLUG)" +	---help--- + +	  This queuing discipline allows userspace to plug/unplug a network +	  output queue, using the netlink interface.  When it receives an +	  enqueue command it inserts a plug into the outbound queue that +	  causes following packets to enqueue until a dequeue command arrives +	  over netlink, causing the plug to be removed and resuming the normal +	  packet flow. + +	  This module also provides a generic "network output buffering" +	  functionality (aka output commit), wherein upon arrival of a dequeue +	  command, only packets up to the first plug are released for delivery. +	  The Remus HA project uses this module to enable speculative execution +	  of virtual machines by allowing the generated network output to be rolled +	  back if needed. + +	  For more information, please refer to http://wiki.xensource.com/xenwiki/Remus + +	  Say Y here if you are using this kernel for Xen dom0 and +	  want to protect Xen guests with Remus. + +	  To compile this code as a module, choose M here: the +	  module will be called sch_plug. +  comment "Classification"  config NET_CLS @@ -243,7 +372,8 @@ config NET_CLS_TCINDEX  config NET_CLS_ROUTE4  	tristate "Routing decision (ROUTE)" -	select NET_CLS_ROUTE +	depends on INET +	select IP_ROUTE_CLASSID  	select NET_CLS  	---help---  	  If you say Y here, you will be able to classify packets @@ -252,9 +382,6 @@ config NET_CLS_ROUTE4  	  To compile this code as a module, choose M here: the  	  module will be called cls_route. -config NET_CLS_ROUTE -	bool -  config NET_CLS_FW  	tristate "Netfilter mark (FW)"  	select NET_CLS @@ -330,6 +457,7 @@ config NET_CLS_FLOW  config NET_CLS_CGROUP  	tristate "Control Group Classifier"  	select NET_CLS +	select CGROUP_NET_CLASSID  	depends on CGROUPS  	---help---  	  Say Y here if you want to classify packets based on the control @@ -338,6 +466,16 @@ config NET_CLS_CGROUP  	  To compile this code as a module, choose M here: the  	  module will be called cls_cgroup. +config NET_CLS_BPF +	tristate "BPF-based classifier" +	select NET_CLS +	---help--- +	  If you say Y here, you will be able to classify packets based on +	  programmable BPF (JIT'ed) filters as an alternative to ematches. + +	  To compile this code as a module, choose M here: the module will +	  be called cls_bpf. +  config NET_EMATCH  	bool "Extended Matches"  	select NET_CLS @@ -416,6 +554,26 @@ config NET_EMATCH_TEXT  	  To compile this code as a module, choose M here: the  	  module will be called em_text. +config NET_EMATCH_CANID +	tristate "CAN Identifier" +	depends on NET_EMATCH && (CAN=y || CAN=m) +	---help--- +	  Say Y here if you want to be able to classify CAN frames based +	  on CAN Identifier. + +	  To compile this code as a module, choose M here: the +	  module will be called em_canid. + +config NET_EMATCH_IPSET +	tristate "IPset" +	depends on NET_EMATCH && IP_SET +	---help--- +	  Say Y here if you want to be able to classify packets based on +	  ipset membership. + +	  To compile this code as a module, choose M here: the +	  module will be called em_ipset. +  config NET_CLS_ACT  	bool "Actions"  	---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 960f5dba630..0a869a11f3e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o  obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o  obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o   obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFB)	+= sch_sfb.o  obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o  obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o  obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o @@ -32,6 +33,16 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o  obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o  obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o  obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o +obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o +obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o +obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o +obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o +obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o +obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o +obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o +  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o  obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o @@ -41,9 +52,12 @@ obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o  obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o  obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o  obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o +obj-$(CONFIG_NET_CLS_BPF)	+= cls_bpf.o  obj-$(CONFIG_NET_EMATCH)	+= ematch.o  obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o  obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o  obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o  obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o  obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o +obj-$(CONFIG_NET_EMATCH_CANID)	+= em_canid.o +obj-$(CONFIG_NET_EMATCH_IPSET)	+= em_ipset.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 23b25f89e7e..648778aef1a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -20,53 +20,47 @@  #include <linux/init.h>  #include <linux/kmod.h>  #include <linux/err.h> +#include <linux/module.h>  #include <net/net_namespace.h>  #include <net/sock.h>  #include <net/sch_generic.h>  #include <net/act_api.h>  #include <net/netlink.h> -static void tcf_common_free_rcu(struct rcu_head *head) +void tcf_hash_destroy(struct tc_action *a)  { -	kfree(container_of(head, struct tcf_common, tcfc_rcu)); -} +	struct tcf_common *p = a->priv; +	struct tcf_hashinfo *hinfo = a->ops->hinfo; -void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) -{ -	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); -	struct tcf_common **p1p; - -	for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) { -		if (*p1p == p) { -			write_lock_bh(hinfo->lock); -			*p1p = p->tcfc_next; -			write_unlock_bh(hinfo->lock); -			gen_kill_estimator(&p->tcfc_bstats, -					   &p->tcfc_rate_est); -			/* -			 * gen_estimator est_timer() might access p->tcfc_lock -			 * or bstats, wait a RCU grace period before freeing p -			 */ -			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu); -			return; -		} -	} -	WARN_ON(1); +	spin_lock_bh(&hinfo->lock); +	hlist_del(&p->tcfc_head); +	spin_unlock_bh(&hinfo->lock); +	gen_kill_estimator(&p->tcfc_bstats, +			   &p->tcfc_rate_est); +	/* +	 * gen_estimator est_timer() might access p->tcfc_lock +	 * or bstats, wait a RCU grace period before freeing p +	 */ +	kfree_rcu(p, tcfc_rcu);  }  EXPORT_SYMBOL(tcf_hash_destroy); -int tcf_hash_release(struct tcf_common *p, int bind, -		     struct tcf_hashinfo *hinfo) +int tcf_hash_release(struct tc_action *a, int bind)  { +	struct tcf_common *p = a->priv;  	int ret = 0;  	if (p) {  		if (bind)  			p->tcfc_bindcnt--; +		else if (p->tcfc_bindcnt > 0) +			return -EPERM;  		p->tcfc_refcnt--;  		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) { -			tcf_hash_destroy(p, hinfo); +			if (a->ops->cleanup) +				a->ops->cleanup(a, bind); +			tcf_hash_destroy(a);  			ret = 1;  		}  	} @@ -75,20 +69,22 @@ int tcf_hash_release(struct tcf_common *p, int bind,  EXPORT_SYMBOL(tcf_hash_release);  static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, -			   struct tc_action *a, struct tcf_hashinfo *hinfo) +			   struct tc_action *a)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head;  	struct tcf_common *p; -	int err = 0, index = -1,i = 0, s_i = 0, n_i = 0; +	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;  	struct nlattr *nest; -	read_lock_bh(hinfo->lock); +	spin_lock_bh(&hinfo->lock);  	s_i = cb->args[0];  	for (i = 0; i < (hinfo->hmask + 1); i++) { -		p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; +		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; -		for (; p; p = p->tcfc_next) { +		hlist_for_each_entry_rcu(p, head, tcfc_head) {  			index++;  			if (index < s_i)  				continue; @@ -111,7 +107,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,  		}  	}  done: -	read_unlock_bh(hinfo->lock); +	spin_unlock_bh(&hinfo->lock);  	if (n_i)  		cb->args[0] += n_i;  	return n_i; @@ -121,79 +117,82 @@ nla_put_failure:  	goto done;  } -static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, -			  struct tcf_hashinfo *hinfo) +static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)  { -	struct tcf_common *p, *s_p; +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head; +	struct hlist_node *n; +	struct tcf_common *p;  	struct nlattr *nest; -	int i= 0, n_i = 0; +	int i = 0, n_i = 0; +	int ret = -EINVAL;  	nest = nla_nest_start(skb, a->order);  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); +	if (nla_put_string(skb, TCA_KIND, a->ops->kind)) +		goto nla_put_failure;  	for (i = 0; i < (hinfo->hmask + 1); i++) { -		p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; - -		while (p != NULL) { -			s_p = p->tcfc_next; -			if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) -				 module_put(a->ops->owner); -			n_i++; -			p = s_p; +		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)]; +		hlist_for_each_entry_safe(p, n, head, tcfc_head) { +			a->priv = p; +			ret = tcf_hash_release(a, 0); +			if (ret == ACT_P_DELETED) { +				module_put(a->ops->owner); +				n_i++; +			} else if (ret < 0) +				goto nla_put_failure;  		}  	} -	NLA_PUT_U32(skb, TCA_FCNT, n_i); +	if (nla_put_u32(skb, TCA_FCNT, n_i)) +		goto nla_put_failure;  	nla_nest_end(skb, nest);  	return n_i;  nla_put_failure:  	nla_nest_cancel(skb, nest); -	return -EINVAL; +	return ret;  } -int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, -		       int type, struct tc_action *a) +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, +			      int type, struct tc_action *a)  { -	struct tcf_hashinfo *hinfo = a->ops->hinfo; -  	if (type == RTM_DELACTION) { -		return tcf_del_walker(skb, a, hinfo); +		return tcf_del_walker(skb, a);  	} else if (type == RTM_GETACTION) { -		return tcf_dump_walker(skb, cb, a, hinfo); +		return tcf_dump_walker(skb, cb, a);  	} else {  		WARN(1, "tcf_generic_walker: unknown action %d\n", type);  		return -EINVAL;  	}  } -EXPORT_SYMBOL(tcf_generic_walker); -struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo) +static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)  { -	struct tcf_common *p; +	struct tcf_common *p = NULL; +	struct hlist_head *head; -	read_lock_bh(hinfo->lock); -	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p; -	     p = p->tcfc_next) { +	spin_lock_bh(&hinfo->lock); +	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)]; +	hlist_for_each_entry_rcu(p, head, tcfc_head)  		if (p->tcfc_index == index)  			break; -	} -	read_unlock_bh(hinfo->lock); +	spin_unlock_bh(&hinfo->lock);  	return p;  } -EXPORT_SYMBOL(tcf_hash_lookup); -u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo) +u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)  { -	u32 val = *idx_gen; +	u32 val = hinfo->index;  	do {  		if (++val == 0)  			val = 1;  	} while (tcf_hash_lookup(val, hinfo)); -	return (*idx_gen = val); +	hinfo->index = val; +	return val;  }  EXPORT_SYMBOL(tcf_hash_new_index); @@ -210,34 +209,46 @@ int tcf_hash_search(struct tc_action *a, u32 index)  }  EXPORT_SYMBOL(tcf_hash_search); -struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind, -				  struct tcf_hashinfo *hinfo) +int tcf_hash_check(u32 index, struct tc_action *a, int bind)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	struct tcf_common *p = NULL;  	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {  		if (bind)  			p->tcfc_bindcnt++;  		p->tcfc_refcnt++;  		a->priv = p; +		return 1;  	} -	return p; +	return 0;  }  EXPORT_SYMBOL(tcf_hash_check); -struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est, -				   struct tc_action *a, int size, int bind, -				   u32 *idx_gen, struct tcf_hashinfo *hinfo) +void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) +{ +	struct tcf_common *pc = a->priv; +	if (est) +		gen_kill_estimator(&pc->tcfc_bstats, +				   &pc->tcfc_rate_est); +	kfree_rcu(pc, tcfc_rcu); +} +EXPORT_SYMBOL(tcf_hash_cleanup); + +int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, +		    int size, int bind)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	struct tcf_common *p = kzalloc(size, GFP_KERNEL);  	if (unlikely(!p)) -		return ERR_PTR(-ENOMEM); +		return -ENOMEM;  	p->tcfc_refcnt = 1;  	if (bind)  		p->tcfc_bindcnt = 1;  	spin_lock_init(&p->tcfc_lock); -	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo); +	INIT_HLIST_NODE(&p->tcfc_head); +	p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);  	p->tcfc_tm.install = jiffies;  	p->tcfc_tm.lastuse = jiffies;  	if (est) { @@ -245,42 +256,64 @@ struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,  					    &p->tcfc_lock, est);  		if (err) {  			kfree(p); -			return ERR_PTR(err); +			return err;  		}  	}  	a->priv = (void *) p; -	return p; +	return 0;  }  EXPORT_SYMBOL(tcf_hash_create); -void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo) +void tcf_hash_insert(struct tc_action *a)  { +	struct tcf_common *p = a->priv; +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); -	write_lock_bh(hinfo->lock); -	p->tcfc_next = hinfo->htab[h]; -	hinfo->htab[h] = p; -	write_unlock_bh(hinfo->lock); +	spin_lock_bh(&hinfo->lock); +	hlist_add_head(&p->tcfc_head, &hinfo->htab[h]); +	spin_unlock_bh(&hinfo->lock);  }  EXPORT_SYMBOL(tcf_hash_insert); -static struct tc_action_ops *act_base = NULL; +static LIST_HEAD(act_base);  static DEFINE_RWLOCK(act_mod_lock); -int tcf_register_action(struct tc_action_ops *act) +int tcf_register_action(struct tc_action_ops *act, unsigned int mask)  { -	struct tc_action_ops *a, **ap; +	struct tc_action_ops *a; +	int err; + +	/* Must supply act, dump and init */ +	if (!act->act || !act->dump || !act->init) +		return -EINVAL; + +	/* Supply defaults */ +	if (!act->lookup) +		act->lookup = tcf_hash_search; +	if (!act->walk) +		act->walk = tcf_generic_walker; + +	act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL); +	if (!act->hinfo) +		return -ENOMEM; +	err = tcf_hashinfo_init(act->hinfo, mask); +	if (err) { +		kfree(act->hinfo); +		return err; +	}  	write_lock(&act_mod_lock); -	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { +	list_for_each_entry(a, &act_base, head) {  		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {  			write_unlock(&act_mod_lock); +			tcf_hashinfo_destroy(act->hinfo); +			kfree(act->hinfo);  			return -EEXIST;  		}  	} -	act->next = NULL; -	*ap = act; +	list_add_tail(&act->head, &act_base);  	write_unlock(&act_mod_lock);  	return 0;  } @@ -288,17 +321,18 @@ EXPORT_SYMBOL(tcf_register_action);  int tcf_unregister_action(struct tc_action_ops *act)  { -	struct tc_action_ops *a, **ap; +	struct tc_action_ops *a;  	int err = -ENOENT;  	write_lock(&act_mod_lock); -	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) -		if (a == act) +	list_for_each_entry(a, &act_base, head) { +		if (a == act) { +			list_del(&act->head); +			tcf_hashinfo_destroy(act->hinfo); +			kfree(act->hinfo); +			err = 0;  			break; -	if (a) { -		*ap = a->next; -		a->next = NULL; -		err = 0; +		}  	}  	write_unlock(&act_mod_lock);  	return err; @@ -308,72 +342,45 @@ EXPORT_SYMBOL(tcf_unregister_action);  /* lookup by name */  static struct tc_action_ops *tc_lookup_action_n(char *kind)  { -	struct tc_action_ops *a = NULL; +	struct tc_action_ops *a, *res = NULL;  	if (kind) {  		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { +		list_for_each_entry(a, &act_base, head) {  			if (strcmp(kind, a->kind) == 0) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} +				if (try_module_get(a->owner)) +					res = a;  				break;  			}  		}  		read_unlock(&act_mod_lock);  	} -	return a; +	return res;  }  /* lookup by nlattr */  static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)  { -	struct tc_action_ops *a = NULL; +	struct tc_action_ops *a, *res = NULL;  	if (kind) {  		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { +		list_for_each_entry(a, &act_base, head) {  			if (nla_strcmp(kind, a->kind) == 0) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} +				if (try_module_get(a->owner)) +					res = a;  				break;  			}  		}  		read_unlock(&act_mod_lock);  	} -	return a; +	return res;  } -#if 0 -/* lookup by id */ -static struct tc_action_ops *tc_lookup_action_id(u32 type) -{ -	struct tc_action_ops *a = NULL; - -	if (type) { -		read_lock(&act_mod_lock); -		for (a = act_base; a; a = a->next) { -			if (a->type == type) { -				if (!try_module_get(a->owner)) { -					read_unlock(&act_mod_lock); -					return NULL; -				} -				break; -			} -		} -		read_unlock(&act_mod_lock); -	} -	return a; -} -#endif - -int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, +int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,  		    struct tcf_result *res)  { -	struct tc_action *a; +	const struct tc_action *a;  	int ret = -1;  	if (skb->tc_verd & TC_NCLS) { @@ -381,53 +388,44 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,  		ret = TC_ACT_OK;  		goto exec_done;  	} -	while ((a = act) != NULL) { +	list_for_each_entry(a, actions, list) {  repeat: -		if (a->ops && a->ops->act) { -			ret = a->ops->act(skb, a, res); -			if (TC_MUNGED & skb->tc_verd) { -				/* copied already, allow trampling */ -				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); -				skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); -			} -			if (ret == TC_ACT_REPEAT) -				goto repeat;	/* we need a ttl - JHS */ -			if (ret != TC_ACT_PIPE) -				goto exec_done; +		ret = a->ops->act(skb, a, res); +		if (TC_MUNGED & skb->tc_verd) { +			/* copied already, allow trampling */ +			skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); +			skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);  		} -		act = a->next; +		if (ret == TC_ACT_REPEAT) +			goto repeat;	/* we need a ttl - JHS */ +		if (ret != TC_ACT_PIPE) +			goto exec_done;  	}  exec_done:  	return ret;  }  EXPORT_SYMBOL(tcf_action_exec); -void tcf_action_destroy(struct tc_action *act, int bind) +int tcf_action_destroy(struct list_head *actions, int bind)  { -	struct tc_action *a; +	struct tc_action *a, *tmp; +	int ret = 0; -	for (a = act; a; a = act) { -		if (a->ops && a->ops->cleanup) { -			if (a->ops->cleanup(a, bind) == ACT_P_DELETED) -				module_put(a->ops->owner); -			act = act->next; -			kfree(a); -		} else { -			/*FIXME: Remove later - catch insertion bugs*/ -			WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n"); -			act = act->next; -			kfree(a); -		} +	list_for_each_entry_safe(a, tmp, actions, list) { +		ret = tcf_hash_release(a, bind); +		if (ret == ACT_P_DELETED) +			module_put(a->ops->owner); +		else if (ret < 0) +			return ret; +		list_del(&a->list); +		kfree(a);  	} +	return ret;  }  int  tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)  { -	int err = -EINVAL; - -	if (a->ops == NULL || a->ops->dump == NULL) -		return err;  	return a->ops->dump(skb, a, bind, ref);  } @@ -438,16 +436,15 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; -	if (a->ops == NULL || a->ops->dump == NULL) -		return err; - -	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); +	if (nla_put_string(skb, TCA_KIND, a->ops->kind)) +		goto nla_put_failure;  	if (tcf_action_copy_stats(skb, a, 0))  		goto nla_put_failure;  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure; -	if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { +	err = tcf_action_dump_old(skb, a, bind, ref); +	if (err > 0) {  		nla_nest_end(skb, nest);  		return err;  	} @@ -459,14 +456,13 @@ nla_put_failure:  EXPORT_SYMBOL(tcf_action_dump_1);  int -tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)  {  	struct tc_action *a;  	int err = -EINVAL;  	struct nlattr *nest; -	while ((a = act) != NULL) { -		act = a->next; +	list_for_each_entry(a, actions, list) {  		nest = nla_nest_start(skb, a->order);  		if (nest == NULL)  			goto nla_put_failure; @@ -485,13 +481,14 @@ errout:  	return err;  } -struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, -				    char *name, int ovr, int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, +				    struct nlattr *est, char *name, int ovr, +				    int bind)  {  	struct tc_action *a;  	struct tc_action_ops *a_o;  	char act_name[IFNAMSIZ]; -	struct nlattr *tb[TCA_ACT_MAX+1]; +	struct nlattr *tb[TCA_ACT_MAX + 1];  	struct nlattr *kind;  	int err; @@ -540,21 +537,22 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,  	if (a == NULL)  		goto err_mod; +	a->ops = a_o; +	INIT_LIST_HEAD(&a->list);  	/* backward compatibility for policer */  	if (name == NULL) -		err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); +		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);  	else -		err = a_o->init(nla, est, a, ovr, bind); +		err = a_o->init(net, nla, est, a, ovr, bind);  	if (err < 0)  		goto err_free;  	/* module count goes up only when brand new policy is created -	   if it exists and is only bound to in a_o->init() then -	   ACT_P_CREATED is not returned (a zero is). -	*/ +	 * if it exists and is only bound to in a_o->init() then +	 * ACT_P_CREATED is not returned (a zero is). +	 */  	if (err != ACT_P_CREATED)  		module_put(a_o->owner); -	a->ops = a_o;  	return a; @@ -566,36 +564,33 @@ err_out:  	return ERR_PTR(err);  } -struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, -				  char *name, int ovr, int bind) +int tcf_action_init(struct net *net, struct nlattr *nla, +				  struct nlattr *est, char *name, int ovr, +				  int bind, struct list_head *actions)  { -	struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; -	struct tc_action *head = NULL, *act, *act_prev = NULL; +	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; +	struct tc_action *act;  	int err;  	int i;  	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);  	if (err < 0) -		return ERR_PTR(err); +		return err;  	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { -		act = tcf_action_init_1(tb[i], est, name, ovr, bind); -		if (IS_ERR(act)) +		act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); +		if (IS_ERR(act)) { +			err = PTR_ERR(act);  			goto err; +		}  		act->order = i; - -		if (head == NULL) -			head = act; -		else -			act_prev->next = act; -		act_prev = act; +		list_add_tail(&act->list, actions);  	} -	return head; +	return 0;  err: -	if (head != NULL) -		tcf_action_destroy(head, bind); -	return act; +	tcf_action_destroy(actions, bind); +	return err;  }  int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, @@ -603,9 +598,9 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,  {  	int err = 0;  	struct gnet_dump d; -	struct tcf_act_hdr *h = a->priv; +	struct tcf_common *p = a->priv; -	if (h == NULL) +	if (p == NULL)  		goto errout;  	/* compat_mode being true specifies a call that is supposed @@ -614,24 +609,20 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,  	if (compat_mode) {  		if (a->type == TCA_OLD_COMPAT)  			err = gnet_stats_start_copy_compat(skb, 0, -				TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d); +				TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);  		else  			return 0;  	} else  		err = gnet_stats_start_copy(skb, TCA_ACT_STATS, -					    &h->tcf_lock, &d); +					    &p->tcfc_lock, &d);  	if (err < 0)  		goto errout; -	if (a->ops != NULL && a->ops->get_stats != NULL) -		if (a->ops->get_stats(skb, a) < 0) -			goto errout; - -	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 || -	    gnet_stats_copy_rate_est(&d, &h->tcf_bstats, -				     &h->tcf_rate_est) < 0 || -	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0) +	if (gnet_stats_copy_basic(&d, &p->tcfc_bstats) < 0 || +	    gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, +				     &p->tcfc_rate_est) < 0 || +	    gnet_stats_copy_queue(&d, &p->tcfc_qstats) < 0)  		goto errout;  	if (gnet_stats_finish_copy(&d) < 0) @@ -644,7 +635,7 @@ errout:  }  static int -tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, +tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,  	     u16 flags, int event, int bind, int ref)  {  	struct tcamsg *t; @@ -652,52 +643,66 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; -	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); - -	t = NLMSG_DATA(nlh); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); +	if (!nlh) +		goto out_nlmsg_trim; +	t = nlmsg_data(nlh);  	t->tca_family = AF_UNSPEC;  	t->tca__pad1 = 0;  	t->tca__pad2 = 0;  	nest = nla_nest_start(skb, TCA_ACT_TAB);  	if (nest == NULL) -		goto nla_put_failure; +		goto out_nlmsg_trim; -	if (tcf_action_dump(skb, a, bind, ref) < 0) -		goto nla_put_failure; +	if (tcf_action_dump(skb, actions, bind, ref) < 0) +		goto out_nlmsg_trim;  	nla_nest_end(skb, nest);  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	return skb->len; -nla_put_failure: -nlmsg_failure: +out_nlmsg_trim:  	nlmsg_trim(skb, b);  	return -1;  }  static int -act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, -	       struct tc_action *a, int event) +act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, +	       struct list_head *actions, int event)  {  	struct sk_buff *skb;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {  		kfree_skb(skb);  		return -EINVAL;  	} -	return rtnl_unicast(skb, net, pid); +	return rtnl_unicast(skb, net, portid); +} + +static struct tc_action *create_a(int i) +{ +	struct tc_action *act; + +	act = kzalloc(sizeof(*act), GFP_KERNEL); +	if (act == NULL) { +		pr_debug("create_a: failed to alloc!\n"); +		return NULL; +	} +	act->order = i; +	INIT_LIST_HEAD(&act->list); +	return act;  }  static struct tc_action * -tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) +tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)  { -	struct nlattr *tb[TCA_ACT_MAX+1]; +	struct nlattr *tb[TCA_ACT_MAX + 1];  	struct tc_action *a;  	int index;  	int err; @@ -713,16 +718,14 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)  	index = nla_get_u32(tb[TCA_ACT_INDEX]);  	err = -ENOMEM; -	a = kzalloc(sizeof(struct tc_action), GFP_KERNEL); +	a = create_a(0);  	if (a == NULL)  		goto err_out;  	err = -EINVAL;  	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]); -	if (a->ops == NULL) +	if (a->ops == NULL) /* could happen in batch of actions */  		goto err_free; -	if (a->ops->lookup == NULL) -		goto err_mod;  	err = -ENOENT;  	if (a->ops->lookup(a, index) == 0)  		goto err_mod; @@ -738,31 +741,18 @@ err_out:  	return ERR_PTR(err);  } -static void cleanup_a(struct tc_action *act) +static void cleanup_a(struct list_head *actions)  { -	struct tc_action *a; +	struct tc_action *a, *tmp; -	for (a = act; a; a = act) { -		act = a->next; +	list_for_each_entry_safe(a, tmp, actions, list) { +		list_del(&a->list);  		kfree(a);  	}  } -static struct tc_action *create_a(int i) -{ -	struct tc_action *act; - -	act = kzalloc(sizeof(*act), GFP_KERNEL); -	if (act == NULL) { -		pr_debug("create_a: failed to alloc!\n"); -		return NULL; -	} -	act->order = i; -	return act; -} -  static int tca_action_flush(struct net *net, struct nlattr *nla, -			    struct nlmsghdr *n, u32 pid) +			    struct nlmsghdr *n, u32 portid)  {  	struct sk_buff *skb;  	unsigned char *b; @@ -770,20 +760,14 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	struct tcamsg *t;  	struct netlink_callback dcb;  	struct nlattr *nest; -	struct nlattr *tb[TCA_ACT_MAX+1]; +	struct nlattr *tb[TCA_ACT_MAX + 1];  	struct nlattr *kind; -	struct tc_action *a = create_a(0); +	struct tc_action a;  	int err = -ENOMEM; -	if (a == NULL) { -		pr_debug("tca_action_flush: couldnt create tc_action\n"); -		return err; -	} -  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb) {  		pr_debug("tca_action_flush: failed skb alloc\n"); -		kfree(a);  		return err;  	} @@ -795,23 +779,27 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	err = -EINVAL;  	kind = tb[TCA_ACT_KIND]; -	a->ops = tc_lookup_action(kind); -	if (a->ops == NULL) +	memset(&a, 0, sizeof(struct tc_action)); +	INIT_LIST_HEAD(&a.list); +	a.ops = tc_lookup_action(kind); +	if (a.ops == NULL) /*some idjot trying to flush unknown action */  		goto err_out; -	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); -	t = NLMSG_DATA(nlh); +	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); +	if (!nlh) +		goto out_module_put; +	t = nlmsg_data(nlh);  	t->tca_family = AF_UNSPEC;  	t->tca__pad1 = 0;  	t->tca__pad2 = 0;  	nest = nla_nest_start(skb, TCA_ACT_TAB);  	if (nest == NULL) -		goto nla_put_failure; +		goto out_module_put; -	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); +	err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);  	if (err < 0) -		goto nla_put_failure; +		goto out_module_put;  	if (err == 0)  		goto noflush_out; @@ -819,171 +807,150 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	nlh->nlmsg_flags |= NLM_F_ROOT; -	module_put(a->ops->owner); -	kfree(a); -	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +	module_put(a.ops->owner); +	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			     n->nlmsg_flags & NLM_F_ECHO);  	if (err > 0)  		return 0;  	return err; -nla_put_failure: -nlmsg_failure: -	module_put(a->ops->owner); +out_module_put: +	module_put(a.ops->owner);  err_out:  noflush_out:  	kfree_skb(skb); -	kfree(a);  	return err;  }  static int +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +	       u32 portid) +{ +	int ret; +	struct sk_buff *skb; + +	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb) +		return -ENOBUFS; + +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, +			 0, 1) <= 0) { +		kfree_skb(skb); +		return -EINVAL; +	} + +	/* now do the delete */ +	ret = tcf_action_destroy(actions, 0); +	if (ret < 0) { +		kfree_skb(skb); +		return ret; +	} + +	ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			     n->nlmsg_flags & NLM_F_ECHO); +	if (ret > 0) +		return 0; +	return ret; +} + +static int  tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, -	      u32 pid, int event) +	      u32 portid, int event)  {  	int i, ret; -	struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; -	struct tc_action *head = NULL, *act, *act_prev = NULL; +	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; +	struct tc_action *act; +	LIST_HEAD(actions);  	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);  	if (ret < 0)  		return ret; -	if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { +	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {  		if (tb[1] != NULL) -			return tca_action_flush(net, tb[1], n, pid); +			return tca_action_flush(net, tb[1], n, portid);  		else  			return -EINVAL;  	}  	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { -		act = tcf_action_get_1(tb[i], n, pid); +		act = tcf_action_get_1(tb[i], n, portid);  		if (IS_ERR(act)) {  			ret = PTR_ERR(act);  			goto err;  		}  		act->order = i; - -		if (head == NULL) -			head = act; -		else -			act_prev->next = act; -		act_prev = act; +		list_add_tail(&act->list, &actions);  	}  	if (event == RTM_GETACTION) -		ret = act_get_notify(net, pid, n, head, event); +		ret = act_get_notify(net, portid, n, &actions, event);  	else { /* delete */ -		struct sk_buff *skb; - -		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); -		if (!skb) { -			ret = -ENOBUFS; -			goto err; -		} - -		if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, -				 0, 1) <= 0) { -			kfree_skb(skb); -			ret = -EINVAL; +		ret = tcf_del_notify(net, n, &actions, portid); +		if (ret)  			goto err; -		} - -		/* now do the delete */ -		tcf_action_destroy(head, 0); -		ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, -				     n->nlmsg_flags&NLM_F_ECHO); -		if (ret > 0) -			return 0;  		return ret;  	}  err: -	cleanup_a(head); +	cleanup_a(&actions);  	return ret;  } -static int tcf_add_notify(struct net *net, struct tc_action *a, -			  u32 pid, u32 seq, int event, u16 flags) +static int +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +	       u32 portid)  { -	struct tcamsg *t; -	struct nlmsghdr *nlh;  	struct sk_buff *skb; -	struct nlattr *nest; -	unsigned char *b;  	int err = 0;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	b = skb_tail_pointer(skb); - -	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags); -	t = NLMSG_DATA(nlh); -	t->tca_family = AF_UNSPEC; -	t->tca__pad1 = 0; -	t->tca__pad2 = 0; - -	nest = nla_nest_start(skb, TCA_ACT_TAB); -	if (nest == NULL) -		goto nla_put_failure; - -	if (tcf_action_dump(skb, a, 0, 0) < 0) -		goto nla_put_failure; - -	nla_nest_end(skb, nest); - -	nlh->nlmsg_len = skb_tail_pointer(skb) - b; -	NETLINK_CB(skb).dst_group = RTNLGRP_TC; +	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, +			 RTM_NEWACTION, 0, 0) <= 0) { +		kfree_skb(skb); +		return -EINVAL; +	} -	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); +	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			     n->nlmsg_flags & NLM_F_ECHO);  	if (err > 0)  		err = 0;  	return err; - -nla_put_failure: -nlmsg_failure: -	kfree_skb(skb); -	return -1;  } -  static int  tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, -	       u32 pid, int ovr) +	       u32 portid, int ovr)  {  	int ret = 0; -	struct tc_action *act; -	struct tc_action *a; -	u32 seq = n->nlmsg_seq; +	LIST_HEAD(actions); -	act = tcf_action_init(nla, NULL, NULL, ovr, 0); -	if (act == NULL) -		goto done; -	if (IS_ERR(act)) { -		ret = PTR_ERR(act); +	ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions); +	if (ret)  		goto done; -	}  	/* dump then free all the actions after update; inserted policy  	 * stays intact -	 * */ -	ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); -	for (a = act; a; a = act) { -		act = a->next; -		kfree(a); -	} +	 */ +	ret = tcf_add_notify(net, n, &actions, portid); +	cleanup_a(&actions);  done:  	return ret;  } -static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)  {  	struct net *net = sock_net(skb->sk);  	struct nlattr *tca[TCA_ACT_MAX + 1]; -	u32 pid = skb ? NETLINK_CB(skb).pid : 0; +	u32 portid = skb ? NETLINK_CB(skb).portid : 0;  	int ret = 0, ovr = 0; +	if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) +		return -EPERM; +  	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);  	if (ret < 0)  		return ret; @@ -993,30 +960,29 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  		return -EINVAL;  	} -	/* n->nlmsg_flags&NLM_F_CREATE -	 * */ +	/* n->nlmsg_flags & NLM_F_CREATE */  	switch (n->nlmsg_type) {  	case RTM_NEWACTION:  		/* we are going to assume all other flags -		 * imply create only if it doesnt exist +		 * imply create only if it doesn't exist  		 * Note that CREATE | EXCL implies that  		 * but since we want avoid ambiguity (eg when flags  		 * is zero) then just set this  		 */ -		if (n->nlmsg_flags&NLM_F_REPLACE) +		if (n->nlmsg_flags & NLM_F_REPLACE)  			ovr = 1;  replay: -		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); +		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);  		if (ret == -EAGAIN)  			goto replay;  		break;  	case RTM_DELACTION:  		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, -				    pid, RTM_DELACTION); +				    portid, RTM_DELACTION);  		break;  	case RTM_GETACTION:  		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, -				    pid, RTM_GETACTION); +				    portid, RTM_GETACTION);  		break;  	default:  		BUG(); @@ -1028,7 +994,7 @@ replay:  static struct nlattr *  find_dump_kind(const struct nlmsghdr *n)  { -	struct nlattr *tb1, *tb2[TCA_ACT_MAX+1]; +	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];  	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];  	struct nlattr *nla[TCAA_MAX + 1];  	struct nlattr *kind; @@ -1062,7 +1028,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)  	struct tc_action_ops *a_o;  	struct tc_action a;  	int ret = 0; -	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); +	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);  	struct nlattr *kind = find_dump_kind(cb->nlh);  	if (kind == NULL) { @@ -1071,33 +1037,28 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)  	}  	a_o = tc_lookup_action(kind); -	if (a_o == NULL) { +	if (a_o == NULL)  		return 0; -	}  	memset(&a, 0, sizeof(struct tc_action));  	a.ops = a_o; -	if (a_o->walk == NULL) { -		WARN(1, "tc_dump_action: %s !capable of dumping table\n", -		     a_o->kind); -		goto nla_put_failure; -	} - -	nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, -			cb->nlh->nlmsg_type, sizeof(*t)); -	t = NLMSG_DATA(nlh); +	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, +			cb->nlh->nlmsg_type, sizeof(*t), 0); +	if (!nlh) +		goto out_module_put; +	t = nlmsg_data(nlh);  	t->tca_family = AF_UNSPEC;  	t->tca__pad1 = 0;  	t->tca__pad2 = 0;  	nest = nla_nest_start(skb, TCA_ACT_TAB);  	if (nest == NULL) -		goto nla_put_failure; +		goto out_module_put;  	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);  	if (ret < 0) -		goto nla_put_failure; +		goto out_module_put;  	if (ret > 0) {  		nla_nest_end(skb, nest); @@ -1106,13 +1067,12 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)  		nla_nest_cancel(skb, nest);  	nlh->nlmsg_len = skb_tail_pointer(skb) - b; -	if (NETLINK_CB(cb->skb).pid && ret) +	if (NETLINK_CB(cb->skb).portid && ret)  		nlh->nlmsg_flags |= NLM_F_MULTI;  	module_put(a_o->owner);  	return skb->len; -nla_put_failure: -nlmsg_failure: +out_module_put:  	module_put(a_o->owner);  	nlmsg_trim(skb, b);  	return skb->len; @@ -1120,9 +1080,10 @@ nlmsg_failure:  static int __init tc_action_init(void)  { -	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action); +	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, +		      NULL);  	return 0;  } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 67dc7ce9b63..edbf40dac70 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,33 +37,23 @@  #include <net/tc_act/tc_csum.h>  #define CSUM_TAB_MASK 15 -static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; -static u32 csum_idx_gen; -static DEFINE_RWLOCK(csum_lock); - -static struct tcf_hashinfo csum_hash_info = { -	.htab	= tcf_csum_ht, -	.hmask	= CSUM_TAB_MASK, -	.lock	= &csum_lock, -};  static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {  	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },  }; -static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, +static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,  			 struct tc_action *a, int ovr, int bind)  {  	struct nlattr *tb[TCA_CSUM_MAX + 1];  	struct tc_csum *parm; -	struct tcf_common *pc;  	struct tcf_csum *p;  	int ret = 0, err;  	if (nla == NULL)  		return -EINVAL; -	err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); +	err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);  	if (err < 0)  		return err; @@ -71,39 +61,31 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,  		return -EINVAL;  	parm = nla_data(tb[TCA_CSUM_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &csum_idx_gen, &csum_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); -		p = to_tcf_csum(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		p = to_tcf_csum(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &csum_hash_info); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} +	p = to_tcf_csum(a);  	spin_lock_bh(&p->tcf_lock);  	p->tcf_action = parm->action;  	p->update_flags = parm->update_flags;  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &csum_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_csum_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_csum *p = a->priv; -	return tcf_hash_release(&p->common, bind, &csum_hash_info); -} -  /**   * tcf_csum_skb_nextlayer - Get next layer pointer   * @skb: sk_buff to use @@ -166,15 +148,17 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb,  	return 1;  } -static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_icmp(struct sk_buff *skb,  			      unsigned int ihl, unsigned int ipl)  {  	struct icmp6hdr *icmp6h; +	const struct ipv6hdr *ip6h;  	icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));  	if (icmp6h == NULL)  		return 0; +	ip6h = ipv6_hdr(skb);  	icmp6h->icmp6_cksum = 0;  	skb->csum = csum_partial(icmp6h, ipl - ihl, 0);  	icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -186,15 +170,17 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,  	return 1;  } -static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_tcp(struct sk_buff *skb,  			     unsigned int ihl, unsigned int ipl)  {  	struct tcphdr *tcph; +	const struct iphdr *iph;  	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));  	if (tcph == NULL)  		return 0; +	iph = ip_hdr(skb);  	tcph->check = 0;  	skb->csum = csum_partial(tcph, ipl - ihl, 0);  	tcph->check = tcp_v4_check(ipl - ihl, @@ -205,15 +191,17 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,  	return 1;  } -static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_tcp(struct sk_buff *skb,  			     unsigned int ihl, unsigned int ipl)  {  	struct tcphdr *tcph; +	const struct ipv6hdr *ip6h;  	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));  	if (tcph == NULL)  		return 0; +	ip6h = ipv6_hdr(skb);  	tcph->check = 0;  	skb->csum = csum_partial(tcph, ipl - ihl, 0);  	tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, @@ -225,10 +213,11 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,  	return 1;  } -static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, +static int tcf_csum_ipv4_udp(struct sk_buff *skb,  			     unsigned int ihl, unsigned int ipl, int udplite)  {  	struct udphdr *udph; +	const struct iphdr *iph;  	u16 ul;  	/* @@ -242,6 +231,7 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,  	if (udph == NULL)  		return 0; +	iph = ip_hdr(skb);  	ul = ntohs(udph->len);  	if (udplite || udph->check) { @@ -276,10 +266,11 @@ ignore_obscure_skb:  	return 1;  } -static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, +static int tcf_csum_ipv6_udp(struct sk_buff *skb,  			     unsigned int ihl, unsigned int ipl, int udplite)  {  	struct udphdr *udph; +	const struct ipv6hdr *ip6h;  	u16 ul;  	/* @@ -293,6 +284,7 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,  	if (udph == NULL)  		return 0; +	ip6h = ipv6_hdr(skb);  	ul = ntohs(udph->len);  	udph->check = 0; @@ -328,7 +320,7 @@ ignore_obscure_skb:  static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)  { -	struct iphdr *iph; +	const struct iphdr *iph;  	int ntkoff;  	ntkoff = skb_network_offset(skb); @@ -353,19 +345,19 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)  		break;  	case IPPROTO_TCP:  		if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) -			if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, +			if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,  					       ntohs(iph->tot_len)))  				goto fail;  		break;  	case IPPROTO_UDP:  		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) -			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, +			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,  					       ntohs(iph->tot_len), 0))  				goto fail;  		break;  	case IPPROTO_UDPLITE:  		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) -			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, +			if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,  					       ntohs(iph->tot_len), 1))  				goto fail;  		break; @@ -377,7 +369,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)  		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))  			goto fail; -		ip_send_check(iph); +		ip_send_check(ip_hdr(skb));  	}  	return 1; @@ -397,7 +389,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,  	while (len > 1) {  		switch (xh[off]) { -		case IPV6_TLV_PAD0: +		case IPV6_TLV_PAD1:  			optlen = 1;  			break;  		case IPV6_TLV_JUMBO: @@ -456,6 +448,7 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)  			ixhl = ipv6_optlen(ip6xh);  			if (!pskb_may_pull(skb, hl + ixhl + ntkoff))  				goto fail; +			ip6xh = (void *)(skb_network_header(skb) + hl);  			if ((nexthdr == NEXTHDR_HOP) &&  			    !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))  				goto fail; @@ -464,25 +457,25 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)  			break;  		case IPPROTO_ICMPV6:  			if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) -				if (!tcf_csum_ipv6_icmp(skb, ip6h, +				if (!tcf_csum_ipv6_icmp(skb,  							hl, pl + sizeof(*ip6h)))  					goto fail;  			goto done;  		case IPPROTO_TCP:  			if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) -				if (!tcf_csum_ipv6_tcp(skb, ip6h, +				if (!tcf_csum_ipv6_tcp(skb,  						       hl, pl + sizeof(*ip6h)))  					goto fail;  			goto done;  		case IPPROTO_UDP:  			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) -				if (!tcf_csum_ipv6_udp(skb, ip6h, hl, +				if (!tcf_csum_ipv6_udp(skb, hl,  						       pl + sizeof(*ip6h), 0))  					goto fail;  			goto done;  		case IPPROTO_UDPLITE:  			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) -				if (!tcf_csum_ipv6_udp(skb, ip6h, hl, +				if (!tcf_csum_ipv6_udp(skb, hl,  						       pl + sizeof(*ip6h), 1))  					goto fail;  			goto done; @@ -500,7 +493,7 @@ fail:  }  static int tcf_csum(struct sk_buff *skb, -		    struct tc_action *a, struct tcf_result *res) +		    const struct tc_action *a, struct tcf_result *res)  {  	struct tcf_csum *p = a->priv;  	int action; @@ -508,8 +501,7 @@ static int tcf_csum(struct sk_buff *skb,  	spin_lock(&p->tcf_lock);  	p->tcf_tm.lastuse = jiffies; -	p->tcf_bstats.bytes += qdisc_pkt_len(skb); -	p->tcf_bstats.packets++; +	bstats_update(&p->tcf_bstats, skb);  	action = p->tcf_action;  	update_flags = p->update_flags;  	spin_unlock(&p->tcf_lock); @@ -551,11 +543,13 @@ static int tcf_csum_dump(struct sk_buff *skb,  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(p->tcf_tm.expires); -	NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len; @@ -566,16 +560,11 @@ nla_put_failure:  static struct tc_action_ops act_csum_ops = {  	.kind		= "csum", -	.hinfo		= &csum_hash_info,  	.type		= TCA_ACT_CSUM, -	.capab		= TCA_CAP_NONE,  	.owner		= THIS_MODULE,  	.act		= tcf_csum,  	.dump		= tcf_csum_dump, -	.cleanup	= tcf_csum_cleanup, -	.lookup		= tcf_hash_search,  	.init		= tcf_csum_init, -	.walk		= tcf_generic_walker  };  MODULE_DESCRIPTION("Checksum updating actions"); @@ -583,7 +572,7 @@ MODULE_LICENSE("GPL");  static int __init csum_init_module(void)  { -	return tcf_register_action(&act_csum_ops); +	return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);  }  static void __exit csum_cleanup_module(void) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index c2ed90a4c0b..d6bcbd9f779 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -24,20 +24,11 @@  #include <net/tc_act/tc_gact.h>  #define GACT_TAB_MASK	15 -static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1]; -static u32 gact_idx_gen; -static DEFINE_RWLOCK(gact_lock); - -static struct tcf_hashinfo gact_hash_info = { -	.htab	=	tcf_gact_ht, -	.hmask	=	GACT_TAB_MASK, -	.lock	=	&gact_lock, -};  #ifdef CONFIG_GACT_PROB  static int gact_net_rand(struct tcf_gact *gact)  { -	if (!gact->tcfg_pval || net_random() % gact->tcfg_pval) +	if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)  		return gact->tcf_action;  	return gact->tcfg_paction;  } @@ -50,7 +41,7 @@ static int gact_determ(struct tcf_gact *gact)  }  typedef int (*g_rand)(struct tcf_gact *gact); -static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; +static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };  #endif /* CONFIG_GACT_PROB */  static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { @@ -58,15 +49,18 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {  	[TCA_GACT_PROB]		= { .len = sizeof(struct tc_gact_p) },  }; -static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, -			 struct tc_action *a, int ovr, int bind) +static int tcf_gact_init(struct net *net, struct nlattr *nla, +			 struct nlattr *est, struct tc_action *a, +			 int ovr, int bind)  {  	struct nlattr *tb[TCA_GACT_MAX + 1];  	struct tc_gact *parm;  	struct tcf_gact *gact; -	struct tcf_common *pc;  	int ret = 0;  	int err; +#ifdef CONFIG_GACT_PROB +	struct tc_gact_p *p_parm = NULL; +#endif  	if (nla == NULL)  		return -EINVAL; @@ -82,29 +76,33 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,  #ifndef CONFIG_GACT_PROB  	if (tb[TCA_GACT_PROB] != NULL)  		return -EOPNOTSUPP; +#else +	if (tb[TCA_GACT_PROB]) { +		p_parm = nla_data(tb[TCA_GACT_PROB]); +		if (p_parm->ptype >= MAX_RAND) +			return -EINVAL; +	}  #endif -	pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), -				     bind, &gact_idx_gen, &gact_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		if (!ovr) { -			tcf_hash_release(pc, bind, &gact_hash_info); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} -	gact = to_gact(pc); +	gact = to_gact(a);  	spin_lock_bh(&gact->tcf_lock);  	gact->tcf_action = parm->action;  #ifdef CONFIG_GACT_PROB -	if (tb[TCA_GACT_PROB] != NULL) { -		struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]); +	if (p_parm) {  		gact->tcfg_paction = p_parm->paction;  		gact->tcfg_pval    = p_parm->pval;  		gact->tcfg_ptype   = p_parm->ptype; @@ -112,27 +110,19 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,  #endif  	spin_unlock_bh(&gact->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &gact_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_gact_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_gact *gact = a->priv; - -	if (gact) -		return tcf_hash_release(&gact->common, bind, &gact_hash_info); -	return 0; -} - -static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, +		    struct tcf_result *res)  {  	struct tcf_gact *gact = a->priv;  	int action = TC_ACT_SHOT;  	spin_lock(&gact->tcf_lock);  #ifdef CONFIG_GACT_PROB -	if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) +	if (gact->tcfg_ptype)  		action = gact_rand[gact->tcfg_ptype](gact);  	else  		action = gact->tcf_action; @@ -161,7 +151,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure;  #ifdef CONFIG_GACT_PROB  	if (gact->tcfg_ptype) {  		struct tc_gact_p p_opt = { @@ -170,13 +161,15 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int  			.ptype   = gact->tcfg_ptype,  		}; -		NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); +		if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt)) +			goto nla_put_failure;  	}  #endif  	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); -	NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -186,16 +179,11 @@ nla_put_failure:  static struct tc_action_ops act_gact_ops = {  	.kind		=	"gact", -	.hinfo		=	&gact_hash_info,  	.type		=	TCA_ACT_GACT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_gact,  	.dump		=	tcf_gact_dump, -	.cleanup	=	tcf_gact_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_gact_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -205,11 +193,11 @@ MODULE_LICENSE("GPL");  static int __init gact_init_module(void)  {  #ifdef CONFIG_GACT_PROB -	printk(KERN_INFO "GACT probability on\n"); +	pr_info("GACT probability on\n");  #else -	printk(KERN_INFO "GACT probability NOT on\n"); +	pr_info("GACT probability NOT on\n");  #endif -	return tcf_register_action(&act_gact_ops); +	return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);  }  static void __exit gact_cleanup_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8daef963225..8a64a0734ae 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -1,5 +1,5 @@  /* - * net/sched/ipt.c	iptables target interface + * net/sched/ipt.c     iptables target interface   *   *TODO: Add other tables. For now we only support the ipv4 table targets   * @@ -8,7 +8,7 @@   *		as published by the Free Software Foundation; either version   *		2 of the License, or (at your option) any later version.   * - * Copyright:	Jamal Hadi Salim (2002-4) + * Copyright:	Jamal Hadi Salim (2002-13)   */  #include <linux/types.h> @@ -29,15 +29,6 @@  #define IPT_TAB_MASK     15 -static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1]; -static u32 ipt_idx_gen; -static DEFINE_RWLOCK(ipt_lock); - -static struct tcf_hashinfo ipt_hash_info = { -	.htab	=	tcf_ipt_ht, -	.hmask	=	IPT_TAB_MASK, -	.lock	=	&ipt_lock, -};  static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)  { @@ -77,22 +68,12 @@ static void ipt_destroy_target(struct xt_entry_target *t)  	module_put(par.target->me);  } -static int tcf_ipt_release(struct tcf_ipt *ipt, int bind) +static void tcf_ipt_release(struct tc_action *a, int bind)  { -	int ret = 0; -	if (ipt) { -		if (bind) -			ipt->tcf_bindcnt--; -		ipt->tcf_refcnt--; -		if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) { -			ipt_destroy_target(ipt->tcfi_t); -			kfree(ipt->tcfi_tname); -			kfree(ipt->tcfi_t); -			tcf_hash_destroy(&ipt->common, &ipt_hash_info); -			ret = ACT_P_DELETED; -		} -	} -	return ret; +	struct tcf_ipt *ipt = to_ipt(a); +	ipt_destroy_target(ipt->tcfi_t); +	kfree(ipt->tcfi_tname); +	kfree(ipt->tcfi_t);  }  static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { @@ -102,12 +83,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {  	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },  }; -static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, +static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,  			struct tc_action *a, int ovr, int bind)  {  	struct nlattr *tb[TCA_IPT_MAX + 1];  	struct tcf_ipt *ipt; -	struct tcf_common *pc;  	struct xt_entry_target *td, *t;  	char *tname;  	int ret = 0, err; @@ -133,20 +113,20 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,  	if (tb[TCA_IPT_INDEX] != NULL)  		index = nla_get_u32(tb[TCA_IPT_INDEX]); -	pc = tcf_hash_check(index, a, bind, &ipt_hash_info); -	if (!pc) { -		pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, -				     &ipt_idx_gen, &ipt_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); +	if (!tcf_hash_check(index, a, bind) ) { +		ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		if (!ovr) { -			tcf_ipt_release(to_ipt(pc), bind); +		if (bind)/* dont override defaults */ +			return 0; +		tcf_hash_release(a, bind); + +		if (!ovr)  			return -EEXIST; -		}  	} -	ipt = to_ipt(pc); +	ipt = to_ipt(a);  	hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -162,7 +142,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,  	if (unlikely(!t))  		goto err2; -	if ((err = ipt_init_target(t, tname, hook)) < 0) +	err = ipt_init_target(t, tname, hook); +	if (err < 0)  		goto err3;  	spin_lock_bh(&ipt->tcf_lock); @@ -176,7 +157,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,  	ipt->tcfi_hook  = hook;  	spin_unlock_bh(&ipt->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &ipt_hash_info); +		tcf_hash_insert(a);  	return ret;  err3: @@ -184,37 +165,30 @@ err3:  err2:  	kfree(tname);  err1: -	kfree(pc); +	if (ret == ACT_P_CREATED) +		tcf_hash_cleanup(a, est);  	return err;  } -static int tcf_ipt_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_ipt *ipt = a->priv; -	return tcf_ipt_release(ipt, bind); -} - -static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, +static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,  		   struct tcf_result *res)  {  	int ret = 0, result = 0;  	struct tcf_ipt *ipt = a->priv;  	struct xt_action_param par; -	if (skb_cloned(skb)) { -		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) -			return TC_ACT_UNSPEC; -	} +	if (skb_unclone(skb, GFP_ATOMIC)) +		return TC_ACT_UNSPEC;  	spin_lock(&ipt->tcf_lock);  	ipt->tcf_tm.lastuse = jiffies; -	ipt->tcf_bstats.bytes += qdisc_pkt_len(skb); -	ipt->tcf_bstats.packets++; +	bstats_update(&ipt->tcf_bstats, skb);  	/* yes, we have to worry about both in and out dev -	 worry later - danger - this API seems to have changed -	 from earlier kernels */ +	 * worry later - danger - this API seems to have changed +	 * from earlier kernels +	 */  	par.in       = skb->dev;  	par.out      = NULL;  	par.hooknum  = ipt->tcfi_hook; @@ -234,9 +208,8 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,  		result = TC_ACT_PIPE;  		break;  	default: -		if (net_ratelimit()) -			pr_notice("tc filter: Bogus netfilter code" -				  " %d assume ACCEPT\n", ret); +		net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", +				       ret);  		result = TC_POLICE_OK;  		break;  	} @@ -254,9 +227,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int  	struct tc_cnt c;  	/* for simple targets kernel size == user size -	** user name = target name -	** for foolproof you need to not assume this -	*/ +	 * user name = target name +	 * for foolproof you need to not assume this +	 */  	t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);  	if (unlikely(!t)) @@ -266,15 +239,17 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int  	c.refcnt = ipt->tcf_refcnt - ref;  	strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); -	NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); -	NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index); -	NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook); -	NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); -	NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname); +	if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || +	    nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || +	    nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || +	    nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || +	    nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) +		goto nla_put_failure;  	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);  	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);  	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); -	NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); +	if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) +		goto nla_put_failure;  	kfree(t);  	return skb->len; @@ -286,29 +261,49 @@ nla_put_failure:  static struct tc_action_ops act_ipt_ops = {  	.kind		=	"ipt", -	.hinfo		=	&ipt_hash_info,  	.type		=	TCA_ACT_IPT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_ipt,  	.dump		=	tcf_ipt_dump, -	.cleanup	=	tcf_ipt_cleanup, -	.lookup		=	tcf_hash_search, +	.cleanup	=	tcf_ipt_release, +	.init		=	tcf_ipt_init, +}; + +static struct tc_action_ops act_xt_ops = { +	.kind		=	"xt", +	.type		=	TCA_ACT_XT, +	.owner		=	THIS_MODULE, +	.act		=	tcf_ipt, +	.dump		=	tcf_ipt_dump, +	.cleanup	=	tcf_ipt_release,  	.init		=	tcf_ipt_init, -	.walk		=	tcf_generic_walker  }; -MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");  MODULE_DESCRIPTION("Iptables target actions");  MODULE_LICENSE("GPL"); +MODULE_ALIAS("act_xt");  static int __init ipt_init_module(void)  { -	return tcf_register_action(&act_ipt_ops); +	int ret1, ret2; + +	ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK); +	if (ret1 < 0) +		printk("Failed to load xt action\n"); +	ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK); +	if (ret2 < 0) +		printk("Failed to load ipt action\n"); + +	if (ret1 < 0 && ret2 < 0) { +		return ret1; +	} else +		return 0;  }  static void __exit ipt_cleanup_module(void)  { +	tcf_unregister_action(&act_xt_ops);  	tcf_unregister_action(&act_ipt_ops);  } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 0c311be9282..4f912c0e225 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,45 +30,27 @@  #include <linux/if_arp.h>  #define MIRRED_TAB_MASK     7 -static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; -static u32 mirred_idx_gen; -static DEFINE_RWLOCK(mirred_lock);  static LIST_HEAD(mirred_list); -static struct tcf_hashinfo mirred_hash_info = { -	.htab	=	tcf_mirred_ht, -	.hmask	=	MIRRED_TAB_MASK, -	.lock	=	&mirred_lock, -}; - -static inline int tcf_mirred_release(struct tcf_mirred *m, int bind) +static void tcf_mirred_release(struct tc_action *a, int bind)  { -	if (m) { -		if (bind) -			m->tcf_bindcnt--; -		m->tcf_refcnt--; -		if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) { -			list_del(&m->tcfm_list); -			if (m->tcfm_dev) -				dev_put(m->tcfm_dev); -			tcf_hash_destroy(&m->common, &mirred_hash_info); -			return 1; -		} -	} -	return 0; +	struct tcf_mirred *m = to_mirred(a); +	list_del(&m->tcfm_list); +	if (m->tcfm_dev) +		dev_put(m->tcfm_dev);  }  static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {  	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },  }; -static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, -			   struct tc_action *a, int ovr, int bind) +static int tcf_mirred_init(struct net *net, struct nlattr *nla, +			   struct nlattr *est, struct tc_action *a, int ovr, +			   int bind)  {  	struct nlattr *tb[TCA_MIRRED_MAX + 1];  	struct tc_mirred *parm;  	struct tcf_mirred *m; -	struct tcf_common *pc;  	struct net_device *dev;  	int ret, ok_push = 0; @@ -88,7 +70,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,  		return -EINVAL;  	}  	if (parm->ifindex) { -		dev = __dev_get_by_index(&init_net, parm->ifindex); +		dev = __dev_get_by_index(net, parm->ifindex);  		if (dev == NULL)  			return -ENODEV;  		switch (dev->type) { @@ -108,22 +90,20 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,  		dev = NULL;  	} -	pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info); -	if (!pc) { +	if (!tcf_hash_check(parm->index, a, bind)) {  		if (dev == NULL)  			return -EINVAL; -		pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind, -				     &mirred_idx_gen, &mirred_hash_info); -		if (IS_ERR(pc)) -			return PTR_ERR(pc); +		ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else {  		if (!ovr) { -			tcf_mirred_release(to_mirred(pc), bind); +			tcf_hash_release(a, bind);  			return -EEXIST;  		}  	} -	m = to_mirred(pc); +	m = to_mirred(a);  	spin_lock_bh(&m->tcf_lock);  	m->tcf_action = parm->action; @@ -139,22 +119,13 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,  	spin_unlock_bh(&m->tcf_lock);  	if (ret == ACT_P_CREATED) {  		list_add(&m->tcfm_list, &mirred_list); -		tcf_hash_insert(pc, &mirred_hash_info); +		tcf_hash_insert(a);  	}  	return ret;  } -static int tcf_mirred_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_mirred *m = a->priv; - -	if (m) -		return tcf_mirred_release(m, bind); -	return 0; -} - -static int tcf_mirred(struct sk_buff *skb, struct tc_action *a, +static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,  		      struct tcf_result *res)  {  	struct tcf_mirred *m = a->priv; @@ -165,8 +136,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,  	spin_lock(&m->tcf_lock);  	m->tcf_tm.lastuse = jiffies; -	m->tcf_bstats.bytes += qdisc_pkt_len(skb); -	m->tcf_bstats.packets++; +	bstats_update(&m->tcf_bstats, skb);  	dev = m->tcfm_dev;  	if (!dev) { @@ -175,9 +145,8 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,  	}  	if (!(dev->flags & IFF_UP)) { -		if (net_ratelimit()) -			pr_notice("tc mirred to Houston: device %s is down\n", -				  dev->name); +		net_notice_ratelimited("tc mirred to Houston: device %s is down\n", +				       dev->name);  		goto out;  	} @@ -197,19 +166,17 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,  	skb2->skb_iif = skb->dev->ifindex;  	skb2->dev = dev; -	dev_queue_xmit(skb2); -	err = 0; +	err = dev_queue_xmit(skb2);  out:  	if (err) {  		m->tcf_qstats.overlimits++; -		/* should we be asking for packet to be dropped? -		 * may make sense for redirect case only -		 */ -		retval = TC_ACT_SHOT; -	} else { +		if (m->tcfm_eaction != TCA_EGRESS_MIRROR) +			retval = TC_ACT_SHOT; +		else +			retval = m->tcf_action; +	} else  		retval = m->tcf_action; -	}  	spin_unlock(&m->tcf_lock);  	return retval; @@ -229,11 +196,13 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(m->tcf_tm.expires); -	NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -244,7 +213,7 @@ nla_put_failure:  static int mirred_device_event(struct notifier_block *unused,  			       unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct tcf_mirred *m;  	if (event == NETDEV_UNREGISTER) @@ -262,19 +231,14 @@ static struct notifier_block mirred_device_notifier = {  	.notifier_call = mirred_device_event,  }; -  static struct tc_action_ops act_mirred_ops = {  	.kind		=	"mirred", -	.hinfo		=	&mirred_hash_info,  	.type		=	TCA_ACT_MIRRED, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_mirred,  	.dump		=	tcf_mirred_dump, -	.cleanup	=	tcf_mirred_cleanup, -	.lookup		=	tcf_hash_search, +	.cleanup	=	tcf_mirred_release,  	.init		=	tcf_mirred_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002)"); @@ -288,13 +252,13 @@ static int __init mirred_init_module(void)  		return err;  	pr_info("Mirror/redirect action on\n"); -	return tcf_register_action(&act_mirred_ops); +	return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);  }  static void __exit mirred_cleanup_module(void)  { -	unregister_netdevice_notifier(&mirred_device_notifier);  	tcf_unregister_action(&act_mirred_ops); +	unregister_netdevice_notifier(&mirred_device_notifier);  }  module_init(mirred_init_module); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 186eb837e60..270a030d5fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -30,28 +30,18 @@  #define NAT_TAB_MASK	15 -static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1]; -static u32 nat_idx_gen; -static DEFINE_RWLOCK(nat_lock); - -static struct tcf_hashinfo nat_hash_info = { -	.htab	=	tcf_nat_ht, -	.hmask	=	NAT_TAB_MASK, -	.lock	=	&nat_lock, -};  static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {  	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },  }; -static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, +static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,  			struct tc_action *a, int ovr, int bind)  {  	struct nlattr *tb[TCA_NAT_MAX + 1];  	struct tc_nat *parm;  	int ret = 0, err;  	struct tcf_nat *p; -	struct tcf_common *pc;  	if (nla == NULL)  		return -EINVAL; @@ -64,21 +54,19 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,  		return -EINVAL;  	parm = nla_data(tb[TCA_NAT_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &nat_idx_gen, &nat_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); -		p = to_tcf_nat(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret;  		ret = ACT_P_CREATED;  	} else { -		p = to_tcf_nat(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &nat_hash_info); +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	} +	p = to_tcf_nat(a);  	spin_lock_bh(&p->tcf_lock);  	p->old_addr = parm->old_addr; @@ -90,19 +78,12 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &nat_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_nat_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_nat *p = a->priv; - -	return tcf_hash_release(&p->common, bind, &nat_hash_info); -} - -static int tcf_nat(struct sk_buff *skb, struct tc_action *a, +static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,  		   struct tcf_result *res)  {  	struct tcf_nat *p = a->priv; @@ -125,8 +106,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,  	egress = p->flags & TCA_NAT_FLAG_EGRESS;  	action = p->tcf_action; -	p->tcf_bstats.bytes += qdisc_pkt_len(skb); -	p->tcf_bstats.packets++; +	bstats_update(&p->tcf_bstats, skb);  	spin_unlock(&p->tcf_lock); @@ -285,11 +265,13 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(p->tcf_tm.expires); -	NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len; @@ -300,16 +282,11 @@ nla_put_failure:  static struct tc_action_ops act_nat_ops = {  	.kind		=	"nat", -	.hinfo		=	&nat_hash_info,  	.type		=	TCA_ACT_NAT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_nat,  	.dump		=	tcf_nat_dump, -	.cleanup	=	tcf_nat_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_nat_init, -	.walk		=	tcf_generic_walker  };  MODULE_DESCRIPTION("Stateless NAT actions"); @@ -317,7 +294,7 @@ MODULE_LICENSE("GPL");  static int __init nat_init_module(void)  { -	return tcf_register_action(&act_nat_ops); +	return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);  }  static void __exit nat_cleanup_module(void) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index a0593c9640d..5f9bcb2e080 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -24,28 +24,19 @@  #include <net/tc_act/tc_pedit.h>  #define PEDIT_TAB_MASK	15 -static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1]; -static u32 pedit_idx_gen; -static DEFINE_RWLOCK(pedit_lock); - -static struct tcf_hashinfo pedit_hash_info = { -	.htab	=	tcf_pedit_ht, -	.hmask	=	PEDIT_TAB_MASK, -	.lock	=	&pedit_lock, -};  static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {  	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },  }; -static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, -			  struct tc_action *a, int ovr, int bind) +static int tcf_pedit_init(struct net *net, struct nlattr *nla, +			  struct nlattr *est, struct tc_action *a, +			  int ovr, int bind)  {  	struct nlattr *tb[TCA_PEDIT_MAX + 1];  	struct tc_pedit *parm;  	int ret = 0, err;  	struct tcf_pedit *p; -	struct tcf_common *pc;  	struct tc_pedit_key *keys = NULL;  	int ksize; @@ -63,27 +54,27 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,  	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)  		return -EINVAL; -	pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info); -	if (!pc) { +	if (!tcf_hash_check(parm->index, a, bind)) {  		if (!parm->nkeys)  			return -EINVAL; -		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, -				     &pedit_idx_gen, &pedit_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); -		p = to_pedit(pc); +		ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); +		if (ret) +			return ret; +		p = to_pedit(a);  		keys = kmalloc(ksize, GFP_KERNEL);  		if (keys == NULL) { -			kfree(pc); +			tcf_hash_cleanup(a, est);  			return -ENOMEM;  		}  		ret = ACT_P_CREATED;  	} else { -		p = to_pedit(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &pedit_hash_info); +		p = to_pedit(a); +		tcf_hash_release(a, bind); +		if (bind) +			return 0; +		if (!ovr)  			return -EEXIST; -		} +  		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {  			keys = kmalloc(ksize, GFP_KERNEL);  			if (keys == NULL) @@ -102,36 +93,26 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,  	memcpy(p->tcfp_keys, parm->keys, ksize);  	spin_unlock_bh(&p->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &pedit_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static int tcf_pedit_cleanup(struct tc_action *a, int bind) +static void tcf_pedit_cleanup(struct tc_action *a, int bind)  {  	struct tcf_pedit *p = a->priv; - -	if (p) { -		struct tc_pedit_key *keys = p->tcfp_keys; -		if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) { -			kfree(keys); -			return 1; -		} -	} -	return 0; +	struct tc_pedit_key *keys = p->tcfp_keys; +	kfree(keys);  } -static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, +static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,  		     struct tcf_result *res)  {  	struct tcf_pedit *p = a->priv;  	int i, munged = 0;  	unsigned int off; -	if (skb_cloned(skb)) { -		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { -			return p->tcf_action; -		} -	} +	if (skb_unclone(skb, GFP_ATOMIC)) +		return p->tcf_action;  	off = skb_network_offset(skb); @@ -163,7 +144,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,  			}  			if (offset > 0 && offset > skb->len) {  				pr_info("tc filter pedit" -					" offset %d cant exceed pkt length %d\n", +					" offset %d can't exceed pkt length %d\n",  				       offset, skb->len);  				goto bad;  			} @@ -187,8 +168,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,  bad:  	p->tcf_qstats.overlimits++;  done: -	p->tcf_bstats.bytes += qdisc_pkt_len(skb); -	p->tcf_bstats.packets++; +	bstats_update(&p->tcf_bstats, skb);  	spin_unlock(&p->tcf_lock);  	return p->tcf_action;  } @@ -218,11 +198,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,  	opt->refcnt = p->tcf_refcnt - ref;  	opt->bindcnt = p->tcf_bindcnt - bind; -	NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt); +	if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(p->tcf_tm.expires); -	NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) +		goto nla_put_failure;  	kfree(opt);  	return skb->len; @@ -234,16 +216,12 @@ nla_put_failure:  static struct tc_action_ops act_pedit_ops = {  	.kind		=	"pedit", -	.hinfo		=	&pedit_hash_info,  	.type		=	TCA_ACT_PEDIT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_pedit,  	.dump		=	tcf_pedit_dump,  	.cleanup	=	tcf_pedit_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_pedit_init, -	.walk		=	tcf_generic_walker  };  MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); @@ -252,7 +230,7 @@ MODULE_LICENSE("GPL");  static int __init pedit_init_module(void)  { -	return tcf_register_action(&act_pedit_ops); +	return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);  }  static void __exit pedit_cleanup_module(void) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 7ebf7439b47..0566e4606a4 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,23 +22,28 @@  #include <net/act_api.h>  #include <net/netlink.h> -#define L2T(p,L)   qdisc_l2t((p)->tcfp_R_tab, L) -#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L) +struct tcf_police { +	struct tcf_common	common; +	int			tcfp_result; +	u32			tcfp_ewma_rate; +	s64			tcfp_burst; +	u32			tcfp_mtu; +	s64			tcfp_toks; +	s64			tcfp_ptoks; +	s64			tcfp_mtu_ptoks; +	s64			tcfp_t_c; +	struct psched_ratecfg	rate; +	bool			rate_present; +	struct psched_ratecfg	peak; +	bool			peak_present; +}; +#define to_police(pc)	\ +	container_of(pc, struct tcf_police, common)  #define POL_TAB_MASK     15 -static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; -static u32 police_idx_gen; -static DEFINE_RWLOCK(police_lock); - -static struct tcf_hashinfo police_hash_info = { -	.htab	=	tcf_police_ht, -	.hmask	=	POL_TAB_MASK, -	.lock	=	&police_lock, -};  /* old policer structure from before tc actions */ -struct tc_police_compat -{ +struct tc_police_compat {  	u32			index;  	int			action;  	u32			limit; @@ -53,18 +58,20 @@ struct tc_police_compat  static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,  			      int type, struct tc_action *a)  { +	struct tcf_hashinfo *hinfo = a->ops->hinfo; +	struct hlist_head *head;  	struct tcf_common *p;  	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;  	struct nlattr *nest; -	read_lock_bh(&police_lock); +	spin_lock_bh(&hinfo->lock);  	s_i = cb->args[0];  	for (i = 0; i < (POL_TAB_MASK + 1); i++) { -		p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)]; +		head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)]; -		for (; p; p = p->tcfc_next) { +		hlist_for_each_entry_rcu(p, head, tcfc_head) {  			index++;  			if (index < s_i)  				continue; @@ -87,7 +94,7 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c  		}  	}  done: -	read_unlock_bh(&police_lock); +	spin_unlock_bh(&hinfo->lock);  	if (n_i)  		cb->args[0] += n_i;  	return n_i; @@ -97,38 +104,6 @@ nla_put_failure:  	goto done;  } -static void tcf_police_free_rcu(struct rcu_head *head) -{ -	kfree(container_of(head, struct tcf_police, tcf_rcu)); -} - -static void tcf_police_destroy(struct tcf_police *p) -{ -	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); -	struct tcf_common **p1p; - -	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) { -		if (*p1p == &p->common) { -			write_lock_bh(&police_lock); -			*p1p = p->tcf_next; -			write_unlock_bh(&police_lock); -			gen_kill_estimator(&p->tcf_bstats, -					   &p->tcf_rate_est); -			if (p->tcfp_R_tab) -				qdisc_put_rtab(p->tcfp_R_tab); -			if (p->tcfp_P_tab) -				qdisc_put_rtab(p->tcfp_P_tab); -			/* -			 * gen_estimator est_timer() might access p->tcf_lock -			 * or bstats, wait a RCU grace period before freeing p -			 */ -			call_rcu(&p->tcf_rcu, tcf_police_free_rcu); -			return; -		} -	} -	WARN_ON(1); -} -  static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {  	[TCA_POLICE_RATE]	= { .len = TC_RTAB_SIZE },  	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE }, @@ -136,15 +111,17 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {  	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },  }; -static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, -				 struct tc_action *a, int ovr, int bind) +static int tcf_act_police_locate(struct net *net, struct nlattr *nla, +				 struct nlattr *est, struct tc_action *a, +				 int ovr, int bind)  { -	unsigned h; +	unsigned int h;  	int ret = 0, err;  	struct nlattr *tb[TCA_POLICE_MAX + 1];  	struct tc_police *parm;  	struct tcf_police *police;  	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; +	struct tcf_hashinfo *hinfo = a->ops->hinfo;  	int size;  	if (nla == NULL) @@ -162,19 +139,17 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,  	parm = nla_data(tb[TCA_POLICE_TBF]);  	if (parm->index) { -		struct tcf_common *pc; - -		pc = tcf_hash_lookup(parm->index, &police_hash_info); -		if (pc != NULL) { -			a->priv = pc; -			police = to_police(pc); +		if (tcf_hash_search(a, parm->index)) { +			police = to_police(a->priv);  			if (bind) {  				police->tcf_bindcnt += 1;  				police->tcf_refcnt += 1; +				return 0;  			}  			if (ovr)  				goto override; -			return ret; +			/* not replacing */ +			return -EEXIST;  		}  	} @@ -217,26 +192,36 @@ override:  	}  	/* No failure allowed after this point */ -	if (R_tab != NULL) { -		qdisc_put_rtab(police->tcfp_R_tab); -		police->tcfp_R_tab = R_tab; +	police->tcfp_mtu = parm->mtu; +	if (police->tcfp_mtu == 0) { +		police->tcfp_mtu = ~0; +		if (R_tab) +			police->tcfp_mtu = 255 << R_tab->rate.cell_log; +	} +	if (R_tab) { +		police->rate_present = true; +		psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); +		qdisc_put_rtab(R_tab); +	} else { +		police->rate_present = false;  	} -	if (P_tab != NULL) { -		qdisc_put_rtab(police->tcfp_P_tab); -		police->tcfp_P_tab = P_tab; +	if (P_tab) { +		police->peak_present = true; +		psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); +		qdisc_put_rtab(P_tab); +	} else { +		police->peak_present = false;  	}  	if (tb[TCA_POLICE_RESULT])  		police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); -	police->tcfp_toks = police->tcfp_burst = parm->burst; -	police->tcfp_mtu = parm->mtu; -	if (police->tcfp_mtu == 0) { -		police->tcfp_mtu = ~0; -		if (police->tcfp_R_tab) -			police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log; +	police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); +	police->tcfp_toks = police->tcfp_burst; +	if (police->peak_present) { +		police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, +							     police->tcfp_mtu); +		police->tcfp_ptoks = police->tcfp_mtu_ptoks;  	} -	if (police->tcfp_P_tab) -		police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);  	police->tcf_action = parm->action;  	if (tb[TCA_POLICE_AVRATE]) @@ -246,14 +231,13 @@ override:  	if (ret != ACT_P_CREATED)  		return ret; -	police->tcfp_t_c = psched_get_time(); +	police->tcfp_t_c = ktime_to_ns(ktime_get());  	police->tcf_index = parm->index ? parm->index : -		tcf_hash_new_index(&police_idx_gen, &police_hash_info); +		tcf_hash_new_index(hinfo);  	h = tcf_hash(police->tcf_index, POL_TAB_MASK); -	write_lock_bh(&police_lock); -	police->tcf_next = tcf_police_ht[h]; -	tcf_police_ht[h] = &police->common; -	write_unlock_bh(&police_lock); +	spin_lock_bh(&hinfo->lock); +	hlist_add_head(&police->tcf_head, &hinfo->htab[h]); +	spin_unlock_bh(&hinfo->lock);  	a->priv = police;  	return ret; @@ -261,45 +245,24 @@ override:  failure_unlock:  	spin_unlock_bh(&police->tcf_lock);  failure: -	if (P_tab) -		qdisc_put_rtab(P_tab); -	if (R_tab) -		qdisc_put_rtab(R_tab); +	qdisc_put_rtab(P_tab); +	qdisc_put_rtab(R_tab);  	if (ret == ACT_P_CREATED)  		kfree(police);  	return err;  } -static int tcf_act_police_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_police *p = a->priv; -	int ret = 0; - -	if (p != NULL) { -		if (bind) -			p->tcf_bindcnt--; - -		p->tcf_refcnt--; -		if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) { -			tcf_police_destroy(p); -			ret = 1; -		} -	} -	return ret; -} - -static int tcf_act_police(struct sk_buff *skb, struct tc_action *a, +static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,  			  struct tcf_result *res)  {  	struct tcf_police *police = a->priv; -	psched_time_t now; -	long toks; -	long ptoks = 0; +	s64 now; +	s64 toks; +	s64 ptoks = 0;  	spin_lock(&police->tcf_lock); -	police->tcf_bstats.bytes += qdisc_pkt_len(skb); -	police->tcf_bstats.packets++; +	bstats_update(&police->tcf_bstats, skb);  	if (police->tcfp_ewma_rate &&  	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { @@ -311,24 +274,25 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,  	}  	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { -		if (police->tcfp_R_tab == NULL) { +		if (!police->rate_present) {  			spin_unlock(&police->tcf_lock);  			return police->tcfp_result;  		} -		now = psched_get_time(); -		toks = psched_tdiff_bounded(now, police->tcfp_t_c, -					    police->tcfp_burst); -		if (police->tcfp_P_tab) { +		now = ktime_to_ns(ktime_get()); +		toks = min_t(s64, now - police->tcfp_t_c, +			     police->tcfp_burst); +		if (police->peak_present) {  			ptoks = toks + police->tcfp_ptoks; -			if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) -				ptoks = (long)L2T_P(police, police->tcfp_mtu); -			ptoks -= L2T_P(police, qdisc_pkt_len(skb)); +			if (ptoks > police->tcfp_mtu_ptoks) +				ptoks = police->tcfp_mtu_ptoks; +			ptoks -= (s64) psched_l2t_ns(&police->peak, +						     qdisc_pkt_len(skb));  		}  		toks += police->tcfp_toks; -		if (toks > (long)police->tcfp_burst) +		if (toks > police->tcfp_burst)  			toks = police->tcfp_burst; -		toks -= L2T(police, qdisc_pkt_len(skb)); +		toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));  		if ((toks|ptoks) >= 0) {  			police->tcfp_t_c = now;  			police->tcfp_toks = toks; @@ -354,20 +318,23 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)  		.index = police->tcf_index,  		.action = police->tcf_action,  		.mtu = police->tcfp_mtu, -		.burst = police->tcfp_burst, +		.burst = PSCHED_NS2TICKS(police->tcfp_burst),  		.refcnt = police->tcf_refcnt - ref,  		.bindcnt = police->tcf_bindcnt - bind,  	}; -	if (police->tcfp_R_tab) -		opt.rate = police->tcfp_R_tab->rate; -	if (police->tcfp_P_tab) -		opt.peakrate = police->tcfp_P_tab->rate; -	NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); -	if (police->tcfp_result) -		NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); -	if (police->tcfp_ewma_rate) -		NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate); +	if (police->rate_present) +		psched_ratecfg_getrate(&opt.rate, &police->rate); +	if (police->peak_present) +		psched_ratecfg_getrate(&opt.peakrate, &police->peak); +	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) +		goto nla_put_failure; +	if (police->tcfp_result && +	    nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) +		goto nla_put_failure; +	if (police->tcfp_ewma_rate && +	    nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -381,14 +348,10 @@ MODULE_LICENSE("GPL");  static struct tc_action_ops act_police_ops = {  	.kind		=	"police", -	.hinfo		=	&police_hash_info,  	.type		=	TCA_ID_POLICE, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_act_police,  	.dump		=	tcf_act_police_dump, -	.cleanup	=	tcf_act_police_cleanup, -	.lookup		=	tcf_hash_search,  	.init		=	tcf_act_police_locate,  	.walk		=	tcf_act_police_walker  }; @@ -396,14 +359,13 @@ static struct tc_action_ops act_police_ops = {  static int __init  police_init_module(void)  { -	return tcf_register_action(&act_police_ops); +	return tcf_register_action(&act_police_ops, POL_TAB_MASK);  }  static void __exit  police_cleanup_module(void)  {  	tcf_unregister_action(&act_police_ops); -	rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */  }  module_init(police_init_module); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 97e84f3ee77..992c2317ce8 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -25,50 +25,31 @@  #include <net/tc_act/tc_defact.h>  #define SIMP_TAB_MASK     7 -static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1]; -static u32 simp_idx_gen; -static DEFINE_RWLOCK(simp_lock); - -static struct tcf_hashinfo simp_hash_info = { -	.htab	=	tcf_simp_ht, -	.hmask	=	SIMP_TAB_MASK, -	.lock	=	&simp_lock, -};  #define SIMP_MAX_DATA	32 -static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res) +static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, +		    struct tcf_result *res)  {  	struct tcf_defact *d = a->priv;  	spin_lock(&d->tcf_lock);  	d->tcf_tm.lastuse = jiffies; -	d->tcf_bstats.bytes += qdisc_pkt_len(skb); -	d->tcf_bstats.packets++; +	bstats_update(&d->tcf_bstats, skb);  	/* print policy string followed by _ then packet count  	 * Example if this was the 3rd packet and the string was "hello"  	 * then it would look like "hello_3" (without quotes) -	 **/ +	 */  	pr_info("simple: %s_%d\n",  	       (char *)d->tcfd_defdata, d->tcf_bstats.packets);  	spin_unlock(&d->tcf_lock);  	return d->tcf_action;  } -static int tcf_simp_release(struct tcf_defact *d, int bind) +static void tcf_simp_release(struct tc_action *a, int bind)  { -	int ret = 0; -	if (d) { -		if (bind) -			d->tcf_bindcnt--; -		d->tcf_refcnt--; -		if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) { -			kfree(d->tcfd_defdata); -			tcf_hash_destroy(&d->common, &simp_hash_info); -			ret = 1; -		} -	} -	return ret; +	struct tcf_defact *d = to_defact(a); +	kfree(d->tcfd_defdata);  }  static int alloc_defdata(struct tcf_defact *d, char *defdata) @@ -95,13 +76,13 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {  	[TCA_DEF_DATA]	= { .type = NLA_STRING, .len = SIMP_MAX_DATA },  }; -static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, -			 struct tc_action *a, int ovr, int bind) +static int tcf_simp_init(struct net *net, struct nlattr *nla, +			 struct nlattr *est, struct tc_action *a, +			 int ovr, int bind)  {  	struct nlattr *tb[TCA_DEF_MAX + 1];  	struct tc_defact *parm;  	struct tcf_defact *d; -	struct tcf_common *pc;  	char *defdata;  	int ret = 0, err; @@ -121,46 +102,38 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,  	parm = nla_data(tb[TCA_DEF_PARMS]);  	defdata = nla_data(tb[TCA_DEF_DATA]); -	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, -				     &simp_idx_gen, &simp_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); +		if (ret) +			return ret; -		d = to_defact(pc); +		d = to_defact(a);  		ret = alloc_defdata(d, defdata);  		if (ret < 0) { -			kfree(pc); +			tcf_hash_cleanup(a, est);  			return ret;  		}  		d->tcf_action = parm->action;  		ret = ACT_P_CREATED;  	} else { -		d = to_defact(pc); -		if (!ovr) { -			tcf_simp_release(d, bind); +		d = to_defact(a); + +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		} +  		reset_policy(d, defdata, parm);  	}  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &simp_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static inline int tcf_simp_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_defact *d = a->priv; - -	if (d) -		return tcf_simp_release(d, bind); -	return 0; -} - -static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, -				int bind, int ref) +static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, +			 int bind, int ref)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tcf_defact *d = a->priv; @@ -172,12 +145,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); -	NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); +	if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || +	    nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(d->tcf_tm.expires); -	NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -187,15 +162,12 @@ nla_put_failure:  static struct tc_action_ops act_simp_ops = {  	.kind		=	"simple", -	.hinfo		=	&simp_hash_info,  	.type		=	TCA_ACT_SIMP, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_simp,  	.dump		=	tcf_simp_dump, -	.cleanup	=	tcf_simp_cleanup, +	.cleanup	=	tcf_simp_release,  	.init		=	tcf_simp_init, -	.walk		=	tcf_generic_walker,  };  MODULE_AUTHOR("Jamal Hadi Salim(2005)"); @@ -204,7 +176,8 @@ MODULE_LICENSE("GPL");  static int __init simp_init_module(void)  { -	int ret = tcf_register_action(&act_simp_ops); +	int ret; +	ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);  	if (!ret)  		pr_info("Simple TC action Loaded\n");  	return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 66cbf4eb885..fcfeeaf838b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -11,8 +11,7 @@   * more details.   *   * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: Alexander Duyck <alexander.h.duyck@intel.com>   */ @@ -29,25 +28,15 @@  #include <net/tc_act/tc_skbedit.h>  #define SKBEDIT_TAB_MASK     15 -static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1]; -static u32 skbedit_idx_gen; -static DEFINE_RWLOCK(skbedit_lock); - -static struct tcf_hashinfo skbedit_hash_info = { -	.htab	=	tcf_skbedit_ht, -	.hmask	=	SKBEDIT_TAB_MASK, -	.lock	=	&skbedit_lock, -}; -static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a, +static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,  		       struct tcf_result *res)  {  	struct tcf_skbedit *d = a->priv;  	spin_lock(&d->tcf_lock);  	d->tcf_tm.lastuse = jiffies; -	d->tcf_bstats.bytes += qdisc_pkt_len(skb); -	d->tcf_bstats.packets++; +	bstats_update(&d->tcf_bstats, skb);  	if (d->flags & SKBEDIT_F_PRIORITY)  		skb->priority = d->priority; @@ -68,13 +57,13 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {  	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },  }; -static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, -			 struct tc_action *a, int ovr, int bind) +static int tcf_skbedit_init(struct net *net, struct nlattr *nla, +			    struct nlattr *est, struct tc_action *a, +			    int ovr, int bind)  {  	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];  	struct tc_skbedit *parm;  	struct tcf_skbedit *d; -	struct tcf_common *pc;  	u32 flags = 0, *priority = NULL, *mark = NULL;  	u16 *queue_mapping = NULL;  	int ret = 0, err; @@ -109,21 +98,20 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,  	parm = nla_data(tb[TCA_SKBEDIT_PARMS]); -	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info); -	if (!pc) { -		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, -				     &skbedit_idx_gen, &skbedit_hash_info); -		if (IS_ERR(pc)) -		    return PTR_ERR(pc); +	if (!tcf_hash_check(parm->index, a, bind)) { +		ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); +		if (ret) +			return ret; -		d = to_skbedit(pc); +		d = to_skbedit(a);  		ret = ACT_P_CREATED;  	} else { -		d = to_skbedit(pc); -		if (!ovr) { -			tcf_hash_release(pc, bind, &skbedit_hash_info); +		d = to_skbedit(a); +		if (bind) +			return 0; +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	}  	spin_lock_bh(&d->tcf_lock); @@ -141,21 +129,12 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,  	spin_unlock_bh(&d->tcf_lock);  	if (ret == ACT_P_CREATED) -		tcf_hash_insert(pc, &skbedit_hash_info); +		tcf_hash_insert(a);  	return ret;  } -static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind) -{ -	struct tcf_skbedit *d = a->priv; - -	if (d) -		return tcf_hash_release(&d->common, bind, &skbedit_hash_info); -	return 0; -} - -static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, -				int bind, int ref) +static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, +			    int bind, int ref)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tcf_skbedit *d = a->priv; @@ -167,20 +146,25 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,  	};  	struct tcf_t t; -	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); -	if (d->flags & SKBEDIT_F_PRIORITY) -		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), -			&d->priority); -	if (d->flags & SKBEDIT_F_QUEUE_MAPPING) -		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING, -			sizeof(d->queue_mapping), &d->queue_mapping); -	if (d->flags & SKBEDIT_F_MARK) -		NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), -			&d->mark); +	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure; +	if ((d->flags & SKBEDIT_F_PRIORITY) && +	    nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), +		    &d->priority)) +		goto nla_put_failure; +	if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && +	    nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, +		    sizeof(d->queue_mapping), &d->queue_mapping)) +		goto nla_put_failure; +	if ((d->flags & SKBEDIT_F_MARK) && +	    nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), +		    &d->mark)) +		goto nla_put_failure;  	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);  	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);  	t.expires = jiffies_to_clock_t(d->tcf_tm.expires); -	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t); +	if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -190,15 +174,11 @@ nla_put_failure:  static struct tc_action_ops act_skbedit_ops = {  	.kind		=	"skbedit", -	.hinfo		=	&skbedit_hash_info,  	.type		=	TCA_ACT_SKBEDIT, -	.capab		=	TCA_CAP_NONE,  	.owner		=	THIS_MODULE,  	.act		=	tcf_skbedit,  	.dump		=	tcf_skbedit_dump, -	.cleanup	=	tcf_skbedit_cleanup,  	.init		=	tcf_skbedit_init, -	.walk		=	tcf_generic_walker,  };  MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>"); @@ -207,7 +187,7 @@ MODULE_LICENSE("GPL");  static int __init skbedit_init_module(void)  { -	return tcf_register_action(&act_skbedit_ops); +	return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);  }  static void __exit skbedit_cleanup_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5fd0c28ef79..45527e6b52d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,7 +22,6 @@  #include <linux/skbuff.h>  #include <linux/init.h>  #include <linux/kmod.h> -#include <linux/netlink.h>  #include <linux/err.h>  #include <linux/slab.h>  #include <net/net_namespace.h> @@ -32,46 +31,44 @@  #include <net/pkt_cls.h>  /* The list of all installed classifier types */ - -static struct tcf_proto_ops *tcf_proto_base __read_mostly; +static LIST_HEAD(tcf_proto_base);  /* Protects list of registered TC modules. It is pure SMP lock. */  static DEFINE_RWLOCK(cls_mod_lock);  /* Find classifier type by string name */ -static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind) +static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)  { -	struct tcf_proto_ops *t = NULL; +	const struct tcf_proto_ops *t, *res = NULL;  	if (kind) {  		read_lock(&cls_mod_lock); -		for (t = tcf_proto_base; t; t = t->next) { +		list_for_each_entry(t, &tcf_proto_base, head) {  			if (nla_strcmp(kind, t->kind) == 0) { -				if (!try_module_get(t->owner)) -					t = NULL; +				if (try_module_get(t->owner)) +					res = t;  				break;  			}  		}  		read_unlock(&cls_mod_lock);  	} -	return t; +	return res;  }  /* Register(unregister) new classifier type */  int register_tcf_proto_ops(struct tcf_proto_ops *ops)  { -	struct tcf_proto_ops *t, **tp; +	struct tcf_proto_ops *t;  	int rc = -EEXIST;  	write_lock(&cls_mod_lock); -	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) +	list_for_each_entry(t, &tcf_proto_base, head)  		if (!strcmp(ops->kind, t->kind))  			goto out; -	ops->next = NULL; -	*tp = ops; +	list_add_tail(&ops->head, &tcf_proto_base);  	rc = 0;  out:  	write_unlock(&cls_mod_lock); @@ -81,19 +78,17 @@ EXPORT_SYMBOL(register_tcf_proto_ops);  int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)  { -	struct tcf_proto_ops *t, **tp; +	struct tcf_proto_ops *t;  	int rc = -ENOENT;  	write_lock(&cls_mod_lock); -	for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) -		if (t == ops) +	list_for_each_entry(t, &tcf_proto_base, head) { +		if (t == ops) { +			list_del(&t->head); +			rc = 0;  			break; - -	if (!t) -		goto out; -	*tp = t->next; -	rc = 0; -out: +		} +	}  	write_unlock(&cls_mod_lock);  	return rc;  } @@ -111,14 +106,14 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)  	u32 first = TC_H_MAKE(0xC0000000U, 0U);  	if (tp) -		first = tp->prio-1; +		first = tp->prio - 1;  	return first;  }  /* Add/change/delete/get a filter node */ -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)  {  	struct net *net = sock_net(skb->sk);  	struct nlattr *tca[TCA_MAX + 1]; @@ -132,15 +127,23 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	struct Qdisc  *q;  	struct tcf_proto **back, **chain;  	struct tcf_proto *tp; -	struct tcf_proto_ops *tp_ops; +	const struct tcf_proto_ops *tp_ops;  	const struct Qdisc_class_ops *cops;  	unsigned long cl;  	unsigned long fh;  	int err;  	int tp_created = 0; +	if ((n->nlmsg_type != RTM_GETTFILTER) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) +		return -EPERM; +  replay: -	t = NLMSG_DATA(n); +	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); +	if (err < 0) +		return err; + +	t = nlmsg_data(n);  	protocol = TC_H_MIN(t->tcm_info);  	prio = TC_H_MAJ(t->tcm_info);  	nprio = prio; @@ -149,7 +152,8 @@ replay:  	if (prio == 0) {  		/* If no priority is given, user wants we allocated it. */ -		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) +		if (n->nlmsg_type != RTM_NEWTFILTER || +		    !(n->nlmsg_flags & NLM_F_CREATE))  			return -ENOENT;  		prio = TC_H_MAKE(0x80000000U, 0U);  	} @@ -161,10 +165,6 @@ replay:  	if (dev == NULL)  		return -ENODEV; -	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); -	if (err < 0) -		return err; -  	/* Find qdisc */  	if (!parent) {  		q = dev->qdisc; @@ -176,7 +176,8 @@ replay:  	}  	/* Is it classful? */ -	if ((cops = q->ops->cl_ops) == NULL) +	cops = q->ops->cl_ops; +	if (!cops)  		return -EINVAL;  	if (cops->tcf_chain == NULL) @@ -196,10 +197,11 @@ replay:  		goto errout;  	/* Check the chain for existence of proto-tcf with this priority */ -	for (back = chain; (tp=*back) != NULL; back = &tp->next) { +	for (back = chain; (tp = *back) != NULL; back = &tp->next) {  		if (tp->prio >= prio) {  			if (tp->prio == prio) { -				if (!nprio || (tp->protocol != protocol && protocol)) +				if (!nprio || +				    (tp->protocol != protocol && protocol))  					goto errout;  			} else  				tp = NULL; @@ -216,7 +218,8 @@ replay:  			goto errout;  		err = -ENOENT; -		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) +		if (n->nlmsg_type != RTM_NEWTFILTER || +		    !(n->nlmsg_flags & NLM_F_CREATE))  			goto errout; @@ -315,7 +318,8 @@ replay:  		}  	} -	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); +	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, +			      n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);  	if (err == 0) {  		if (tp_created) {  			spin_lock_bh(root_lock); @@ -338,32 +342,35 @@ errout:  	return err;  } -static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, -			 unsigned long fh, u32 pid, u32 seq, u16 flags, int event) +static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, +			 unsigned long fh, u32 portid, u32 seq, u16 flags, int event)  {  	struct tcmsg *tcm;  	struct nlmsghdr  *nlh;  	unsigned char *b = skb_tail_pointer(skb); -	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); -	tcm = NLMSG_DATA(nlh); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); +	if (!nlh) +		goto out_nlmsg_trim; +	tcm = nlmsg_data(nlh);  	tcm->tcm_family = AF_UNSPEC;  	tcm->tcm__pad1 = 0;  	tcm->tcm__pad2 = 0;  	tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;  	tcm->tcm_parent = tp->classid;  	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); -	NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind); +	if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) +		goto nla_put_failure;  	tcm->tcm_handle = fh;  	if (RTM_DELTFILTER != event) {  		tcm->tcm_handle = 0; -		if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) +		if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)  			goto nla_put_failure;  	}  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	return skb->len; -nlmsg_failure: +out_nlmsg_trim:  nla_put_failure:  	nlmsg_trim(skb, b);  	return -1; @@ -374,18 +381,18 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,  			  unsigned long fh, int event)  {  	struct sk_buff *skb; -	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; +	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { +	if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {  		kfree_skb(skb);  		return -EINVAL;  	} -	return rtnetlink_send(skb, net, pid, RTNLGRP_TC, +	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,  			      n->nlmsg_flags & NLM_F_ECHO);  } @@ -399,8 +406,9 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,  			 struct tcf_walker *arg)  {  	struct tcf_dump_args *a = (void *)arg; +	struct net *net = sock_net(a->skb->sk); -	return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, +	return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,  			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);  } @@ -413,14 +421,15 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  	struct net_device *dev;  	struct Qdisc *q;  	struct tcf_proto *tp, **chain; -	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); +	struct tcmsg *tcm = nlmsg_data(cb->nlh);  	unsigned long cl = 0;  	const struct Qdisc_class_ops *cops;  	struct tcf_dump_args arg; -	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) +	if (nlmsg_len(cb->nlh) < sizeof(*tcm))  		return skb->len; -	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) +	dev = __dev_get_by_index(net, tcm->tcm_ifindex); +	if (!dev)  		return skb->len;  	if (!tcm->tcm_parent) @@ -429,7 +438,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));  	if (!q)  		goto out; -	if ((cops = q->ops->cl_ops) == NULL) +	cops = q->ops->cl_ops; +	if (!cops)  		goto errout;  	if (cops->tcf_chain == NULL)  		goto errout; @@ -444,8 +454,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  	s_t = cb->args[0]; -	for (tp=*chain, t=0; tp; tp = tp->next, t++) { -		if (t < s_t) continue; +	for (tp = *chain, t = 0; tp; tp = tp->next, t++) { +		if (t < s_t) +			continue;  		if (TC_H_MAJ(tcm->tcm_info) &&  		    TC_H_MAJ(tcm->tcm_info) != tp->prio)  			continue; @@ -455,7 +466,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  		if (t > s_t)  			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));  		if (cb->args[1] == 0) { -			if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, +			if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,  					  cb->nlh->nlmsg_seq, NLM_F_MULTI,  					  RTM_NEWTFILTER) <= 0)  				break; @@ -468,10 +479,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)  		arg.skb = skb;  		arg.cb = cb;  		arg.w.stop = 0; -		arg.w.skip = cb->args[1]-1; +		arg.w.skip = cb->args[1] - 1;  		arg.w.count = 0;  		tp->ops->walk(tp, &arg.w); -		cb->args[1] = arg.w.count+1; +		cb->args[1] = arg.w.count + 1;  		if (arg.w.stop)  			break;  	} @@ -488,45 +499,41 @@ out:  void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (exts->action) { -		tcf_action_destroy(exts->action, TCA_ACT_UNBIND); -		exts->action = NULL; -	} +	tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND); +	INIT_LIST_HEAD(&exts->actions);  #endif  }  EXPORT_SYMBOL(tcf_exts_destroy); -int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, -		  struct nlattr *rate_tlv, struct tcf_exts *exts, -		  const struct tcf_ext_map *map) +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, +		  struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)  { -	memset(exts, 0, sizeof(*exts)); -  #ifdef CONFIG_NET_CLS_ACT  	{  		struct tc_action *act; -		if (map->police && tb[map->police]) { -			act = tcf_action_init_1(tb[map->police], rate_tlv, -						"police", TCA_ACT_NOREPLACE, +		INIT_LIST_HEAD(&exts->actions); +		if (exts->police && tb[exts->police]) { +			act = tcf_action_init_1(net, tb[exts->police], rate_tlv, +						"police", ovr,  						TCA_ACT_BIND);  			if (IS_ERR(act))  				return PTR_ERR(act); -			act->type = TCA_OLD_COMPAT; -			exts->action = act; -		} else if (map->action && tb[map->action]) { -			act = tcf_action_init(tb[map->action], rate_tlv, NULL, -					      TCA_ACT_NOREPLACE, TCA_ACT_BIND); -			if (IS_ERR(act)) -				return PTR_ERR(act); - -			exts->action = act; +			act->type = exts->type = TCA_OLD_COMPAT; +			list_add(&act->list, &exts->actions); +		} else if (exts->action && tb[exts->action]) { +			int err; +			err = tcf_action_init(net, tb[exts->action], rate_tlv, +					      NULL, ovr, +					      TCA_ACT_BIND, &exts->actions); +			if (err) +				return err;  		}  	}  #else -	if ((map->action && tb[map->action]) || -	    (map->police && tb[map->police])) +	if ((exts->action && tb[exts->action]) || +	    (exts->police && tb[exts->police]))  		return -EOPNOTSUPP;  #endif @@ -538,43 +545,42 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,  		     struct tcf_exts *src)  {  #ifdef CONFIG_NET_CLS_ACT -	if (src->action) { -		struct tc_action *act; -		tcf_tree_lock(tp); -		act = dst->action; -		dst->action = src->action; -		tcf_tree_unlock(tp); -		if (act) -			tcf_action_destroy(act, TCA_ACT_UNBIND); -	} +	LIST_HEAD(tmp); +	tcf_tree_lock(tp); +	list_splice_init(&dst->actions, &tmp); +	list_splice(&src->actions, &dst->actions); +	tcf_tree_unlock(tp); +	tcf_action_destroy(&tmp, TCA_ACT_UNBIND);  #endif  }  EXPORT_SYMBOL(tcf_exts_change); -int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, -		  const struct tcf_ext_map *map) +#define tcf_exts_first_act(ext) \ +		list_first_entry(&(exts)->actions, struct tc_action, list) + +int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (map->action && exts->action) { +	if (exts->action && !list_empty(&exts->actions)) {  		/*  		 * again for backward compatible mode - we want  		 * to work with both old and new modes of entering  		 * tc data even if iproute2  was newer - jhs  		 */  		struct nlattr *nest; - -		if (exts->action->type != TCA_OLD_COMPAT) { -			nest = nla_nest_start(skb, map->action); +		if (exts->type != TCA_OLD_COMPAT) { +			nest = nla_nest_start(skb, exts->action);  			if (nest == NULL)  				goto nla_put_failure; -			if (tcf_action_dump(skb, exts->action, 0, 0) < 0) +			if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)  				goto nla_put_failure;  			nla_nest_end(skb, nest); -		} else if (map->police) { -			nest = nla_nest_start(skb, map->police); -			if (nest == NULL) +		} else if (exts->police) { +			struct tc_action *act = tcf_exts_first_act(exts); +			nest = nla_nest_start(skb, exts->police); +			if (nest == NULL || !act)  				goto nla_put_failure; -			if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) +			if (tcf_action_dump_old(skb, act, 0, 0) < 0)  				goto nla_put_failure;  			nla_nest_end(skb, nest);  		} @@ -587,26 +593,23 @@ nla_put_failure: __attribute__ ((unused))  EXPORT_SYMBOL(tcf_exts_dump); -int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, -			const struct tcf_ext_map *map) +int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)  {  #ifdef CONFIG_NET_CLS_ACT -	if (exts->action) -		if (tcf_action_copy_stats(skb, exts->action, 1) < 0) -			goto nla_put_failure; +	struct tc_action *a = tcf_exts_first_act(exts); +	if (tcf_action_copy_stats(skb, a, 1) < 0) +		return -1;  #endif  	return 0; -nla_put_failure: __attribute__ ((unused)) -	return -1;  }  EXPORT_SYMBOL(tcf_exts_dump_stats);  static int __init tc_filter_init(void)  { -	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL); +	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);  	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, -						 tc_dump_tfilter); +		      tc_dump_tfilter, NULL);  	return 0;  } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index f23d9155b1e..0ae1813e3e9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -21,14 +21,12 @@  #include <net/act_api.h>  #include <net/pkt_cls.h> -struct basic_head -{ +struct basic_head {  	u32			hgenerator;  	struct list_head	flist;  }; -struct basic_filter -{ +struct basic_filter {  	u32			handle;  	struct tcf_exts		exts;  	struct tcf_ematch_tree	ematches; @@ -36,16 +34,11 @@ struct basic_filter  	struct list_head	link;  }; -static const struct tcf_ext_map basic_ext_map = { -	.action = TCA_BASIC_ACT, -	.police = TCA_BASIC_POLICE -}; - -static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			  struct tcf_result *res)  {  	int r; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	list_for_each_entry(f, &head->flist, link) { @@ -63,7 +56,7 @@ static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,  static unsigned long basic_get(struct tcf_proto *tp, u32 handle)  {  	unsigned long l = 0UL; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	if (head == NULL) @@ -92,8 +85,7 @@ static int basic_init(struct tcf_proto *tp)  	return 0;  } -static inline void basic_delete_filter(struct tcf_proto *tp, -				       struct basic_filter *f) +static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)  {  	tcf_unbind_filter(tp, &f->res);  	tcf_exts_destroy(tp, &f->exts); @@ -115,7 +107,7 @@ static void basic_destroy(struct tcf_proto *tp)  static int basic_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *t, *f = (struct basic_filter *) arg;  	list_for_each_entry(t, &head->flist, link) @@ -135,15 +127,17 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {  	[TCA_BASIC_EMATCHES]	= { .type = NLA_NESTED },  }; -static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, -				  unsigned long base, struct nlattr **tb, -				  struct nlattr *est) +static int basic_set_parms(struct net *net, struct tcf_proto *tp, +			   struct basic_filter *f, unsigned long base, +			   struct nlattr **tb, +			   struct nlattr *est, bool ovr)  { -	int err = -EINVAL; +	int err;  	struct tcf_exts e;  	struct tcf_ematch_tree t; -	err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); +	tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -165,11 +159,12 @@ errout:  	return err;  } -static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, -			struct nlattr **tca, unsigned long *arg) +static int basic_change(struct net *net, struct sk_buff *in_skb, +			struct tcf_proto *tp, unsigned long base, u32 handle, +			struct nlattr **tca, unsigned long *arg, bool ovr)  {  	int err; -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct nlattr *tb[TCA_BASIC_MAX + 1];  	struct basic_filter *f = (struct basic_filter *) *arg; @@ -184,7 +179,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	if (f != NULL) {  		if (handle && f->handle != handle)  			return -EINVAL; -		return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); +		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);  	}  	err = -ENOBUFS; @@ -192,6 +187,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	if (f == NULL)  		goto errout; +	tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);  	err = -EINVAL;  	if (handle)  		f->handle = handle; @@ -203,14 +199,14 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,  		} while (--i > 0 && basic_get(tp, head->hgenerator));  		if (i <= 0) { -			printk(KERN_ERR "Insufficient number of handles\n"); +			pr_err("Insufficient number of handles\n");  			goto errout;  		}  		f->handle = head->hgenerator;  	} -	err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); +	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr);  	if (err < 0)  		goto errout; @@ -229,7 +225,7 @@ errout:  static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)  { -	struct basic_head *head = (struct basic_head *) tp->root; +	struct basic_head *head = tp->root;  	struct basic_filter *f;  	list_for_each_entry(f, &head->flist, link) { @@ -245,7 +241,7 @@ skip:  	}  } -static int basic_dump(struct tcf_proto *tp, unsigned long fh, +static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		      struct sk_buff *skb, struct tcmsg *t)  {  	struct basic_filter *f = (struct basic_filter *) fh; @@ -260,16 +256,17 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	if (f->res.classid) -		NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid); +	if (f->res.classid && +	    nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) +		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || +	if (tcf_exts_dump(skb, &f->exts) < 0 ||  	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c new file mode 100644 index 00000000000..13f64df2c71 --- /dev/null +++ b/net/sched/cls_bpf.c @@ -0,0 +1,382 @@ +/* + * Berkeley Packet Filter based traffic classifier + * + * Might be used to classify traffic through flexible, user-defined and + * possibly JIT-ed BPF filters for traffic control as an alternative to + * ematches. + * + * (C) 2013 Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_DESCRIPTION("TC BPF based classifier"); + +struct cls_bpf_head { +	struct list_head plist; +	u32 hgen; +}; + +struct cls_bpf_prog { +	struct sk_filter *filter; +	struct sock_filter *bpf_ops; +	struct tcf_exts exts; +	struct tcf_result res; +	struct list_head link; +	u32 handle; +	u16 bpf_len; +}; + +static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { +	[TCA_BPF_CLASSID]	= { .type = NLA_U32 }, +	[TCA_BPF_OPS_LEN]	= { .type = NLA_U16 }, +	[TCA_BPF_OPS]		= { .type = NLA_BINARY, +				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, +}; + +static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +			    struct tcf_result *res) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; +	int ret; + +	list_for_each_entry(prog, &head->plist, link) { +		int filter_res = SK_RUN_FILTER(prog->filter, skb); + +		if (filter_res == 0) +			continue; + +		*res = prog->res; +		if (filter_res != -1) +			res->classid = filter_res; + +		ret = tcf_exts_exec(skb, &prog->exts, res); +		if (ret < 0) +			continue; + +		return ret; +	} + +	return -1; +} + +static int cls_bpf_init(struct tcf_proto *tp) +{ +	struct cls_bpf_head *head; + +	head = kzalloc(sizeof(*head), GFP_KERNEL); +	if (head == NULL) +		return -ENOBUFS; + +	INIT_LIST_HEAD(&head->plist); +	tp->root = head; + +	return 0; +} + +static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) +{ +	tcf_unbind_filter(tp, &prog->res); +	tcf_exts_destroy(tp, &prog->exts); + +	sk_unattached_filter_destroy(prog->filter); + +	kfree(prog->bpf_ops); +	kfree(prog); +} + +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg; + +	list_for_each_entry(prog, &head->plist, link) { +		if (prog == todel) { +			tcf_tree_lock(tp); +			list_del(&prog->link); +			tcf_tree_unlock(tp); + +			cls_bpf_delete_prog(tp, prog); +			return 0; +		} +	} + +	return -ENOENT; +} + +static void cls_bpf_destroy(struct tcf_proto *tp) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog, *tmp; + +	list_for_each_entry_safe(prog, tmp, &head->plist, link) { +		list_del(&prog->link); +		cls_bpf_delete_prog(tp, prog); +	} + +	kfree(head); +} + +static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; +	unsigned long ret = 0UL; + +	if (head == NULL) +		return 0UL; + +	list_for_each_entry(prog, &head->plist, link) { +		if (prog->handle == handle) { +			ret = (unsigned long) prog; +			break; +		} +	} + +	return ret; +} + +static void cls_bpf_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, +				   struct cls_bpf_prog *prog, +				   unsigned long base, struct nlattr **tb, +				   struct nlattr *est, bool ovr) +{ +	struct sock_filter *bpf_ops, *bpf_old; +	struct tcf_exts exts; +	struct sock_fprog_kern tmp; +	struct sk_filter *fp, *fp_old; +	u16 bpf_size, bpf_len; +	u32 classid; +	int ret; + +	if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) +		return -EINVAL; + +	tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); +	ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); +	if (ret < 0) +		return ret; + +	classid = nla_get_u32(tb[TCA_BPF_CLASSID]); +	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]); +	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) { +		ret = -EINVAL; +		goto errout; +	} + +	bpf_size = bpf_len * sizeof(*bpf_ops); +	bpf_ops = kzalloc(bpf_size, GFP_KERNEL); +	if (bpf_ops == NULL) { +		ret = -ENOMEM; +		goto errout; +	} + +	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); + +	tmp.len = bpf_len; +	tmp.filter = bpf_ops; + +	ret = sk_unattached_filter_create(&fp, &tmp); +	if (ret) +		goto errout_free; + +	tcf_tree_lock(tp); +	fp_old = prog->filter; +	bpf_old = prog->bpf_ops; + +	prog->bpf_len = bpf_len; +	prog->bpf_ops = bpf_ops; +	prog->filter = fp; +	prog->res.classid = classid; +	tcf_tree_unlock(tp); + +	tcf_bind_filter(tp, &prog->res, base); +	tcf_exts_change(tp, &prog->exts, &exts); + +	if (fp_old) +		sk_unattached_filter_destroy(fp_old); +	if (bpf_old) +		kfree(bpf_old); + +	return 0; + +errout_free: +	kfree(bpf_ops); +errout: +	tcf_exts_destroy(tp, &exts); +	return ret; +} + +static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, +				   struct cls_bpf_head *head) +{ +	unsigned int i = 0x80000000; + +	do { +		if (++head->hgen == 0x7FFFFFFF) +			head->hgen = 1; +	} while (--i > 0 && cls_bpf_get(tp, head->hgen)); +	if (i == 0) +		pr_err("Insufficient number of handles\n"); + +	return i; +} + +static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, +			  struct tcf_proto *tp, unsigned long base, +			  u32 handle, struct nlattr **tca, +			  unsigned long *arg, bool ovr) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg; +	struct nlattr *tb[TCA_BPF_MAX + 1]; +	int ret; + +	if (tca[TCA_OPTIONS] == NULL) +		return -EINVAL; + +	ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); +	if (ret < 0) +		return ret; + +	if (prog != NULL) { +		if (handle && prog->handle != handle) +			return -EINVAL; +		return cls_bpf_modify_existing(net, tp, prog, base, tb, +					       tca[TCA_RATE], ovr); +	} + +	prog = kzalloc(sizeof(*prog), GFP_KERNEL); +	if (prog == NULL) +		return -ENOBUFS; + +	tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); +	if (handle == 0) +		prog->handle = cls_bpf_grab_new_handle(tp, head); +	else +		prog->handle = handle; +	if (prog->handle == 0) { +		ret = -EINVAL; +		goto errout; +	} + +	ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); +	if (ret < 0) +		goto errout; + +	tcf_tree_lock(tp); +	list_add(&prog->link, &head->plist); +	tcf_tree_unlock(tp); + +	*arg = (unsigned long) prog; + +	return 0; +errout: +	if (*arg == 0UL && prog) +		kfree(prog); + +	return ret; +} + +static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, +			struct sk_buff *skb, struct tcmsg *tm) +{ +	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; +	struct nlattr *nest, *nla; + +	if (prog == NULL) +		return skb->len; + +	tm->tcm_handle = prog->handle; + +	nest = nla_nest_start(skb, TCA_OPTIONS); +	if (nest == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) +		goto nla_put_failure; +	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len)) +		goto nla_put_failure; + +	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len * +			  sizeof(struct sock_filter)); +	if (nla == NULL) +		goto nla_put_failure; + +	memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + +	if (tcf_exts_dump(skb, &prog->exts) < 0) +		goto nla_put_failure; + +	nla_nest_end(skb, nest); + +	if (tcf_exts_dump_stats(skb, &prog->exts) < 0) +		goto nla_put_failure; + +	return skb->len; + +nla_put_failure: +	nla_nest_cancel(skb, nest); +	return -1; +} + +static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ +	struct cls_bpf_head *head = tp->root; +	struct cls_bpf_prog *prog; + +	list_for_each_entry(prog, &head->plist, link) { +		if (arg->count < arg->skip) +			goto skip; +		if (arg->fn(tp, (unsigned long) prog, arg) < 0) { +			arg->stop = 1; +			break; +		} +skip: +		arg->count++; +	} +} + +static struct tcf_proto_ops cls_bpf_ops __read_mostly = { +	.kind		=	"bpf", +	.owner		=	THIS_MODULE, +	.classify	=	cls_bpf_classify, +	.init		=	cls_bpf_init, +	.destroy	=	cls_bpf_destroy, +	.get		=	cls_bpf_get, +	.put		=	cls_bpf_put, +	.change		=	cls_bpf_change, +	.delete		=	cls_bpf_delete, +	.walk		=	cls_bpf_walk, +	.dump		=	cls_bpf_dump, +}; + +static int __init cls_bpf_init_mod(void) +{ +	return register_tcf_proto_ops(&cls_bpf_ops); +} + +static void __exit cls_bpf_exit_mod(void) +{ +	unregister_tcf_proto_ops(&cls_bpf_ops); +} + +module_init(cls_bpf_init_mod); +module_exit(cls_bpf_exit_mod); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index d49c40fb7e0..cacf01bd04f 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -11,97 +11,20 @@  #include <linux/module.h>  #include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h>  #include <linux/skbuff.h> -#include <linux/cgroup.h>  #include <linux/rcupdate.h>  #include <net/rtnetlink.h>  #include <net/pkt_cls.h>  #include <net/sock.h>  #include <net/cls_cgroup.h> -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, -					       struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); - -struct cgroup_subsys net_cls_subsys = { -	.name		= "net_cls", -	.create		= cgrp_create, -	.destroy	= cgrp_destroy, -	.populate	= cgrp_populate, -#ifdef CONFIG_NET_CLS_CGROUP -	.subsys_id	= net_cls_subsys_id, -#endif -	.module		= THIS_MODULE, -}; - - -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) -{ -	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), -			    struct cgroup_cls_state, css); -} - -static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) -{ -	return container_of(task_subsys_state(p, net_cls_subsys_id), -			    struct cgroup_cls_state, css); -} - -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, -						 struct cgroup *cgrp) -{ -	struct cgroup_cls_state *cs; - -	if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL))) -		return ERR_PTR(-ENOMEM); - -	if (cgrp->parent) -		cs->classid = cgrp_cls_state(cgrp->parent)->classid; - -	return &cs->css; -} - -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ -	kfree(cgrp_cls_state(cgrp)); -} - -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) -{ -	return cgrp_cls_state(cgrp)->classid; -} - -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) -{ -	cgrp_cls_state(cgrp)->classid = (u32) value; -	return 0; -} - -static struct cftype ss_files[] = { -	{ -		.name = "classid", -		.read_u64 = read_classid, -		.write_u64 = write_classid, -	}, -}; - -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ -	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); -} - -struct cls_cgroup_head -{ +struct cls_cgroup_head {  	u32			handle;  	struct tcf_exts		exts;  	struct tcf_ematch_tree	ematches;  }; -static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			       struct tcf_result *res)  {  	struct cls_cgroup_head *head = tp->root; @@ -153,20 +76,16 @@ static int cls_cgroup_init(struct tcf_proto *tp)  	return 0;  } -static const struct tcf_ext_map cgroup_ext_map = { -	.action = TCA_CGROUP_ACT, -	.police = TCA_CGROUP_POLICE, -}; -  static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {  	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },  }; -static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, +static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, +			     struct tcf_proto *tp, unsigned long base,  			     u32 handle, struct nlattr **tca, -			     unsigned long *arg) +			     unsigned long *arg, bool ovr)  { -	struct nlattr *tb[TCA_CGROUP_MAX+1]; +	struct nlattr *tb[TCA_CGROUP_MAX + 1];  	struct cls_cgroup_head *head = tp->root;  	struct tcf_ematch_tree t;  	struct tcf_exts e; @@ -183,6 +102,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,  		if (head == NULL)  			return -ENOBUFS; +		tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);  		head->handle = handle;  		tcf_tree_lock(tp); @@ -198,7 +118,8 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,  	if (err < 0)  		return err; -	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map); +	tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -243,7 +164,7 @@ skip:  	arg->count++;  } -static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh, +static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  			   struct sk_buff *skb, struct tcmsg *t)  {  	struct cls_cgroup_head *head = tp->root; @@ -256,13 +177,13 @@ static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 || +	if (tcf_exts_dump(skb, &head->exts) < 0 ||  	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &head->exts) < 0)  		goto nla_put_failure;  	return skb->len; @@ -288,36 +209,12 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {  static int __init init_cgroup_cls(void)  { -	int ret; - -	ret = cgroup_load_subsys(&net_cls_subsys); -	if (ret) -		goto out; - -#ifndef CONFIG_NET_CLS_CGROUP -	/* We can't use rcu_assign_pointer because this is an int. */ -	smp_wmb(); -	net_cls_subsys_id = net_cls_subsys.subsys_id; -#endif - -	ret = register_tcf_proto_ops(&cls_cgroup_ops); -	if (ret) -		cgroup_unload_subsys(&net_cls_subsys); - -out: -	return ret; +	return register_tcf_proto_ops(&cls_cgroup_ops);  }  static void __exit exit_cgroup_cls(void)  {  	unregister_tcf_proto_ops(&cls_cgroup_ops); - -#ifndef CONFIG_NET_CLS_CGROUP -	net_cls_subsys_id = -1; -	synchronize_rcu(); -#endif - -	cgroup_unload_subsys(&net_cls_subsys);  }  module_init(init_cgroup_cls); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5b271a18bc3..35be16f7c19 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -21,10 +21,13 @@  #include <linux/ipv6.h>  #include <linux/if_vlan.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <net/pkt_cls.h>  #include <net/ip.h>  #include <net/route.h> +#include <net/flow_keys.h> +  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)  #include <net/netfilter/nf_conntrack.h>  #endif @@ -53,11 +56,6 @@ struct flow_filter {  	u32			hashrnd;  }; -static const struct tcf_ext_map flow_ext_map = { -	.action	= TCA_FLOW_ACT, -	.police	= TCA_FLOW_POLICE, -}; -  static inline u32 addr_fold(void *addr)  {  	unsigned long a = (unsigned long)addr; @@ -65,132 +63,37 @@ static inline u32 addr_fold(void *addr)  	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);  } -static u32 flow_get_src(struct sk_buff *skb) +static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)  { -	switch (skb->protocol) { -	case htons(ETH_P_IP): -		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) -			return ntohl(ip_hdr(skb)->saddr); -		break; -	case htons(ETH_P_IPV6): -		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) -			return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]); -		break; -	} - +	if (flow->src) +		return ntohl(flow->src);  	return addr_fold(skb->sk);  } -static u32 flow_get_dst(struct sk_buff *skb) +static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)  { -	switch (skb->protocol) { -	case htons(ETH_P_IP): -		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) -			return ntohl(ip_hdr(skb)->daddr); -		break; -	case htons(ETH_P_IPV6): -		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) -			return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]); -		break; -	} - +	if (flow->dst) +		return ntohl(flow->dst);  	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;  } -static u32 flow_get_proto(struct sk_buff *skb) +static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)  { -	switch (skb->protocol) { -	case htons(ETH_P_IP): -		return pskb_network_may_pull(skb, sizeof(struct iphdr)) ? -		       ip_hdr(skb)->protocol : 0; -	case htons(ETH_P_IPV6): -		return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ? -		       ipv6_hdr(skb)->nexthdr : 0; -	default: -		return 0; -	} +	return flow->ip_proto;  } -static u32 flow_get_proto_src(struct sk_buff *skb) +static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)  { -	switch (skb->protocol) { -	case htons(ETH_P_IP): { -		struct iphdr *iph; -		int poff; - -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			break; -		iph = ip_hdr(skb); -		if (iph->frag_off & htons(IP_MF|IP_OFFSET)) -			break; -		poff = proto_ports_offset(iph->protocol); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { -			iph = ip_hdr(skb); -			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + -						 poff)); -		} -		break; -	} -	case htons(ETH_P_IPV6): { -		struct ipv6hdr *iph; -		int poff; - -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			break; -		iph = ipv6_hdr(skb); -		poff = proto_ports_offset(iph->nexthdr); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { -			iph = ipv6_hdr(skb); -			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + -						 poff)); -		} -		break; -	} -	} +	if (flow->ports) +		return ntohs(flow->port16[0]);  	return addr_fold(skb->sk);  } -static u32 flow_get_proto_dst(struct sk_buff *skb) +static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)  { -	switch (skb->protocol) { -	case htons(ETH_P_IP): { -		struct iphdr *iph; -		int poff; - -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			break; -		iph = ip_hdr(skb); -		if (iph->frag_off & htons(IP_MF|IP_OFFSET)) -			break; -		poff = proto_ports_offset(iph->protocol); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { -			iph = ip_hdr(skb); -			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + -						 2 + poff)); -		} -		break; -	} -	case htons(ETH_P_IPV6): { -		struct ipv6hdr *iph; -		int poff; - -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			break; -		iph = ipv6_hdr(skb); -		poff = proto_ports_offset(iph->nexthdr); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { -			iph = ipv6_hdr(skb); -			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + -						 poff + 2)); -		} -		break; -	} -	} +	if (flow->ports) +		return ntohs(flow->port16[1]);  	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;  } @@ -223,7 +126,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)  #define CTTUPLE(skb, member)						\  ({									\  	enum ip_conntrack_info ctinfo;					\ -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);			\ +	const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);		\  	if (ct == NULL)							\  		goto fallback;						\  	ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member;			\ @@ -236,7 +139,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)  })  #endif -static u32 flow_get_nfct_src(struct sk_buff *skb) +static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)  {  	switch (skb->protocol) {  	case htons(ETH_P_IP): @@ -245,10 +148,10 @@ static u32 flow_get_nfct_src(struct sk_buff *skb)  		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));  	}  fallback: -	return flow_get_src(skb); +	return flow_get_src(skb, flow);  } -static u32 flow_get_nfct_dst(struct sk_buff *skb) +static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)  {  	switch (skb->protocol) {  	case htons(ETH_P_IP): @@ -257,26 +160,26 @@ static u32 flow_get_nfct_dst(struct sk_buff *skb)  		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));  	}  fallback: -	return flow_get_dst(skb); +	return flow_get_dst(skb, flow);  } -static u32 flow_get_nfct_proto_src(struct sk_buff *skb) +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)  {  	return ntohs(CTTUPLE(skb, src.u.all));  fallback: -	return flow_get_proto_src(skb); +	return flow_get_proto_src(skb, flow);  } -static u32 flow_get_nfct_proto_dst(struct sk_buff *skb) +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)  {  	return ntohs(CTTUPLE(skb, dst.u.all));  fallback: -	return flow_get_proto_dst(skb); +	return flow_get_proto_dst(skb, flow);  }  static u32 flow_get_rtclassid(const struct sk_buff *skb)  { -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	if (skb_dst(skb))  		return skb_dst(skb)->tclassid;  #endif @@ -285,15 +188,19 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)  static u32 flow_get_skuid(const struct sk_buff *skb)  { -	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) -		return skb->sk->sk_socket->file->f_cred->fsuid; +	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { +		kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; +		return from_kuid(&init_user_ns, skuid); +	}  	return 0;  }  static u32 flow_get_skgid(const struct sk_buff *skb)  { -	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) -		return skb->sk->sk_socket->file->f_cred->fsgid; +	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { +		kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; +		return from_kgid(&init_user_ns, skgid); +	}  	return 0;  } @@ -308,22 +215,22 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)  static u32 flow_get_rxhash(struct sk_buff *skb)  { -	return skb_get_rxhash(skb); +	return skb_get_hash(skb);  } -static u32 flow_key_get(struct sk_buff *skb, int key) +static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)  {  	switch (key) {  	case FLOW_KEY_SRC: -		return flow_get_src(skb); +		return flow_get_src(skb, flow);  	case FLOW_KEY_DST: -		return flow_get_dst(skb); +		return flow_get_dst(skb, flow);  	case FLOW_KEY_PROTO: -		return flow_get_proto(skb); +		return flow_get_proto(skb, flow);  	case FLOW_KEY_PROTO_SRC: -		return flow_get_proto_src(skb); +		return flow_get_proto_src(skb, flow);  	case FLOW_KEY_PROTO_DST: -		return flow_get_proto_dst(skb); +		return flow_get_proto_dst(skb, flow);  	case FLOW_KEY_IIF:  		return flow_get_iif(skb);  	case FLOW_KEY_PRIORITY: @@ -333,13 +240,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key)  	case FLOW_KEY_NFCT:  		return flow_get_nfct(skb);  	case FLOW_KEY_NFCT_SRC: -		return flow_get_nfct_src(skb); +		return flow_get_nfct_src(skb, flow);  	case FLOW_KEY_NFCT_DST: -		return flow_get_nfct_dst(skb); +		return flow_get_nfct_dst(skb, flow);  	case FLOW_KEY_NFCT_PROTO_SRC: -		return flow_get_nfct_proto_src(skb); +		return flow_get_nfct_proto_src(skb, flow);  	case FLOW_KEY_NFCT_PROTO_DST: -		return flow_get_nfct_proto_dst(skb); +		return flow_get_nfct_proto_dst(skb, flow);  	case FLOW_KEY_RTCLASSID:  		return flow_get_rtclassid(skb);  	case FLOW_KEY_SKUID: @@ -356,7 +263,17 @@ static u32 flow_key_get(struct sk_buff *skb, int key)  	}  } -static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp, +#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | 		\ +			  (1 << FLOW_KEY_DST) |			\ +			  (1 << FLOW_KEY_PROTO) |		\ +			  (1 << FLOW_KEY_PROTO_SRC) |		\ +			  (1 << FLOW_KEY_PROTO_DST) | 		\ +			  (1 << FLOW_KEY_NFCT_SRC) |		\ +			  (1 << FLOW_KEY_NFCT_DST) |		\ +			  (1 << FLOW_KEY_NFCT_PROTO_SRC) |	\ +			  (1 << FLOW_KEY_NFCT_PROTO_DST)) + +static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			 struct tcf_result *res)  {  	struct flow_head *head = tp->root; @@ -367,17 +284,20 @@ static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,  	int r;  	list_for_each_entry(f, &head->filters, list) { -		u32 keys[f->nkeys]; +		u32 keys[FLOW_KEY_MAX + 1]; +		struct flow_keys flow_keys;  		if (!tcf_em_tree_match(skb, &f->ematches, NULL))  			continue;  		keymask = f->keymask; +		if (keymask & FLOW_KEYS_NEEDED) +			skb_flow_dissect(skb, &flow_keys);  		for (n = 0; n < f->nkeys; n++) {  			key = ffs(keymask) - 1;  			keymask &= ~(1 << key); -			keys[n] = flow_key_get(skb, key); +			keys[n] = flow_key_get(skb, key, &flow_keys);  		}  		if (f->mode == FLOW_MODE_HASH) @@ -426,9 +346,10 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {  	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },  }; -static int flow_change(struct tcf_proto *tp, unsigned long base, +static int flow_change(struct net *net, struct sk_buff *in_skb, +		       struct tcf_proto *tp, unsigned long base,  		       u32 handle, struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct flow_head *head = tp->root;  	struct flow_filter *f; @@ -465,9 +386,14 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,  		if (fls(keymask) - 1 > FLOW_KEY_MAX)  			return -EOPNOTSUPP; + +		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && +		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) +			return -EOPNOTSUPP;  	} -	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); +	tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; @@ -525,6 +451,7 @@ static int flow_change(struct tcf_proto *tp, unsigned long base,  		f->handle = handle;  		f->mask	  = ~0U; +		tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);  		get_random_bytes(&f->hashrnd, 4);  		f->perturb_timer.function = flow_perturbation; @@ -636,7 +563,7 @@ static void flow_put(struct tcf_proto *tp, unsigned long f)  {  } -static int flow_dump(struct tcf_proto *tp, unsigned long fh, +static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  {  	struct flow_filter *f = (struct flow_filter *)fh; @@ -651,27 +578,34 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask); -	NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode); +	if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || +	    nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) +		goto nla_put_failure;  	if (f->mask != ~0 || f->xor != 0) { -		NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask); -		NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor); +		if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || +		    nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) +			goto nla_put_failure;  	} -	if (f->rshift) -		NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift); -	if (f->addend) -		NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend); +	if (f->rshift && +	    nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) +		goto nla_put_failure; +	if (f->addend && +	    nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) +		goto nla_put_failure; -	if (f->divisor) -		NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor); -	if (f->baseclass) -		NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass); +	if (f->divisor && +	    nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) +		goto nla_put_failure; +	if (f->baseclass && +	    nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) +		goto nla_put_failure; -	if (f->perturb_period) -		NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ); +	if (f->perturb_period && +	    nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) +		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  #ifdef CONFIG_NET_EMATCH  	if (f->ematches.hdr.nmatches && @@ -680,7 +614,7 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh,  #endif  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 93b0a7b6f9b..861b03ccfed 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -29,71 +29,45 @@  #include <net/act_api.h>  #include <net/pkt_cls.h> -#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) +#define HTSIZE 256 -struct fw_head -{ -	struct fw_filter *ht[HTSIZE]; -	u32 mask; +struct fw_head { +	u32			mask; +	struct fw_filter	*ht[HTSIZE];  }; -struct fw_filter -{ +struct fw_filter {  	struct fw_filter	*next;  	u32			id;  	struct tcf_result	res;  #ifdef CONFIG_NET_CLS_IND -	char			indev[IFNAMSIZ]; +	int			ifindex;  #endif /* CONFIG_NET_CLS_IND */  	struct tcf_exts		exts;  }; -static const struct tcf_ext_map fw_ext_map = { -	.action = TCA_FW_ACT, -	.police = TCA_FW_POLICE -}; - -static __inline__ int fw_hash(u32 handle) +static u32 fw_hash(u32 handle)  { -	if (HTSIZE == 4096) -		return ((handle >> 24) & 0xFFF) ^ -		       ((handle >> 12) & 0xFFF) ^ -		       (handle & 0xFFF); -	else if (HTSIZE == 2048) -		return ((handle >> 22) & 0x7FF) ^ -		       ((handle >> 11) & 0x7FF) ^ -		       (handle & 0x7FF); -	else if (HTSIZE == 1024) -		return ((handle >> 20) & 0x3FF) ^ -		       ((handle >> 10) & 0x3FF) ^ -		       (handle & 0x3FF); -	else if (HTSIZE == 512) -		return (handle >> 27) ^ -		       ((handle >> 18) & 0x1FF) ^ -		       ((handle >> 9) & 0x1FF) ^ -		       (handle & 0x1FF); -	else if (HTSIZE == 256) { -		u8 *t = (u8 *) &handle; -		return t[0] ^ t[1] ^ t[2] ^ t[3]; -	} else -		return handle & (HTSIZE - 1); +	handle ^= (handle >> 16); +	handle ^= (handle >> 8); +	return handle % HTSIZE;  } -static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			  struct tcf_result *res)  { -	struct fw_head *head = (struct fw_head*)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f;  	int r;  	u32 id = skb->mark;  	if (head != NULL) {  		id &= head->mask; -		for (f=head->ht[fw_hash(id)]; f; f=f->next) { +		for (f = head->ht[fw_hash(id)]; f; f = f->next) {  			if (f->id == id) {  				*res = f->res;  #ifdef CONFIG_NET_CLS_IND -				if (!tcf_match_indev(skb, f->indev)) +				if (!tcf_match_indev(skb, f->ifindex))  					continue;  #endif /* CONFIG_NET_CLS_IND */  				r = tcf_exts_exec(skb, &f->exts, res); @@ -105,7 +79,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,  		}  	} else {  		/* old method */ -		if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { +		if (id && (TC_H_MAJ(id) == 0 || +			   !(TC_H_MAJ(id ^ tp->q->handle)))) {  			res->classid = id;  			res->class = 0;  			return 0; @@ -117,13 +92,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,  static unsigned long fw_get(struct tcf_proto *tp, u32 handle)  { -	struct fw_head *head = (struct fw_head*)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f;  	if (head == NULL)  		return 0; -	for (f=head->ht[fw_hash(handle)]; f; f=f->next) { +	for (f = head->ht[fw_hash(handle)]; f; f = f->next) {  		if (f->id == handle)  			return (unsigned long)f;  	} @@ -139,8 +114,7 @@ static int fw_init(struct tcf_proto *tp)  	return 0;  } -static inline void -fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)  {  	tcf_unbind_filter(tp, &f->res);  	tcf_exts_destroy(tp, &f->exts); @@ -156,8 +130,8 @@ static void fw_destroy(struct tcf_proto *tp)  	if (head == NULL)  		return; -	for (h=0; h<HTSIZE; h++) { -		while ((f=head->ht[h]) != NULL) { +	for (h = 0; h < HTSIZE; h++) { +		while ((f = head->ht[h]) != NULL) {  			head->ht[h] = f->next;  			fw_delete_filter(tp, f);  		} @@ -167,14 +141,14 @@ static void fw_destroy(struct tcf_proto *tp)  static int fw_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct fw_head *head = (struct fw_head*)tp->root; -	struct fw_filter *f = (struct fw_filter*)arg; +	struct fw_head *head = tp->root; +	struct fw_filter *f = (struct fw_filter *)arg;  	struct fw_filter **fp;  	if (head == NULL || f == NULL)  		goto out; -	for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { +	for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {  		if (*fp == f) {  			tcf_tree_lock(tp);  			*fp = f->next; @@ -194,19 +168,19 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {  };  static int -fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, -	struct nlattr **tb, struct nlattr **tca, unsigned long base) +fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, +	struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)  { -	struct fw_head *head = (struct fw_head *)tp->root; +	struct fw_head *head = tp->root;  	struct tcf_exts e;  	u32 mask;  	int err; -	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); +	tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; -	err = -EINVAL;  	if (tb[TCA_FW_CLASSID]) {  		f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);  		tcf_bind_filter(tp, &f->res, base); @@ -214,12 +188,17 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,  #ifdef CONFIG_NET_CLS_IND  	if (tb[TCA_FW_INDEV]) { -		err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]); -		if (err < 0) +		int ret; +		ret = tcf_change_indev(net, tb[TCA_FW_INDEV]); +		if (ret < 0) { +			err = ret;  			goto errout; +		} +		f->ifindex = ret;  	}  #endif /* CONFIG_NET_CLS_IND */ +	err = -EINVAL;  	if (tb[TCA_FW_MASK]) {  		mask = nla_get_u32(tb[TCA_FW_MASK]);  		if (mask != head->mask) @@ -235,12 +214,13 @@ errout:  	return err;  } -static int fw_change(struct tcf_proto *tp, unsigned long base, +static int fw_change(struct net *net, struct sk_buff *in_skb, +		     struct tcf_proto *tp, unsigned long base,  		     u32 handle,  		     struct nlattr **tca, -		     unsigned long *arg) +		     unsigned long *arg, bool ovr)  { -	struct fw_head *head = (struct fw_head*)tp->root; +	struct fw_head *head = tp->root;  	struct fw_filter *f = (struct fw_filter *) *arg;  	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_FW_MAX + 1]; @@ -256,7 +236,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,  	if (f != NULL) {  		if (f->id != handle && handle)  			return -EINVAL; -		return fw_change_attrs(tp, f, tb, tca, base); +		return fw_change_attrs(net, tp, f, tb, tca, base, ovr);  	}  	if (!handle) @@ -281,9 +261,10 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,  	if (f == NULL)  		return -ENOBUFS; +	tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);  	f->id = handle; -	err = fw_change_attrs(tp, f, tb, tca, base); +	err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);  	if (err < 0)  		goto errout; @@ -302,7 +283,7 @@ errout:  static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)  { -	struct fw_head *head = (struct fw_head*)tp->root; +	struct fw_head *head = tp->root;  	int h;  	if (head == NULL) @@ -328,11 +309,11 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int fw_dump(struct tcf_proto *tp, unsigned long fh, +static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		   struct sk_buff *skb, struct tcmsg *t)  { -	struct fw_head *head = (struct fw_head *)tp->root; -	struct fw_filter *f = (struct fw_filter*)fh; +	struct fw_head *head = tp->root; +	struct fw_filter *f = (struct fw_filter *)fh;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; @@ -348,21 +329,27 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	if (f->res.classid) -		NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid); +	if (f->res.classid && +	    nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) +		goto nla_put_failure;  #ifdef CONFIG_NET_CLS_IND -	if (strlen(f->indev)) -		NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev); +	if (f->ifindex) { +		struct net_device *dev; +		dev = __dev_get_by_index(net, f->ifindex); +		if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) +			goto nla_put_failure; +	}  #endif /* CONFIG_NET_CLS_IND */ -	if (head->mask != 0xFFFFFFFF) -		NLA_PUT_U32(skb, TCA_FW_MASK, head->mask); +	if (head->mask != 0xFFFFFFFF && +	    nla_put_u32(skb, TCA_FW_MASK, head->mask)) +		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 694dcd85dec..dd9fc2523c7 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -23,34 +23,30 @@  #include <net/pkt_cls.h>  /* -   1. For now we assume that route tags < 256. -      It allows to use direct table lookups, instead of hash tables. -   2. For now we assume that "from TAG" and "fromdev DEV" statements -      are mutually  exclusive. -   3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + * 1. For now we assume that route tags < 256. + *    It allows to use direct table lookups, instead of hash tables. + * 2. For now we assume that "from TAG" and "fromdev DEV" statements + *    are mutually  exclusive. + * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"   */ -struct route4_fastmap -{ +struct route4_fastmap {  	struct route4_filter	*filter;  	u32			id;  	int			iif;  }; -struct route4_head -{ +struct route4_head {  	struct route4_fastmap	fastmap[16]; -	struct route4_bucket	*table[256+1]; +	struct route4_bucket	*table[256 + 1];  }; -struct route4_bucket -{ +struct route4_bucket {  	/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ -	struct route4_filter	*ht[16+16+1]; +	struct route4_filter	*ht[16 + 16 + 1];  }; -struct route4_filter -{ +struct route4_filter {  	struct route4_filter	*next;  	u32			id;  	int			iif; @@ -61,20 +57,15 @@ struct route4_filter  	struct route4_bucket	*bkt;  }; -#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) +#define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) -static const struct tcf_ext_map route_ext_map = { -	.police = TCA_ROUTE4_POLICE, -	.action = TCA_ROUTE4_ACT -}; - -static __inline__ int route4_fastmap_hash(u32 id, int iif) +static inline int route4_fastmap_hash(u32 id, int iif)  { -	return id&0xF; +	return id & 0xF;  } -static inline -void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) +static void +route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)  {  	spinlock_t *root_lock = qdisc_root_sleeping_lock(q); @@ -83,32 +74,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)  	spin_unlock_bh(root_lock);  } -static inline void +static void  route4_set_fastmap(struct route4_head *head, u32 id, int iif,  		   struct route4_filter *f)  {  	int h = route4_fastmap_hash(id, iif); +  	head->fastmap[h].id = id;  	head->fastmap[h].iif = iif;  	head->fastmap[h].filter = f;  } -static __inline__ int route4_hash_to(u32 id) +static inline int route4_hash_to(u32 id)  { -	return id&0xFF; +	return id & 0xFF;  } -static __inline__ int route4_hash_from(u32 id) +static inline int route4_hash_from(u32 id)  { -	return (id>>16)&0xF; +	return (id >> 16) & 0xF;  } -static __inline__ int route4_hash_iif(int iif) +static inline int route4_hash_iif(int iif)  { -	return 16 + ((iif>>16)&0xF); +	return 16 + ((iif >> 16) & 0xF);  } -static __inline__ int route4_hash_wild(void) +static inline int route4_hash_wild(void)  {  	return 32;  } @@ -128,24 +120,25 @@ static __inline__ int route4_hash_wild(void)  	return 0;						\  } -static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			   struct tcf_result *res)  { -	struct route4_head *head = (struct route4_head*)tp->root; +	struct route4_head *head = tp->root;  	struct dst_entry *dst;  	struct route4_bucket *b;  	struct route4_filter *f;  	u32 id, h;  	int iif, dont_cache = 0; -	if ((dst = skb_dst(skb)) == NULL) +	dst = skb_dst(skb); +	if (!dst)  		goto failure;  	id = dst->tclassid;  	if (head == NULL)  		goto old_method; -	iif = ((struct rtable*)dst)->fl.iif; +	iif = inet_iif(skb);  	h = route4_fastmap_hash(id, iif);  	if (id == head->fastmap[h].id && @@ -161,7 +154,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,  	h = route4_hash_to(id);  restart: -	if ((b = head->table[h]) != NULL) { +	b = head->table[h]; +	if (b) {  		for (f = b->ht[route4_hash_from(id)]; f; f = f->next)  			if (f->id == id)  				ROUTE4_APPLY_RESULT(); @@ -197,8 +191,9 @@ old_method:  static inline u32 to_hash(u32 id)  { -	u32 h = id&0xFF; -	if (id&0x8000) +	u32 h = id & 0xFF; + +	if (id & 0x8000)  		h += 256;  	return h;  } @@ -211,17 +206,17 @@ static inline u32 from_hash(u32 id)  	if (!(id & 0x8000)) {  		if (id > 255)  			return 256; -		return id&0xF; +		return id & 0xF;  	} -	return 16 + (id&0xF); +	return 16 + (id & 0xF);  }  static unsigned long route4_get(struct tcf_proto *tp, u32 handle)  { -	struct route4_head *head = (struct route4_head*)tp->root; +	struct route4_head *head = tp->root;  	struct route4_bucket *b;  	struct route4_filter *f; -	unsigned h1, h2; +	unsigned int h1, h2;  	if (!head)  		return 0; @@ -230,11 +225,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)  	if (h1 > 256)  		return 0; -	h2 = from_hash(handle>>16); +	h2 = from_hash(handle >> 16);  	if (h2 > 32)  		return 0; -	if ((b = head->table[h1]) != NULL) { +	b = head->table[h1]; +	if (b) {  		for (f = b->ht[h2]; f; f = f->next)  			if (f->handle == handle)  				return (unsigned long)f; @@ -251,7 +247,7 @@ static int route4_init(struct tcf_proto *tp)  	return 0;  } -static inline void +static void  route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)  {  	tcf_unbind_filter(tp, &f->res); @@ -267,11 +263,12 @@ static void route4_destroy(struct tcf_proto *tp)  	if (head == NULL)  		return; -	for (h1=0; h1<=256; h1++) { +	for (h1 = 0; h1 <= 256; h1++) {  		struct route4_bucket *b; -		if ((b = head->table[h1]) != NULL) { -			for (h2=0; h2<=32; h2++) { +		b = head->table[h1]; +		if (b) { +			for (h2 = 0; h2 <= 32; h2++) {  				struct route4_filter *f;  				while ((f = b->ht[h2]) != NULL) { @@ -287,9 +284,9 @@ static void route4_destroy(struct tcf_proto *tp)  static int route4_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct route4_head *head = (struct route4_head*)tp->root; -	struct route4_filter **fp, *f = (struct route4_filter*)arg; -	unsigned h = 0; +	struct route4_head *head = tp->root; +	struct route4_filter **fp, *f = (struct route4_filter *)arg; +	unsigned int h = 0;  	struct route4_bucket *b;  	int i; @@ -299,7 +296,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)  	h = f->handle;  	b = f->bkt; -	for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { +	for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {  		if (*fp == f) {  			tcf_tree_lock(tp);  			*fp = f->next; @@ -310,7 +307,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)  			/* Strip tree */ -			for (i=0; i<=32; i++) +			for (i = 0; i <= 32; i++)  				if (b->ht[i])  					return 0; @@ -333,9 +330,11 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {  	[TCA_ROUTE4_IIF]	= { .type = NLA_U32 },  }; -static int route4_set_parms(struct tcf_proto *tp, unsigned long base, -	struct route4_filter *f, u32 handle, struct route4_head *head, -	struct nlattr **tb, struct nlattr *est, int new) +static int route4_set_parms(struct net *net, struct tcf_proto *tp, +			    unsigned long base, struct route4_filter *f, +			    u32 handle, struct route4_head *head, +			    struct nlattr **tb, struct nlattr *est, int new, +			    bool ovr)  {  	int err;  	u32 id = 0, to = 0, nhandle = 0x8000; @@ -344,7 +343,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,  	struct route4_bucket *b;  	struct tcf_exts e; -	err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); +	tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -380,7 +380,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,  	}  	h1 = to_hash(nhandle); -	if ((b = head->table[h1]) == NULL) { +	b = head->table[h1]; +	if (!b) {  		err = -ENOBUFS;  		b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);  		if (b == NULL) @@ -391,6 +392,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,  		tcf_tree_unlock(tp);  	} else {  		unsigned int h2 = from_hash(nhandle >> 16); +  		err = -EEXIST;  		for (fp = b->ht[h2]; fp; fp = fp->next)  			if (fp->handle == f->handle) @@ -423,10 +425,11 @@ errout:  	return err;  } -static int route4_change(struct tcf_proto *tp, unsigned long base, +static int route4_change(struct net *net, struct sk_buff *in_skb, +		       struct tcf_proto *tp, unsigned long base,  		       u32 handle,  		       struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct route4_head *head = tp->root;  	struct route4_filter *f, *f1, **fp; @@ -444,15 +447,16 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,  	if (err < 0)  		return err; -	if ((f = (struct route4_filter*)*arg) != NULL) { +	f = (struct route4_filter *)*arg; +	if (f) {  		if (f->handle != handle && handle)  			return -EINVAL;  		if (f->bkt)  			old_handle = f->handle; -		err = route4_set_parms(tp, base, f, handle, head, tb, -			tca[TCA_RATE], 0); +		err = route4_set_parms(net, tp, base, f, handle, head, tb, +			tca[TCA_RATE], 0, ovr);  		if (err < 0)  			return err; @@ -474,14 +478,15 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,  	if (f == NULL)  		goto errout; -	err = route4_set_parms(tp, base, f, handle, head, tb, -		tca[TCA_RATE], 1); +	tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); +	err = route4_set_parms(net, tp, base, f, handle, head, tb, +		tca[TCA_RATE], 1, ovr);  	if (err < 0)  		goto errout;  reinsert:  	h = from_hash(f->handle >> 16); -	for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) +	for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)  		if (f->handle < f1->handle)  			break; @@ -492,7 +497,8 @@ reinsert:  	if (old_handle && f->handle != old_handle) {  		th = to_hash(old_handle);  		h = from_hash(old_handle >> 16); -		if ((b = head->table[th]) != NULL) { +		b = head->table[th]; +		if (b) {  			for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {  				if (*fp == f) {  					*fp = f->next; @@ -515,7 +521,7 @@ errout:  static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)  {  	struct route4_head *head = tp->root; -	unsigned h, h1; +	unsigned int h, h1;  	if (head == NULL)  		arg->stop = 1; @@ -546,10 +552,10 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int route4_dump(struct tcf_proto *tp, unsigned long fh, +static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		       struct sk_buff *skb, struct tcmsg *t)  { -	struct route4_filter *f = (struct route4_filter*)fh; +	struct route4_filter *f = (struct route4_filter *)fh;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest;  	u32 id; @@ -563,26 +569,30 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	if (!(f->handle&0x8000)) { -		id = f->id&0xFF; -		NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); +	if (!(f->handle & 0x8000)) { +		id = f->id & 0xFF; +		if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) +			goto nla_put_failure;  	} -	if (f->handle&0x80000000) { -		if ((f->handle>>16) != 0xFFFF) -			NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); +	if (f->handle & 0x80000000) { +		if ((f->handle >> 16) != 0xFFFF && +		    nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) +			goto nla_put_failure;  	} else { -		id = f->id>>16; -		NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); +		id = f->id >> 16; +		if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) +			goto nla_put_failure;  	} -	if (f->res.classid) -		NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid); +	if (f->res.classid && +	    nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) +		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 425a1790b04..1020e233a5d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -66,28 +66,25 @@     powerful classification engine.  */ -struct rsvp_head -{ +struct rsvp_head {  	u32			tmap[256/32];  	u32			hgenerator;  	u8			tgenerator;  	struct rsvp_session	*ht[256];  }; -struct rsvp_session -{ +struct rsvp_session {  	struct rsvp_session	*next;  	__be32			dst[RSVP_DST_LEN];  	struct tc_rsvp_gpi 	dpi;  	u8			protocol;  	u8			tunnelid;  	/* 16 (src,sport) hash slots, and one wildcard source slot */ -	struct rsvp_filter	*ht[16+1]; +	struct rsvp_filter	*ht[16 + 1];  }; -struct rsvp_filter -{ +struct rsvp_filter {  	struct rsvp_filter	*next;  	__be32			src[RSVP_DST_LEN];  	struct tc_rsvp_gpi	spi; @@ -100,28 +97,25 @@ struct rsvp_filter  	struct rsvp_session	*sess;  }; -static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) +static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)  { -	unsigned h = (__force __u32)dst[RSVP_DST_LEN-1]; +	unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; +  	h ^= h>>16;  	h ^= h>>8;  	return (h ^ protocol ^ tunnelid) & 0xFF;  } -static __inline__ unsigned hash_src(__be32 *src) +static inline unsigned int hash_src(__be32 *src)  { -	unsigned h = (__force __u32)src[RSVP_DST_LEN-1]; +	unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; +  	h ^= h>>16;  	h ^= h>>8;  	h ^= h>>4;  	return h & 0xF;  } -static struct tcf_ext_map rsvp_ext_map = { -	.police = TCA_RSVP_POLICE, -	.action = TCA_RSVP_ACT -}; -  #define RSVP_APPLY_RESULT()				\  {							\  	int r = tcf_exts_exec(skb, &f->exts, res);	\ @@ -131,13 +125,13 @@ static struct tcf_ext_map rsvp_ext_map = {  		return r;				\  } -static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			 struct tcf_result *res)  { -	struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; +	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;  	struct rsvp_session *s;  	struct rsvp_filter *f; -	unsigned h1, h2; +	unsigned int h1, h2;  	__be32 *dst, *src;  	u8 protocol;  	u8 tunnelid = 0; @@ -162,13 +156,13 @@ restart:  	src = &nhptr->saddr.s6_addr32[0];  	dst = &nhptr->daddr.s6_addr32[0];  	protocol = nhptr->nexthdr; -	xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +	xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);  #else  	src = &nhptr->saddr;  	dst = &nhptr->daddr;  	protocol = nhptr->protocol; -	xprt = ((u8*)nhptr) + (nhptr->ihl<<2); -	if (nhptr->frag_off & htons(IP_MF|IP_OFFSET)) +	xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); +	if (ip_is_fragment(nhptr))  		return -1;  #endif @@ -176,10 +170,10 @@ restart:  	h2 = hash_src(src);  	for (s = sht[h1]; s; s = s->next) { -		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && +		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&  		    protocol == s->protocol &&  		    !(s->dpi.mask & -		      (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) && +		      (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&  #if RSVP_DST_LEN == 4  		    dst[0] == s->dst[0] &&  		    dst[1] == s->dst[1] && @@ -188,8 +182,8 @@ restart:  		    tunnelid == s->tunnelid) {  			for (f = s->ht[h2]; f; f = f->next) { -				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && -				    !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && +				    !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))  #if RSVP_DST_LEN == 4  				    &&  				    src[0] == f->src[0] && @@ -205,7 +199,7 @@ matched:  						return 0;  					tunnelid = f->res.classid; -					nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); +					nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));  					goto restart;  				}  			} @@ -224,11 +218,11 @@ matched:  static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)  { -	struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; +	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;  	struct rsvp_session *s;  	struct rsvp_filter *f; -	unsigned h1 = handle&0xFF; -	unsigned h2 = (handle>>8)&0xFF; +	unsigned int h1 = handle & 0xFF; +	unsigned int h2 = (handle >> 8) & 0xFF;  	if (h2 > 16)  		return 0; @@ -258,7 +252,7 @@ static int rsvp_init(struct tcf_proto *tp)  	return -ENOBUFS;  } -static inline void +static void  rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)  {  	tcf_unbind_filter(tp, &f->res); @@ -277,13 +271,13 @@ static void rsvp_destroy(struct tcf_proto *tp)  	sht = data->ht; -	for (h1=0; h1<256; h1++) { +	for (h1 = 0; h1 < 256; h1++) {  		struct rsvp_session *s;  		while ((s = sht[h1]) != NULL) {  			sht[h1] = s->next; -			for (h2=0; h2<=16; h2++) { +			for (h2 = 0; h2 <= 16; h2++) {  				struct rsvp_filter *f;  				while ((f = s->ht[h2]) != NULL) { @@ -299,13 +293,13 @@ static void rsvp_destroy(struct tcf_proto *tp)  static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; -	unsigned h = f->handle; +	struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; +	unsigned int h = f->handle;  	struct rsvp_session **sp;  	struct rsvp_session *s = f->sess;  	int i; -	for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { +	for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {  		if (*fp == f) {  			tcf_tree_lock(tp);  			*fp = f->next; @@ -314,12 +308,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)  			/* Strip tree */ -			for (i=0; i<=16; i++) +			for (i = 0; i <= 16; i++)  				if (s->ht[i])  					return 0;  			/* OK, session has no flows */ -			for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; +			for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];  			     *sp; sp = &(*sp)->next) {  				if (*sp == s) {  					tcf_tree_lock(tp); @@ -337,13 +331,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)  	return 0;  } -static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)  {  	struct rsvp_head *data = tp->root;  	int i = 0xFFFF;  	while (i-- > 0) {  		u32 h; +  		if ((data->hgenerator += 0x10000) == 0)  			data->hgenerator = 0x10000;  		h = data->hgenerator|salt; @@ -355,10 +350,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)  static int tunnel_bts(struct rsvp_head *data)  { -	int n = data->tgenerator>>5; -	u32 b = 1<<(data->tgenerator&0x1F); +	int n = data->tgenerator >> 5; +	u32 b = 1 << (data->tgenerator & 0x1F); -	if (data->tmap[n]&b) +	if (data->tmap[n] & b)  		return 0;  	data->tmap[n] |= b;  	return 1; @@ -372,10 +367,10 @@ static void tunnel_recycle(struct rsvp_head *data)  	memset(tmap, 0, sizeof(tmap)); -	for (h1=0; h1<256; h1++) { +	for (h1 = 0; h1 < 256; h1++) {  		struct rsvp_session *s;  		for (s = sht[h1]; s; s = s->next) { -			for (h2=0; h2<=16; h2++) { +			for (h2 = 0; h2 <= 16; h2++) {  				struct rsvp_filter *f;  				for (f = s->ht[h2]; f; f = f->next) { @@ -395,8 +390,8 @@ static u32 gen_tunnel(struct rsvp_head *data)  {  	int i, k; -	for (k=0; k<2; k++) { -		for (i=255; i>0; i--) { +	for (k = 0; k < 2; k++) { +		for (i = 255; i > 0; i--) {  			if (++data->tgenerator == 0)  				data->tgenerator = 1;  			if (tunnel_bts(data)) @@ -416,19 +411,20 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {  	[TCA_RSVP_PINFO]	= { .len = sizeof(struct tc_rsvp_pinfo) },  }; -static int rsvp_change(struct tcf_proto *tp, unsigned long base, +static int rsvp_change(struct net *net, struct sk_buff *in_skb, +		       struct tcf_proto *tp, unsigned long base,  		       u32 handle,  		       struct nlattr **tca, -		       unsigned long *arg) +		       unsigned long *arg, bool ovr)  {  	struct rsvp_head *data = tp->root;  	struct rsvp_filter *f, **fp;  	struct rsvp_session *s, **sp;  	struct tc_rsvp_pinfo *pinfo = NULL; -	struct nlattr *opt = tca[TCA_OPTIONS-1]; +	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_RSVP_MAX + 1];  	struct tcf_exts e; -	unsigned h1, h2; +	unsigned int h1, h2;  	__be32 *dst;  	int err; @@ -439,17 +435,19 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,  	if (err < 0)  		return err; -	err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); +	tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); +	err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);  	if (err < 0)  		return err; -	if ((f = (struct rsvp_filter*)*arg) != NULL) { +	f = (struct rsvp_filter *)*arg; +	if (f) {  		/* Node exists: adjust only classid */  		if (f->handle != handle && handle)  			goto errout2; -		if (tb[TCA_RSVP_CLASSID-1]) { -			f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); +		if (tb[TCA_RSVP_CLASSID]) { +			f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);  			tcf_bind_filter(tp, &f->res, base);  		} @@ -461,7 +459,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,  	err = -EINVAL;  	if (handle)  		goto errout2; -	if (tb[TCA_RSVP_DST-1] == NULL) +	if (tb[TCA_RSVP_DST] == NULL)  		goto errout2;  	err = -ENOBUFS; @@ -469,20 +467,21 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,  	if (f == NULL)  		goto errout2; +	tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);  	h2 = 16; -	if (tb[TCA_RSVP_SRC-1]) { -		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); +	if (tb[TCA_RSVP_SRC]) { +		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));  		h2 = hash_src(f->src);  	} -	if (tb[TCA_RSVP_PINFO-1]) { -		pinfo = nla_data(tb[TCA_RSVP_PINFO-1]); +	if (tb[TCA_RSVP_PINFO]) { +		pinfo = nla_data(tb[TCA_RSVP_PINFO]);  		f->spi = pinfo->spi;  		f->tunnelhdr = pinfo->tunnelhdr;  	} -	if (tb[TCA_RSVP_CLASSID-1]) -		f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]); +	if (tb[TCA_RSVP_CLASSID]) +		f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); -	dst = nla_data(tb[TCA_RSVP_DST-1]); +	dst = nla_data(tb[TCA_RSVP_DST]);  	h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);  	err = -ENOMEM; @@ -500,7 +499,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,  			goto errout;  	} -	for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { +	for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {  		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&  		    pinfo && pinfo->protocol == s->protocol &&  		    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && @@ -523,7 +522,7 @@ insert:  			tcf_exts_change(tp, &f->exts, &e);  			for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) -				if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) +				if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)  					break;  			f->next = *fp;  			wmb(); @@ -567,7 +566,7 @@ errout2:  static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)  {  	struct rsvp_head *head = tp->root; -	unsigned h, h1; +	unsigned int h, h1;  	if (arg->stop)  		return; @@ -595,10 +594,10 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, +static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  { -	struct rsvp_filter *f = (struct rsvp_filter*)fh; +	struct rsvp_filter *f = (struct rsvp_filter *)fh;  	struct rsvp_session *s;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; @@ -614,25 +613,29 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); +	if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) +		goto nla_put_failure;  	pinfo.dpi = s->dpi;  	pinfo.spi = f->spi;  	pinfo.protocol = s->protocol;  	pinfo.tunnelid = s->tunnelid;  	pinfo.tunnelhdr = f->tunnelhdr;  	pinfo.pad = 0; -	NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); -	if (f->res.classid) -		NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); -	if (((f->handle>>8)&0xFF) != 16) -		NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); +	if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) +		goto nla_put_failure; +	if (f->res.classid && +	    nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) +		goto nla_put_failure; +	if (((f->handle >> 8) & 0xFF) != 16 && +	    nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) +		goto nla_put_failure; -	if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) +	if (tcf_exts_dump(skb, &f->exts) < 0)  		goto nla_put_failure;  	nla_nest_end(skb, nest); -	if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) +	if (tcf_exts_dump_stats(skb, &f->exts) < 0)  		goto nla_put_failure;  	return skb->len; @@ -641,8 +644,7 @@ nla_put_failure:  	return -1;  } -static struct tcf_proto_ops RSVP_OPS = { -	.next		=	NULL, +static struct tcf_proto_ops RSVP_OPS __read_mostly = {  	.kind		=	RSVP_ID,  	.classify	=	rsvp_classify,  	.init		=	rsvp_init, diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 20ef330bb91..c721cd4a469 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -24,9 +24,6 @@  #define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */ -#define	PRIV(tp)	((struct tcindex_data *) (tp)->root) - -  struct tcindex_filter_result {  	struct tcf_exts		exts;  	struct tcf_result	res; @@ -50,11 +47,6 @@ struct tcindex_data {  	int fall_through;	/* 0: only classify if explicit match */  }; -static const struct tcf_ext_map tcindex_ext_map = { -	.police = TCA_TCINDEX_POLICE, -	.action = TCA_TCINDEX_ACT -}; -  static inline int  tcindex_filter_is_set(struct tcindex_filter_result *r)  { @@ -79,10 +71,10 @@ tcindex_lookup(struct tcindex_data *p, u16 key)  } -static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, +static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			    struct tcf_result *res)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *f;  	int key = (skb->tc_index & p->mask) >> p->shift; @@ -107,7 +99,7 @@ static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,  static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r;  	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); @@ -145,7 +137,7 @@ static int tcindex_init(struct tcf_proto *tp)  static int  __tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;  	struct tcindex_filter *f = NULL; @@ -196,10 +188,17 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {  	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },  }; +static void tcindex_filter_result_init(struct tcindex_filter_result *r) +{ +	memset(r, 0, sizeof(*r)); +	tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +} +  static int -tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, -		  struct tcindex_data *p, struct tcindex_filter_result *r, -		  struct nlattr **tb, struct nlattr *est) +tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, +		  u32 handle, struct tcindex_data *p, +		  struct tcindex_filter_result *r, struct nlattr **tb, +		  struct nlattr *est, bool ovr)  {  	int err, balloc = 0;  	struct tcindex_filter_result new_filter_result, *old_r = r; @@ -208,17 +207,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,  	struct tcindex_filter *f = NULL; /* make gcc behave */  	struct tcf_exts e; -	err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); +	tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err;  	memcpy(&cp, p, sizeof(cp)); -	memset(&new_filter_result, 0, sizeof(new_filter_result)); +	tcindex_filter_result_init(&new_filter_result); +	tcindex_filter_result_init(&cr);  	if (old_r) -		memcpy(&cr, r, sizeof(cr)); -	else -		memset(&cr, 0, sizeof(cr)); +		cr.res = r->res;  	if (tb[TCA_TCINDEX_HASH])  		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -249,7 +248,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,  		 * of the hashing index is below the threshold.  		 */  		if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) -			cp.hash = (cp.mask >> cp.shift)+1; +			cp.hash = (cp.mask >> cp.shift) + 1;  		else  			cp.hash = DEFAULT_HASH_SIZE;  	} @@ -270,9 +269,14 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,  	err = -ENOMEM;  	if (!cp.perfect && !cp.h) {  		if (valid_perfect_hash(&cp)) { +			int i; +  			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);  			if (!cp.perfect)  				goto errout; +			for (i = 0; i < cp.hash; i++) +				tcf_exts_init(&cp.perfect[i].exts, TCA_TCINDEX_ACT, +					      TCA_TCINDEX_POLICE);  			balloc = 1;  		} else {  			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL); @@ -298,14 +302,17 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,  		tcf_bind_filter(tp, &cr.res, base);  	} -	tcf_exts_change(tp, &cr.exts, &e); +	if (old_r) +		tcf_exts_change(tp, &r->exts, &e); +	else +		tcf_exts_change(tp, &cr.exts, &e);  	tcf_tree_lock(tp);  	if (old_r && old_r != r) -		memset(old_r, 0, sizeof(*old_r)); +		tcindex_filter_result_init(old_r);  	memcpy(p, &cp, sizeof(cp)); -	memcpy(r, &cr, sizeof(cr)); +	r->res = cr.res;  	if (r == &new_filter_result) {  		struct tcindex_filter **fp; @@ -332,12 +339,13 @@ errout:  }  static int -tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, -	       struct nlattr **tca, unsigned long *arg) +tcindex_change(struct net *net, struct sk_buff *in_skb, +	       struct tcf_proto *tp, unsigned long base, u32 handle, +	       struct nlattr **tca, unsigned long *arg, bool ovr)  {  	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_TCINDEX_MAX + 1]; -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;  	int err; @@ -352,13 +360,14 @@ tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	if (err < 0)  		return err; -	return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); +	return tcindex_set_parms(net, tp, base, handle, p, r, tb, +				 tca[TCA_RATE], ovr);  }  static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter *f, *next;  	int i; @@ -405,7 +414,7 @@ static int tcindex_destroy_element(struct tcf_proto *tp,  static void tcindex_destroy(struct tcf_proto *tp)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcf_walker walker;  	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); @@ -420,10 +429,10 @@ static void tcindex_destroy(struct tcf_proto *tp)  } -static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, +static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,      struct sk_buff *skb, struct tcmsg *t)  { -	struct tcindex_data *p = PRIV(tp); +	struct tcindex_data *p = tp->root;  	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;  	unsigned char *b = skb_tail_pointer(skb);  	struct nlattr *nest; @@ -438,10 +447,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,  	if (!fh) {  		t->tcm_handle = ~0; /* whatever ... */ -		NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash); -		NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask); -		NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift); -		NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through); +		if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || +		    nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || +		    nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || +		    nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) +			goto nla_put_failure;  		nla_nest_end(skb, nest);  	} else {  		if (p->perfect) { @@ -460,14 +470,15 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,  			}  		}  		pr_debug("handle = %d\n", t->tcm_handle); -		if (r->res.class) -			NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid); +		if (r->res.class && +		    nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) +			goto nla_put_failure; -		if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) +		if (tcf_exts_dump(skb, &r->exts) < 0)  			goto nla_put_failure;  		nla_nest_end(skb, nest); -		if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) +		if (tcf_exts_dump_stats(skb, &r->exts) < 0)  			goto nla_put_failure;  	} diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b0c2a82178a..70c0be8d012 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -38,18 +38,18 @@  #include <linux/errno.h>  #include <linux/rtnetlink.h>  #include <linux/skbuff.h> +#include <linux/bitmap.h>  #include <net/netlink.h>  #include <net/act_api.h>  #include <net/pkt_cls.h> -struct tc_u_knode -{ +struct tc_u_knode {  	struct tc_u_knode	*next;  	u32			handle;  	struct tc_u_hnode	*ht_up;  	struct tcf_exts		exts;  #ifdef CONFIG_NET_CLS_IND -	char                     indev[IFNAMSIZ]; +	int			ifindex;  #endif  	u8			fshift;  	struct tcf_result	res; @@ -63,45 +63,40 @@ struct tc_u_knode  	struct tc_u32_sel	sel;  }; -struct tc_u_hnode -{ +struct tc_u_hnode {  	struct tc_u_hnode	*next;  	u32			handle;  	u32			prio;  	struct tc_u_common	*tp_c;  	int			refcnt; -	unsigned		divisor; +	unsigned int		divisor;  	struct tc_u_knode	*ht[1];  }; -struct tc_u_common -{ +struct tc_u_common {  	struct tc_u_hnode	*hlist;  	struct Qdisc		*q;  	int			refcnt;  	u32			hgenerator;  }; -static const struct tcf_ext_map u32_ext_map = { -	.action = TCA_U32_ACT, -	.police = TCA_U32_POLICE -}; - -static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift) +static inline unsigned int u32_hash_fold(__be32 key, +					 const struct tc_u32_sel *sel, +					 u8 fshift)  { -	unsigned h = ntohl(key & sel->hmask)>>fshift; +	unsigned int h = ntohl(key & sel->hmask) >> fshift;  	return h;  } -static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)  {  	struct {  		struct tc_u_knode *knode;  		unsigned int	  off;  	} stack[TC_U32_MAXDEPTH]; -	struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; +	struct tc_u_hnode *ht = tp->root;  	unsigned int off = skb_network_offset(skb);  	struct tc_u_knode *n;  	int sdepth = 0; @@ -120,7 +115,7 @@ next_knode:  		struct tc_u32_key *key = n->sel.keys;  #ifdef CONFIG_CLS_U32_PERF -		n->pf->rcnt +=1; +		n->pf->rcnt += 1;  		j = 0;  #endif @@ -133,14 +128,14 @@ next_knode:  		}  #endif -		for (i = n->sel.nkeys; i>0; i--, key++) { +		for (i = n->sel.nkeys; i > 0; i--, key++) {  			int toff = off + key->off + (off2 & key->offmask); -			__be32 *data, _data; +			__be32 *data, hdata;  			if (skb_headroom(skb) + toff > INT_MAX)  				goto out; -			data = skb_header_pointer(skb, toff, 4, &_data); +			data = skb_header_pointer(skb, toff, 4, &hdata);  			if (!data)  				goto out;  			if ((*data ^ key->val) & key->mask) { @@ -148,23 +143,23 @@ next_knode:  				goto next_knode;  			}  #ifdef CONFIG_CLS_U32_PERF -			n->pf->kcnts[j] +=1; +			n->pf->kcnts[j] += 1;  			j++;  #endif  		}  		if (n->ht_down == NULL) {  check_terminal: -			if (n->sel.flags&TC_U32_TERMINAL) { +			if (n->sel.flags & TC_U32_TERMINAL) {  				*res = n->res;  #ifdef CONFIG_NET_CLS_IND -				if (!tcf_match_indev(skb, n->indev)) { +				if (!tcf_match_indev(skb, n->ifindex)) {  					n = n->next;  					goto next_knode;  				}  #endif  #ifdef CONFIG_CLS_U32_PERF -				n->pf->rhit +=1; +				n->pf->rhit += 1;  #endif  				r = tcf_exts_exec(skb, &n->exts, res);  				if (r < 0) { @@ -188,26 +183,26 @@ check_terminal:  		ht = n->ht_down;  		sel = 0;  		if (ht->divisor) { -			__be32 *data, _data; +			__be32 *data, hdata;  			data = skb_header_pointer(skb, off + n->sel.hoff, 4, -						  &_data); +						  &hdata);  			if (!data)  				goto out;  			sel = ht->divisor & u32_hash_fold(*data, &n->sel,  							  n->fshift);  		} -		if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) +		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))  			goto next_ht; -		if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { +		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {  			off2 = n->sel.off + 3;  			if (n->sel.flags & TC_U32_VAROFFSET) { -				__be16 *data, _data; +				__be16 *data, hdata;  				data = skb_header_pointer(skb,  							  off + n->sel.offoff, -							  2, &_data); +							  2, &hdata);  				if (!data)  					goto out;  				off2 += ntohs(n->sel.offmask & *data) >> @@ -215,7 +210,7 @@ check_terminal:  			}  			off2 &= ~3;  		} -		if (n->sel.flags&TC_U32_EAT) { +		if (n->sel.flags & TC_U32_EAT) {  			off += off2;  			off2 = 0;  		} @@ -235,12 +230,11 @@ out:  	return -1;  deadloop: -	if (net_ratelimit()) -		printk(KERN_WARNING "cls_u32: dead loop\n"); +	net_warn_ratelimited("cls_u32: dead loop\n");  	return -1;  } -static __inline__ struct tc_u_hnode * +static struct tc_u_hnode *  u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)  {  	struct tc_u_hnode *ht; @@ -252,10 +246,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)  	return ht;  } -static __inline__ struct tc_u_knode * +static struct tc_u_knode *  u32_lookup_key(struct tc_u_hnode *ht, u32 handle)  { -	unsigned sel; +	unsigned int sel;  	struct tc_u_knode *n = NULL;  	sel = TC_U32_HASH(handle); @@ -300,7 +294,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)  	do {  		if (++tp_c->hgenerator == 0x7FF)  			tp_c->hgenerator = 1; -	} while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); +	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));  	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;  } @@ -354,7 +348,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)  	return 0;  } -static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)  {  	struct tc_u_knode **kp;  	struct tc_u_hnode *ht = key->ht_up; @@ -378,9 +372,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)  static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)  {  	struct tc_u_knode *n; -	unsigned h; +	unsigned int h; -	for (h=0; h<=ht->divisor; h++) { +	for (h = 0; h <= ht->divisor; h++) {  		while ((n = ht->ht[h]) != NULL) {  			ht->ht[h] = n->next; @@ -446,13 +440,13 @@ static void u32_destroy(struct tcf_proto *tp)  static int u32_delete(struct tcf_proto *tp, unsigned long arg)  { -	struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; +	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;  	if (ht == NULL)  		return 0;  	if (TC_U32_KEY(ht->handle)) -		return u32_delete_key(tp, (struct tc_u_knode*)ht); +		return u32_delete_key(tp, (struct tc_u_knode *)ht);  	if (tp->root == ht)  		return -EINVAL; @@ -467,17 +461,25 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)  	return 0;  } +#define NR_U32_NODE (1<<12)  static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)  {  	struct tc_u_knode *n; -	unsigned i = 0x7FF; +	unsigned long i; +	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), +					GFP_KERNEL); +	if (!bitmap) +		return handle | 0xFFF; + +	for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) +		set_bit(TC_U32_NODE(n->handle), bitmap); -	for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) -		if (i < TC_U32_NODE(n->handle)) -			i = TC_U32_NODE(n->handle); -	i++; +	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); +	if (i >= NR_U32_NODE) +		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); -	return handle|(i>0xFFF ? 0xFFF : i); +	kfree(bitmap); +	return handle | (i >= NR_U32_NODE ? 0xFFF : i);  }  static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -490,15 +492,16 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {  	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },  }; -static int u32_set_parms(struct tcf_proto *tp, unsigned long base, -			 struct tc_u_hnode *ht, +static int u32_set_parms(struct net *net, struct tcf_proto *tp, +			 unsigned long base, struct tc_u_hnode *ht,  			 struct tc_u_knode *n, struct nlattr **tb, -			 struct nlattr *est) +			 struct nlattr *est, bool ovr)  {  	int err;  	struct tcf_exts e; -	err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); +	tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); +	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);  	if (err < 0)  		return err; @@ -533,9 +536,11 @@ static int u32_set_parms(struct tcf_proto *tp, unsigned long base,  #ifdef CONFIG_NET_CLS_IND  	if (tb[TCA_U32_INDEV]) { -		err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]); -		if (err < 0) +		int ret; +		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]); +		if (ret < 0)  			goto errout; +		n->ifindex = ret;  	}  #endif  	tcf_exts_change(tp, &n->exts, &e); @@ -546,9 +551,10 @@ errout:  	return err;  } -static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, +static int u32_change(struct net *net, struct sk_buff *in_skb, +		      struct tcf_proto *tp, unsigned long base, u32 handle,  		      struct nlattr **tca, -		      unsigned long *arg) +		      unsigned long *arg, bool ovr)  {  	struct tc_u_common *tp_c = tp->data;  	struct tc_u_hnode *ht; @@ -566,15 +572,17 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	if (err < 0)  		return err; -	if ((n = (struct tc_u_knode*)*arg) != NULL) { +	n = (struct tc_u_knode *)*arg; +	if (n) {  		if (TC_U32_KEY(n->handle) == 0)  			return -EINVAL; -		return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); +		return u32_set_parms(net, tp, base, n->ht_up, n, tb, +				     tca[TCA_RATE], ovr);  	}  	if (tb[TCA_U32_DIVISOR]) { -		unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); +		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);  		if (--divisor > 0x100)  			return -EINVAL; @@ -585,7 +593,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,  			if (handle == 0)  				return -ENOMEM;  		} -		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); +		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);  		if (ht == NULL)  			return -ENOBUFS;  		ht->tp_c = tp_c; @@ -645,6 +653,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	n->ht_up = ht;  	n->handle = handle;  	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; +	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);  #ifdef CONFIG_CLS_U32_MARK  	if (tb[TCA_U32_MARK]) { @@ -656,7 +665,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,  	}  #endif -	err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); +	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);  	if (err == 0) {  		struct tc_u_knode **ins;  		for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) @@ -683,7 +692,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	struct tc_u_common *tp_c = tp->data;  	struct tc_u_hnode *ht;  	struct tc_u_knode *n; -	unsigned h; +	unsigned int h;  	if (arg->stop)  		return; @@ -714,10 +723,10 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)  	}  } -static int u32_dump(struct tcf_proto *tp, unsigned long fh, +static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,  		     struct sk_buff *skb, struct tcmsg *t)  { -	struct tc_u_knode *n = (struct tc_u_knode*)fh; +	struct tc_u_knode *n = (struct tc_u_knode *)fh;  	struct nlattr *nest;  	if (n == NULL) @@ -730,45 +739,57 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,  		goto nla_put_failure;  	if (TC_U32_KEY(n->handle) == 0) { -		struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; -		u32 divisor = ht->divisor+1; -		NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); +		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; +		u32 divisor = ht->divisor + 1; + +		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) +			goto nla_put_failure;  	} else { -		NLA_PUT(skb, TCA_U32_SEL, -			sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), -			&n->sel); +		if (nla_put(skb, TCA_U32_SEL, +			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), +			    &n->sel)) +			goto nla_put_failure;  		if (n->ht_up) {  			u32 htid = n->handle & 0xFFFFF000; -			NLA_PUT_U32(skb, TCA_U32_HASH, htid); +			if (nla_put_u32(skb, TCA_U32_HASH, htid)) +				goto nla_put_failure;  		} -		if (n->res.classid) -			NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid); -		if (n->ht_down) -			NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle); +		if (n->res.classid && +		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) +			goto nla_put_failure; +		if (n->ht_down && +		    nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) +			goto nla_put_failure;  #ifdef CONFIG_CLS_U32_MARK -		if (n->mark.val || n->mark.mask) -			NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); +		if ((n->mark.val || n->mark.mask) && +		    nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) +			goto nla_put_failure;  #endif -		if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) +		if (tcf_exts_dump(skb, &n->exts) < 0)  			goto nla_put_failure;  #ifdef CONFIG_NET_CLS_IND -		if(strlen(n->indev)) -			NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); +		if (n->ifindex) { +			struct net_device *dev; +			dev = __dev_get_by_index(net, n->ifindex); +			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) +				goto nla_put_failure; +		}  #endif  #ifdef CONFIG_CLS_U32_PERF -		NLA_PUT(skb, TCA_U32_PCNT, -		sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), -			n->pf); +		if (nla_put(skb, TCA_U32_PCNT, +			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), +			    n->pf)) +			goto nla_put_failure;  #endif  	}  	nla_nest_end(skb, nest);  	if (TC_U32_KEY(n->handle)) -		if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) +		if (tcf_exts_dump_stats(skb, &n->exts) < 0)  			goto nla_put_failure;  	return skb->len; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c new file mode 100644 index 00000000000..bfd34e4c1af --- /dev/null +++ b/net/sched/em_canid.c @@ -0,0 +1,240 @@ +/* + * em_canid.c  Ematch rule to match CAN frames according to their CAN IDs + * + *              This program is free software; you can distribute it and/or + *              modify it under the terms of the GNU General Public License + *              as published by the Free Software Foundation; either version + *              2 of the License, or (at your option) any later version. + * + * Idea:       Oliver Hartkopp <oliver.hartkopp@volkswagen.de> + * Copyright:  (c) 2011 Czech Technical University in Prague + *             (c) 2011 Volkswagen Group Research + * Authors:    Michal Sojka <sojkam1@fel.cvut.cz> + *             Pavel Pisa <pisa@cmp.felk.cvut.cz> + *             Rostislav Lisovy <lisovy@gmail.cz> + * Funded by:  Volkswagen Group Research + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <net/pkt_cls.h> +#include <linux/can.h> + +#define EM_CAN_RULES_MAX 500 + +struct canid_match { +	/* For each SFF CAN ID (11 bit) there is one record in this bitfield */ +	DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); + +	int rules_count; +	int sff_rules_count; +	int eff_rules_count; + +	/* +	 * Raw rules copied from netlink message; Used for sending +	 * information to userspace (when 'tc filter show' is invoked) +	 * AND when matching EFF frames +	 */ +	struct can_filter rules_raw[]; +}; + +/** + * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. + */ +static canid_t em_canid_get_id(struct sk_buff *skb) +{ +	/* CAN ID is stored within the data field */ +	struct can_frame *cf = (struct can_frame *)skb->data; + +	return cf->can_id; +} + +static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, +					u32 can_mask) +{ +	int i; + +	/* +	 * Limit can_mask and can_id to SFF range to +	 * protect against write after end of array +	 */ +	can_mask &= CAN_SFF_MASK; +	can_id &= can_mask; + +	/* Single frame */ +	if (can_mask == CAN_SFF_MASK) { +		set_bit(can_id, cm->match_sff); +		return; +	} + +	/* All frames */ +	if (can_mask == 0) { +		bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); +		return; +	} + +	/* +	 * Individual frame filter. +	 * Add record (set bit to 1) for each ID that +	 * conforms particular rule +	 */ +	for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { +		if ((i & can_mask) == can_id) +			set_bit(i, cm->match_sff); +	} +} + +static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) +{ +	return (struct canid_match *)m->data; +} + +static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, +			 struct tcf_pkt_info *info) +{ +	struct canid_match *cm = em_canid_priv(m); +	canid_t can_id; +	int match = 0; +	int i; +	const struct can_filter *lp; + +	can_id = em_canid_get_id(skb); + +	if (can_id & CAN_EFF_FLAG) { +		for (i = 0, lp = cm->rules_raw; +		     i < cm->eff_rules_count; i++, lp++) { +			if (!(((lp->can_id ^ can_id) & lp->can_mask))) { +				match = 1; +				break; +			} +		} +	} else { /* SFF */ +		can_id &= CAN_SFF_MASK; +		match = (test_bit(can_id, cm->match_sff) ? 1 : 0); +	} + +	return match; +} + +static int em_canid_change(struct tcf_proto *tp, void *data, int len, +			  struct tcf_ematch *m) +{ +	struct can_filter *conf = data; /* Array with rules */ +	struct canid_match *cm; +	struct canid_match *cm_old = (struct canid_match *)m->data; +	int i; + +	if (!len) +		return -EINVAL; + +	if (len % sizeof(struct can_filter)) +		return -EINVAL; + +	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) +		return -EINVAL; + +	cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); +	if (!cm) +		return -ENOMEM; + +	cm->rules_count = len / sizeof(struct can_filter); + +	/* +	 * We need two for() loops for copying rules into two contiguous +	 * areas in rules_raw to process all eff rules with a simple loop. +	 * NB: The configuration interface supports sff and eff rules. +	 * We do not support filters here that match for the same can_id +	 * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). +	 * For this (unusual case) two filters have to be specified. The +	 * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. +	 */ + +	/* Fill rules_raw with EFF rules first */ +	for (i = 0; i < cm->rules_count; i++) { +		if (conf[i].can_id & CAN_EFF_FLAG) { +			memcpy(cm->rules_raw + cm->eff_rules_count, +				&conf[i], +				sizeof(struct can_filter)); + +			cm->eff_rules_count++; +		} +	} + +	/* append SFF frame rules */ +	for (i = 0; i < cm->rules_count; i++) { +		if (!(conf[i].can_id & CAN_EFF_FLAG)) { +			memcpy(cm->rules_raw +				+ cm->eff_rules_count +				+ cm->sff_rules_count, +				&conf[i], sizeof(struct can_filter)); + +			cm->sff_rules_count++; + +			em_canid_sff_match_add(cm, +				conf[i].can_id, conf[i].can_mask); +		} +	} + +	m->datalen = sizeof(struct canid_match) + len; +	m->data = (unsigned long)cm; + +	if (cm_old != NULL) { +		pr_err("canid: Configuring an existing ematch!\n"); +		kfree(cm_old); +	} + +	return 0; +} + +static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ +	struct canid_match *cm = em_canid_priv(m); + +	kfree(cm); +} + +static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ +	struct canid_match *cm = em_canid_priv(m); + +	/* +	 * When configuring this ematch 'rules_count' is set not to exceed +	 * 'rules_raw' array size +	 */ +	if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, +	    &cm->rules_raw) < 0) +		return -EMSGSIZE; + +	return 0; +} + +static struct tcf_ematch_ops em_canid_ops = { +	.kind	  = TCF_EM_CANID, +	.change	  = em_canid_change, +	.match	  = em_canid_match, +	.destroy  = em_canid_destroy, +	.dump	  = em_canid_dump, +	.owner	  = THIS_MODULE, +	.link	  = LIST_HEAD_INIT(em_canid_ops.link) +}; + +static int __init init_em_canid(void) +{ +	return tcf_em_register(&em_canid_ops); +} + +static void __exit exit_em_canid(void) +{ +	tcf_em_unregister(&em_canid_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_canid); +module_exit(exit_em_canid); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index bc450397487..1c8360a2752 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,  		return 0;  	switch (cmp->align) { -		case TCF_EM_ALIGN_U8: -			val = *ptr; -			break; +	case TCF_EM_ALIGN_U8: +		val = *ptr; +		break; -		case TCF_EM_ALIGN_U16: -			val = get_unaligned_be16(ptr); +	case TCF_EM_ALIGN_U16: +		val = get_unaligned_be16(ptr); -			if (cmp_needs_transformation(cmp)) -				val = be16_to_cpu(val); -			break; +		if (cmp_needs_transformation(cmp)) +			val = be16_to_cpu(val); +		break; -		case TCF_EM_ALIGN_U32: -			/* Worth checking boundries? The branching seems -			 * to get worse. Visit again. */ -			val = get_unaligned_be32(ptr); +	case TCF_EM_ALIGN_U32: +		/* Worth checking boundries? The branching seems +		 * to get worse. Visit again. +		 */ +		val = get_unaligned_be32(ptr); -			if (cmp_needs_transformation(cmp)) -				val = be32_to_cpu(val); -			break; +		if (cmp_needs_transformation(cmp)) +			val = be32_to_cpu(val); +		break; -		default: -			return 0; +	default: +		return 0;  	}  	if (cmp->mask)  		val &= cmp->mask;  	switch (cmp->opnd) { -		case TCF_EM_OPND_EQ: -			return val == cmp->val; -		case TCF_EM_OPND_LT: -			return val < cmp->val; -		case TCF_EM_OPND_GT: -			return val > cmp->val; +	case TCF_EM_OPND_EQ: +		return val == cmp->val; +	case TCF_EM_OPND_LT: +		return val < cmp->val; +	case TCF_EM_OPND_GT: +		return val > cmp->val;  	}  	return 0; diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c new file mode 100644 index 00000000000..527aeb7a3ff --- /dev/null +++ b/net/sched/em_ipset.c @@ -0,0 +1,136 @@ +/* + * net/sched/em_ipset.c	ipset ematch + * + * Copyright (c) 2012 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_set.h> +#include <linux/ipv6.h> +#include <net/ip.h> +#include <net/pkt_cls.h> + +static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len, +			   struct tcf_ematch *em) +{ +	struct xt_set_info *set = data; +	ip_set_id_t index; +	struct net *net = dev_net(qdisc_dev(tp->q)); + +	if (data_len != sizeof(*set)) +		return -EINVAL; + +	index = ip_set_nfnl_get_byindex(net, set->index); +	if (index == IPSET_INVALID_ID) +		return -ENOENT; + +	em->datalen = sizeof(*set); +	em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL); +	if (em->data) +		return 0; + +	ip_set_nfnl_put(net, index); +	return -ENOMEM; +} + +static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em) +{ +	const struct xt_set_info *set = (const void *) em->data; +	if (set) { +		ip_set_nfnl_put(dev_net(qdisc_dev(p->q)), set->index); +		kfree((void *) em->data); +	} +} + +static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, +			  struct tcf_pkt_info *info) +{ +	struct ip_set_adt_opt opt; +	struct xt_action_param acpar; +	const struct xt_set_info *set = (const void *) em->data; +	struct net_device *dev, *indev = NULL; +	int ret, network_offset; + +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		acpar.family = NFPROTO_IPV4; +		if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) +			return 0; +		acpar.thoff = ip_hdrlen(skb); +		break; +	case htons(ETH_P_IPV6): +		acpar.family = NFPROTO_IPV6; +		if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) +			return 0; +		/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */ +		acpar.thoff = sizeof(struct ipv6hdr); +		break; +	default: +		return 0; +	} + +	acpar.hooknum = 0; + +	opt.family = acpar.family; +	opt.dim = set->dim; +	opt.flags = set->flags; +	opt.cmdflags = 0; +	opt.ext.timeout = ~0u; + +	network_offset = skb_network_offset(skb); +	skb_pull(skb, network_offset); + +	dev = skb->dev; + +	rcu_read_lock(); + +	if (dev && skb->skb_iif) +		indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + +	acpar.in      = indev ? indev : dev; +	acpar.out     = dev; + +	ret = ip_set_test(set->index, skb, &acpar, &opt); + +	rcu_read_unlock(); + +	skb_push(skb, network_offset); +	return ret; +} + +static struct tcf_ematch_ops em_ipset_ops = { +	.kind	  = TCF_EM_IPSET, +	.change	  = em_ipset_change, +	.destroy  = em_ipset_destroy, +	.match	  = em_ipset_match, +	.owner	  = THIS_MODULE, +	.link	  = LIST_HEAD_INIT(em_ipset_ops.link) +}; + +static int __init init_em_ipset(void) +{ +	return tcf_em_register(&em_ipset_ops); +} + +static void __exit exit_em_ipset(void) +{ +	tcf_em_unregister(&em_ipset_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("TC extended match for IP sets"); + +module_init(init_em_ipset); +module_exit(exit_em_ipset); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 34da5e29ea1..9b8c0b0e60d 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -47,7 +47,7 @@   * 	on the meta type. Obviously, the length of the data must also   * 	be provided for non-numeric types.   * - * 	Additionaly, type dependant modifiers such as shift operators + * 	Additionally, type dependent modifiers such as shift operators   * 	or mask may be applied to extend the functionaliy. As of now,   * 	the variable length type supports shifting the byte string to   * 	the right, eating up any number of octets and thus supporting @@ -73,21 +73,18 @@  #include <net/pkt_cls.h>  #include <net/sock.h> -struct meta_obj -{ +struct meta_obj {  	unsigned long		value;  	unsigned int		len;  }; -struct meta_value -{ +struct meta_value {  	struct tcf_meta_val	hdr;  	unsigned long		val;  	unsigned int		len;  }; -struct meta_match -{ +struct meta_match {  	struct meta_value	lvalue;  	struct meta_value	rvalue;  }; @@ -225,7 +222,7 @@ META_COLLECTOR(int_maclen)  META_COLLECTOR(int_rxhash)  { -	dst->value = skb_get_rxhash(skb); +	dst->value = skb_get_hash(skb);  }  /************************************************************************** @@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid)  	if (unlikely(skb_dst(skb) == NULL))  		*err = -1;  	else -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  		dst->value = skb_dst(skb)->tclassid;  #else  		dst->value = 0; @@ -267,47 +264,59 @@ META_COLLECTOR(int_rtiif)  	if (unlikely(skb_rtable(skb) == NULL))  		*err = -1;  	else -		dst->value = skb_rtable(skb)->fl.iif; +		dst->value = inet_iif(skb);  }  /**************************************************************************   * Socket Attributes   **************************************************************************/ -#define SKIP_NONLOCAL(skb)			\ -	if (unlikely(skb->sk == NULL)) {	\ -		*err = -1;			\ -		return;				\ -	} +#define skip_nonlocal(skb) \ +	(unlikely(skb->sk == NULL))  META_COLLECTOR(int_sk_family)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_family;  }  META_COLLECTOR(int_sk_state)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_state;  }  META_COLLECTOR(int_sk_reuse)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_reuse;  }  META_COLLECTOR(int_sk_bound_if)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	/* No error if bound_dev_if is 0, legal userspace check */  	dst->value = skb->sk->sk_bound_dev_if;  }  META_COLLECTOR(var_sk_bound_if)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	if (skb->sk->sk_bound_dev_if == 0) {  		dst->value = (unsigned long) "any"; @@ -325,157 +334,226 @@ META_COLLECTOR(var_sk_bound_if)  META_COLLECTOR(int_sk_refcnt)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = atomic_read(&skb->sk->sk_refcnt);  }  META_COLLECTOR(int_sk_rcvbuf)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvbuf;  }  META_COLLECTOR(int_sk_shutdown)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_shutdown;  }  META_COLLECTOR(int_sk_proto)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_protocol;  }  META_COLLECTOR(int_sk_type)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_type;  }  META_COLLECTOR(int_sk_rmem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = sk_rmem_alloc_get(skb->sk);  }  META_COLLECTOR(int_sk_wmem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = sk_wmem_alloc_get(skb->sk);  }  META_COLLECTOR(int_sk_omem_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = atomic_read(&skb->sk->sk_omem_alloc);  }  META_COLLECTOR(int_sk_rcv_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_receive_queue.qlen;  }  META_COLLECTOR(int_sk_snd_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_write_queue.qlen;  }  META_COLLECTOR(int_sk_wmem_queued)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_wmem_queued;  }  META_COLLECTOR(int_sk_fwd_alloc)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_forward_alloc;  }  META_COLLECTOR(int_sk_sndbuf)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_sndbuf;  }  META_COLLECTOR(int_sk_alloc)  { -	SKIP_NONLOCAL(skb); -	dst->value = skb->sk->sk_allocation; -} - -META_COLLECTOR(int_sk_route_caps) -{ -	SKIP_NONLOCAL(skb); -	dst->value = skb->sk->sk_route_caps; +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	} +	dst->value = (__force int) skb->sk->sk_allocation;  }  META_COLLECTOR(int_sk_hash)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_hash;  }  META_COLLECTOR(int_sk_lingertime)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_lingertime / HZ;  }  META_COLLECTOR(int_sk_err_qlen)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_error_queue.qlen;  }  META_COLLECTOR(int_sk_ack_bl)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_ack_backlog;  }  META_COLLECTOR(int_sk_max_ack_bl)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_max_ack_backlog;  }  META_COLLECTOR(int_sk_prio)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_priority;  }  META_COLLECTOR(int_sk_rcvlowat)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvlowat;  }  META_COLLECTOR(int_sk_rcvtimeo)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_rcvtimeo / HZ;  }  META_COLLECTOR(int_sk_sndtimeo)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_sndtimeo / HZ;  }  META_COLLECTOR(int_sk_sendmsg_off)  { -	SKIP_NONLOCAL(skb); -	dst->value = skb->sk->sk_sndmsg_off; +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	} +	dst->value = skb->sk->sk_frag.offset;  }  META_COLLECTOR(int_sk_write_pend)  { -	SKIP_NONLOCAL(skb); +	if (skip_nonlocal(skb)) { +		*err = -1; +		return; +	}  	dst->value = skb->sk->sk_write_pending;  } @@ -483,8 +561,7 @@ META_COLLECTOR(int_sk_write_pend)   * Meta value collectors assignment table   **************************************************************************/ -struct meta_ops -{ +struct meta_ops {  	void		(*get)(struct sk_buff *, struct tcf_pkt_info *,  			       struct meta_value *, struct meta_obj *, int *);  }; @@ -494,7 +571,7 @@ struct meta_ops  /* Meta value operations table listing all meta value collectors and   * assigns them to a type and meta id. */ -static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {  	[TCF_META_TYPE_VAR] = {  		[META_ID(DEV)]			= META_FUNC(var_dev),  		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if), @@ -534,7 +611,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {  		[META_ID(SK_ERR_QLEN)]		= META_FUNC(int_sk_err_qlen),  		[META_ID(SK_FORWARD_ALLOCS)]	= META_FUNC(int_sk_fwd_alloc),  		[META_ID(SK_ALLOCS)]		= META_FUNC(int_sk_alloc), -		[META_ID(SK_ROUTE_CAPS)]	= META_FUNC(int_sk_route_caps),  		[META_ID(SK_HASH)]		= META_FUNC(int_sk_hash),  		[META_ID(SK_LINGERTIME)]	= META_FUNC(int_sk_lingertime),  		[META_ID(SK_ACK_BACKLOG)]	= META_FUNC(int_sk_ack_bl), @@ -550,7 +626,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {  	}  }; -static inline struct meta_ops * meta_ops(struct meta_value *val) +static inline struct meta_ops *meta_ops(struct meta_value *val)  {  	return &__meta_ops[meta_type(val)][meta_id(val)];  } @@ -596,8 +672,9 @@ static void meta_var_apply_extras(struct meta_value *v,  static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)  { -	if (v->val && v->len) -		NLA_PUT(skb, tlv, v->len, (void *) v->val); +	if (v->val && v->len && +	    nla_put(skb, tlv, v->len, (void *) v->val)) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -647,10 +724,12 @@ static void meta_int_apply_extras(struct meta_value *v,  static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)  { -	if (v->len == sizeof(unsigned long)) -		NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); -	else if (v->len == sizeof(u32)) { -		NLA_PUT_U32(skb, tlv, v->val); +	if (v->len == sizeof(unsigned long)) { +		if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) +			goto nla_put_failure; +	} else if (v->len == sizeof(u32)) { +		if (nla_put_u32(skb, tlv, v->val)) +			goto nla_put_failure;  	}  	return 0; @@ -663,8 +742,7 @@ nla_put_failure:   * Type specific operations table   **************************************************************************/ -struct meta_type_ops -{ +struct meta_type_ops {  	void	(*destroy)(struct meta_value *);  	int	(*compare)(struct meta_obj *, struct meta_obj *);  	int	(*change)(struct meta_value *, struct nlattr *); @@ -672,7 +750,7 @@ struct meta_type_ops  	int	(*dump)(struct sk_buff *, struct meta_value *, int);  }; -static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {  	[TCF_META_TYPE_VAR] = {  		.destroy = meta_var_destroy,  		.compare = meta_var_compare, @@ -688,7 +766,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {  	}  }; -static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) +static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)  {  	return &__meta_type_ops[meta_type(v)];  } @@ -713,7 +791,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,  		return err;  	if (meta_type_ops(v)->apply_extras) -	    meta_type_ops(v)->apply_extras(v, dst); +		meta_type_ops(v)->apply_extras(v, dst);  	return 0;  } @@ -732,12 +810,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,  	r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);  	switch (meta->lvalue.hdr.op) { -		case TCF_EM_OPND_EQ: -			return !r; -		case TCF_EM_OPND_LT: -			return r < 0; -		case TCF_EM_OPND_GT: -			return r > 0; +	case TCF_EM_OPND_EQ: +		return !r; +	case TCF_EM_OPND_LT: +		return r < 0; +	case TCF_EM_OPND_GT: +		return r > 0;  	}  	return 0; @@ -771,7 +849,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)  static inline int meta_is_supported(struct meta_value *val)  { -	return (!meta_id(val) || meta_ops(val)->get); +	return !meta_id(val) || meta_ops(val)->get;  }  static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { @@ -802,8 +880,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len,  		goto errout;  	meta = kzalloc(sizeof(*meta), GFP_KERNEL); -	if (meta == NULL) +	if (meta == NULL) { +		err = -ENOMEM;  		goto errout; +	}  	memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));  	memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); @@ -844,7 +924,8 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)  	memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));  	memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); -	NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); +	if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) +		goto nla_put_failure;  	ops = meta_type_ops(&meta->lvalue);  	if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 1a4176aee6e..a3bed07a008 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -18,8 +18,7 @@  #include <linux/tc_ematch/tc_em_nbyte.h>  #include <net/pkt_cls.h> -struct nbyte_data -{ +struct nbyte_data {  	struct tcf_em_nbyte	hdr;  	char			pattern[0];  }; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index ea8f566e720..15d353d2e4b 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -19,8 +19,7 @@  #include <linux/tc_ematch/tc_em_text.h>  #include <net/pkt_cls.h> -struct text_match -{ +struct text_match {  	u16			from_offset;  	u16			to_offset;  	u8			from_layer; diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 953f1479f7d..797bdb88c01 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,  	if (!tcf_valid_offset(skb, ptr, sizeof(u32)))  		return 0; -	return !(((*(__be32*) ptr)  ^ key->val) & key->mask); +	return !(((*(__be32 *) ptr)  ^ key->val) & key->mask);  }  static struct tcf_ematch_ops em_u32_ops = { diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 5e37da961f8..3a633debb6d 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -93,7 +93,7 @@  static LIST_HEAD(ematch_ops);  static DEFINE_RWLOCK(ematch_mod_lock); -static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) +static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)  {  	struct tcf_ematch_ops *e = NULL; @@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops)  }  EXPORT_SYMBOL(tcf_em_unregister); -static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, -						   int index) +static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, +						  int index)  {  	return &tree->matches[index];  } @@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  	if (em_hdr->kind == TCF_EM_CONTAINER) {  		/* Special ematch called "container", carries an index -		 * referencing an external ematch sequence. */ +		 * referencing an external ematch sequence. +		 */  		u32 ref;  		if (data_len < sizeof(ref)) @@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  			goto errout;  		/* We do not allow backward jumps to avoid loops and jumps -		 * to our own position are of course illegal. */ +		 * to our own position are of course illegal. +		 */  		if (ref <= idx)  			goto errout; @@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  		 * which automatically releases the reference again, therefore  		 * the module MUST not be given back under any circumstances  		 * here. Be aware, the destroy function assumes that the -		 * module is held if the ops field is non zero. */ +		 * module is held if the ops field is non zero. +		 */  		em->ops = tcf_em_lookup(em_hdr->kind);  		if (em->ops == NULL) { @@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  			if (em->ops) {  				/* We dropped the RTNL mutex in order to  				 * perform the module load. Tell the caller -				 * to replay the request. */ +				 * to replay the request. +				 */  				module_put(em->ops->owner);  				err = -EAGAIN;  			} @@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  		}  		/* ematch module provides expected length of data, so we -		 * can do a basic sanity check. */ +		 * can do a basic sanity check. +		 */  		if (em->ops->datalen && data_len < em->ops->datalen)  			goto errout; @@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp,  			 * TCF_EM_SIMPLE may be specified stating that the  			 * data only consists of a u32 integer and the module  			 * does not expected a memory reference but rather -			 * the value carried. */ +			 * the value carried. +			 */  			if (em_hdr->flags & TCF_EM_SIMPLE) {  				if (data_len < sizeof(u32))  					goto errout; @@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,  	 * The array of rt attributes is parsed in the order as they are  	 * provided, their type must be incremental from 1 to n. Even  	 * if it does not serve any real purpose, a failure of sticking -	 * to this policy will result in parsing failure. */ +	 * to this policy will result in parsing failure. +	 */  	for (idx = 0; nla_ok(rt_match, list_len); idx++) {  		err = -EINVAL; @@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,  	/* Check if the number of matches provided by userspace actually  	 * complies with the array of matches. The number was used for  	 * the validation of references and a mismatch could lead to -	 * undefined references during the matching process. */ +	 * undefined references during the matching process. +	 */  	if (idx != tree_hdr->nmatches) {  		err = -EINVAL;  		goto errout_abort; @@ -433,7 +441,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)  	if (top_start == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); +	if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) +		goto nla_put_failure;  	list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);  	if (list_start == NULL) @@ -449,7 +458,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)  			.flags = em->flags  		}; -		NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); +		if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) +			goto nla_put_failure;  		if (em->ops && em->ops->dump) {  			if (em->ops->dump(skb, em) < 0) @@ -478,6 +488,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,  			       struct tcf_pkt_info *info)  {  	int r = em->ops->match(skb, em, info); +  	return tcf_em_is_inverted(em) ? !r : r;  } @@ -526,9 +537,7 @@ pop_stack:  	return res;  stack_overflow: -	if (net_ratelimit()) -		printk(KERN_WARNING "tc ematch: local stack overflow," -			" increase NET_EMATCH_STACK\n"); +	net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");  	return -1;  }  EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b22ca2d1ceb..58bed7599db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -135,7 +135,7 @@ static DEFINE_RWLOCK(qdisc_mod_lock);  static struct Qdisc_ops *qdisc_base; -/* Register/uregister queueing discipline */ +/* Register/unregister queueing discipline */  int register_qdisc(struct Qdisc_ops *qops)  { @@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)  	int err = -ENOENT;  	write_lock(&qdisc_mod_lock); -	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) +	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)  		if (q == qops)  			break;  	if (q) { @@ -200,6 +200,58 @@ int unregister_qdisc(struct Qdisc_ops *qops)  }  EXPORT_SYMBOL(unregister_qdisc); +/* Get default qdisc if not otherwise specified */ +void qdisc_get_default(char *name, size_t len) +{ +	read_lock(&qdisc_mod_lock); +	strlcpy(name, default_qdisc_ops->id, len); +	read_unlock(&qdisc_mod_lock); +} + +static struct Qdisc_ops *qdisc_lookup_default(const char *name) +{ +	struct Qdisc_ops *q = NULL; + +	for (q = qdisc_base; q; q = q->next) { +		if (!strcmp(name, q->id)) { +			if (!try_module_get(q->owner)) +				q = NULL; +			break; +		} +	} + +	return q; +} + +/* Set new default qdisc to use */ +int qdisc_set_default(const char *name) +{ +	const struct Qdisc_ops *ops; + +	if (!capable(CAP_NET_ADMIN)) +		return -EPERM; + +	write_lock(&qdisc_mod_lock); +	ops = qdisc_lookup_default(name); +	if (!ops) { +		/* Not found, drop lock and try to load module */ +		write_unlock(&qdisc_mod_lock); +		request_module("sch_%s", name); +		write_lock(&qdisc_mod_lock); + +		ops = qdisc_lookup_default(name); +	} + +	if (ops) { +		/* Set new default */ +		module_put(default_qdisc_ops->owner); +		default_qdisc_ops = ops; +	} +	write_unlock(&qdisc_mod_lock); + +	return ops ? 0 : -ENOENT; +} +  /* We know handle. Find qdisc among all qdisc's attached to device     (root qdisc, all its children, children of children etc.)   */ @@ -219,11 +271,16 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)  	return NULL;  } -static void qdisc_list_add(struct Qdisc *q) +void qdisc_list_add(struct Qdisc *q)  { -	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) -		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list); +	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { +		struct Qdisc *root = qdisc_dev(q)->qdisc; + +		WARN_ON_ONCE(root == &noop_qdisc); +		list_add_tail(&q->list, &root->list); +	}  } +EXPORT_SYMBOL(qdisc_list_add);  void qdisc_list_del(struct Qdisc *q)  { @@ -285,28 +342,70 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)  	return q;  } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value.  The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell.  If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ +	int low       = roundup(r->mpu, 48); +	int high      = roundup(low+1, 48); +	int cell_low  = low >> r->cell_log; +	int cell_high = (high >> r->cell_log) - 1; + +	/* rtab is too inaccurate at rates > 100Mbit/s */ +	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { +		pr_debug("TC linklayer: Giving up ATM detection\n"); +		return TC_LINKLAYER_ETHERNET; +	} + +	if ((cell_high > cell_low) && (cell_high < 256) +	    && (rtab[cell_low] == rtab[cell_high])) { +		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", +			 cell_low, cell_high, rtab[cell_high]); +		return TC_LINKLAYER_ATM; +	} +	return TC_LINKLAYER_ETHERNET; +} +  static struct qdisc_rate_table *qdisc_rtab_list;  struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)  {  	struct qdisc_rate_table *rtab; +	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || +	    nla_len(tab) != TC_RTAB_SIZE) +		return NULL; +  	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { -		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { +		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && +		    !memcmp(&rtab->data, nla_data(tab), 1024)) {  			rtab->refcnt++;  			return rtab;  		}  	} -	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || -	    nla_len(tab) != TC_RTAB_SIZE) -		return NULL; -  	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);  	if (rtab) {  		rtab->rate = *r;  		rtab->refcnt = 1;  		memcpy(rtab->data, nla_data(tab), 1024); +		if (r->linklayer == TC_LINKLAYER_UNAWARE) +			r->linklayer = __detect_linklayer(r, rtab->data);  		rtab->next = qdisc_rtab_list;  		qdisc_rtab_list = rtab;  	} @@ -321,7 +420,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)  	if (!tab || --tab->refcnt)  		return; -	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { +	for (rtabp = &qdisc_rtab_list; +	     (rtab = *rtabp) != NULL; +	     rtabp = &rtab->next) {  		if (rtab == tab) {  			*rtabp = rtab->next;  			kfree(rtab); @@ -396,6 +497,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)  	return stab;  } +static void stab_kfree_rcu(struct rcu_head *head) +{ +	kfree(container_of(head, struct qdisc_size_table, rcu)); +} +  void qdisc_put_stab(struct qdisc_size_table *tab)  {  	if (!tab) @@ -405,7 +511,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)  	if (--tab->refcnt == 0) {  		list_del(&tab->list); -		kfree(tab); +		call_rcu_bh(&tab->rcu, stab_kfree_rcu);  	}  	spin_unlock(&qdisc_stab_lock); @@ -419,7 +525,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)  	nest = nla_nest_start(skb, TCA_STAB);  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts); +	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) +		goto nla_put_failure;  	nla_nest_end(skb, nest);  	return skb->len; @@ -428,7 +535,7 @@ nla_put_failure:  	return -1;  } -void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab) +void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)  {  	int pkt_len, slot; @@ -454,14 +561,13 @@ out:  		pkt_len = 1;  	qdisc_skb_cb(skb)->pkt_len = pkt_len;  } -EXPORT_SYMBOL(qdisc_calculate_pkt_len); +EXPORT_SYMBOL(__qdisc_calculate_pkt_len); -void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) +void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)  {  	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { -		printk(KERN_WARNING -		       "%s: %s qdisc %X: is non-work-conserving?\n", -		       txt, qdisc->ops->id, qdisc->handle >> 16); +		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", +			txt, qdisc->ops->id, qdisc->handle >> 16);  		qdisc->flags |= TCQ_F_WARN_NONWC;  	}  } @@ -472,7 +578,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)  	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,  						 timer); -	wd->qdisc->flags &= ~TCQ_F_THROTTLED; +	qdisc_unthrottled(wd->qdisc);  	__netif_schedule(qdisc_root(wd->qdisc));  	return HRTIMER_NORESTART; @@ -486,25 +592,24 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)  }  EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)  { -	ktime_t time; -  	if (test_bit(__QDISC_STATE_DEACTIVATED,  		     &qdisc_root_sleeping(wd->qdisc)->state))  		return; -	wd->qdisc->flags |= TCQ_F_THROTTLED; -	time = ktime_set(0, 0); -	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); -	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); +	qdisc_throttled(wd->qdisc); + +	hrtimer_start(&wd->timer, +		      ns_to_ktime(expires), +		      HRTIMER_MODE_ABS);  } -EXPORT_SYMBOL(qdisc_watchdog_schedule); +EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);  void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)  {  	hrtimer_cancel(&wd->timer); -	wd->qdisc->flags &= ~TCQ_F_THROTTLED; +	qdisc_unthrottled(wd->qdisc);  }  EXPORT_SYMBOL(qdisc_watchdog_cancel); @@ -539,7 +644,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)  void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)  {  	struct Qdisc_class_common *cl; -	struct hlist_node *n, *next; +	struct hlist_node *next;  	struct hlist_head *nhash, *ohash;  	unsigned int nsize, nmask, osize;  	unsigned int i, h; @@ -558,7 +663,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)  	sch_tree_lock(sch);  	for (i = 0; i < osize; i++) { -		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { +		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {  			h = qdisc_class_hash(cl->classid, nmask);  			hlist_add_head(&cl->hnode, &nhash[h]);  		} @@ -612,20 +717,24 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,  }  EXPORT_SYMBOL(qdisc_class_hash_remove); -/* Allocate an unique handle from space managed by kernel */ - +/* Allocate an unique handle from space managed by kernel + * Possible range is [8000-FFFF]:0000 (0x8000 values) + */  static u32 qdisc_alloc_handle(struct net_device *dev)  { -	int i = 0x10000; +	int i = 0x8000;  	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);  	do {  		autohandle += TC_H_MAKE(0x10000U, 0);  		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))  			autohandle = TC_H_MAKE(0x80000000U, 0); -	} while	(qdisc_lookup(dev, autohandle) && --i > 0); +		if (!qdisc_lookup(dev, autohandle)) +			return autohandle; +		cond_resched(); +	} while	(--i > 0); -	return i>0 ? autohandle : 0; +	return 0;  }  void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) @@ -633,9 +742,11 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)  	const struct Qdisc_class_ops *cops;  	unsigned long cl;  	u32 parentid; +	int drops;  	if (n == 0)  		return; +	drops = max_t(int, n, 0);  	while ((parentid = sch->parent)) {  		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))  			return; @@ -652,6 +763,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)  			cops->put(sch, cl);  		}  		sch->q.qlen -= n; +		sch->qstats.drops += drops;  	}  }  EXPORT_SYMBOL(qdisc_tree_decrease_qlen); @@ -823,6 +935,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,  				goto err_out3;  		}  		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); +		if (!netif_is_multiqueue(dev)) +			sch->flags |= TCQ_F_ONETXQUEUE;  	}  	sch->handle = handle; @@ -834,7 +948,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,  				err = PTR_ERR(stab);  				goto err_out4;  			} -			sch->stab = stab; +			rcu_assign_pointer(sch->stab, stab);  		}  		if (tca[TCA_RATE]) {  			spinlock_t *root_lock; @@ -874,7 +988,7 @@ err_out4:  	 * Any broken qdiscs that would require a ops->reset() here?  	 * The qdisc was never in action so it shouldn't be necessary.  	 */ -	qdisc_put_stab(sch->stab); +	qdisc_put_stab(rtnl_dereference(sch->stab));  	if (ops->destroy)  		ops->destroy(sch);  	goto err_out3; @@ -882,7 +996,7 @@ err_out4:  static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)  { -	struct qdisc_size_table *stab = NULL; +	struct qdisc_size_table *ostab, *stab = NULL;  	int err = 0;  	if (tca[TCA_OPTIONS]) { @@ -899,8 +1013,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)  			return PTR_ERR(stab);  	} -	qdisc_put_stab(sch->stab); -	sch->stab = stab; +	ostab = rtnl_dereference(sch->stab); +	rcu_assign_pointer(sch->stab, stab); +	qdisc_put_stab(ostab);  	if (tca[TCA_RATE]) {  		/* NB: ignores errors from replace_estimator @@ -915,9 +1030,8 @@ out:  	return 0;  } -struct check_loop_arg -{ -	struct qdisc_walker 	w; +struct check_loop_arg { +	struct qdisc_walker	w;  	struct Qdisc		*p;  	int			depth;  }; @@ -959,33 +1073,39 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)   * Delete/get qdisc.   */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)  {  	struct net *net = sock_net(skb->sk); -	struct tcmsg *tcm = NLMSG_DATA(n); +	struct tcmsg *tcm = nlmsg_data(n);  	struct nlattr *tca[TCA_MAX + 1];  	struct net_device *dev; -	u32 clid = tcm->tcm_parent; +	u32 clid;  	struct Qdisc *q = NULL;  	struct Qdisc *p = NULL;  	int err; -	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) -		return -ENODEV; +	if ((n->nlmsg_type != RTM_GETQDISC) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) +		return -EPERM;  	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);  	if (err < 0)  		return err; +	dev = __dev_get_by_index(net, tcm->tcm_ifindex); +	if (!dev) +		return -ENODEV; + +	clid = tcm->tcm_parent;  	if (clid) {  		if (clid != TC_H_ROOT) {  			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { -				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) +				p = qdisc_lookup(dev, TC_H_MAJ(clid)); +				if (!p)  					return -ENOENT;  				q = qdisc_leaf(p, clid); -			} else { /* ingress */ -				if (dev_ingress_queue(dev)) -					q = dev_ingress_queue(dev)->qdisc_sleeping; +			} else if (dev_ingress_queue(dev)) { +				q = dev_ingress_queue(dev)->qdisc_sleeping;  			}  		} else {  			q = dev->qdisc; @@ -996,7 +1116,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)  			return -EINVAL;  	} else { -		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) +		q = qdisc_lookup(dev, tcm->tcm_handle); +		if (!q)  			return -ENOENT;  	} @@ -1008,7 +1129,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  			return -EINVAL;  		if (q->handle == 0)  			return -ENOENT; -		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0) +		err = qdisc_graft(dev, p, skb, n, clid, NULL, q); +		if (err != 0)  			return err;  	} else {  		qdisc_notify(net, skb, n, clid, NULL, q); @@ -1017,10 +1139,10 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  }  /* -   Create/change qdisc. + * Create/change qdisc.   */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)  {  	struct net *net = sock_net(skb->sk);  	struct tcmsg *tcm; @@ -1030,28 +1152,33 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	struct Qdisc *q, *p;  	int err; +	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) +		return -EPERM; +  replay:  	/* Reinit, just in case something touches this. */ -	tcm = NLMSG_DATA(n); +	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); +	if (err < 0) +		return err; + +	tcm = nlmsg_data(n);  	clid = tcm->tcm_parent;  	q = p = NULL; -	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) +	dev = __dev_get_by_index(net, tcm->tcm_ifindex); +	if (!dev)  		return -ENODEV; -	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); -	if (err < 0) -		return err;  	if (clid) {  		if (clid != TC_H_ROOT) {  			if (clid != TC_H_INGRESS) { -				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) +				p = qdisc_lookup(dev, TC_H_MAJ(clid)); +				if (!p)  					return -ENOENT;  				q = qdisc_leaf(p, clid); -			} else { /* ingress */ -				if (dev_ingress_queue_create(dev)) -					q = dev_ingress_queue(dev)->qdisc_sleeping; +			} else if (dev_ingress_queue_create(dev)) { +				q = dev_ingress_queue(dev)->qdisc_sleeping;  			}  		} else {  			q = dev->qdisc; @@ -1063,13 +1190,14 @@ replay:  		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {  			if (tcm->tcm_handle) { -				if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) +				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))  					return -EEXIST;  				if (TC_H_MIN(tcm->tcm_handle))  					return -EINVAL; -				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) +				q = qdisc_lookup(dev, tcm->tcm_handle); +				if (!q)  					goto create_n_graft; -				if (n->nlmsg_flags&NLM_F_EXCL) +				if (n->nlmsg_flags & NLM_F_EXCL)  					return -EEXIST;  				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))  					return -EINVAL; @@ -1079,7 +1207,7 @@ replay:  				atomic_inc(&q->refcnt);  				goto graft;  			} else { -				if (q == NULL) +				if (!q)  					goto create_n_graft;  				/* This magic test requires explanation. @@ -1101,9 +1229,9 @@ replay:  				 *   For now we select create/graft, if  				 *   user gave KIND, which does not match existing.  				 */ -				if ((n->nlmsg_flags&NLM_F_CREATE) && -				    (n->nlmsg_flags&NLM_F_REPLACE) && -				    ((n->nlmsg_flags&NLM_F_EXCL) || +				if ((n->nlmsg_flags & NLM_F_CREATE) && +				    (n->nlmsg_flags & NLM_F_REPLACE) && +				    ((n->nlmsg_flags & NLM_F_EXCL) ||  				     (tca[TCA_KIND] &&  				      nla_strcmp(tca[TCA_KIND], q->ops->id))))  					goto create_n_graft; @@ -1118,7 +1246,7 @@ replay:  	/* Change qdisc parameters */  	if (q == NULL)  		return -ENOENT; -	if (n->nlmsg_flags&NLM_F_EXCL) +	if (n->nlmsg_flags & NLM_F_EXCL)  		return -EEXIST;  	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))  		return -EINVAL; @@ -1128,7 +1256,7 @@ replay:  	return err;  create_n_graft: -	if (!(n->nlmsg_flags&NLM_F_CREATE)) +	if (!(n->nlmsg_flags & NLM_F_CREATE))  		return -ENOENT;  	if (clid == TC_H_INGRESS) {  		if (dev_ingress_queue(dev)) @@ -1169,15 +1297,19 @@ graft:  }  static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, -			 u32 pid, u32 seq, u16 flags, int event) +			 u32 portid, u32 seq, u16 flags, int event)  {  	struct tcmsg *tcm;  	struct nlmsghdr  *nlh;  	unsigned char *b = skb_tail_pointer(skb);  	struct gnet_dump d; +	struct qdisc_size_table *stab; -	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); -	tcm = NLMSG_DATA(nlh); +	cond_resched(); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); +	if (!nlh) +		goto out_nlmsg_trim; +	tcm = nlmsg_data(nlh);  	tcm->tcm_family = AF_UNSPEC;  	tcm->tcm__pad1 = 0;  	tcm->tcm__pad2 = 0; @@ -1185,12 +1317,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,  	tcm->tcm_parent = clid;  	tcm->tcm_handle = q->handle;  	tcm->tcm_info = atomic_read(&q->refcnt); -	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); +	if (nla_put_string(skb, TCA_KIND, q->ops->id)) +		goto nla_put_failure;  	if (q->ops->dump && q->ops->dump(q, skb) < 0)  		goto nla_put_failure;  	q->qstats.qlen = q->q.qlen; -	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0) +	stab = rtnl_dereference(q->stab); +	if (stab && qdisc_dump_stab(skb, stab) < 0)  		goto nla_put_failure;  	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, @@ -1211,7 +1345,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	return skb->len; -nlmsg_failure: +out_nlmsg_trim:  nla_put_failure:  	nlmsg_trim(skb, b);  	return -1; @@ -1227,23 +1361,26 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,  			struct Qdisc *old, struct Qdisc *new)  {  	struct sk_buff *skb; -	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; +	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS;  	if (old && !tc_qdisc_dump_ignore(old)) { -		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) +		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, +				  0, RTM_DELQDISC) < 0)  			goto err_out;  	}  	if (new && !tc_qdisc_dump_ignore(new)) { -		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) +		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, +				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)  			goto err_out;  	}  	if (skb->len) -		return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +		return rtnetlink_send(skb, net, portid, RTNLGRP_TC, +				      n->nlmsg_flags & NLM_F_ECHO);  err_out:  	kfree_skb(skb); @@ -1265,7 +1402,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,  		q_idx++;  	} else {  		if (!tc_qdisc_dump_ignore(q) && -		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, +		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,  				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)  			goto done;  		q_idx++; @@ -1275,8 +1412,8 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,  			q_idx++;  			continue;  		} -		if (!tc_qdisc_dump_ignore(q) &&  -		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, +		if (!tc_qdisc_dump_ignore(q) && +		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,  				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)  			goto done;  		q_idx++; @@ -1300,9 +1437,9 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)  	s_idx = cb->args[0];  	s_q_idx = q_idx = cb->args[1]; -	rcu_read_lock();  	idx = 0; -	for_each_netdev_rcu(net, dev) { +	ASSERT_RTNL(); +	for_each_netdev(net, dev) {  		struct netdev_queue *dev_queue;  		if (idx < s_idx) @@ -1325,8 +1462,6 @@ cont:  	}  done: -	rcu_read_unlock(); -  	cb->args[0] = idx;  	cb->args[1] = q_idx; @@ -1341,28 +1476,33 @@ done: -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)  {  	struct net *net = sock_net(skb->sk); -	struct tcmsg *tcm = NLMSG_DATA(n); +	struct tcmsg *tcm = nlmsg_data(n);  	struct nlattr *tca[TCA_MAX + 1];  	struct net_device *dev;  	struct Qdisc *q = NULL;  	const struct Qdisc_class_ops *cops;  	unsigned long cl = 0;  	unsigned long new_cl; -	u32 pid = tcm->tcm_parent; -	u32 clid = tcm->tcm_handle; -	u32 qid = TC_H_MAJ(clid); +	u32 portid; +	u32 clid; +	u32 qid;  	int err; -	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) -		return -ENODEV; +	if ((n->nlmsg_type != RTM_GETTCLASS) && +	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) +		return -EPERM;  	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);  	if (err < 0)  		return err; +	dev = __dev_get_by_index(net, tcm->tcm_ifindex); +	if (!dev) +		return -ENODEV; +  	/*  	   parent == TC_H_UNSPEC - unspecified parent.  	   parent == TC_H_ROOT   - class is root, which has no parent. @@ -1378,8 +1518,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	/* Step 1. Determine qdisc handle X:0 */ -	if (pid != TC_H_ROOT) { -		u32 qid1 = TC_H_MAJ(pid); +	portid = tcm->tcm_parent; +	clid = tcm->tcm_handle; +	qid = TC_H_MAJ(clid); + +	if (portid != TC_H_ROOT) { +		u32 qid1 = TC_H_MAJ(portid);  		if (qid && qid1) {  			/* If both majors are known, they must be identical. */ @@ -1391,19 +1535,20 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  			qid = dev->qdisc->handle;  		/* Now qid is genuine qdisc handle consistent -		   both with parent and child. - -		   TC_H_MAJ(pid) still may be unspecified, complete it now. +		 * both with parent and child. +		 * +		 * TC_H_MAJ(portid) still may be unspecified, complete it now.  		 */ -		if (pid) -			pid = TC_H_MAKE(qid, pid); +		if (portid) +			portid = TC_H_MAKE(qid, portid);  	} else {  		if (qid == 0)  			qid = dev->qdisc->handle;  	}  	/* OK. Locate qdisc */ -	if ((q = qdisc_lookup(dev, qid)) == NULL) +	q = qdisc_lookup(dev, qid); +	if (!q)  		return -ENOENT;  	/* An check that it supports classes */ @@ -1413,7 +1558,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	/* Now try to get class */  	if (clid == 0) { -		if (pid == TC_H_ROOT) +		if (portid == TC_H_ROOT)  			clid = qid;  	} else  		clid = TC_H_MAKE(qid, clid); @@ -1423,13 +1568,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	if (cl == 0) {  		err = -ENOENT; -		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) +		if (n->nlmsg_type != RTM_NEWTCLASS || +		    !(n->nlmsg_flags & NLM_F_CREATE))  			goto out;  	} else {  		switch (n->nlmsg_type) {  		case RTM_NEWTCLASS:  			err = -EEXIST; -			if (n->nlmsg_flags&NLM_F_EXCL) +			if (n->nlmsg_flags & NLM_F_EXCL)  				goto out;  			break;  		case RTM_DELTCLASS: @@ -1451,7 +1597,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)  	new_cl = cl;  	err = -EOPNOTSUPP;  	if (cops->change) -		err = cops->change(q, clid, pid, tca, &new_cl); +		err = cops->change(q, clid, portid, tca, &new_cl);  	if (err == 0)  		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); @@ -1465,7 +1611,7 @@ out:  static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,  			  unsigned long cl, -			  u32 pid, u32 seq, u16 flags, int event) +			  u32 portid, u32 seq, u16 flags, int event)  {  	struct tcmsg *tcm;  	struct nlmsghdr  *nlh; @@ -1473,8 +1619,11 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,  	struct gnet_dump d;  	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; -	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); -	tcm = NLMSG_DATA(nlh); +	cond_resched(); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); +	if (!nlh) +		goto out_nlmsg_trim; +	tcm = nlmsg_data(nlh);  	tcm->tcm_family = AF_UNSPEC;  	tcm->tcm__pad1 = 0;  	tcm->tcm__pad2 = 0; @@ -1482,7 +1631,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,  	tcm->tcm_parent = q->handle;  	tcm->tcm_handle = q->handle;  	tcm->tcm_info = 0; -	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); +	if (nla_put_string(skb, TCA_KIND, q->ops->id)) +		goto nla_put_failure;  	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)  		goto nla_put_failure; @@ -1499,7 +1649,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,  	nlh->nlmsg_len = skb_tail_pointer(skb) - b;  	return skb->len; -nlmsg_failure: +out_nlmsg_trim:  nla_put_failure:  	nlmsg_trim(skb, b);  	return -1; @@ -1510,32 +1660,32 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,  			 unsigned long cl, int event)  {  	struct sk_buff *skb; -	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; +	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;  	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);  	if (!skb)  		return -ENOBUFS; -	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { +	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {  		kfree_skb(skb);  		return -EINVAL;  	} -	return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +	return rtnetlink_send(skb, net, portid, RTNLGRP_TC, +			      n->nlmsg_flags & NLM_F_ECHO);  } -struct qdisc_dump_args -{ -	struct qdisc_walker w; -	struct sk_buff *skb; -	struct netlink_callback *cb; +struct qdisc_dump_args { +	struct qdisc_walker	w; +	struct sk_buff		*skb; +	struct netlink_callback	*cb;  };  static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)  {  	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; -	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, +	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,  			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);  } @@ -1590,15 +1740,16 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,  static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)  { -	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); +	struct tcmsg *tcm = nlmsg_data(cb->nlh);  	struct net *net = sock_net(skb->sk);  	struct netdev_queue *dev_queue;  	struct net_device *dev;  	int t, s_t; -	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) +	if (nlmsg_len(cb->nlh) < sizeof(*tcm))  		return 0; -	if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) +	dev = dev_get_by_index(net, tcm->tcm_ifindex); +	if (!dev)  		return 0;  	s_t = cb->args[0]; @@ -1621,19 +1772,22 @@ done:  }  /* Main classifier routine: scans classifier chain attached -   to this qdisc, (optionally) tests for protocol and asks -   specific classifiers. + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers.   */ -int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp, +int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,  		       struct tcf_result *res)  {  	__be16 protocol = skb->protocol; -	int err = 0; +	int err;  	for (; tp; tp = tp->next) { -		if ((tp->protocol == protocol || -		     tp->protocol == htons(ETH_P_ALL)) && -		    (err = tp->classify(skb, tp, res)) >= 0) { +		if (tp->protocol != protocol && +		    tp->protocol != htons(ETH_P_ALL)) +			continue; +		err = tp->classify(skb, tp, res); + +		if (err >= 0) {  #ifdef CONFIG_NET_CLS_ACT  			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)  				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); @@ -1645,16 +1799,14 @@ int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,  }  EXPORT_SYMBOL(tc_classify_compat); -int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,  		struct tcf_result *res)  {  	int err = 0; -	__be16 protocol;  #ifdef CONFIG_NET_CLS_ACT -	struct tcf_proto *otp = tp; +	const struct tcf_proto *otp = tp;  reclassify:  #endif -	protocol = skb->protocol;  	err = tc_classify_compat(skb, tp, res);  #ifdef CONFIG_NET_CLS_ACT @@ -1663,12 +1815,10 @@ reclassify:  		tp = otp;  		if (verd++ >= MAX_REC_LOOP) { -			if (net_ratelimit()) -				printk(KERN_NOTICE -				       "%s: packet reclassify loop" -					  " rule prio %u protocol %02x\n", -				       tp->q->ops->id, -				       tp->prio & 0xffff, ntohs(tp->protocol)); +			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", +					       tp->q->ops->id, +					       tp->prio & 0xffff, +					       ntohs(tp->protocol));  			return TC_ACT_SHOT;  		}  		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); @@ -1728,7 +1878,7 @@ static int __net_init psched_net_init(struct net *net)  {  	struct proc_dir_entry *e; -	e = proc_net_fops_create(net, "psched", 0, &psched_fops); +	e = proc_create("psched", 0, net->proc_net, &psched_fops);  	if (e == NULL)  		return -ENOMEM; @@ -1737,7 +1887,7 @@ static int __net_init psched_net_init(struct net *net)  static void __net_exit psched_net_exit(struct net *net)  { -	proc_net_remove(net, "psched"); +	remove_proc_entry("psched", net->proc_net);  }  #else  static int __net_init psched_net_init(struct net *net) @@ -1761,22 +1911,23 @@ static int __init pktsched_init(void)  	err = register_pernet_subsys(&psched_net_ops);  	if (err) { -		printk(KERN_ERR "pktsched_init: " +		pr_err("pktsched_init: "  		       "cannot initialize per netns operations\n");  		return err;  	} +	register_qdisc(&pfifo_fast_ops);  	register_qdisc(&pfifo_qdisc_ops);  	register_qdisc(&bfifo_qdisc_ops);  	register_qdisc(&pfifo_head_drop_qdisc_ops);  	register_qdisc(&mq_qdisc_ops); -	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc); -	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL); -	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL); -	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass); +	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); +	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); +	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);  	return 0;  } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 282540778aa..8449b337f9e 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -5,6 +5,7 @@  #include <linux/module.h>  #include <linux/slab.h>  #include <linux/init.h> +#include <linux/interrupt.h>  #include <linux/string.h>  #include <linux/errno.h>  #include <linux/skbuff.h> @@ -15,8 +16,6 @@  #include <net/netlink.h>  #include <net/pkt_sched.h> -extern struct socket *sockfd_lookup(int fd, int *err);	/* @@@ fix this */ -  /*   * The ATM queuing discipline provides a framework for invoking classifiers   * (aka "filters"), which in turn select classes of this queuing discipline. @@ -319,7 +318,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)  	 * creation), and one for the reference held when calling delete.  	 */  	if (flow->ref < 2) { -		printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref); +		pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);  		return -EINVAL;  	}  	if (flow->ref > 2) @@ -384,12 +383,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)  			}  		}  		flow = NULL; -	done: -		;		 +done: +		;  	} -	if (!flow) +	if (!flow) {  		flow = &p->link; -	else { +	} else {  		if (flow->vcc)  			ATM_SKB(skb)->atm_options = flow->vcc->atm_options;  		/*@@@ looks good ... but it's not supposed to work :-) */ @@ -422,10 +421,6 @@ drop: __maybe_unused  		}  		return ret;  	} -	sch->bstats.bytes += qdisc_pkt_len(skb); -	sch->bstats.packets++; -	flow->bstats.bytes += qdisc_pkt_len(skb); -	flow->bstats.packets++;  	/*  	 * Okay, this may seem weird. We pretend we've dropped the packet if  	 * it goes via ATM. The reason for this is that the outer qdisc @@ -473,6 +468,8 @@ static void sch_atm_dequeue(unsigned long data)  			if (unlikely(!skb))  				break; +			qdisc_bstats_update(sch, skb); +			bstats_update(&flow->bstats, skb);  			pr_debug("atm_tc_dequeue: sending on class %p\n", flow);  			/* remove any LL header somebody else has attached */  			skb_pull(skb, skb_network_offset(skb)); @@ -578,8 +575,7 @@ static void atm_tc_destroy(struct Qdisc *sch)  	list_for_each_entry_safe(flow, tmp, &p->flows, list) {  		if (flow->ref > 1) -			printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow, -			       flow->ref); +			pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);  		atm_tc_put(sch, (unsigned long)flow);  	}  	tasklet_kill(&p->task); @@ -603,27 +599,31 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); +	if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) +		goto nla_put_failure;  	if (flow->vcc) {  		struct sockaddr_atmpvc pvc;  		int state; +		memset(&pvc, 0, sizeof(pvc));  		pvc.sap_family = AF_ATMPVC;  		pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;  		pvc.sap_addr.vpi = flow->vcc->vpi;  		pvc.sap_addr.vci = flow->vcc->vci; -		NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); +		if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) +			goto nla_put_failure;  		state = ATM_VF2VS(flow->vcc->flags); -		NLA_PUT_U32(skb, TCA_ATM_STATE, state); +		if (nla_put_u32(skb, TCA_ATM_STATE, state)) +			goto nla_put_failure;  	} -	if (flow->excess) -		NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); -	else { -		NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); +	if (flow->excess) { +		if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid)) +			goto nla_put_failure; +	} else { +		if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) +			goto nla_put_failure;  	} - -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index eb763159086..ead526467cc 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -72,8 +72,7 @@  struct cbq_sched_data; -struct cbq_class -{ +struct cbq_class {  	struct Qdisc_class_common common;  	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */ @@ -131,7 +130,7 @@ struct cbq_class  	psched_time_t		penalized;  	struct gnet_stats_basic_packed bstats;  	struct gnet_stats_queue qstats; -	struct gnet_stats_rate_est rate_est; +	struct gnet_stats_rate_est64 rate_est;  	struct tc_cbq_xstats	xstats;  	struct tcf_proto	*filter_list; @@ -139,19 +138,18 @@ struct cbq_class  	int			refcnt;  	int			filters; -	struct cbq_class 	*defaults[TC_PRIO_MAX+1]; +	struct cbq_class	*defaults[TC_PRIO_MAX + 1];  }; -struct cbq_sched_data -{ +struct cbq_sched_data {  	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */ -	int			nclasses[TC_CBQ_MAXPRIO+1]; -	unsigned		quanta[TC_CBQ_MAXPRIO+1]; +	int			nclasses[TC_CBQ_MAXPRIO + 1]; +	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];  	struct cbq_class	link; -	unsigned		activemask; -	struct cbq_class	*active[TC_CBQ_MAXPRIO+1];	/* List of all classes +	unsigned int		activemask; +	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes  								   with backlog */  #ifdef CONFIG_NET_CLS_ACT @@ -162,7 +160,7 @@ struct cbq_sched_data  	int			tx_len;  	psched_time_t		now;		/* Cached timestamp */  	psched_time_t		now_rt;		/* Cached real time */ -	unsigned		pmask; +	unsigned int		pmask;  	struct hrtimer		delay_timer;  	struct qdisc_watchdog	watchdog;	/* Watchdog timer, @@ -175,9 +173,9 @@ struct cbq_sched_data  }; -#define L2T(cl,len)	qdisc_l2t((cl)->R_tab,len) +#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len) -static __inline__ struct cbq_class * +static inline struct cbq_class *  cbq_class_lookup(struct cbq_sched_data *q, u32 classid)  {  	struct Qdisc_class_common *clc; @@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid)  static struct cbq_class *  cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)  { -	struct cbq_class *cl, *new; +	struct cbq_class *cl; -	for (cl = this->tparent; cl; cl = cl->tparent) -		if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) -			return new; +	for (cl = this->tparent; cl; cl = cl->tparent) { +		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; +		if (new != NULL && new != this) +			return new; +	}  	return NULL;  }  #endif  /* Classify packet. The procedure is pretty complicated, but -   it allows us to combine link sharing and priority scheduling -   transparently. - -   Namely, you can put link sharing rules (f.e. route based) at root of CBQ, -   so that it resolves to split nodes. Then packets are classified -   by logical priority, or a more specific classifier may be attached -   to the split node. + * it allows us to combine link sharing and priority scheduling + * transparently. + * + * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + * so that it resolves to split nodes. Then packets are classified + * by logical priority, or a more specific classifier may be attached + * to the split node.   */  static struct cbq_class * @@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)  	/*  	 *  Step 1. If skb->priority points to one of our classes, use it.  	 */ -	if (TC_H_MAJ(prio^sch->handle) == 0 && +	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&  	    (cl = cbq_class_lookup(q, prio)) != NULL)  		return cl; @@ -243,16 +243,18 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)  		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)  			goto fallback; -		if ((cl = (void*)res.class) == NULL) { +		cl = (void *)res.class; +		if (!cl) {  			if (TC_H_MAJ(res.classid))  				cl = cbq_class_lookup(q, res.classid); -			else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) +			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)  				cl = defmap[TC_PRIO_BESTEFFORT]; -			if (cl == NULL || cl->level >= head->level) +			if (cl == NULL)  				goto fallback;  		} - +		if (cl->level >= head->level) +			goto fallback;  #ifdef CONFIG_NET_CLS_ACT  		switch (result) {  		case TC_ACT_QUEUED: @@ -282,7 +284,7 @@ fallback:  	 * Step 4. No success...  	 */  	if (TC_H_MAJ(prio) == 0 && -	    !(cl = head->defaults[prio&TC_PRIO_MAX]) && +	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&  	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))  		return head; @@ -290,12 +292,12 @@ fallback:  }  /* -   A packet has just been enqueued on the empty class. -   cbq_activate_class adds it to the tail of active class list -   of its priority band. + * A packet has just been enqueued on the empty class. + * cbq_activate_class adds it to the tail of active class list + * of its priority band.   */ -static __inline__ void cbq_activate_class(struct cbq_class *cl) +static inline void cbq_activate_class(struct cbq_class *cl)  {  	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);  	int prio = cl->cpriority; @@ -314,9 +316,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)  }  /* -   Unlink class from active chain. -   Note that this same procedure is done directly in cbq_dequeue* -   during round-robin procedure. + * Unlink class from active chain. + * Note that this same procedure is done directly in cbq_dequeue* + * during round-robin procedure.   */  static void cbq_deactivate_class(struct cbq_class *this) @@ -350,7 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)  {  	int toplevel = q->toplevel; -	if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { +	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {  		psched_time_t now;  		psched_tdiff_t incr; @@ -363,7 +365,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)  				q->toplevel = cl->level;  				return;  			} -		} while ((cl=cl->borrow) != NULL && toplevel > cl->level); +		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);  	}  } @@ -390,8 +392,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	ret = qdisc_enqueue(skb, cl->q);  	if (ret == NET_XMIT_SUCCESS) {  		sch->q.qlen++; -		sch->bstats.packets++; -		sch->bstats.bytes += qdisc_pkt_len(skb);  		cbq_mark_toplevel(q, cl);  		if (!cl->next_alive)  			cbq_activate_class(cl); @@ -419,11 +419,11 @@ static void cbq_ovl_classic(struct cbq_class *cl)  		delay += cl->offtime;  		/* -		   Class goes to sleep, so that it will have no -		   chance to work avgidle. Let's forgive it 8) - -		   BTW cbq-2.0 has a crap in this -		   place, apparently they forgot to shift it by cl->ewma_log. +		 * Class goes to sleep, so that it will have no +		 * chance to work avgidle. Let's forgive it 8) +		 * +		 * BTW cbq-2.0 has a crap in this +		 * place, apparently they forgot to shift it by cl->ewma_log.  		 */  		if (cl->avgidle < 0)  			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); @@ -440,8 +440,8 @@ static void cbq_ovl_classic(struct cbq_class *cl)  		q->wd_expires = delay;  	/* Dirty work! We must schedule wakeups based on -	   real available rate, rather than leaf rate, -	   which may be tiny (even zero). +	 * real available rate, rather than leaf rate, +	 * which may be tiny (even zero).  	 */  	if (q->toplevel == TC_CBQ_MAXLEVEL) {  		struct cbq_class *b; @@ -461,7 +461,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)  }  /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when -   they go overlimit + * they go overlimit   */  static void cbq_ovl_rclassic(struct cbq_class *cl) @@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl)  			cl->cpriority = TC_CBQ_MAXPRIO;  			q->pmask |= (1<<TC_CBQ_MAXPRIO); -			expires = ktime_set(0, 0); -			expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); +			expires = ns_to_ktime(PSCHED_TICKS2NS(sched));  			if (hrtimer_try_to_cancel(&q->delay_timer) &&  			    ktime_to_ns(ktime_sub(  					hrtimer_get_expires(&q->delay_timer), @@ -596,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)  	struct Qdisc *sch = q->watchdog.qdisc;  	psched_time_t now;  	psched_tdiff_t delay = 0; -	unsigned pmask; +	unsigned int pmask;  	now = psched_get_time(); @@ -625,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)  		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);  	} -	sch->flags &= ~TCQ_F_THROTTLED; +	qdisc_unthrottled(sch);  	__netif_schedule(qdisc_root(sch));  	return HRTIMER_NORESTART;  } @@ -650,8 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)  		ret = qdisc_enqueue(skb, cl->q);  		if (ret == NET_XMIT_SUCCESS) {  			sch->q.qlen++; -			sch->bstats.packets++; -			sch->bstats.bytes += qdisc_pkt_len(skb);  			if (!cl->next_alive)  				cbq_activate_class(cl);  			return 0; @@ -667,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)  #endif  /* -   It is mission critical procedure. - -   We "regenerate" toplevel cutoff, if transmitting class -   has backlog and it is not regulated. It is not part of -   original CBQ description, but looks more reasonable. -   Probably, it is wrong. This question needs further investigation. -*/ + * It is mission critical procedure. + * + * We "regenerate" toplevel cutoff, if transmitting class + * has backlog and it is not regulated. It is not part of + * original CBQ description, but looks more reasonable. + * Probably, it is wrong. This question needs further investigation. + */ -static __inline__ void +static inline void  cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,  		    struct cbq_class *borrowed)  { @@ -686,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,  					q->toplevel = borrowed->level;  					return;  				} -			} while ((borrowed=borrowed->borrow) != NULL); +			} while ((borrowed = borrowed->borrow) != NULL);  		}  #if 0  	/* It is not necessary now. Uncommenting it @@ -714,10 +711,10 @@ cbq_update(struct cbq_sched_data *q)  		cl->bstats.bytes += len;  		/* -		   (now - last) is total time between packet right edges. -		   (last_pktlen/rate) is "virtual" busy time, so that - -			 idle = (now - last) - last_pktlen/rate +		 * (now - last) is total time between packet right edges. +		 * (last_pktlen/rate) is "virtual" busy time, so that +		 * +		 *	idle = (now - last) - last_pktlen/rate  		 */  		idle = q->now - cl->last; @@ -727,9 +724,9 @@ cbq_update(struct cbq_sched_data *q)  			idle -= L2T(cl, len);  		/* true_avgidle := (1-W)*true_avgidle + W*idle, -		   where W=2^{-ewma_log}. But cl->avgidle is scaled: -		   cl->avgidle == true_avgidle/W, -		   hence: +		 * where W=2^{-ewma_log}. But cl->avgidle is scaled: +		 * cl->avgidle == true_avgidle/W, +		 * hence:  		 */  			avgidle += idle - (avgidle>>cl->ewma_log);  		} @@ -743,22 +740,22 @@ cbq_update(struct cbq_sched_data *q)  			cl->avgidle = avgidle;  			/* Calculate expected time, when this class -			   will be allowed to send. -			   It will occur, when: -			   (1-W)*true_avgidle + W*delay = 0, i.e. -			   idle = (1/W - 1)*(-true_avgidle) -			   or -			   idle = (1 - W)*(-cl->avgidle); +			 * will be allowed to send. +			 * It will occur, when: +			 * (1-W)*true_avgidle + W*delay = 0, i.e. +			 * idle = (1/W - 1)*(-true_avgidle) +			 * or +			 * idle = (1 - W)*(-cl->avgidle);  			 */  			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);  			/* -			   That is not all. -			   To maintain the rate allocated to the class, -			   we add to undertime virtual clock, -			   necessary to complete transmitted packet. -			   (len/phys_bandwidth has been already passed -			   to the moment of cbq_update) +			 * That is not all. +			 * To maintain the rate allocated to the class, +			 * we add to undertime virtual clock, +			 * necessary to complete transmitted packet. +			 * (len/phys_bandwidth has been already passed +			 * to the moment of cbq_update)  			 */  			idle -= L2T(&q->link, len); @@ -780,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)  	cbq_update_toplevel(q, this, q->tx_borrowed);  } -static __inline__ struct cbq_class * +static inline struct cbq_class *  cbq_under_limit(struct cbq_class *cl)  {  	struct cbq_sched_data *q = qdisc_priv(cl->qdisc); @@ -796,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl)  	do {  		/* It is very suspicious place. Now overlimit -		   action is generated for not bounded classes -		   only if link is completely congested. -		   Though it is in agree with ancestor-only paradigm, -		   it looks very stupid. Particularly, -		   it means that this chunk of code will either -		   never be called or result in strong amplification -		   of burstiness. Dangerous, silly, and, however, -		   no another solution exists. +		 * action is generated for not bounded classes +		 * only if link is completely congested. +		 * Though it is in agree with ancestor-only paradigm, +		 * it looks very stupid. Particularly, +		 * it means that this chunk of code will either +		 * never be called or result in strong amplification +		 * of burstiness. Dangerous, silly, and, however, +		 * no another solution exists.  		 */ -		if ((cl = cl->borrow) == NULL) { +		cl = cl->borrow; +		if (!cl) {  			this_cl->qstats.overlimits++;  			this_cl->overlimit(this_cl);  			return NULL; @@ -818,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl)  	return cl;  } -static __inline__ struct sk_buff * +static inline struct sk_buff *  cbq_dequeue_prio(struct Qdisc *sch, int prio)  {  	struct cbq_sched_data *q = qdisc_priv(sch); @@ -842,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)  			if (cl->deficit <= 0) {  				/* Class exhausted its allotment per -				   this round. Switch to the next one. +				 * this round. Switch to the next one.  				 */  				deficit = 1;  				cl->deficit += cl->quantum; @@ -852,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)  			skb = cl->q->dequeue(cl->q);  			/* Class did not give us any skb :-( -			   It could occur even if cl->q->q.qlen != 0 -			   f.e. if cl->q == "tbf" +			 * It could occur even if cl->q->q.qlen != 0 +			 * f.e. if cl->q == "tbf"  			 */  			if (skb == NULL)  				goto skip_class; @@ -882,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)  skip_class:  			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {  				/* Class is empty or penalized. -				   Unlink it from active chain. +				 * Unlink it from active chain.  				 */  				cl_prev->next_alive = cl->next_alive;  				cl->next_alive = NULL; @@ -921,14 +919,14 @@ next_class:  	return NULL;  } -static __inline__ struct sk_buff * +static inline struct sk_buff *  cbq_dequeue_1(struct Qdisc *sch)  {  	struct cbq_sched_data *q = qdisc_priv(sch);  	struct sk_buff *skb; -	unsigned activemask; +	unsigned int activemask; -	activemask = q->activemask&0xFF; +	activemask = q->activemask & 0xFF;  	while (activemask) {  		int prio = ffz(~activemask);  		activemask &= ~(1<<prio); @@ -953,19 +951,22 @@ cbq_dequeue(struct Qdisc *sch)  	if (q->tx_class) {  		psched_tdiff_t incr2;  		/* Time integrator. We calculate EOS time -		   by adding expected packet transmission time. -		   If real time is greater, we warp artificial clock, -		   so that: - -		   cbq_time = max(real_time, work); +		 * by adding expected packet transmission time. +		 * If real time is greater, we warp artificial clock, +		 * so that: +		 * +		 * cbq_time = max(real_time, work);  		 */  		incr2 = L2T(&q->link, q->tx_len);  		q->now += incr2;  		cbq_update(q);  		if ((incr -= incr2) < 0)  			incr = 0; +		q->now += incr; +	} else { +		if (now > q->now) +			q->now = now;  	} -	q->now += incr;  	q->now_rt = now;  	for (;;) { @@ -973,28 +974,29 @@ cbq_dequeue(struct Qdisc *sch)  		skb = cbq_dequeue_1(sch);  		if (skb) { +			qdisc_bstats_update(sch, skb);  			sch->q.qlen--; -			sch->flags &= ~TCQ_F_THROTTLED; +			qdisc_unthrottled(sch);  			return skb;  		}  		/* All the classes are overlimit. - -		   It is possible, if: - -		   1. Scheduler is empty. -		   2. Toplevel cutoff inhibited borrowing. -		   3. Root class is overlimit. - -		   Reset 2d and 3d conditions and retry. - -		   Note, that NS and cbq-2.0 are buggy, peeking -		   an arbitrary class is appropriate for ancestor-only -		   sharing, but not for toplevel algorithm. - -		   Our version is better, but slower, because it requires -		   two passes, but it is unavoidable with top-level sharing. -		*/ +		 * +		 * It is possible, if: +		 * +		 * 1. Scheduler is empty. +		 * 2. Toplevel cutoff inhibited borrowing. +		 * 3. Root class is overlimit. +		 * +		 * Reset 2d and 3d conditions and retry. +		 * +		 * Note, that NS and cbq-2.0 are buggy, peeking +		 * an arbitrary class is appropriate for ancestor-only +		 * sharing, but not for toplevel algorithm. +		 * +		 * Our version is better, but slower, because it requires +		 * two passes, but it is unavoidable with top-level sharing. +		 */  		if (q->toplevel == TC_CBQ_MAXLEVEL &&  		    q->link.undertime == PSCHED_PASTPERFECT) @@ -1005,7 +1007,8 @@ cbq_dequeue(struct Qdisc *sch)  	}  	/* No packets in scheduler or nobody wants to give them to us :-( -	   Sigh... start watchdog timer in the last case. */ +	 * Sigh... start watchdog timer in the last case. +	 */  	if (sch->q.qlen) {  		sch->qstats.overlimits++; @@ -1027,36 +1030,38 @@ static void cbq_adjust_levels(struct cbq_class *this)  		int level = 0;  		struct cbq_class *cl; -		if ((cl = this->children) != NULL) { +		cl = this->children; +		if (cl) {  			do {  				if (cl->level > level)  					level = cl->level;  			} while ((cl = cl->sibling) != this->children);  		} -		this->level = level+1; +		this->level = level + 1;  	} while ((this = this->tparent) != NULL);  }  static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)  {  	struct cbq_class *cl; -	struct hlist_node *n;  	unsigned int h;  	if (q->quanta[prio] == 0)  		return;  	for (h = 0; h < q->clhash.hashsize; h++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {  			/* BUGGGG... Beware! This expression suffer of -			   arithmetic overflows! +			 * arithmetic overflows!  			 */  			if (cl->priority == prio) {  				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/  					q->quanta[prio];  			} -			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { -				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); +			if (cl->quantum <= 0 || +			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { +				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", +					cl->common.classid, cl->quantum);  				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;  			}  		} @@ -1067,31 +1072,30 @@ static void cbq_sync_defmap(struct cbq_class *cl)  {  	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);  	struct cbq_class *split = cl->split; -	unsigned h; +	unsigned int h;  	int i;  	if (split == NULL)  		return; -	for (i=0; i<=TC_PRIO_MAX; i++) { -		if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) +	for (i = 0; i <= TC_PRIO_MAX; i++) { +		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))  			split->defaults[i] = NULL;  	} -	for (i=0; i<=TC_PRIO_MAX; i++) { +	for (i = 0; i <= TC_PRIO_MAX; i++) {  		int level = split->level;  		if (split->defaults[i])  			continue;  		for (h = 0; h < q->clhash.hashsize; h++) { -			struct hlist_node *n;  			struct cbq_class *c; -			hlist_for_each_entry(c, n, &q->clhash.hash[h], +			hlist_for_each_entry(c, &q->clhash.hash[h],  					     common.hnode) {  				if (c->split == split && c->level < level && -				    c->defmap&(1<<i)) { +				    c->defmap & (1<<i)) {  					split->defaults[i] = c;  					level = c->level;  				} @@ -1105,7 +1109,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma  	struct cbq_class *split = NULL;  	if (splitid == 0) { -		if ((split = cl->split) == NULL) +		split = cl->split; +		if (!split)  			return;  		splitid = split->common.classid;  	} @@ -1123,9 +1128,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma  		cl->defmap = 0;  		cbq_sync_defmap(cl);  		cl->split = split; -		cl->defmap = def&mask; +		cl->defmap = def & mask;  	} else -		cl->defmap = (cl->defmap&~mask)|(def&mask); +		cl->defmap = (cl->defmap & ~mask) | (def & mask);  	cbq_sync_defmap(cl);  } @@ -1138,7 +1143,7 @@ static void cbq_unlink_class(struct cbq_class *this)  	qdisc_class_hash_remove(&q->clhash, &this->common);  	if (this->tparent) { -		clp=&this->sibling; +		clp = &this->sibling;  		cl = *clp;  		do {  			if (cl == this) { @@ -1177,7 +1182,7 @@ static void cbq_link_class(struct cbq_class *this)  	}  } -static unsigned int cbq_drop(struct Qdisc* sch) +static unsigned int cbq_drop(struct Qdisc *sch)  {  	struct cbq_sched_data *q = qdisc_priv(sch);  	struct cbq_class *cl, *cl_head; @@ -1185,7 +1190,8 @@ static unsigned int cbq_drop(struct Qdisc* sch)  	unsigned int len;  	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { -		if ((cl_head = q->active[prio]) == NULL) +		cl_head = q->active[prio]; +		if (!cl_head)  			continue;  		cl = cl_head; @@ -1202,13 +1208,12 @@ static unsigned int cbq_drop(struct Qdisc* sch)  }  static void -cbq_reset(struct Qdisc* sch) +cbq_reset(struct Qdisc *sch)  {  	struct cbq_sched_data *q = qdisc_priv(sch);  	struct cbq_class *cl; -	struct hlist_node *n;  	int prio; -	unsigned h; +	unsigned int h;  	q->activemask = 0;  	q->pmask = 0; @@ -1224,7 +1229,7 @@ cbq_reset(struct Qdisc* sch)  		q->active[prio] = NULL;  	for (h = 0; h < q->clhash.hashsize; h++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {  			qdisc_reset(cl->q);  			cl->next_alive = NULL; @@ -1240,21 +1245,21 @@ cbq_reset(struct Qdisc* sch)  static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)  { -	if (lss->change&TCF_CBQ_LSS_FLAGS) { -		cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; -		cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; +	if (lss->change & TCF_CBQ_LSS_FLAGS) { +		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; +		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;  	} -	if (lss->change&TCF_CBQ_LSS_EWMA) +	if (lss->change & TCF_CBQ_LSS_EWMA)  		cl->ewma_log = lss->ewma_log; -	if (lss->change&TCF_CBQ_LSS_AVPKT) +	if (lss->change & TCF_CBQ_LSS_AVPKT)  		cl->avpkt = lss->avpkt; -	if (lss->change&TCF_CBQ_LSS_MINIDLE) +	if (lss->change & TCF_CBQ_LSS_MINIDLE)  		cl->minidle = -(long)lss->minidle; -	if (lss->change&TCF_CBQ_LSS_MAXIDLE) { +	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {  		cl->maxidle = lss->maxidle;  		cl->avgidle = lss->maxidle;  	} -	if (lss->change&TCF_CBQ_LSS_OFFTIME) +	if (lss->change & TCF_CBQ_LSS_OFFTIME)  		cl->offtime = lss->offtime;  	return 0;  } @@ -1282,10 +1287,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)  	if (wrr->weight)  		cl->weight = wrr->weight;  	if (wrr->priority) { -		cl->priority = wrr->priority-1; +		cl->priority = wrr->priority - 1;  		cl->cpriority = cl->priority;  		if (cl->priority >= cl->priority2) -			cl->priority2 = TC_CBQ_MAXPRIO-1; +			cl->priority2 = TC_CBQ_MAXPRIO - 1;  	}  	cbq_addprio(q, cl); @@ -1302,10 +1307,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)  		cl->overlimit = cbq_ovl_delay;  		break;  	case TC_CBQ_OVL_LOWPRIO: -		if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || -		    ovl->priority2-1 <= cl->priority) +		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || +		    ovl->priority2 - 1 <= cl->priority)  			return -EINVAL; -		cl->priority2 = ovl->priority2-1; +		cl->priority2 = ovl->priority2 - 1;  		cl->overlimit = cbq_ovl_lowprio;  		break;  	case TC_CBQ_OVL_DROP: @@ -1384,9 +1389,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)  	if (!q->link.q)  		q->link.q = &noop_qdisc; -	q->link.priority = TC_CBQ_MAXPRIO-1; -	q->link.priority2 = TC_CBQ_MAXPRIO-1; -	q->link.cpriority = TC_CBQ_MAXPRIO-1; +	q->link.priority = TC_CBQ_MAXPRIO - 1; +	q->link.priority2 = TC_CBQ_MAXPRIO - 1; +	q->link.cpriority = TC_CBQ_MAXPRIO - 1;  	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;  	q->link.overlimit = cbq_ovl_classic;  	q->link.allot = psched_mtu(qdisc_dev(sch)); @@ -1417,11 +1422,12 @@ put_rtab:  	return err;  } -static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb); -	NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); +	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -1429,7 +1435,7 @@ nla_put_failure:  	return -1;  } -static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_cbq_lssopt opt; @@ -1446,7 +1452,8 @@ static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)  	opt.minidle = (u32)(-cl->minidle);  	opt.offtime = cl->offtime;  	opt.change = ~0; -	NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); +	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -1454,17 +1461,19 @@ nla_put_failure:  	return -1;  } -static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_cbq_wrropt opt; +	memset(&opt, 0, sizeof(opt));  	opt.flags = 0;  	opt.allot = cl->allot; -	opt.priority = cl->priority+1; -	opt.cpriority = cl->cpriority+1; +	opt.priority = cl->priority + 1; +	opt.cpriority = cl->cpriority + 1;  	opt.weight = cl->weight; -	NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); +	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -1472,16 +1481,17 @@ nla_put_failure:  	return -1;  } -static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_cbq_ovl opt;  	opt.strategy = cl->ovl_strategy; -	opt.priority2 = cl->priority2+1; +	opt.priority2 = cl->priority2 + 1;  	opt.pad = 0;  	opt.penalty = cl->penalty; -	NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); +	if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -1489,7 +1499,7 @@ nla_put_failure:  	return -1;  } -static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_cbq_fopt opt; @@ -1498,7 +1508,8 @@ static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)  		opt.split = cl->split ? cl->split->common.classid : 0;  		opt.defmap = cl->defmap;  		opt.defchange = ~0; -		NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); +		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) +			goto nla_put_failure;  	}  	return skb->len; @@ -1508,7 +1519,7 @@ nla_put_failure:  }  #ifdef CONFIG_NET_CLS_ACT -static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)  {  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_cbq_police opt; @@ -1517,7 +1528,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)  		opt.police = cl->police;  		opt.__res1 = 0;  		opt.__res2 = 0; -		NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); +		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) +			goto nla_put_failure;  	}  	return skb->len; @@ -1551,8 +1563,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)  		goto nla_put_failure;  	if (cbq_dump_attr(skb, &q->link) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -1572,7 +1583,7 @@ static int  cbq_dump_class(struct Qdisc *sch, unsigned long arg,  	       struct sk_buff *skb, struct tcmsg *tcm)  { -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	struct nlattr *nest;  	if (cl->tparent) @@ -1587,8 +1598,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg,  		goto nla_put_failure;  	if (cbq_dump_attr(skb, cl) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -1600,7 +1610,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,  	struct gnet_dump *d)  {  	struct cbq_sched_data *q = qdisc_priv(sch); -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	cl->qstats.qlen = cl->q->q.qlen;  	cl->xstats.avgidle = cl->avgidle; @@ -1620,7 +1630,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,  static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,  		     struct Qdisc **old)  { -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	if (new == NULL) {  		new = qdisc_create_dflt(sch->dev_queue, @@ -1643,10 +1653,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,  	return 0;  } -static struct Qdisc * -cbq_leaf(struct Qdisc *sch, unsigned long arg) +static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)  { -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	return cl->q;  } @@ -1685,13 +1694,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)  		kfree(cl);  } -static void -cbq_destroy(struct Qdisc* sch) +static void cbq_destroy(struct Qdisc *sch)  {  	struct cbq_sched_data *q = qdisc_priv(sch); -	struct hlist_node *n, *next; +	struct hlist_node *next;  	struct cbq_class *cl; -	unsigned h; +	unsigned int h;  #ifdef CONFIG_NET_CLS_ACT  	q->rx_class = NULL; @@ -1702,11 +1710,11 @@ cbq_destroy(struct Qdisc* sch)  	 * be bound to classes which have been destroyed already. --TGR '04  	 */  	for (h = 0; h < q->clhash.hashsize; h++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) +		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)  			tcf_destroy_chain(&cl->filter_list);  	}  	for (h = 0; h < q->clhash.hashsize; h++) { -		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], +		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],  					  common.hnode)  			cbq_destroy_class(sch, cl);  	} @@ -1715,7 +1723,7 @@ cbq_destroy(struct Qdisc* sch)  static void cbq_put(struct Qdisc *sch, unsigned long arg)  { -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	if (--cl->refcnt == 0) {  #ifdef CONFIG_NET_CLS_ACT @@ -1738,7 +1746,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t  {  	int err;  	struct cbq_sched_data *q = qdisc_priv(sch); -	struct cbq_class *cl = (struct cbq_class*)*arg; +	struct cbq_class *cl = (struct cbq_class *)*arg;  	struct nlattr *opt = tca[TCA_OPTIONS];  	struct nlattr *tb[TCA_CBQ_MAX + 1];  	struct cbq_class *parent; @@ -1773,8 +1781,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t  						    qdisc_root_sleeping_lock(sch),  						    tca[TCA_RATE]);  			if (err) { -				if (rtab) -					qdisc_put_rtab(rtab); +				qdisc_put_rtab(rtab);  				return err;  			}  		} @@ -1830,13 +1837,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t  	if (classid) {  		err = -EINVAL; -		if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) +		if (TC_H_MAJ(classid ^ sch->handle) || +		    cbq_class_lookup(q, classid))  			goto failure;  	} else {  		int i; -		classid = TC_H_MAKE(sch->handle,0x8000); +		classid = TC_H_MAKE(sch->handle, 0x8000); -		for (i=0; i<0x8000; i++) { +		for (i = 0; i < 0x8000; i++) {  			if (++q->hgenerator >= 0x8000)  				q->hgenerator = 1;  			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) @@ -1893,11 +1901,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t  	cl->minidle = -0x7FFFFFFF;  	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));  	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); -	if (cl->ewma_log==0) +	if (cl->ewma_log == 0)  		cl->ewma_log = q->link.ewma_log; -	if (cl->maxidle==0) +	if (cl->maxidle == 0)  		cl->maxidle = q->link.maxidle; -	if (cl->avpkt==0) +	if (cl->avpkt == 0)  		cl->avpkt = q->link.avpkt;  	cl->overlimit = cbq_ovl_classic;  	if (tb[TCA_CBQ_OVL_STRATEGY]) @@ -1923,7 +1931,7 @@ failure:  static int cbq_delete(struct Qdisc *sch, unsigned long arg)  {  	struct cbq_sched_data *q = qdisc_priv(sch); -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	unsigned int qlen;  	if (cl->filters || cl->children || cl == &q->link) @@ -1981,7 +1989,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,  				     u32 classid)  {  	struct cbq_sched_data *q = qdisc_priv(sch); -	struct cbq_class *p = (struct cbq_class*)parent; +	struct cbq_class *p = (struct cbq_class *)parent;  	struct cbq_class *cl = cbq_class_lookup(q, classid);  	if (cl) { @@ -1995,7 +2003,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,  static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)  { -	struct cbq_class *cl = (struct cbq_class*)arg; +	struct cbq_class *cl = (struct cbq_class *)arg;  	cl->filters--;  } @@ -2004,14 +2012,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)  {  	struct cbq_sched_data *q = qdisc_priv(sch);  	struct cbq_class *cl; -	struct hlist_node *n; -	unsigned h; +	unsigned int h;  	if (arg->stop)  		return;  	for (h = 0; h < q->clhash.hashsize; h++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {  			if (arg->count < arg->skip) {  				arg->count++;  				continue; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c new file mode 100644 index 00000000000..ed30e436128 --- /dev/null +++ b/net/sched/sch_choke.c @@ -0,0 +1,632 @@ +/* + * net/sched/sch_choke.c	CHOKE scheduler + * + * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/red.h> +#include <net/flow_keys.h> + +/* +   CHOKe stateless AQM for fair bandwidth allocation +   ================================================= + +   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for +   unresponsive flows) is a variant of RED that penalizes misbehaving flows but +   maintains no flow state. The difference from RED is an additional step +   during the enqueuing process. If average queue size is over the +   low threshold (qmin), a packet is chosen at random from the queue. +   If both the new and chosen packet are from the same flow, both +   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it +   needs to access packets in queue randomly. It has a minimal class +   interface to allow overriding the builtin flow classifier with +   filters. + +   Source: +   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless +   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation", +   IEEE INFOCOM, 2000. + +   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial +   Characteristics", IEEE/ACM Transactions on Networking, 2004 + + */ + +/* Upper bound on size of sk_buff table (packets) */ +#define CHOKE_MAX_QUEUE	(128*1024 - 1) + +struct choke_sched_data { +/* Parameters */ +	u32		 limit; +	unsigned char	 flags; + +	struct red_parms parms; + +/* Variables */ +	struct red_vars  vars; +	struct tcf_proto *filter_list; +	struct { +		u32	prob_drop;	/* Early probability drops */ +		u32	prob_mark;	/* Early probability marks */ +		u32	forced_drop;	/* Forced drops, qavg > max_thresh */ +		u32	forced_mark;	/* Forced marks, qavg > max_thresh */ +		u32	pdrop;          /* Drops due to queue limits */ +		u32	other;          /* Drops due to drop() calls */ +		u32	matched;	/* Drops to flow match */ +	} stats; + +	unsigned int	 head; +	unsigned int	 tail; + +	unsigned int	 tab_mask; /* size - 1 */ + +	struct sk_buff **tab; +}; + +/* number of elements in queue including holes */ +static unsigned int choke_len(const struct choke_sched_data *q) +{ +	return (q->tail - q->head) & q->tab_mask; +} + +/* Is ECN parameter configured */ +static int use_ecn(const struct choke_sched_data *q) +{ +	return q->flags & TC_RED_ECN; +} + +/* Should packets over max just be dropped (versus marked) */ +static int use_harddrop(const struct choke_sched_data *q) +{ +	return q->flags & TC_RED_HARDDROP; +} + +/* Move head pointer forward to skip over holes */ +static void choke_zap_head_holes(struct choke_sched_data *q) +{ +	do { +		q->head = (q->head + 1) & q->tab_mask; +		if (q->head == q->tail) +			break; +	} while (q->tab[q->head] == NULL); +} + +/* Move tail pointer backwards to reuse holes */ +static void choke_zap_tail_holes(struct choke_sched_data *q) +{ +	do { +		q->tail = (q->tail - 1) & q->tab_mask; +		if (q->head == q->tail) +			break; +	} while (q->tab[q->tail] == NULL); +} + +/* Drop packet from queue array by creating a "hole" */ +static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb = q->tab[idx]; + +	q->tab[idx] = NULL; + +	if (idx == q->head) +		choke_zap_head_holes(q); +	if (idx == q->tail) +		choke_zap_tail_holes(q); + +	sch->qstats.backlog -= qdisc_pkt_len(skb); +	qdisc_drop(skb, sch); +	qdisc_tree_decrease_qlen(sch, 1); +	--sch->q.qlen; +} + +struct choke_skb_cb { +	u16			classid; +	u8			keys_valid; +	struct flow_keys	keys; +}; + +static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) +{ +	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); +	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static inline void choke_set_classid(struct sk_buff *skb, u16 classid) +{ +	choke_skb_cb(skb)->classid = classid; +} + +static u16 choke_get_classid(const struct sk_buff *skb) +{ +	return choke_skb_cb(skb)->classid; +} + +/* + * Compare flow of two packets + *  Returns true only if source and destination address and port match. + *          false for special cases + */ +static bool choke_match_flow(struct sk_buff *skb1, +			     struct sk_buff *skb2) +{ +	if (skb1->protocol != skb2->protocol) +		return false; + +	if (!choke_skb_cb(skb1)->keys_valid) { +		choke_skb_cb(skb1)->keys_valid = 1; +		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys); +	} + +	if (!choke_skb_cb(skb2)->keys_valid) { +		choke_skb_cb(skb2)->keys_valid = 1; +		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys); +	} + +	return !memcmp(&choke_skb_cb(skb1)->keys, +		       &choke_skb_cb(skb2)->keys, +		       sizeof(struct flow_keys)); +} + +/* + * Classify flow using either: + *  1. pre-existing classification result in skb + *  2. fast internal classification + *  3. use TC filter based classification + */ +static bool choke_classify(struct sk_buff *skb, +			   struct Qdisc *sch, int *qerr) + +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct tcf_result res; +	int result; + +	result = tc_classify(skb, q->filter_list, &res); +	if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT +		switch (result) { +		case TC_ACT_STOLEN: +		case TC_ACT_QUEUED: +			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; +		case TC_ACT_SHOT: +			return false; +		} +#endif +		choke_set_classid(skb, TC_H_MIN(res.classid)); +		return true; +	} + +	return false; +} + +/* + * Select a packet at random from queue + * HACK: since queue can have holes from previous deletion; retry several + *   times to find a random skb but then just give up and return the head + * Will return NULL if queue is empty (q->head == q->tail) + */ +static struct sk_buff *choke_peek_random(const struct choke_sched_data *q, +					 unsigned int *pidx) +{ +	struct sk_buff *skb; +	int retrys = 3; + +	do { +		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask; +		skb = q->tab[*pidx]; +		if (skb) +			return skb; +	} while (--retrys > 0); + +	return q->tab[*pidx = q->head]; +} + +/* + * Compare new packet with random packet in queue + * returns true if matched and sets *pidx + */ +static bool choke_match_random(const struct choke_sched_data *q, +			       struct sk_buff *nskb, +			       unsigned int *pidx) +{ +	struct sk_buff *oskb; + +	if (q->head == q->tail) +		return false; + +	oskb = choke_peek_random(q, pidx); +	if (q->filter_list) +		return choke_get_classid(nskb) == choke_get_classid(oskb); + +	return choke_match_flow(oskb, nskb); +} + +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	const struct red_parms *p = &q->parms; +	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + +	if (q->filter_list) { +		/* If using external classifiers, get result and record it. */ +		if (!choke_classify(skb, sch, &ret)) +			goto other_drop;	/* Packet was eaten by filter */ +	} + +	choke_skb_cb(skb)->keys_valid = 0; +	/* Compute average queue usage (see RED) */ +	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); +	if (red_is_idling(&q->vars)) +		red_end_of_idle_period(&q->vars); + +	/* Is queue small? */ +	if (q->vars.qavg <= p->qth_min) +		q->vars.qcount = -1; +	else { +		unsigned int idx; + +		/* Draw a packet at random from queue and compare flow */ +		if (choke_match_random(q, skb, &idx)) { +			q->stats.matched++; +			choke_drop_by_idx(sch, idx); +			goto congestion_drop; +		} + +		/* Queue is large, always mark/drop */ +		if (q->vars.qavg > p->qth_max) { +			q->vars.qcount = -1; + +			sch->qstats.overlimits++; +			if (use_harddrop(q) || !use_ecn(q) || +			    !INET_ECN_set_ce(skb)) { +				q->stats.forced_drop++; +				goto congestion_drop; +			} + +			q->stats.forced_mark++; +		} else if (++q->vars.qcount) { +			if (red_mark_probability(p, &q->vars, q->vars.qavg)) { +				q->vars.qcount = 0; +				q->vars.qR = red_random(p); + +				sch->qstats.overlimits++; +				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { +					q->stats.prob_drop++; +					goto congestion_drop; +				} + +				q->stats.prob_mark++; +			} +		} else +			q->vars.qR = red_random(p); +	} + +	/* Admit new packet */ +	if (sch->q.qlen < q->limit) { +		q->tab[q->tail] = skb; +		q->tail = (q->tail + 1) & q->tab_mask; +		++sch->q.qlen; +		sch->qstats.backlog += qdisc_pkt_len(skb); +		return NET_XMIT_SUCCESS; +	} + +	q->stats.pdrop++; +	return qdisc_drop(skb, sch); + +congestion_drop: +	qdisc_drop(skb, sch); +	return NET_XMIT_CN; + +other_drop: +	if (ret & __NET_XMIT_BYPASS) +		sch->qstats.drops++; +	kfree_skb(skb); +	return ret; +} + +static struct sk_buff *choke_dequeue(struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb; + +	if (q->head == q->tail) { +		if (!red_is_idling(&q->vars)) +			red_start_of_idle_period(&q->vars); +		return NULL; +	} + +	skb = q->tab[q->head]; +	q->tab[q->head] = NULL; +	choke_zap_head_holes(q); +	--sch->q.qlen; +	sch->qstats.backlog -= qdisc_pkt_len(skb); +	qdisc_bstats_update(sch, skb); + +	return skb; +} + +static unsigned int choke_drop(struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	unsigned int len; + +	len = qdisc_queue_drop(sch); +	if (len > 0) +		q->stats.other++; +	else { +		if (!red_is_idling(&q->vars)) +			red_start_of_idle_period(&q->vars); +	} + +	return len; +} + +static void choke_reset(struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); + +	red_restart(&q->vars); +} + +static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { +	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) }, +	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE }, +	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 }, +}; + + +static void choke_free(void *addr) +{ +	kvfree(addr); +} + +static int choke_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_CHOKE_MAX + 1]; +	const struct tc_red_qopt *ctl; +	int err; +	struct sk_buff **old = NULL; +	unsigned int mask; +	u32 max_P; + +	if (opt == NULL) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy); +	if (err < 0) +		return err; + +	if (tb[TCA_CHOKE_PARMS] == NULL || +	    tb[TCA_CHOKE_STAB] == NULL) +		return -EINVAL; + +	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + +	ctl = nla_data(tb[TCA_CHOKE_PARMS]); + +	if (ctl->limit > CHOKE_MAX_QUEUE) +		return -EINVAL; + +	mask = roundup_pow_of_two(ctl->limit + 1) - 1; +	if (mask != q->tab_mask) { +		struct sk_buff **ntab; + +		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), +			       GFP_KERNEL | __GFP_NOWARN); +		if (!ntab) +			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); +		if (!ntab) +			return -ENOMEM; + +		sch_tree_lock(sch); +		old = q->tab; +		if (old) { +			unsigned int oqlen = sch->q.qlen, tail = 0; + +			while (q->head != q->tail) { +				struct sk_buff *skb = q->tab[q->head]; + +				q->head = (q->head + 1) & q->tab_mask; +				if (!skb) +					continue; +				if (tail < mask) { +					ntab[tail++] = skb; +					continue; +				} +				sch->qstats.backlog -= qdisc_pkt_len(skb); +				--sch->q.qlen; +				qdisc_drop(skb, sch); +			} +			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen); +			q->head = 0; +			q->tail = tail; +		} + +		q->tab_mask = mask; +		q->tab = ntab; +	} else +		sch_tree_lock(sch); + +	q->flags = ctl->flags; +	q->limit = ctl->limit; + +	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, +		      ctl->Plog, ctl->Scell_log, +		      nla_data(tb[TCA_CHOKE_STAB]), +		      max_P); +	red_set_vars(&q->vars); + +	if (q->head == q->tail) +		red_end_of_idle_period(&q->vars); + +	sch_tree_unlock(sch); +	choke_free(old); +	return 0; +} + +static int choke_init(struct Qdisc *sch, struct nlattr *opt) +{ +	return choke_change(sch, opt); +} + +static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts = NULL; +	struct tc_red_qopt opt = { +		.limit		= q->limit, +		.flags		= q->flags, +		.qth_min	= q->parms.qth_min >> q->parms.Wlog, +		.qth_max	= q->parms.qth_max >> q->parms.Wlog, +		.Wlog		= q->parms.Wlog, +		.Plog		= q->parms.Plog, +		.Scell_log	= q->parms.Scell_log, +	}; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || +	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) +		goto nla_put_failure; +	return nla_nest_end(skb, opts); + +nla_put_failure: +	nla_nest_cancel(skb, opts); +	return -EMSGSIZE; +} + +static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct choke_sched_data *q = qdisc_priv(sch); +	struct tc_choke_xstats st = { +		.early	= q->stats.prob_drop + q->stats.forced_drop, +		.marked	= q->stats.prob_mark + q->stats.forced_mark, +		.pdrop	= q->stats.pdrop, +		.other	= q->stats.other, +		.matched = q->stats.matched, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void choke_destroy(struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); + +	tcf_destroy_chain(&q->filter_list); +	choke_free(q->tab); +} + +static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg) +{ +	return NULL; +} + +static unsigned long choke_get(struct Qdisc *sch, u32 classid) +{ +	return 0; +} + +static void choke_put(struct Qdisc *q, unsigned long cl) +{ +} + +static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent, +				u32 classid) +{ +	return 0; +} + +static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl) +{ +	struct choke_sched_data *q = qdisc_priv(sch); + +	if (cl) +		return NULL; +	return &q->filter_list; +} + +static int choke_dump_class(struct Qdisc *sch, unsigned long cl, +			  struct sk_buff *skb, struct tcmsg *tcm) +{ +	tcm->tcm_handle |= TC_H_MIN(cl); +	return 0; +} + +static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ +	if (!arg->stop) { +		if (arg->fn(sch, 1, arg) < 0) { +			arg->stop = 1; +			return; +		} +		arg->count++; +	} +} + +static const struct Qdisc_class_ops choke_class_ops = { +	.leaf		=	choke_leaf, +	.get		=	choke_get, +	.put		=	choke_put, +	.tcf_chain	=	choke_find_tcf, +	.bind_tcf	=	choke_bind, +	.unbind_tcf	=	choke_put, +	.dump		=	choke_dump_class, +	.walk		=	choke_walk, +}; + +static struct sk_buff *choke_peek_head(struct Qdisc *sch) +{ +	struct choke_sched_data *q = qdisc_priv(sch); + +	return (q->head != q->tail) ? q->tab[q->head] : NULL; +} + +static struct Qdisc_ops choke_qdisc_ops __read_mostly = { +	.id		=	"choke", +	.priv_size	=	sizeof(struct choke_sched_data), + +	.enqueue	=	choke_enqueue, +	.dequeue	=	choke_dequeue, +	.peek		=	choke_peek_head, +	.drop		=	choke_drop, +	.init		=	choke_init, +	.destroy	=	choke_destroy, +	.reset		=	choke_reset, +	.change		=	choke_change, +	.dump		=	choke_dump, +	.dump_stats	=	choke_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init choke_module_init(void) +{ +	return register_qdisc(&choke_qdisc_ops); +} + +static void __exit choke_module_exit(void) +{ +	unregister_qdisc(&choke_qdisc_ops); +} + +module_init(choke_module_init) +module_exit(choke_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 00000000000..2f9ab17db85 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,276 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm + * + *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * + *  Implemented on linux by : + *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + *    notice, this list of conditions, and the following disclaimer, + *    without modification. + * 2. Redistributions in binary form must reproduce the above copyright + *    notice, this list of conditions and the following disclaimer in the + *    documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + *    derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/prefetch.h> +#include <net/pkt_sched.h> +#include <net/codel.h> + + +#define DEFAULT_CODEL_LIMIT 1000 + +struct codel_sched_data { +	struct codel_params	params; +	struct codel_vars	vars; +	struct codel_stats	stats; +	u32			drop_overlimit; +}; + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ +	struct sk_buff *skb = __skb_dequeue(&sch->q); + +	prefetch(&skb->end); /* we'll need skb_shinfo() */ +	return skb; +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ +	struct codel_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb; + +	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + +	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, +	 * or HTB crashes. Defer it for next round. +	 */ +	if (q->stats.drop_count && sch->q.qlen) { +		qdisc_tree_decrease_qlen(sch, q->stats.drop_count); +		q->stats.drop_count = 0; +	} +	if (skb) +		qdisc_bstats_update(sch, skb); +	return skb; +} + +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct codel_sched_data *q; + +	if (likely(qdisc_qlen(sch) < sch->limit)) { +		codel_set_enqueue_time(skb); +		return qdisc_enqueue_tail(skb, sch); +	} +	q = qdisc_priv(sch); +	q->drop_overlimit++; +	return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { +	[TCA_CODEL_TARGET]	= { .type = NLA_U32 }, +	[TCA_CODEL_LIMIT]	= { .type = NLA_U32 }, +	[TCA_CODEL_INTERVAL]	= { .type = NLA_U32 }, +	[TCA_CODEL_ECN]		= { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct codel_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_CODEL_MAX + 1]; +	unsigned int qlen; +	int err; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); +	if (err < 0) +		return err; + +	sch_tree_lock(sch); + +	if (tb[TCA_CODEL_TARGET]) { +		u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + +		q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; +	} + +	if (tb[TCA_CODEL_INTERVAL]) { +		u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + +		q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; +	} + +	if (tb[TCA_CODEL_LIMIT]) +		sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + +	if (tb[TCA_CODEL_ECN]) +		q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + +	qlen = sch->q.qlen; +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = __skb_dequeue(&sch->q); + +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		qdisc_drop(skb, sch); +	} +	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + +	sch_tree_unlock(sch); +	return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct codel_sched_data *q = qdisc_priv(sch); + +	sch->limit = DEFAULT_CODEL_LIMIT; + +	codel_params_init(&q->params); +	codel_vars_init(&q->vars); +	codel_stats_init(&q->stats); + +	if (opt) { +		int err = codel_change(sch, opt); + +		if (err) +			return err; +	} + +	if (sch->limit >= 1) +		sch->flags |= TCQ_F_CAN_BYPASS; +	else +		sch->flags &= ~TCQ_F_CAN_BYPASS; + +	return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct codel_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_CODEL_TARGET, +			codel_time_to_us(q->params.target)) || +	    nla_put_u32(skb, TCA_CODEL_LIMIT, +			sch->limit) || +	    nla_put_u32(skb, TCA_CODEL_INTERVAL, +			codel_time_to_us(q->params.interval)) || +	    nla_put_u32(skb, TCA_CODEL_ECN, +			q->params.ecn)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	nla_nest_cancel(skb, opts); +	return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	const struct codel_sched_data *q = qdisc_priv(sch); +	struct tc_codel_xstats st = { +		.maxpacket	= q->stats.maxpacket, +		.count		= q->vars.count, +		.lastcount	= q->vars.lastcount, +		.drop_overlimit = q->drop_overlimit, +		.ldelay		= codel_time_to_us(q->vars.ldelay), +		.dropping	= q->vars.dropping, +		.ecn_mark	= q->stats.ecn_mark, +	}; + +	if (q->vars.dropping) { +		codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + +		if (delta >= 0) +			st.drop_next = codel_time_to_us(delta); +		else +			st.drop_next = -codel_time_to_us(-delta); +	} + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ +	struct codel_sched_data *q = qdisc_priv(sch); + +	qdisc_reset_queue(sch); +	codel_vars_init(&q->vars); +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { +	.id		=	"codel", +	.priv_size	=	sizeof(struct codel_sched_data), + +	.enqueue	=	codel_qdisc_enqueue, +	.dequeue	=	codel_qdisc_dequeue, +	.peek		=	qdisc_peek_dequeued, +	.init		=	codel_init, +	.reset		=	codel_reset, +	.change 	=	codel_change, +	.dump		=	codel_dump, +	.dump_stats	=	codel_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ +	return register_qdisc(&codel_qdisc_ops); +} + +static void __exit codel_module_exit(void) +{ +	unregister_qdisc(&codel_qdisc_ops); +} + +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index aa8b5313f8c..7bbbfe11219 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class {  	struct gnet_stats_basic_packed		bstats;  	struct gnet_stats_queue		qstats; -	struct gnet_stats_rate_est	rate_est; +	struct gnet_stats_rate_est64	rate_est;  	struct list_head		alist;  	struct Qdisc			*qdisc; @@ -260,7 +260,8 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg,  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum); +	if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) +		goto nla_put_failure;  	return nla_nest_end(skb, nest);  nla_put_failure: @@ -292,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)  {  	struct drr_sched *q = qdisc_priv(sch);  	struct drr_class *cl; -	struct hlist_node *n;  	unsigned int i;  	if (arg->stop)  		return;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {  			if (arg->count < arg->skip) {  				arg->count++;  				continue; @@ -351,8 +351,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)  {  	struct drr_sched *q = qdisc_priv(sch);  	struct drr_class *cl; -	unsigned int len; -	int err; +	int err = 0;  	cl = drr_classify(skb, sch, &err);  	if (cl == NULL) { @@ -362,7 +361,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		return err;  	} -	len = qdisc_pkt_len(skb);  	err = qdisc_enqueue(skb, cl->qdisc);  	if (unlikely(err != NET_XMIT_SUCCESS)) {  		if (net_xmit_drop_count(err)) { @@ -377,11 +375,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		cl->deficit = cl->quantum;  	} -	cl->bstats.packets++; -	cl->bstats.bytes += len; -	sch->bstats.packets++; -	sch->bstats.bytes += len; -  	sch->q.qlen++;  	return err;  } @@ -398,8 +391,10 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)  	while (1) {  		cl = list_first_entry(&q->active, struct drr_class, alist);  		skb = cl->qdisc->ops->peek(cl->qdisc); -		if (skb == NULL) +		if (skb == NULL) { +			qdisc_warn_nonwc(__func__, cl->qdisc);  			goto out; +		}  		len = qdisc_pkt_len(skb);  		if (len <= cl->deficit) { @@ -407,6 +402,9 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)  			skb = qdisc_dequeue_peeked(cl->qdisc);  			if (cl->qdisc->q.qlen == 0)  				list_del(&cl->alist); + +			bstats_update(&cl->bstats, skb); +			qdisc_bstats_update(sch, skb);  			sch->q.qlen--;  			return skb;  		} @@ -454,11 +452,10 @@ static void drr_reset_qdisc(struct Qdisc *sch)  {  	struct drr_sched *q = qdisc_priv(sch);  	struct drr_class *cl; -	struct hlist_node *n;  	unsigned int i;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {  			if (cl->qdisc->q.qlen)  				list_del(&cl->alist);  			qdisc_reset(cl->qdisc); @@ -471,13 +468,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch)  {  	struct drr_sched *q = qdisc_priv(sch);  	struct drr_class *cl; -	struct hlist_node *n, *next; +	struct hlist_node *next;  	unsigned int i;  	tcf_destroy_chain(&q->filter_list);  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], +		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],  					  common.hnode)  			drr_destroy_class(sch, cl);  	} diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 1d295d62bb5..49d6ef338b5 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -47,7 +47,7 @@ struct dsmark_qdisc_data {  static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)  { -	return (index <= p->indices && index > 0); +	return index <= p->indices && index > 0;  }  /* ------------------------- Class/flow operations ------------------------- */ @@ -57,8 +57,8 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n", -		sch, p, new, old); +	pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", +		 __func__, sch, p, new, old);  	if (new == NULL) {  		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, @@ -85,8 +85,8 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)  static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)  { -	pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n", -		sch, qdisc_priv(sch), classid); +	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", +		 __func__, sch, qdisc_priv(sch), classid);  	return TC_H_MIN(classid) + 1;  } @@ -118,8 +118,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,  	int err = -EINVAL;  	u8 mask = 0; -	pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," -		"arg 0x%lx\n", sch, p, classid, parent, *arg); +	pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", +		 __func__, sch, p, classid, parent, *arg);  	if (!dsmark_valid_index(p, *arg)) {  		err = -ENOENT; @@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,  		mask = nla_get_u8(tb[TCA_DSMARK_MASK]);  	if (tb[TCA_DSMARK_VALUE]) -		p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); +		p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);  	if (tb[TCA_DSMARK_MASK]) -		p->mask[*arg-1] = mask; +		p->mask[*arg - 1] = mask;  	err = 0; @@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)  	if (!dsmark_valid_index(p, arg))  		return -EINVAL; -	p->mask[arg-1] = 0xff; -	p->value[arg-1] = 0; +	p->mask[arg - 1] = 0xff; +	p->value[arg - 1] = 0;  	return 0;  } @@ -166,7 +166,8 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	int i; -	pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); +	pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", +		 __func__, sch, p, walker);  	if (walker->stop)  		return; @@ -175,7 +176,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)  		if (p->mask[i] == 0xff && !p->value[i])  			goto ignore;  		if (walker->count >= walker->skip) { -			if (walker->fn(sch, i+1, walker) < 0) { +			if (walker->fn(sch, i + 1, walker) < 0) {  				walker->stop = 1;  				break;  			} @@ -199,7 +200,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	int err; -	pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); +	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);  	if (p->set_tc_index) {  		switch (skb->protocol) { @@ -260,15 +261,12 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		return err;  	} -	sch->bstats.bytes += qdisc_pkt_len(skb); -	sch->bstats.packets++;  	sch->q.qlen++;  	return NET_XMIT_SUCCESS;  drop: -	kfree_skb(skb); -	sch->qstats.drops++; +	qdisc_drop(skb, sch);  	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;  } @@ -278,12 +276,13 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)  	struct sk_buff *skb;  	u32 index; -	pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	skb = p->q->ops->dequeue(p->q);  	if (skb == NULL)  		return NULL; +	qdisc_bstats_update(sch, skb);  	sch->q.qlen--;  	index = skb->tc_index & (p->indices - 1); @@ -305,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)  		 * and don't need yet another qdisc as a bypass.  		 */  		if (p->mask[index] != 0xff || p->value[index]) -			printk(KERN_WARNING -			       "dsmark_dequeue: unsupported protocol %d\n", -			       ntohs(skb->protocol)); +			pr_warn("%s: unsupported protocol %d\n", +				__func__, ntohs(skb->protocol));  		break;  	} @@ -318,7 +316,7 @@ static struct sk_buff *dsmark_peek(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	return p->q->ops->peek(p->q);  } @@ -328,7 +326,7 @@ static unsigned int dsmark_drop(struct Qdisc *sch)  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	unsigned int len; -	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	if (p->q->ops->drop == NULL)  		return 0; @@ -349,7 +347,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)  	u16 indices;  	u8 *mask; -	pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); +	pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);  	if (!opt)  		goto errout; @@ -387,7 +385,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)  	if (p->q == NULL)  		p->q = &noop_qdisc; -	pr_debug("dsmark_init: qdisc %p\n", p->q); +	pr_debug("%s: qdisc %p\n", __func__, p->q);  	err = 0;  errout: @@ -398,7 +396,7 @@ static void dsmark_reset(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	qdisc_reset(p->q);  	sch->q.qlen = 0;  } @@ -407,7 +405,7 @@ static void dsmark_destroy(struct Qdisc *sch)  {  	struct dsmark_qdisc_data *p = qdisc_priv(sch); -	pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p); +	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);  	tcf_destroy_chain(&p->filter_list);  	qdisc_destroy(p->q); @@ -420,19 +418,20 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,  	struct dsmark_qdisc_data *p = qdisc_priv(sch);  	struct nlattr *opts = NULL; -	pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl); +	pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);  	if (!dsmark_valid_index(p, cl))  		return -EINVAL; -	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1); +	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);  	tcm->tcm_info = p->q->handle;  	opts = nla_nest_start(skb, TCA_OPTIONS);  	if (opts == NULL)  		goto nla_put_failure; -	NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]); -	NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]); +	if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || +	    nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) +		goto nla_put_failure;  	return nla_nest_end(skb, opts); @@ -449,13 +448,16 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)  	opts = nla_nest_start(skb, TCA_OPTIONS);  	if (opts == NULL)  		goto nla_put_failure; -	NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); +	if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) +		goto nla_put_failure; -	if (p->default_index != NO_DEFAULT_INDEX) -		NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); +	if (p->default_index != NO_DEFAULT_INDEX && +	    nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) +		goto nla_put_failure; -	if (p->set_tc_index) -		NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); +	if (p->set_tc_index && +	    nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) +		goto nla_put_failure;  	return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 4dfecb0cba3..e15a9eb2908 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,46 +19,30 @@  /* 1 band FIFO pseudo-"scheduler" */ -struct fifo_sched_data +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)  { -	u32 limit; -}; - -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) -{ -	struct fifo_sched_data *q = qdisc_priv(sch); - -	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= q->limit)) +	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))  		return qdisc_enqueue_tail(skb, sch);  	return qdisc_reshape_fail(skb, sch);  } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)  { -	struct fifo_sched_data *q = qdisc_priv(sch); - -	if (likely(skb_queue_len(&sch->q) < q->limit)) +	if (likely(skb_queue_len(&sch->q) < sch->limit))  		return qdisc_enqueue_tail(skb, sch);  	return qdisc_reshape_fail(skb, sch);  } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)  { -	struct sk_buff *skb_head; -	struct fifo_sched_data *q = qdisc_priv(sch); - -	if (likely(skb_queue_len(&sch->q) < q->limit)) +	if (likely(skb_queue_len(&sch->q) < sch->limit))  		return qdisc_enqueue_tail(skb, sch);  	/* queue full, remove one skb to fulfill the limit */ -	skb_head = qdisc_dequeue_head(sch); -	sch->bstats.bytes -= qdisc_pkt_len(skb_head); -	sch->bstats.packets--; +	__qdisc_queue_drop_head(sch, &sch->q);  	sch->qstats.drops++; -	kfree_skb(skb_head); -  	qdisc_enqueue_tail(skb, sch);  	return NET_XMIT_CN; @@ -66,33 +50,43 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)  static int fifo_init(struct Qdisc *sch, struct nlattr *opt)  { -	struct fifo_sched_data *q = qdisc_priv(sch); +	bool bypass; +	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;  	if (opt == NULL) {  		u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; -		if (sch->ops == &bfifo_qdisc_ops) +		if (is_bfifo)  			limit *= psched_mtu(qdisc_dev(sch)); -		q->limit = limit; +		sch->limit = limit;  	} else {  		struct tc_fifo_qopt *ctl = nla_data(opt);  		if (nla_len(opt) < sizeof(*ctl))  			return -EINVAL; -		q->limit = ctl->limit; +		sch->limit = ctl->limit;  	} +	if (is_bfifo) +		bypass = sch->limit >= psched_mtu(qdisc_dev(sch)); +	else +		bypass = sch->limit >= 1; + +	if (bypass) +		sch->flags |= TCQ_F_CAN_BYPASS; +	else +		sch->flags &= ~TCQ_F_CAN_BYPASS;  	return 0;  }  static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)  { -	struct fifo_sched_data *q = qdisc_priv(sch); -	struct tc_fifo_qopt opt = { .limit = q->limit }; +	struct tc_fifo_qopt opt = { .limit = sch->limit }; -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -101,7 +95,7 @@ nla_put_failure:  struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {  	.id		=	"pfifo", -	.priv_size	=	sizeof(struct fifo_sched_data), +	.priv_size	=	0,  	.enqueue	=	pfifo_enqueue,  	.dequeue	=	qdisc_dequeue_head,  	.peek		=	qdisc_peek_head, @@ -116,7 +110,7 @@ EXPORT_SYMBOL(pfifo_qdisc_ops);  struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {  	.id		=	"bfifo", -	.priv_size	=	sizeof(struct fifo_sched_data), +	.priv_size	=	0,  	.enqueue	=	bfifo_enqueue,  	.dequeue	=	qdisc_dequeue_head,  	.peek		=	qdisc_peek_head, @@ -131,7 +125,7 @@ EXPORT_SYMBOL(bfifo_qdisc_ops);  struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {  	.id		=	"pfifo_head_drop", -	.priv_size	=	sizeof(struct fifo_sched_data), +	.priv_size	=	0,  	.enqueue	=	pfifo_tail_enqueue,  	.dequeue	=	qdisc_dequeue_head,  	.peek		=	qdisc_peek_head, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 00000000000..ba32c2b005d --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,849 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com> + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *  Meant to be mostly used for localy generated traffic : + *  Fast classification depends on skb->sk being set before reaching us. + *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + *  All packets belonging to a socket are considered as a 'flow'. + * + *  Flows are dynamically allocated and stored in a hash table of RB trees + *  They are also part of one Round Robin 'queues' (new or old flows) + * + *  Burst avoidance (aka pacing) capability : + * + *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + *  bunch of packets, and this packet scheduler adds delay between + *  packets to respect rate limitation. + * + *  enqueue() : + *   - lookup one RB tree (out of 1024 or more) to find the flow. + *     If non existent flow, create it, add it to the tree. + *     Add skb to the per flow list of skb (fifo). + *   - Use a special fifo for high prio packets + * + *  dequeue() : serves flows in Round Robin + *  Note : When a flow becomes empty, we do not immediately remove it from + *  rb trees, for performance reasons (its expected to send additional packets, + *  or SLAB cache will reuse socket for another flow) + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { +	struct sk_buff	*head;		/* list of skbs for this flow : first skb */ +	union { +		struct sk_buff *tail;	/* last skb in the list */ +		unsigned long  age;	/* jiffies when flow was emptied, for gc */ +	}; +	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */ +	struct sock	*sk; +	int		qlen;		/* number of packets in flow queue */ +	int		credit; +	u32		socket_hash;	/* sk_hash */ +	struct fq_flow *next;		/* next pointer in RR lists, or &detached */ + +	struct rb_node  rate_node;	/* anchor in q->delayed tree */ +	u64		time_next_packet; +}; + +struct fq_flow_head { +	struct fq_flow *first; +	struct fq_flow *last; +}; + +struct fq_sched_data { +	struct fq_flow_head new_flows; + +	struct fq_flow_head old_flows; + +	struct rb_root	delayed;	/* for rate limited flows */ +	u64		time_next_delayed_flow; + +	struct fq_flow	internal;	/* for non classified or high prio packets */ +	u32		quantum; +	u32		initial_quantum; +	u32		flow_refill_delay; +	u32		flow_max_rate;	/* optional max rate per flow */ +	u32		flow_plimit;	/* max packets per flow */ +	struct rb_root	*fq_root; +	u8		rate_enable; +	u8		fq_trees_log; + +	u32		flows; +	u32		inactive_flows; +	u32		throttled_flows; + +	u64		stat_gc_flows; +	u64		stat_internal_packets; +	u64		stat_tcp_retrans; +	u64		stat_throttled; +	u64		stat_flows_plimit; +	u64		stat_pkts_too_long; +	u64		stat_allocation_errors; +	struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ +	f->next = &detached; +	f->age = jiffies; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ +	return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ +	struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + +	while (*p) { +		struct fq_flow *aux; + +		parent = *p; +		aux = container_of(parent, struct fq_flow, rate_node); +		if (f->time_next_packet >= aux->time_next_packet) +			p = &parent->rb_right; +		else +			p = &parent->rb_left; +	} +	rb_link_node(&f->rate_node, parent, p); +	rb_insert_color(&f->rate_node, &q->delayed); +	q->throttled_flows++; +	q->stat_throttled++; + +	f->next = &throttled; +	if (q->time_next_delayed_flow > f->time_next_packet) +		q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ +	if (head->first) +		head->last->next = flow; +	else +		head->first = flow; +	head->last = flow; +	flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ +	return fq_flow_is_detached(f) && +	       time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, +		  struct rb_root *root, +		  struct sock *sk) +{ +	struct fq_flow *f, *tofree[FQ_GC_MAX]; +	struct rb_node **p, *parent; +	int fcnt = 0; + +	p = &root->rb_node; +	parent = NULL; +	while (*p) { +		parent = *p; + +		f = container_of(parent, struct fq_flow, fq_node); +		if (f->sk == sk) +			break; + +		if (fq_gc_candidate(f)) { +			tofree[fcnt++] = f; +			if (fcnt == FQ_GC_MAX) +				break; +		} + +		if (f->sk > sk) +			p = &parent->rb_right; +		else +			p = &parent->rb_left; +	} + +	q->flows -= fcnt; +	q->inactive_flows -= fcnt; +	q->stat_gc_flows += fcnt; +	while (fcnt) { +		struct fq_flow *f = tofree[--fcnt]; + +		rb_erase(&f->fq_node, root); +		kmem_cache_free(fq_flow_cachep, f); +	} +} + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ +	struct rb_node **p, *parent; +	struct sock *sk = skb->sk; +	struct rb_root *root; +	struct fq_flow *f; + +	/* warning: no starvation prevention... */ +	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) +		return &q->internal; + +	if (unlikely(!sk)) { +		/* By forcing low order bit to 1, we make sure to not +		 * collide with a local flow (socket pointers are word aligned) +		 */ +		sk = (struct sock *)(skb_get_hash(skb) | 1L); +	} + +	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + +	if (q->flows >= (2U << q->fq_trees_log) && +	    q->inactive_flows > q->flows/2) +		fq_gc(q, root, sk); + +	p = &root->rb_node; +	parent = NULL; +	while (*p) { +		parent = *p; + +		f = container_of(parent, struct fq_flow, fq_node); +		if (f->sk == sk) { +			/* socket might have been reallocated, so check +			 * if its sk_hash is the same. +			 * It not, we need to refill credit with +			 * initial quantum +			 */ +			if (unlikely(skb->sk && +				     f->socket_hash != sk->sk_hash)) { +				f->credit = q->initial_quantum; +				f->socket_hash = sk->sk_hash; +				f->time_next_packet = 0ULL; +			} +			return f; +		} +		if (f->sk > sk) +			p = &parent->rb_right; +		else +			p = &parent->rb_left; +	} + +	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); +	if (unlikely(!f)) { +		q->stat_allocation_errors++; +		return &q->internal; +	} +	fq_flow_set_detached(f); +	f->sk = sk; +	if (skb->sk) +		f->socket_hash = sk->sk_hash; +	f->credit = q->initial_quantum; + +	rb_link_node(&f->fq_node, parent, p); +	rb_insert_color(&f->fq_node, root); + +	q->flows++; +	q->inactive_flows++; +	return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +{ +	struct sk_buff *skb = flow->head; + +	if (skb) { +		flow->head = skb->next; +		skb->next = NULL; +		flow->qlen--; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		sch->q.qlen--; +	} +	return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ +	return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head->  [retrans pkt 1] + *         [retrans pkt 2] + *         [ normal pkt 1] + *         [ normal pkt 2] + *         [ normal pkt 3] + * tail->  [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ +	struct sk_buff *prev, *head = flow->head; + +	skb->next = NULL; +	if (!head) { +		flow->head = skb; +		flow->tail = skb; +		return; +	} +	if (likely(!skb_is_retransmit(skb))) { +		flow->tail->next = skb; +		flow->tail = skb; +		return; +	} + +	/* This skb is a tcp retransmit, +	 * find the last retrans packet in the queue +	 */ +	prev = NULL; +	while (skb_is_retransmit(head)) { +		prev = head; +		head = head->next; +		if (!head) +			break; +	} +	if (!prev) { /* no rtx packet in queue, become the new head */ +		skb->next = flow->head; +		flow->head = skb; +	} else { +		if (prev == flow->tail) +			flow->tail = skb; +		else +			skb->next = prev->next; +		prev->next = skb; +	} +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	struct fq_flow *f; + +	if (unlikely(sch->q.qlen >= sch->limit)) +		return qdisc_drop(skb, sch); + +	f = fq_classify(skb, q); +	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { +		q->stat_flows_plimit++; +		return qdisc_drop(skb, sch); +	} + +	f->qlen++; +	if (skb_is_retransmit(skb)) +		q->stat_tcp_retrans++; +	sch->qstats.backlog += qdisc_pkt_len(skb); +	if (fq_flow_is_detached(f)) { +		fq_flow_add_tail(&q->new_flows, f); +		if (time_after(jiffies, f->age + q->flow_refill_delay)) +			f->credit = max_t(u32, f->credit, q->quantum); +		q->inactive_flows--; +		qdisc_unthrottled(sch); +	} + +	/* Note: this overwrites f->age */ +	flow_queue_add(f, skb); + +	if (unlikely(f == &q->internal)) { +		q->stat_internal_packets++; +		qdisc_unthrottled(sch); +	} +	sch->q.qlen++; + +	return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ +	struct rb_node *p; + +	if (q->time_next_delayed_flow > now) +		return; + +	q->time_next_delayed_flow = ~0ULL; +	while ((p = rb_first(&q->delayed)) != NULL) { +		struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + +		if (f->time_next_packet > now) { +			q->time_next_delayed_flow = f->time_next_packet; +			break; +		} +		rb_erase(p, &q->delayed); +		q->throttled_flows--; +		fq_flow_add_tail(&q->old_flows, f); +	} +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	u64 now = ktime_to_ns(ktime_get()); +	struct fq_flow_head *head; +	struct sk_buff *skb; +	struct fq_flow *f; +	u32 rate; + +	skb = fq_dequeue_head(sch, &q->internal); +	if (skb) +		goto out; +	fq_check_throttled(q, now); +begin: +	head = &q->new_flows; +	if (!head->first) { +		head = &q->old_flows; +		if (!head->first) { +			if (q->time_next_delayed_flow != ~0ULL) +				qdisc_watchdog_schedule_ns(&q->watchdog, +							   q->time_next_delayed_flow); +			return NULL; +		} +	} +	f = head->first; + +	if (f->credit <= 0) { +		f->credit += q->quantum; +		head->first = f->next; +		fq_flow_add_tail(&q->old_flows, f); +		goto begin; +	} + +	if (unlikely(f->head && now < f->time_next_packet)) { +		head->first = f->next; +		fq_flow_set_throttled(q, f); +		goto begin; +	} + +	skb = fq_dequeue_head(sch, f); +	if (!skb) { +		head->first = f->next; +		/* force a pass through old_flows to prevent starvation */ +		if ((head == &q->new_flows) && q->old_flows.first) { +			fq_flow_add_tail(&q->old_flows, f); +		} else { +			fq_flow_set_detached(f); +			q->inactive_flows++; +		} +		goto begin; +	} +	prefetch(&skb->end); +	f->time_next_packet = now; +	f->credit -= qdisc_pkt_len(skb); + +	if (f->credit > 0 || !q->rate_enable) +		goto out; + +	rate = q->flow_max_rate; +	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) +		rate = min(skb->sk->sk_pacing_rate, rate); + +	if (rate != ~0U) { +		u32 plen = max(qdisc_pkt_len(skb), q->quantum); +		u64 len = (u64)plen * NSEC_PER_SEC; + +		if (likely(rate)) +			do_div(len, rate); +		/* Since socket rate can change later, +		 * clamp the delay to 125 ms. +		 * TODO: maybe segment the too big skb, as in commit +		 * e43ac79a4bc ("sch_tbf: segment too big GSO packets") +		 */ +		if (unlikely(len > 125 * NSEC_PER_MSEC)) { +			len = 125 * NSEC_PER_MSEC; +			q->stat_pkts_too_long++; +		} + +		f->time_next_packet = now + len; +	} +out: +	qdisc_bstats_update(sch, skb); +	qdisc_unthrottled(sch); +	return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	struct rb_root *root; +	struct sk_buff *skb; +	struct rb_node *p; +	struct fq_flow *f; +	unsigned int idx; + +	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) +		kfree_skb(skb); + +	if (!q->fq_root) +		return; + +	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { +		root = &q->fq_root[idx]; +		while ((p = rb_first(root)) != NULL) { +			f = container_of(p, struct fq_flow, fq_node); +			rb_erase(p, root); + +			while ((skb = fq_dequeue_head(sch, f)) != NULL) +				kfree_skb(skb); + +			kmem_cache_free(fq_flow_cachep, f); +		} +	} +	q->new_flows.first	= NULL; +	q->old_flows.first	= NULL; +	q->delayed		= RB_ROOT; +	q->flows		= 0; +	q->inactive_flows	= 0; +	q->throttled_flows	= 0; +} + +static void fq_rehash(struct fq_sched_data *q, +		      struct rb_root *old_array, u32 old_log, +		      struct rb_root *new_array, u32 new_log) +{ +	struct rb_node *op, **np, *parent; +	struct rb_root *oroot, *nroot; +	struct fq_flow *of, *nf; +	int fcnt = 0; +	u32 idx; + +	for (idx = 0; idx < (1U << old_log); idx++) { +		oroot = &old_array[idx]; +		while ((op = rb_first(oroot)) != NULL) { +			rb_erase(op, oroot); +			of = container_of(op, struct fq_flow, fq_node); +			if (fq_gc_candidate(of)) { +				fcnt++; +				kmem_cache_free(fq_flow_cachep, of); +				continue; +			} +			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + +			np = &nroot->rb_node; +			parent = NULL; +			while (*np) { +				parent = *np; + +				nf = container_of(parent, struct fq_flow, fq_node); +				BUG_ON(nf->sk == of->sk); + +				if (nf->sk > of->sk) +					np = &parent->rb_right; +				else +					np = &parent->rb_left; +			} + +			rb_link_node(&of->fq_node, parent, np); +			rb_insert_color(&of->fq_node, nroot); +		} +	} +	q->flows -= fcnt; +	q->inactive_flows -= fcnt; +	q->stat_gc_flows += fcnt; +} + +static void *fq_alloc_node(size_t sz, int node) +{ +	void *ptr; + +	ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); +	if (!ptr) +		ptr = vmalloc_node(sz, node); +	return ptr; +} + +static void fq_free(void *addr) +{ +	kvfree(addr); +} + +static int fq_resize(struct Qdisc *sch, u32 log) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	struct rb_root *array; +	void *old_fq_root; +	u32 idx; + +	if (q->fq_root && log == q->fq_trees_log) +		return 0; + +	/* If XPS was setup, we can allocate memory on right NUMA node */ +	array = fq_alloc_node(sizeof(struct rb_root) << log, +			      netdev_queue_numa_node_read(sch->dev_queue)); +	if (!array) +		return -ENOMEM; + +	for (idx = 0; idx < (1U << log); idx++) +		array[idx] = RB_ROOT; + +	sch_tree_lock(sch); + +	old_fq_root = q->fq_root; +	if (old_fq_root) +		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); + +	q->fq_root = array; +	q->fq_trees_log = log; + +	sch_tree_unlock(sch); + +	fq_free(old_fq_root); + +	return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { +	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 }, +	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 }, +	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 }, +	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 }, +	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 }, +	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_FQ_MAX + 1]; +	int err, drop_count = 0; +	u32 fq_log; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); +	if (err < 0) +		return err; + +	sch_tree_lock(sch); + +	fq_log = q->fq_trees_log; + +	if (tb[TCA_FQ_BUCKETS_LOG]) { +		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + +		if (nval >= 1 && nval <= ilog2(256*1024)) +			fq_log = nval; +		else +			err = -EINVAL; +	} +	if (tb[TCA_FQ_PLIMIT]) +		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + +	if (tb[TCA_FQ_FLOW_PLIMIT]) +		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + +	if (tb[TCA_FQ_QUANTUM]) +		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + +	if (tb[TCA_FQ_INITIAL_QUANTUM]) +		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + +	if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) +		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", +				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); + +	if (tb[TCA_FQ_FLOW_MAX_RATE]) +		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + +	if (tb[TCA_FQ_RATE_ENABLE]) { +		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + +		if (enable <= 1) +			q->rate_enable = enable; +		else +			err = -EINVAL; +	} + +	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { +		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; + +		q->flow_refill_delay = usecs_to_jiffies(usecs_delay); +	} + +	if (!err) { +		sch_tree_unlock(sch); +		err = fq_resize(sch, fq_log); +		sch_tree_lock(sch); +	} +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = fq_dequeue(sch); + +		if (!skb) +			break; +		kfree_skb(skb); +		drop_count++; +	} +	qdisc_tree_decrease_qlen(sch, drop_count); + +	sch_tree_unlock(sch); +	return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ +	struct fq_sched_data *q = qdisc_priv(sch); + +	fq_reset(sch); +	fq_free(q->fq_root); +	qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	int err; + +	sch->limit		= 10000; +	q->flow_plimit		= 100; +	q->quantum		= 2 * psched_mtu(qdisc_dev(sch)); +	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch)); +	q->flow_refill_delay	= msecs_to_jiffies(40); +	q->flow_max_rate	= ~0U; +	q->rate_enable		= 1; +	q->new_flows.first	= NULL; +	q->old_flows.first	= NULL; +	q->delayed		= RB_ROOT; +	q->fq_root		= NULL; +	q->fq_trees_log		= ilog2(1024); +	qdisc_watchdog_init(&q->watchdog, sch); + +	if (opt) +		err = fq_change(sch, opt); +	else +		err = fq_resize(sch, q->fq_trees_log); + +	return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + +	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || +	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || +	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || +	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || +	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || +	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || +	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, +			jiffies_to_usecs(q->flow_refill_delay)) || +	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct fq_sched_data *q = qdisc_priv(sch); +	u64 now = ktime_to_ns(ktime_get()); +	struct tc_fq_qd_stats st = { +		.gc_flows		= q->stat_gc_flows, +		.highprio_packets	= q->stat_internal_packets, +		.tcp_retrans		= q->stat_tcp_retrans, +		.throttled		= q->stat_throttled, +		.flows_plimit		= q->stat_flows_plimit, +		.pkts_too_long		= q->stat_pkts_too_long, +		.allocation_errors	= q->stat_allocation_errors, +		.flows			= q->flows, +		.inactive_flows		= q->inactive_flows, +		.throttled_flows	= q->throttled_flows, +		.time_next_delayed_flow	= q->time_next_delayed_flow - now, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { +	.id		=	"fq", +	.priv_size	=	sizeof(struct fq_sched_data), + +	.enqueue	=	fq_enqueue, +	.dequeue	=	fq_dequeue, +	.peek		=	qdisc_peek_dequeued, +	.init		=	fq_init, +	.reset		=	fq_reset, +	.destroy	=	fq_destroy, +	.change		=	fq_change, +	.dump		=	fq_dump, +	.dump_stats	=	fq_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ +	int ret; + +	fq_flow_cachep = kmem_cache_create("fq_flow_cache", +					   sizeof(struct fq_flow), +					   0, 0, NULL); +	if (!fq_flow_cachep) +		return -ENOMEM; + +	ret = register_qdisc(&fq_qdisc_ops); +	if (ret) +		kmem_cache_destroy(fq_flow_cachep); +	return ret; +} + +static void __exit fq_module_exit(void) +{ +	unregister_qdisc(&fq_qdisc_ops); +	kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c new file mode 100644 index 00000000000..063b726bf1f --- /dev/null +++ b/net/sched/sch_fq_codel.c @@ -0,0 +1,620 @@ +/* + * Fair Queue CoDel discipline + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/codel.h> + +/*	Fair Queue CoDel. + * + * Principles : + * Packets are classified (internal classifier or external) on flows. + * This is a Stochastic model (as we use a hash, several flows + *			       might be hashed on same slot) + * Each flow has a CoDel managed queue. + * Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * + * For a given flow, packets are not reordered (CoDel uses a FIFO) + * head drops only. + * ECN capability is on by default. + * Low memory footprint (64 bytes per flow) + */ + +struct fq_codel_flow { +	struct sk_buff	  *head; +	struct sk_buff	  *tail; +	struct list_head  flowchain; +	int		  deficit; +	u32		  dropped; /* number of drops (or ECN marks) on this flow */ +	struct codel_vars cvars; +}; /* please try to keep this structure <= 64 bytes */ + +struct fq_codel_sched_data { +	struct tcf_proto *filter_list;	/* optional external classifier */ +	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */ +	u32		*backlogs;	/* backlog table [flows_cnt] */ +	u32		flows_cnt;	/* number of flows */ +	u32		perturbation;	/* hash perturbation */ +	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */ +	struct codel_params cparams; +	struct codel_stats cstats; +	u32		drop_overlimit; +	u32		new_flow_count; + +	struct list_head new_flows;	/* list of new flows */ +	struct list_head old_flows;	/* list of old flows */ +}; + +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, +				  const struct sk_buff *skb) +{ +	struct flow_keys keys; +	unsigned int hash; + +	skb_flow_dissect(skb, &keys); +	hash = jhash_3words((__force u32)keys.dst, +			    (__force u32)keys.src ^ keys.ip_proto, +			    (__force u32)keys.ports, q->perturbation); +	return ((u64)hash * q->flows_cnt) >> 32; +} + +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, +				      int *qerr) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct tcf_result res; +	int result; + +	if (TC_H_MAJ(skb->priority) == sch->handle && +	    TC_H_MIN(skb->priority) > 0 && +	    TC_H_MIN(skb->priority) <= q->flows_cnt) +		return TC_H_MIN(skb->priority); + +	if (!q->filter_list) +		return fq_codel_hash(q, skb) + 1; + +	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; +	result = tc_classify(skb, q->filter_list, &res); +	if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT +		switch (result) { +		case TC_ACT_STOLEN: +		case TC_ACT_QUEUED: +			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; +		case TC_ACT_SHOT: +			return 0; +		} +#endif +		if (TC_H_MIN(res.classid) <= q->flows_cnt) +			return TC_H_MIN(res.classid); +	} +	return 0; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) +{ +	struct sk_buff *skb = flow->head; + +	flow->head = skb->next; +	skb->next = NULL; +	return skb; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_codel_flow *flow, +				  struct sk_buff *skb) +{ +	if (flow->head == NULL) +		flow->head = skb; +	else +		flow->tail->next = skb; +	flow->tail = skb; +	skb->next = NULL; +} + +static unsigned int fq_codel_drop(struct Qdisc *sch) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb; +	unsigned int maxbacklog = 0, idx = 0, i, len; +	struct fq_codel_flow *flow; + +	/* Queue is full! Find the fat flow and drop packet from it. +	 * This might sound expensive, but with 1024 flows, we scan +	 * 4KB of memory, and we dont need to handle a complex tree +	 * in fast path (packet queue/enqueue) with many cache misses. +	 */ +	for (i = 0; i < q->flows_cnt; i++) { +		if (q->backlogs[i] > maxbacklog) { +			maxbacklog = q->backlogs[i]; +			idx = i; +		} +	} +	flow = &q->flows[idx]; +	skb = dequeue_head(flow); +	len = qdisc_pkt_len(skb); +	q->backlogs[idx] -= len; +	kfree_skb(skb); +	sch->q.qlen--; +	sch->qstats.drops++; +	sch->qstats.backlog -= len; +	flow->dropped++; +	return idx; +} + +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	unsigned int idx; +	struct fq_codel_flow *flow; +	int uninitialized_var(ret); + +	idx = fq_codel_classify(skb, sch, &ret); +	if (idx == 0) { +		if (ret & __NET_XMIT_BYPASS) +			sch->qstats.drops++; +		kfree_skb(skb); +		return ret; +	} +	idx--; + +	codel_set_enqueue_time(skb); +	flow = &q->flows[idx]; +	flow_queue_add(flow, skb); +	q->backlogs[idx] += qdisc_pkt_len(skb); +	sch->qstats.backlog += qdisc_pkt_len(skb); + +	if (list_empty(&flow->flowchain)) { +		list_add_tail(&flow->flowchain, &q->new_flows); +		q->new_flow_count++; +		flow->deficit = q->quantum; +		flow->dropped = 0; +	} +	if (++sch->q.qlen <= sch->limit) +		return NET_XMIT_SUCCESS; + +	q->drop_overlimit++; +	/* Return Congestion Notification only if we dropped a packet +	 * from this flow. +	 */ +	if (fq_codel_drop(sch) == idx) +		return NET_XMIT_CN; + +	/* As we dropped a packet, better let upper stack know this */ +	qdisc_tree_decrease_qlen(sch, 1); +	return NET_XMIT_SUCCESS; +} + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct fq_codel_flow *flow; +	struct sk_buff *skb = NULL; + +	flow = container_of(vars, struct fq_codel_flow, cvars); +	if (flow->head) { +		skb = dequeue_head(flow); +		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); +		sch->q.qlen--; +	} +	return skb; +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb; +	struct fq_codel_flow *flow; +	struct list_head *head; +	u32 prev_drop_count, prev_ecn_mark; + +begin: +	head = &q->new_flows; +	if (list_empty(head)) { +		head = &q->old_flows; +		if (list_empty(head)) +			return NULL; +	} +	flow = list_first_entry(head, struct fq_codel_flow, flowchain); + +	if (flow->deficit <= 0) { +		flow->deficit += q->quantum; +		list_move_tail(&flow->flowchain, &q->old_flows); +		goto begin; +	} + +	prev_drop_count = q->cstats.drop_count; +	prev_ecn_mark = q->cstats.ecn_mark; + +	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, +			    dequeue); + +	flow->dropped += q->cstats.drop_count - prev_drop_count; +	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; + +	if (!skb) { +		/* force a pass through old_flows to prevent starvation */ +		if ((head == &q->new_flows) && !list_empty(&q->old_flows)) +			list_move_tail(&flow->flowchain, &q->old_flows); +		else +			list_del_init(&flow->flowchain); +		goto begin; +	} +	qdisc_bstats_update(sch, skb); +	flow->deficit -= qdisc_pkt_len(skb); +	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, +	 * or HTB crashes. Defer it for next round. +	 */ +	if (q->cstats.drop_count && sch->q.qlen) { +		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); +		q->cstats.drop_count = 0; +	} +	return skb; +} + +static void fq_codel_reset(struct Qdisc *sch) +{ +	struct sk_buff *skb; + +	while ((skb = fq_codel_dequeue(sch)) != NULL) +		kfree_skb(skb); +} + +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { +	[TCA_FQ_CODEL_TARGET]	= { .type = NLA_U32 }, +	[TCA_FQ_CODEL_LIMIT]	= { .type = NLA_U32 }, +	[TCA_FQ_CODEL_INTERVAL]	= { .type = NLA_U32 }, +	[TCA_FQ_CODEL_ECN]	= { .type = NLA_U32 }, +	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 }, +	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 }, +}; + +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; +	int err; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); +	if (err < 0) +		return err; +	if (tb[TCA_FQ_CODEL_FLOWS]) { +		if (q->flows) +			return -EINVAL; +		q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); +		if (!q->flows_cnt || +		    q->flows_cnt > 65536) +			return -EINVAL; +	} +	sch_tree_lock(sch); + +	if (tb[TCA_FQ_CODEL_TARGET]) { +		u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); + +		q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; +	} + +	if (tb[TCA_FQ_CODEL_INTERVAL]) { +		u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); + +		q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; +	} + +	if (tb[TCA_FQ_CODEL_LIMIT]) +		sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + +	if (tb[TCA_FQ_CODEL_ECN]) +		q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + +	if (tb[TCA_FQ_CODEL_QUANTUM]) +		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = fq_codel_dequeue(sch); + +		kfree_skb(skb); +		q->cstats.drop_count++; +	} +	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); +	q->cstats.drop_count = 0; + +	sch_tree_unlock(sch); +	return 0; +} + +static void *fq_codel_zalloc(size_t sz) +{ +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + +	if (!ptr) +		ptr = vzalloc(sz); +	return ptr; +} + +static void fq_codel_free(void *addr) +{ +	kvfree(addr); +} + +static void fq_codel_destroy(struct Qdisc *sch) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); + +	tcf_destroy_chain(&q->filter_list); +	fq_codel_free(q->backlogs); +	fq_codel_free(q->flows); +} + +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	int i; + +	sch->limit = 10*1024; +	q->flows_cnt = 1024; +	q->quantum = psched_mtu(qdisc_dev(sch)); +	q->perturbation = prandom_u32(); +	INIT_LIST_HEAD(&q->new_flows); +	INIT_LIST_HEAD(&q->old_flows); +	codel_params_init(&q->cparams); +	codel_stats_init(&q->cstats); +	q->cparams.ecn = true; + +	if (opt) { +		int err = fq_codel_change(sch, opt); +		if (err) +			return err; +	} + +	if (!q->flows) { +		q->flows = fq_codel_zalloc(q->flows_cnt * +					   sizeof(struct fq_codel_flow)); +		if (!q->flows) +			return -ENOMEM; +		q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); +		if (!q->backlogs) { +			fq_codel_free(q->flows); +			return -ENOMEM; +		} +		for (i = 0; i < q->flows_cnt; i++) { +			struct fq_codel_flow *flow = q->flows + i; + +			INIT_LIST_HEAD(&flow->flowchain); +			codel_vars_init(&flow->cvars); +		} +	} +	if (sch->limit >= 1) +		sch->flags |= TCQ_F_CAN_BYPASS; +	else +		sch->flags &= ~TCQ_F_CAN_BYPASS; +	return 0; +} + +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, +			codel_time_to_us(q->cparams.target)) || +	    nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, +			sch->limit) || +	    nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, +			codel_time_to_us(q->cparams.interval)) || +	    nla_put_u32(skb, TCA_FQ_CODEL_ECN, +			q->cparams.ecn) || +	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, +			q->quantum) || +	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, +			q->flows_cnt)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	return -1; +} + +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	struct tc_fq_codel_xstats st = { +		.type				= TCA_FQ_CODEL_XSTATS_QDISC, +	}; +	struct list_head *pos; + +	st.qdisc_stats.maxpacket = q->cstats.maxpacket; +	st.qdisc_stats.drop_overlimit = q->drop_overlimit; +	st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; +	st.qdisc_stats.new_flow_count = q->new_flow_count; + +	list_for_each(pos, &q->new_flows) +		st.qdisc_stats.new_flows_len++; + +	list_for_each(pos, &q->old_flows) +		st.qdisc_stats.old_flows_len++; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) +{ +	return NULL; +} + +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) +{ +	return 0; +} + +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, +			      u32 classid) +{ +	/* we cannot bypass queue discipline anymore */ +	sch->flags &= ~TCQ_F_CAN_BYPASS; +	return 0; +} + +static void fq_codel_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); + +	if (cl) +		return NULL; +	return &q->filter_list; +} + +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, +			  struct sk_buff *skb, struct tcmsg *tcm) +{ +	tcm->tcm_handle |= TC_H_MIN(cl); +	return 0; +} + +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, +				     struct gnet_dump *d) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	u32 idx = cl - 1; +	struct gnet_stats_queue qs = { 0 }; +	struct tc_fq_codel_xstats xstats; + +	if (idx < q->flows_cnt) { +		const struct fq_codel_flow *flow = &q->flows[idx]; +		const struct sk_buff *skb = flow->head; + +		memset(&xstats, 0, sizeof(xstats)); +		xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; +		xstats.class_stats.deficit = flow->deficit; +		xstats.class_stats.ldelay = +			codel_time_to_us(flow->cvars.ldelay); +		xstats.class_stats.count = flow->cvars.count; +		xstats.class_stats.lastcount = flow->cvars.lastcount; +		xstats.class_stats.dropping = flow->cvars.dropping; +		if (flow->cvars.dropping) { +			codel_tdiff_t delta = flow->cvars.drop_next - +					      codel_get_time(); + +			xstats.class_stats.drop_next = (delta >= 0) ? +				codel_time_to_us(delta) : +				-codel_time_to_us(-delta); +		} +		while (skb) { +			qs.qlen++; +			skb = skb->next; +		} +		qs.backlog = q->backlogs[idx]; +		qs.drops = flow->dropped; +	} +	if (gnet_stats_copy_queue(d, &qs) < 0) +		return -1; +	if (idx < q->flows_cnt) +		return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +	return 0; +} + +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ +	struct fq_codel_sched_data *q = qdisc_priv(sch); +	unsigned int i; + +	if (arg->stop) +		return; + +	for (i = 0; i < q->flows_cnt; i++) { +		if (list_empty(&q->flows[i].flowchain) || +		    arg->count < arg->skip) { +			arg->count++; +			continue; +		} +		if (arg->fn(sch, i + 1, arg) < 0) { +			arg->stop = 1; +			break; +		} +		arg->count++; +	} +} + +static const struct Qdisc_class_ops fq_codel_class_ops = { +	.leaf		=	fq_codel_leaf, +	.get		=	fq_codel_get, +	.put		=	fq_codel_put, +	.tcf_chain	=	fq_codel_find_tcf, +	.bind_tcf	=	fq_codel_bind, +	.unbind_tcf	=	fq_codel_put, +	.dump		=	fq_codel_dump_class, +	.dump_stats	=	fq_codel_dump_class_stats, +	.walk		=	fq_codel_walk, +}; + +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { +	.cl_ops		=	&fq_codel_class_ops, +	.id		=	"fq_codel", +	.priv_size	=	sizeof(struct fq_codel_sched_data), +	.enqueue	=	fq_codel_enqueue, +	.dequeue	=	fq_codel_dequeue, +	.peek		=	qdisc_peek_dequeued, +	.drop		=	fq_codel_drop, +	.init		=	fq_codel_init, +	.reset		=	fq_codel_reset, +	.destroy	=	fq_codel_destroy, +	.change		=	fq_codel_change, +	.dump		=	fq_codel_dump, +	.dump_stats =	fq_codel_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init fq_codel_module_init(void) +{ +	return register_qdisc(&fq_codel_qdisc_ops); +} + +static void __exit fq_codel_module_exit(void) +{ +	unregister_qdisc(&fq_codel_qdisc_ops); +} + +module_init(fq_codel_module_init) +module_exit(fq_codel_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0918834ee4a..e1543b03e39 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,9 +25,15 @@  #include <linux/rcupdate.h>  #include <linux/list.h>  #include <linux/slab.h> +#include <linux/if_vlan.h> +#include <net/sch_generic.h>  #include <net/pkt_sched.h>  #include <net/dst.h> +/* Qdisc to use by default */ +const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; +EXPORT_SYMBOL(default_qdisc_ops); +  /* Main transmission queue. */  /* Modifications to data participating in scheduling must be protected with @@ -53,20 +59,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)  static inline struct sk_buff *dequeue_skb(struct Qdisc *q)  {  	struct sk_buff *skb = q->gso_skb; +	const struct netdev_queue *txq = q->dev_queue;  	if (unlikely(skb)) { -		struct net_device *dev = qdisc_dev(q); -		struct netdev_queue *txq; -  		/* check the reason of requeuing without tx lock first */ -		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); -		if (!netif_tx_queue_frozen_or_stopped(txq)) { +		txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); +		if (!netif_xmit_frozen_or_stopped(txq)) {  			q->gso_skb = NULL;  			q->q.qlen--;  		} else  			skb = NULL;  	} else { -		skb = q->dequeue(q); +		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) +			skb = q->dequeue(q);  	}  	return skb; @@ -86,9 +91,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,  		 * deadloop is detected. Return OK to try the next skb.  		 */  		kfree_skb(skb); -		if (net_ratelimit()) -			printk(KERN_WARNING "Dead loop on netdevice %s, " -			       "fix it urgently!\n", dev_queue->dev->name); +		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", +				     dev_queue->dev->name);  		ret = qdisc_qlen(q);  	} else {  		/* @@ -121,7 +125,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,  	spin_unlock(root_lock);  	HARD_TX_LOCK(dev, txq, smp_processor_id()); -	if (!netif_tx_queue_frozen_or_stopped(txq)) +	if (!netif_xmit_frozen_or_stopped(txq))  		ret = dev_hard_start_xmit(skb, dev, txq);  	HARD_TX_UNLOCK(dev, txq); @@ -136,14 +140,14 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,  		ret = handle_dev_cpu_collision(skb, txq, q);  	} else {  		/* Driver returned NETDEV_TX_BUSY - requeue skb */ -		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) -			printk(KERN_WARNING "BUG %s code %d qlen %d\n", -			       dev->name, ret, q->q.qlen); +		if (unlikely(ret != NETDEV_TX_BUSY)) +			net_warn_ratelimited("BUG %s code %d qlen %d\n", +					     dev->name, ret, q->q.qlen);  		ret = dev_requeue_skb(skb, q);  	} -	if (ret && netif_tx_queue_frozen_or_stopped(txq)) +	if (ret && netif_xmit_frozen_or_stopped(txq))  		ret = 0;  	return ret; @@ -189,15 +193,15 @@ static inline int qdisc_restart(struct Qdisc *q)  void __qdisc_run(struct Qdisc *q)  { -	unsigned long start_time = jiffies; +	int quota = weight_p;  	while (qdisc_restart(q)) {  		/* -		 * Postpone processing if -		 * 1. another process needs the CPU; -		 * 2. we've been doing it for too long. +		 * Ordered by possible occurrence: Postpone processing if +		 * 1. we've exceeded packet quota +		 * 2. another process needs the CPU;  		 */ -		if (need_resched() || jiffies != start_time) { +		if (--quota <= 0 || need_resched()) {  			__netif_schedule(q);  			break;  		} @@ -208,15 +212,19 @@ void __qdisc_run(struct Qdisc *q)  unsigned long dev_trans_start(struct net_device *dev)  { -	unsigned long val, res = dev->trans_start; +	unsigned long val, res;  	unsigned int i; +	if (is_vlan_dev(dev)) +		dev = vlan_dev_real_dev(dev); +	res = dev->trans_start;  	for (i = 0; i < dev->num_tx_queues; i++) {  		val = netdev_get_tx_queue(dev, i)->trans_start;  		if (val && time_after(val, res))  			res = val;  	}  	dev->trans_start = res; +  	return res;  }  EXPORT_SYMBOL(dev_trans_start); @@ -242,18 +250,18 @@ static void dev_watchdog(unsigned long arg)  				 * old device drivers set dev->trans_start  				 */  				trans_start = txq->trans_start ? : dev->trans_start; -				if (netif_tx_queue_stopped(txq) && +				if (netif_xmit_stopped(txq) &&  				    time_after(jiffies, (trans_start +  							 dev->watchdog_timeo))) {  					some_queue_timedout = 1; +					txq->trans_timeout++;  					break;  				}  			}  			if (some_queue_timedout) { -				char drivername[64];  				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", -				       dev->name, netdev_drivername(dev, drivername, 64), i); +				       dev->name, netdev_drivername(dev), i);  				dev->netdev_ops->ndo_tx_timeout(dev);  			}  			if (!mod_timer(&dev->watchdog_timer, @@ -302,6 +310,7 @@ void netif_carrier_on(struct net_device *dev)  	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {  		if (dev->reg_state == NETREG_UNINITIALIZED)  			return; +		atomic_inc(&dev->carrier_changes);  		linkwatch_fire_event(dev);  		if (netif_running(dev))  			__netdev_watchdog_up(dev); @@ -320,41 +329,24 @@ void netif_carrier_off(struct net_device *dev)  	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {  		if (dev->reg_state == NETREG_UNINITIALIZED)  			return; +		atomic_inc(&dev->carrier_changes);  		linkwatch_fire_event(dev);  	}  }  EXPORT_SYMBOL(netif_carrier_off); -/** - * 	netif_notify_peers - notify network peers about existence of @dev - * 	@dev: network device - * - * Generate traffic such that interested network peers are aware of - * @dev, such as by generating a gratuitous ARP. This may be used when - * a device wants to inform the rest of the network about some sort of - * reconfiguration such as a failover event or virtual machine - * migration. - */ -void netif_notify_peers(struct net_device *dev) -{ -	rtnl_lock(); -	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); -	rtnl_unlock(); -} -EXPORT_SYMBOL(netif_notify_peers); -  /* "NOOP" scheduler: the best scheduler, recommended for all interfaces     under all circumstances. It is difficult to invent anything faster or     cheaper.   */ -static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)  {  	kfree_skb(skb);  	return NET_XMIT_CN;  } -static struct sk_buff *noop_dequeue(struct Qdisc * qdisc) +static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)  {  	return NULL;  } @@ -412,8 +404,9 @@ static struct Qdisc noqueue_qdisc = {  }; -static const u8 prio2band[TC_PRIO_MAX+1] = -	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; +static const u8 prio2band[TC_PRIO_MAX + 1] = { +	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +};  /* 3-band FIFO queue: old style, but should be a bit faster than     generic prio+fifo combination. @@ -445,7 +438,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,  	return priv->q + band;  } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)  {  	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {  		int band = prio2band[skb->priority & TC_PRIO_MAX]; @@ -460,7 +453,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)  	return qdisc_drop(skb, qdisc);  } -static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)  {  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);  	int band = bitmap2band[priv->bitmap]; @@ -479,7 +472,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)  	return NULL;  } -static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)  {  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);  	int band = bitmap2band[priv->bitmap]; @@ -493,7 +486,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)  	return NULL;  } -static void pfifo_fast_reset(struct Qdisc* qdisc) +static void pfifo_fast_reset(struct Qdisc *qdisc)  {  	int prio;  	struct pfifo_fast_priv *priv = qdisc_priv(qdisc); @@ -510,8 +503,9 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)  {  	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; -	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len;  nla_put_failure: @@ -526,6 +520,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)  	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)  		skb_queue_head_init(band2list(priv, prio)); +	/* Can by-pass the queue discipline */ +	qdisc->flags |= TCQ_F_CAN_BYPASS;  	return 0;  } @@ -541,17 +537,16 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {  	.owner		=	THIS_MODULE,  }; +static struct lock_class_key qdisc_tx_busylock; +  struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, -			  struct Qdisc_ops *ops) +			  const struct Qdisc_ops *ops)  {  	void *p;  	struct Qdisc *sch; -	unsigned int size; +	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;  	int err = -ENOBUFS; - -	/* ensure that the Qdisc and the private data are 64-byte aligned */ -	size = QDISC_ALIGN(sizeof(*sch)); -	size += ops->priv_size + (QDISC_ALIGNTO - 1); +	struct net_device *dev = dev_queue->dev;  	p = kzalloc_node(size, GFP_KERNEL,  			 netdev_queue_numa_node_read(dev_queue)); @@ -559,16 +554,28 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,  	if (!p)  		goto errout;  	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); -	sch->padded = (char *) sch - (char *) p; - +	/* if we got non aligned memory, ask more and do alignment ourself */ +	if (sch != p) { +		kfree(p); +		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL, +				 netdev_queue_numa_node_read(dev_queue)); +		if (!p) +			goto errout; +		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); +		sch->padded = (char *) sch - (char *) p; +	}  	INIT_LIST_HEAD(&sch->list);  	skb_queue_head_init(&sch->q); +  	spin_lock_init(&sch->busylock); +	lockdep_set_class(&sch->busylock, +			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); +  	sch->ops = ops;  	sch->enqueue = ops->enqueue;  	sch->dequeue = ops->dequeue;  	sch->dev_queue = dev_queue; -	dev_hold(qdisc_dev(sch)); +	dev_hold(dev);  	atomic_set(&sch->refcnt, 1);  	return sch; @@ -577,10 +584,14 @@ errout:  }  struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, -				struct Qdisc_ops *ops, unsigned int parentid) +				const struct Qdisc_ops *ops, +				unsigned int parentid)  {  	struct Qdisc *sch; +	if (!try_module_get(ops->owner)) +		goto errout; +  	sch = qdisc_alloc(dev_queue, ops);  	if (IS_ERR(sch))  		goto errout; @@ -630,7 +641,7 @@ void qdisc_destroy(struct Qdisc *qdisc)  #ifdef CONFIG_NET_SCHED  	qdisc_list_del(qdisc); -	qdisc_put_stab(qdisc->stab); +	qdisc_put_stab(rtnl_dereference(qdisc->stab));  #endif  	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);  	if (ops->reset) @@ -674,25 +685,23 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,  	return oqdisc;  } +EXPORT_SYMBOL(dev_graft_qdisc);  static void attach_one_default_qdisc(struct net_device *dev,  				     struct netdev_queue *dev_queue,  				     void *_unused)  { -	struct Qdisc *qdisc; +	struct Qdisc *qdisc = &noqueue_qdisc;  	if (dev->tx_queue_len) {  		qdisc = qdisc_create_dflt(dev_queue, -					  &pfifo_fast_ops, TC_H_ROOT); +					  default_qdisc_ops, TC_H_ROOT);  		if (!qdisc) { -			printk(KERN_INFO "%s: activation failed\n", dev->name); +			netdev_info(dev, "activation failed\n");  			return;  		} - -		/* Can by-pass the queue discipline for default qdisc */ -		qdisc->flags |= TCQ_F_CAN_BYPASS; -	} else { -		qdisc =  &noqueue_qdisc; +		if (!netif_is_multiqueue(dev)) +			qdisc->flags |= TCQ_F_ONETXQUEUE;  	}  	dev_queue->qdisc_sleeping = qdisc;  } @@ -711,8 +720,8 @@ static void attach_default_qdiscs(struct net_device *dev)  	} else {  		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);  		if (qdisc) { -			qdisc->ops->attach(qdisc);  			dev->qdisc = qdisc; +			qdisc->ops->attach(qdisc);  		}  	}  } @@ -739,9 +748,8 @@ void dev_activate(struct net_device *dev)  	int need_watchdog;  	/* No queueing discipline is attached to device; -	   create default one i.e. pfifo_fast for devices, -	   which need queueing and noqueue_qdisc for -	   virtual interfaces +	 * create default one for devices, which need queueing +	 * and noqueue_qdisc for virtual interfaces  	 */  	if (dev->qdisc == &noop_qdisc) @@ -761,6 +769,7 @@ void dev_activate(struct net_device *dev)  		dev_watchdog_up(dev);  	}  } +EXPORT_SYMBOL(dev_activate);  static void dev_deactivate_queue(struct net_device *dev,  				 struct netdev_queue *dev_queue, @@ -810,21 +819,51 @@ static bool some_qdisc_is_busy(struct net_device *dev)  	return false;  } -void dev_deactivate(struct net_device *dev) +/** + * 	dev_deactivate_many - deactivate transmissions on several devices + * 	@head: list of devices to deactivate + * + *	This function returns only when all outstanding transmissions + *	have completed, unless all devices are in dismantle phase. + */ +void dev_deactivate_many(struct list_head *head)  { -	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); -	if (dev_ingress_queue(dev)) -		dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc); +	struct net_device *dev; +	bool sync_needed = false; -	dev_watchdog_down(dev); +	list_for_each_entry(dev, head, close_list) { +		netdev_for_each_tx_queue(dev, dev_deactivate_queue, +					 &noop_qdisc); +		if (dev_ingress_queue(dev)) +			dev_deactivate_queue(dev, dev_ingress_queue(dev), +					     &noop_qdisc); -	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */ -	synchronize_rcu(); +		dev_watchdog_down(dev); +		sync_needed |= !dev->dismantle; +	} + +	/* Wait for outstanding qdisc-less dev_queue_xmit calls. +	 * This is avoided if all devices are in dismantle phase : +	 * Caller will call synchronize_net() for us +	 */ +	if (sync_needed) +		synchronize_net();  	/* Wait for outstanding qdisc_run calls. */ -	while (some_qdisc_is_busy(dev)) -		yield(); +	list_for_each_entry(dev, head, close_list) +		while (some_qdisc_is_busy(dev)) +			yield(); +} + +void dev_deactivate(struct net_device *dev) +{ +	LIST_HEAD(single); + +	list_add(&dev->close_list, &single); +	dev_deactivate_many(&single); +	list_del(&single);  } +EXPORT_SYMBOL(dev_deactivate);  static void dev_init_scheduler_queue(struct net_device *dev,  				     struct netdev_queue *dev_queue, @@ -871,3 +910,39 @@ void dev_shutdown(struct net_device *dev)  	WARN_ON(timer_pending(&dev->watchdog_timer));  } + +void psched_ratecfg_precompute(struct psched_ratecfg *r, +			       const struct tc_ratespec *conf, +			       u64 rate64) +{ +	memset(r, 0, sizeof(*r)); +	r->overhead = conf->overhead; +	r->rate_bytes_ps = max_t(u64, conf->rate, rate64); +	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); +	r->mult = 1; +	/* +	 * The deal here is to replace a divide by a reciprocal one +	 * in fast path (a reciprocal divide is a multiply and a shift) +	 * +	 * Normal formula would be : +	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps +	 * +	 * We compute mult/shift to use instead : +	 *  time_in_ns = (len * mult) >> shift; +	 * +	 * We try to get the highest possible mult value for accuracy, +	 * but have to make sure no overflows will ever happen. +	 */ +	if (r->rate_bytes_ps > 0) { +		u64 factor = NSEC_PER_SEC; + +		for (;;) { +			r->mult = div64_u64(factor, r->rate_bytes_ps); +			if (r->mult & (1U << 31) || factor & (1ULL << 63)) +				break; +			factor <<= 1; +			r->shift++; +		} +	} +} +EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 51dcc2aa5c9..12cbc09157f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -32,16 +32,16 @@  struct gred_sched_data;  struct gred_sched; -struct gred_sched_data -{ +struct gred_sched_data {  	u32		limit;		/* HARD maximal queue length	*/ -	u32      	DP;		/* the drop pramaters */ +	u32		DP;		/* the drop parameters */  	u32		bytesin;	/* bytes seen on virtualQ so far*/  	u32		packetsin;	/* packets seen on virtualQ so far*/  	u32		backlog;	/* bytes on the virtualQ */  	u8		prio;		/* the prio of this vq */  	struct red_parms parms; +	struct red_vars  vars;  	struct red_stats stats;  }; @@ -50,14 +50,13 @@ enum {  	GRED_RIO_MODE,  }; -struct gred_sched -{ +struct gred_sched {  	struct gred_sched_data *tab[MAX_DPs];  	unsigned long	flags;  	u32		red_flags;  	u32 		DPs;  	u32 		def; -	struct red_parms wred_set; +	struct red_vars wred_set;  };  static inline int gred_wred_mode(struct gred_sched *table) @@ -103,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch)  		if (q == NULL)  			continue; -		for (n = 0; n < table->DPs; n++) -			if (table->tab[n] && table->tab[n] != q && -			    table->tab[n]->prio == q->prio) +		for (n = i + 1; n < table->DPs; n++) +			if (table->tab[n] && table->tab[n]->prio == q->prio)  				return 1;  	} @@ -127,17 +125,18 @@ static inline u16 tc_index_to_dp(struct sk_buff *skb)  	return skb->tc_index & GRED_VQ_MASK;  } -static inline void gred_load_wred_set(struct gred_sched *table, +static inline void gred_load_wred_set(const struct gred_sched *table,  				      struct gred_sched_data *q)  { -	q->parms.qavg = table->wred_set.qavg; -	q->parms.qidlestart = table->wred_set.qidlestart; +	q->vars.qavg = table->wred_set.qavg; +	q->vars.qidlestart = table->wred_set.qidlestart;  }  static inline void gred_store_wred_set(struct gred_sched *table,  				       struct gred_sched_data *q)  { -	table->wred_set.qavg = q->parms.qavg; +	table->wred_set.qavg = q->vars.qavg; +	table->wred_set.qidlestart = q->vars.qidlestart;  }  static inline int gred_use_ecn(struct gred_sched *t) @@ -150,17 +149,18 @@ static inline int gred_use_harddrop(struct gred_sched *t)  	return t->red_flags & TC_RED_HARDDROP;  } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)  { -	struct gred_sched_data *q=NULL; -	struct gred_sched *t= qdisc_priv(sch); +	struct gred_sched_data *q = NULL; +	struct gred_sched *t = qdisc_priv(sch);  	unsigned long qavg = 0;  	u16 dp = tc_index_to_dp(skb); -	if (dp >= t->DPs  || (q = t->tab[dp]) == NULL) { +	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {  		dp = t->def; -		if ((q = t->tab[dp]) == NULL) { +		q = t->tab[dp]; +		if (!q) {  			/* Pass through packets not assigned to a DP  			 * if no default DP has been configured. This  			 * allows for DP flows to be left untouched. @@ -171,19 +171,19 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)  				goto drop;  		} -		/* fix tc_index? --could be controvesial but needed for +		/* fix tc_index? --could be controversial but needed for  		   requeueing */  		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;  	} -	/* sum up all the qaves of prios <= to ours to get the new qave */ +	/* sum up all the qaves of prios < ours to get the new qave */  	if (!gred_wred_mode(t) && gred_rio_mode(t)) {  		int i;  		for (i = 0; i < t->DPs; i++) {  			if (t->tab[i] && t->tab[i]->prio < q->prio && -			    !red_is_idling(&t->tab[i]->parms)) -				qavg +=t->tab[i]->parms.qavg; +			    !red_is_idling(&t->tab[i]->vars)) +				qavg += t->tab[i]->vars.qavg;  		}  	} @@ -194,37 +194,39 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)  	if (gred_wred_mode(t))  		gred_load_wred_set(t, q); -	q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); +	q->vars.qavg = red_calc_qavg(&q->parms, +				     &q->vars, +				     gred_backlog(t, q, sch)); -	if (red_is_idling(&q->parms)) -		red_end_of_idle_period(&q->parms); +	if (red_is_idling(&q->vars)) +		red_end_of_idle_period(&q->vars);  	if (gred_wred_mode(t))  		gred_store_wred_set(t, q); -	switch (red_action(&q->parms, q->parms.qavg + qavg)) { -		case RED_DONT_MARK: -			break; +	switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) { +	case RED_DONT_MARK: +		break; -		case RED_PROB_MARK: -			sch->qstats.overlimits++; -			if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { -				q->stats.prob_drop++; -				goto congestion_drop; -			} +	case RED_PROB_MARK: +		sch->qstats.overlimits++; +		if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { +			q->stats.prob_drop++; +			goto congestion_drop; +		} -			q->stats.prob_mark++; -			break; +		q->stats.prob_mark++; +		break; -		case RED_HARD_MARK: -			sch->qstats.overlimits++; -			if (gred_use_harddrop(t) || !gred_use_ecn(t) || -			    !INET_ECN_set_ce(skb)) { -				q->stats.forced_drop++; -				goto congestion_drop; -			} -			q->stats.forced_mark++; -			break; +	case RED_HARD_MARK: +		sch->qstats.overlimits++; +		if (gred_use_harddrop(t) || !gred_use_ecn(t) || +		    !INET_ECN_set_ce(skb)) { +			q->stats.forced_drop++; +			goto congestion_drop; +		} +		q->stats.forced_mark++; +		break;  	}  	if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { @@ -241,7 +243,7 @@ congestion_drop:  	return NET_XMIT_CN;  } -static struct sk_buff *gred_dequeue(struct Qdisc* sch) +static struct sk_buff *gred_dequeue(struct Qdisc *sch)  {  	struct sk_buff *skb;  	struct gred_sched *t = qdisc_priv(sch); @@ -253,27 +255,27 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)  		u16 dp = tc_index_to_dp(skb);  		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { -			if (net_ratelimit()) -				printk(KERN_WARNING "GRED: Unable to relocate " -				       "VQ 0x%x after dequeue, screwing up " -				       "backlog.\n", tc_index_to_dp(skb)); +			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", +					     tc_index_to_dp(skb));  		} else {  			q->backlog -= qdisc_pkt_len(skb); -			if (!q->backlog && !gred_wred_mode(t)) -				red_start_of_idle_period(&q->parms); +			if (gred_wred_mode(t)) { +				if (!sch->qstats.backlog) +					red_start_of_idle_period(&t->wred_set); +			} else { +				if (!q->backlog) +					red_start_of_idle_period(&q->vars); +			}  		}  		return skb;  	} -	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) -		red_start_of_idle_period(&t->wred_set); -  	return NULL;  } -static unsigned int gred_drop(struct Qdisc* sch) +static unsigned int gred_drop(struct Qdisc *sch)  {  	struct sk_buff *skb;  	struct gred_sched *t = qdisc_priv(sch); @@ -285,30 +287,29 @@ static unsigned int gred_drop(struct Qdisc* sch)  		u16 dp = tc_index_to_dp(skb);  		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { -			if (net_ratelimit()) -				printk(KERN_WARNING "GRED: Unable to relocate " -				       "VQ 0x%x while dropping, screwing up " -				       "backlog.\n", tc_index_to_dp(skb)); +			net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", +					     tc_index_to_dp(skb));  		} else {  			q->backlog -= len;  			q->stats.other++; -			if (!q->backlog && !gred_wred_mode(t)) -				red_start_of_idle_period(&q->parms); +			if (gred_wred_mode(t)) { +				if (!sch->qstats.backlog) +					red_start_of_idle_period(&t->wred_set); +			} else { +				if (!q->backlog) +					red_start_of_idle_period(&q->vars); +			}  		}  		qdisc_drop(skb, sch);  		return len;  	} -	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) -		red_start_of_idle_period(&t->wred_set); -  	return 0; -  } -static void gred_reset(struct Qdisc* sch) +static void gred_reset(struct Qdisc *sch)  {  	int i;  	struct gred_sched *t = qdisc_priv(sch); @@ -321,7 +322,7 @@ static void gred_reset(struct Qdisc* sch)  		if (!q)  			continue; -		red_restart(&q->parms); +		red_restart(&q->vars);  		q->backlog = 0;  	}  } @@ -369,8 +370,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)  	for (i = table->DPs; i < MAX_DPs; i++) {  		if (table->tab[i]) { -			printk(KERN_WARNING "GRED: Warning: Destroying " -			       "shadowed VQ 0x%x\n", i); +			pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n", +				i);  			gred_destroy_vq(table->tab[i]);  			table->tab[i] = NULL;  		} @@ -380,29 +381,31 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)  }  static inline int gred_change_vq(struct Qdisc *sch, int dp, -				 struct tc_gred_qopt *ctl, int prio, u8 *stab) +				 struct tc_gred_qopt *ctl, int prio, +				 u8 *stab, u32 max_P, +				 struct gred_sched_data **prealloc)  {  	struct gred_sched *table = qdisc_priv(sch); -	struct gred_sched_data *q; +	struct gred_sched_data *q = table->tab[dp]; -	if (table->tab[dp] == NULL) { -		table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL); -		if (table->tab[dp] == NULL) +	if (!q) { +		table->tab[dp] = q = *prealloc; +		*prealloc = NULL; +		if (!q)  			return -ENOMEM;  	} -	q = table->tab[dp];  	q->DP = dp;  	q->prio = prio;  	q->limit = ctl->limit;  	if (q->backlog == 0) -		red_end_of_idle_period(&q->parms); +		red_end_of_idle_period(&q->vars);  	red_set_parms(&q->parms,  		      ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, -		      ctl->Scell_log, stab); - +		      ctl->Scell_log, stab, max_P); +	red_set_vars(&q->vars);  	return 0;  } @@ -410,6 +413,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {  	[TCA_GRED_PARMS]	= { .len = sizeof(struct tc_gred_qopt) },  	[TCA_GRED_STAB]		= { .len = 256 },  	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) }, +	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },  };  static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -419,6 +423,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)  	struct nlattr *tb[TCA_GRED_MAX + 1];  	int err, prio = GRED_DEF_PRIO;  	u8 *stab; +	u32 max_P; +	struct gred_sched_data *prealloc;  	if (opt == NULL)  		return -EINVAL; @@ -434,6 +440,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)  	    tb[TCA_GRED_STAB] == NULL)  		return -EINVAL; +	max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; +  	err = -EINVAL;  	ctl = nla_data(tb[TCA_GRED_PARMS]);  	stab = nla_data(tb[TCA_GRED_STAB]); @@ -456,9 +464,10 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)  			prio = ctl->prio;  	} +	prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);  	sch_tree_lock(sch); -	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); +	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);  	if (err < 0)  		goto errout_locked; @@ -472,6 +481,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt)  errout_locked:  	sch_tree_unlock(sch); +	kfree(prealloc);  errout:  	return err;  } @@ -499,6 +509,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)  	struct gred_sched *table = qdisc_priv(sch);  	struct nlattr *parms, *opts = NULL;  	int i; +	u32 max_p[MAX_DPs];  	struct tc_gred_sopt sopt = {  		.DPs	= table->DPs,  		.def_DP	= table->def, @@ -509,7 +520,17 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)  	opts = nla_nest_start(skb, TCA_OPTIONS);  	if (opts == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); +	if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) +		goto nla_put_failure; + +	for (i = 0; i < MAX_DPs; i++) { +		struct gred_sched_data *q = table->tab[i]; + +		max_p[i] = q ? q->parms.max_P : 0; +	} +	if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) +		goto nla_put_failure; +  	parms = nla_nest_start(skb, TCA_GRED_PARMS);  	if (parms == NULL)  		goto nla_put_failure; @@ -517,6 +538,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)  	for (i = 0; i < MAX_DPs; i++) {  		struct gred_sched_data *q = table->tab[i];  		struct tc_gred_qopt opt; +		unsigned long qavg;  		memset(&opt, 0, sizeof(opt)); @@ -545,13 +567,12 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)  		opt.packets	= q->packetsin;  		opt.bytesin	= q->bytesin; -		if (gred_wred_mode(table)) { -			q->parms.qidlestart = -				table->tab[table->def]->parms.qidlestart; -			q->parms.qavg = table->tab[table->def]->parms.qavg; -		} +		if (gred_wred_mode(table)) +			gred_load_wred_set(table, q); -		opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); +		qavg = red_calc_qavg(&q->parms, &q->vars, +				     q->vars.qavg >> q->parms.Wlog); +		opt.qave = qavg >> q->parms.Wlog;  append_opt:  		if (nla_append(skb, sizeof(opt), &opt) < 0) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 069c62b7bb3..ec8aeaac1dd 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -81,8 +81,7 @@   *   that are expensive on 32-bit architectures.   */ -struct internal_sc -{ +struct internal_sc {  	u64	sm1;	/* scaled slope of the 1st segment */  	u64	ism1;	/* scaled inverse-slope of the 1st segment */  	u64	dx;	/* the x-projection of the 1st segment */ @@ -92,8 +91,7 @@ struct internal_sc  };  /* runtime service curve */ -struct runtime_sc -{ +struct runtime_sc {  	u64	x;	/* current starting position on x-axis */  	u64	y;	/* current starting position on y-axis */  	u64	sm1;	/* scaled slope of the 1st segment */ @@ -104,21 +102,19 @@ struct runtime_sc  	u64	ism2;	/* scaled inverse-slope of the 2nd segment */  }; -enum hfsc_class_flags -{ +enum hfsc_class_flags {  	HFSC_RSC = 0x1,  	HFSC_FSC = 0x2,  	HFSC_USC = 0x4  }; -struct hfsc_class -{ +struct hfsc_class {  	struct Qdisc_class_common cl_common;  	unsigned int	refcnt;		/* usage count */  	struct gnet_stats_basic_packed bstats;  	struct gnet_stats_queue qstats; -	struct gnet_stats_rate_est rate_est; +	struct gnet_stats_rate_est64 rate_est;  	unsigned int	level;		/* class level in hierarchy */  	struct tcf_proto *filter_list;	/* filter list */  	unsigned int	filter_cnt;	/* filter count */ @@ -140,8 +136,8 @@ struct hfsc_class  	u64	cl_cumul;		/* cumulative work in bytes done by  					   real-time criteria */ -	u64 	cl_d;			/* deadline*/ -	u64 	cl_e;			/* eligible time */ +	u64	cl_d;			/* deadline*/ +	u64	cl_e;			/* eligible time */  	u64	cl_vt;			/* virtual time */  	u64	cl_f;			/* time when this class will fit for  					   link-sharing, max(myf, cfmin) */ @@ -176,8 +172,7 @@ struct hfsc_class  	unsigned long	cl_nactive;	/* number of active children */  }; -struct hfsc_sched -{ +struct hfsc_sched {  	u16	defcls;				/* default class id */  	struct hfsc_class root;			/* root class */  	struct Qdisc_class_hash clhash;		/* class hash */ @@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)  		if (go_active) {  			n = rb_last(&cl->cl_parent->vt_tree);  			if (n != NULL) { -				max_cl = rb_entry(n, struct hfsc_class,vt_node); +				max_cl = rb_entry(n, struct hfsc_class, vt_node);  				/*  				 * set vt to the average of the min and max  				 * classes.  if the parent's period didn't @@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)  			return NULL;  		}  #endif -		if ((cl = (struct hfsc_class *)res.class) == NULL) { -			if ((cl = hfsc_find_class(res.classid, sch)) == NULL) +		cl = (struct hfsc_class *)res.class; +		if (!cl) { +			cl = hfsc_find_class(res.classid, sch); +			if (!cl)  				break; /* filter selected invalid classid */  			if (cl->level >= head->level)  				break; /* filter may only point downwards */ @@ -1308,7 +1305,8 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)  	tsc.m1 = sm2m(sc->sm1);  	tsc.d  = dx2d(sc->dx);  	tsc.m2 = sm2m(sc->sm2); -	NLA_PUT(skb, attr, sizeof(tsc), &tsc); +	if (nla_put(skb, attr, sizeof(tsc), &tsc)) +		goto nla_put_failure;  	return skb->len; @@ -1316,7 +1314,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)  	return -1;  } -static inline int +static int  hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)  {  	if ((cl->cl_flags & HFSC_RSC) && @@ -1355,8 +1353,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,  		goto nla_put_failure;  	if (hfsc_dump_curves(skb, cl) < 0)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);   nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -1371,6 +1368,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,  	struct tc_hfsc_stats xstats;  	cl->qstats.qlen = cl->qdisc->q.qlen; +	cl->qstats.backlog = cl->qdisc->qstats.backlog;  	xstats.level   = cl->level;  	xstats.period  = cl->cl_vtperiod;  	xstats.work    = cl->cl_total; @@ -1390,7 +1388,6 @@ static void  hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)  {  	struct hfsc_sched *q = qdisc_priv(sch); -	struct hlist_node *n;  	struct hfsc_class *cl;  	unsigned int i; @@ -1398,7 +1395,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)  		return;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], +		hlist_for_each_entry(cl, &q->clhash.hash[i],  				     cl_common.hnode) {  			if (arg->count < arg->skip) {  				arg->count++; @@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)  	struct hfsc_class *cl;  	u64 next_time = 0; -	if ((cl = eltree_get_minel(q)) != NULL) +	cl = eltree_get_minel(q); +	if (cl)  		next_time = cl->cl_e;  	if (q->root.cl_cfmin != 0) {  		if (next_time == 0 || next_time > q->root.cl_cfmin) @@ -1523,11 +1521,10 @@ hfsc_reset_qdisc(struct Qdisc *sch)  {  	struct hfsc_sched *q = qdisc_priv(sch);  	struct hfsc_class *cl; -	struct hlist_node *n;  	unsigned int i;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) +		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)  			hfsc_reset_class(cl);  	}  	q->eligible = RB_ROOT; @@ -1540,16 +1537,16 @@ static void  hfsc_destroy_qdisc(struct Qdisc *sch)  {  	struct hfsc_sched *q = qdisc_priv(sch); -	struct hlist_node *n, *next; +	struct hlist_node *next;  	struct hfsc_class *cl;  	unsigned int i;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) +		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)  			tcf_destroy_chain(&cl->filter_list);  	}  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], +		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],  					  cl_common.hnode)  			hfsc_destroy_class(sch, cl);  	} @@ -1563,9 +1560,18 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)  	struct hfsc_sched *q = qdisc_priv(sch);  	unsigned char *b = skb_tail_pointer(skb);  	struct tc_hfsc_qopt qopt; +	struct hfsc_class *cl; +	unsigned int i; + +	sch->qstats.backlog = 0; +	for (i = 0; i < q->clhash.hashsize; i++) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) +			sch->qstats.backlog += cl->qdisc->qstats.backlog; +	}  	qopt.defcls = q->defcls; -	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) +		goto nla_put_failure;  	return skb->len;   nla_put_failure: @@ -1599,10 +1605,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	if (cl->qdisc->q.qlen == 1)  		set_active(cl, qdisc_pkt_len(skb)); -	cl->bstats.packets++; -	cl->bstats.bytes += qdisc_pkt_len(skb); -	sch->bstats.packets++; -	sch->bstats.bytes += qdisc_pkt_len(skb);  	sch->q.qlen++;  	return NET_XMIT_SUCCESS; @@ -1628,7 +1630,8 @@ hfsc_dequeue(struct Qdisc *sch)  	 * find the class with the minimum deadline among  	 * the eligible classes.  	 */ -	if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { +	cl = eltree_get_mindl(q, cur_time); +	if (cl) {  		realtime = 1;  	} else {  		/* @@ -1649,6 +1652,7 @@ hfsc_dequeue(struct Qdisc *sch)  		return NULL;  	} +	bstats_update(&cl->bstats, skb);  	update_vf(cl, qdisc_pkt_len(skb), cur_time);  	if (realtime)  		cl->cl_cumul += qdisc_pkt_len(skb); @@ -1667,7 +1671,8 @@ hfsc_dequeue(struct Qdisc *sch)  		set_passive(cl);  	} -	sch->flags &= ~TCQ_F_THROTTLED; +	qdisc_unthrottled(sch); +	qdisc_bstats_update(sch, skb);  	sch->q.qlen--;  	return skb; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c new file mode 100644 index 00000000000..d85b6812a7d --- /dev/null +++ b/net/sched/sch_hhf.c @@ -0,0 +1,740 @@ +/* net/sched/sch_hhf.c		Heavy-Hitter Filter (HHF) + * + * Copyright (C) 2013 Terry Lam <vtlam@google.com> + * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> + */ + +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <net/flow_keys.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +/*	Heavy-Hitter Filter (HHF) + * + * Principles : + * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter + * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified + * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. + * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, + * in which the heavy-hitter bucket is served with less weight. + * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) + * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have + * higher share of bandwidth. + * + * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the + * following paper: + * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and + * Accounting", in ACM SIGCOMM, 2002. + * + * Conceptually, a multi-stage filter comprises k independent hash functions + * and k counter arrays. Packets are indexed into k counter arrays by k hash + * functions, respectively. The counters are then increased by the packet sizes. + * Therefore, + *    - For a heavy-hitter flow: *all* of its k array counters must be large. + *    - For a non-heavy-hitter flow: some of its k array counters can be large + *      due to hash collision with other small flows; however, with high + *      probability, not *all* k counters are large. + * + * By the design of the multi-stage filter algorithm, the false negative rate + * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is + * susceptible to false positives (non-heavy-hitters mistakenly classified as + * heavy-hitters). + * Therefore, we also implement the following optimizations to reduce false + * positives by avoiding unnecessary increment of the counter values: + *    - Optimization O1: once a heavy-hitter is identified, its bytes are not + *        accounted in the array counters. This technique is called "shielding" + *        in Section 3.3.1 of [EV02]. + *    - Optimization O2: conservative update of counters + *                       (Section 3.3.2 of [EV02]), + *        New counter value = max {old counter value, + *                                 smallest counter value + packet bytes} + * + * Finally, we refresh the counters periodically since otherwise the counter + * values will keep accumulating. + * + * Once a flow is classified as heavy-hitter, we also save its per-flow state + * in an exact-matching flow table so that its subsequent packets can be + * dispatched to the heavy-hitter bucket accordingly. + * + * + * At a high level, this qdisc works as follows: + * Given a packet p: + *   - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching + *     heavy-hitter flow table, denoted table T, then send p to the heavy-hitter + *     bucket. + *   - Otherwise, forward p to the multi-stage filter, denoted filter F + *        + If F decides that p belongs to a non-heavy-hitter flow, then send p + *          to the non-heavy-hitter bucket. + *        + Otherwise, if F decides that p belongs to a new heavy-hitter flow, + *          then set up a new flow entry for the flow-id of p in the table T and + *          send p to the heavy-hitter bucket. + * + * In this implementation: + *   - T is a fixed-size hash-table with 1024 entries. Hash collision is + *     resolved by linked-list chaining. + *   - F has four counter arrays, each array containing 1024 32-bit counters. + *     That means 4 * 1024 * 32 bits = 16KB of memory. + *   - Since each array in F contains 1024 counters, 10 bits are sufficient to + *     index into each array. + *     Hence, instead of having four hash functions, we chop the 32-bit + *     skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is + *     computed as XOR sum of those three chunks. + *   - We need to clear the counter arrays periodically; however, directly + *     memsetting 16KB of memory can lead to cache eviction and unwanted delay. + *     So by representing each counter by a valid bit, we only need to reset + *     4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. + *   - The Deficit Round Robin engine is taken from fq_codel implementation + *     (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to + *     fq_codel_flow in fq_codel implementation. + * + */ + +/* Non-configurable parameters */ +#define HH_FLOWS_CNT	 1024  /* number of entries in exact-matching table T */ +#define HHF_ARRAYS_CNT	 4     /* number of arrays in multi-stage filter F */ +#define HHF_ARRAYS_LEN	 1024  /* number of counters in each array of F */ +#define HHF_BIT_MASK_LEN 10    /* masking 10 bits */ +#define HHF_BIT_MASK	 0x3FF /* bitmask of 10 bits */ + +#define WDRR_BUCKET_CNT  2     /* two buckets for Weighted DRR */ +enum wdrr_bucket_idx { +	WDRR_BUCKET_FOR_HH	= 0, /* bucket id for heavy-hitters */ +	WDRR_BUCKET_FOR_NON_HH	= 1  /* bucket id for non-heavy-hitters */ +}; + +#define hhf_time_before(a, b)	\ +	(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) + +/* Heavy-hitter per-flow state */ +struct hh_flow_state { +	u32		 hash_id;	/* hash of flow-id (e.g. TCP 5-tuple) */ +	u32		 hit_timestamp;	/* last time heavy-hitter was seen */ +	struct list_head flowchain;	/* chaining under hash collision */ +}; + +/* Weighted Deficit Round Robin (WDRR) scheduler */ +struct wdrr_bucket { +	struct sk_buff	  *head; +	struct sk_buff	  *tail; +	struct list_head  bucketchain; +	int		  deficit; +}; + +struct hhf_sched_data { +	struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; +	u32		   perturbation;   /* hash perturbation */ +	u32		   quantum;        /* psched_mtu(qdisc_dev(sch)); */ +	u32		   drop_overlimit; /* number of times max qdisc packet +					    * limit was hit +					    */ +	struct list_head   *hh_flows;       /* table T (currently active HHs) */ +	u32		   hh_flows_limit;            /* max active HH allocs */ +	u32		   hh_flows_overlimit; /* num of disallowed HH allocs */ +	u32		   hh_flows_total_cnt;          /* total admitted HHs */ +	u32		   hh_flows_current_cnt;        /* total current HHs  */ +	u32		   *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ +	u32		   hhf_arrays_reset_timestamp;  /* last time hhf_arrays +							 * was reset +							 */ +	unsigned long	   *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits +							     * of hhf_arrays +							     */ +	/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ +	struct list_head   new_buckets; /* list of new buckets */ +	struct list_head   old_buckets; /* list of old buckets */ + +	/* Configurable HHF parameters */ +	u32		   hhf_reset_timeout; /* interval to reset counter +					       * arrays in filter F +					       * (default 40ms) +					       */ +	u32		   hhf_admit_bytes;   /* counter thresh to classify as +					       * HH (default 128KB). +					       * With these default values, +					       * 128KB / 40ms = 25 Mbps +					       * i.e., we expect to capture HHs +					       * sending > 25 Mbps. +					       */ +	u32		   hhf_evict_timeout; /* aging threshold to evict idle +					       * HHs out of table T. This should +					       * be large enough to avoid +					       * reordering during HH eviction. +					       * (default 1s) +					       */ +	u32		   hhf_non_hh_weight; /* WDRR weight for non-HHs +					       * (default 2, +					       *  i.e., non-HH : HH = 2 : 1) +					       */ +}; + +static u32 hhf_time_stamp(void) +{ +	return jiffies; +} + +static unsigned int skb_hash(const struct hhf_sched_data *q, +			     const struct sk_buff *skb) +{ +	struct flow_keys keys; +	unsigned int hash; + +	if (skb->sk && skb->sk->sk_hash) +		return skb->sk->sk_hash; + +	skb_flow_dissect(skb, &keys); +	hash = jhash_3words((__force u32)keys.dst, +			    (__force u32)keys.src ^ keys.ip_proto, +			    (__force u32)keys.ports, q->perturbation); +	return hash; +} + +/* Looks up a heavy-hitter flow in a chaining list of table T. */ +static struct hh_flow_state *seek_list(const u32 hash, +				       struct list_head *head, +				       struct hhf_sched_data *q) +{ +	struct hh_flow_state *flow, *next; +	u32 now = hhf_time_stamp(); + +	if (list_empty(head)) +		return NULL; + +	list_for_each_entry_safe(flow, next, head, flowchain) { +		u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + +		if (hhf_time_before(prev, now)) { +			/* Delete expired heavy-hitters, but preserve one entry +			 * to avoid kzalloc() when next time this slot is hit. +			 */ +			if (list_is_last(&flow->flowchain, head)) +				return NULL; +			list_del(&flow->flowchain); +			kfree(flow); +			q->hh_flows_current_cnt--; +		} else if (flow->hash_id == hash) { +			return flow; +		} +	} +	return NULL; +} + +/* Returns a flow state entry for a new heavy-hitter.  Either reuses an expired + * entry or dynamically alloc a new entry. + */ +static struct hh_flow_state *alloc_new_hh(struct list_head *head, +					  struct hhf_sched_data *q) +{ +	struct hh_flow_state *flow; +	u32 now = hhf_time_stamp(); + +	if (!list_empty(head)) { +		/* Find an expired heavy-hitter flow entry. */ +		list_for_each_entry(flow, head, flowchain) { +			u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; + +			if (hhf_time_before(prev, now)) +				return flow; +		} +	} + +	if (q->hh_flows_current_cnt >= q->hh_flows_limit) { +		q->hh_flows_overlimit++; +		return NULL; +	} +	/* Create new entry. */ +	flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); +	if (!flow) +		return NULL; + +	q->hh_flows_current_cnt++; +	INIT_LIST_HEAD(&flow->flowchain); +	list_add_tail(&flow->flowchain, head); + +	return flow; +} + +/* Assigns packets to WDRR buckets.  Implements a multi-stage filter to + * classify heavy-hitters. + */ +static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	u32 tmp_hash, hash; +	u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; +	struct hh_flow_state *flow; +	u32 pkt_len, min_hhf_val; +	int i; +	u32 prev; +	u32 now = hhf_time_stamp(); + +	/* Reset the HHF counter arrays if this is the right time. */ +	prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; +	if (hhf_time_before(prev, now)) { +		for (i = 0; i < HHF_ARRAYS_CNT; i++) +			bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); +		q->hhf_arrays_reset_timestamp = now; +	} + +	/* Get hashed flow-id of the skb. */ +	hash = skb_hash(q, skb); + +	/* Check if this packet belongs to an already established HH flow. */ +	flow_pos = hash & HHF_BIT_MASK; +	flow = seek_list(hash, &q->hh_flows[flow_pos], q); +	if (flow) { /* found its HH flow */ +		flow->hit_timestamp = now; +		return WDRR_BUCKET_FOR_HH; +	} + +	/* Now pass the packet through the multi-stage filter. */ +	tmp_hash = hash; +	xorsum = 0; +	for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { +		/* Split the skb_hash into three 10-bit chunks. */ +		filter_pos[i] = tmp_hash & HHF_BIT_MASK; +		xorsum ^= filter_pos[i]; +		tmp_hash >>= HHF_BIT_MASK_LEN; +	} +	/* The last chunk is computed as XOR sum of other chunks. */ +	filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; + +	pkt_len = qdisc_pkt_len(skb); +	min_hhf_val = ~0U; +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		u32 val; + +		if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { +			q->hhf_arrays[i][filter_pos[i]] = 0; +			__set_bit(filter_pos[i], q->hhf_valid_bits[i]); +		} + +		val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; +		if (min_hhf_val > val) +			min_hhf_val = val; +	} + +	/* Found a new HH iff all counter values > HH admit threshold. */ +	if (min_hhf_val > q->hhf_admit_bytes) { +		/* Just captured a new heavy-hitter. */ +		flow = alloc_new_hh(&q->hh_flows[flow_pos], q); +		if (!flow) /* memory alloc problem */ +			return WDRR_BUCKET_FOR_NON_HH; +		flow->hash_id = hash; +		flow->hit_timestamp = now; +		q->hh_flows_total_cnt++; + +		/* By returning without updating counters in q->hhf_arrays, +		 * we implicitly implement "shielding" (see Optimization O1). +		 */ +		return WDRR_BUCKET_FOR_HH; +	} + +	/* Conservative update of HHF arrays (see Optimization O2). */ +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) +			q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; +	} +	return WDRR_BUCKET_FOR_NON_HH; +} + +/* Removes one skb from head of bucket. */ +static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) +{ +	struct sk_buff *skb = bucket->head; + +	bucket->head = skb->next; +	skb->next = NULL; +	return skb; +} + +/* Tail-adds skb to bucket. */ +static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) +{ +	if (bucket->head == NULL) +		bucket->head = skb; +	else +		bucket->tail->next = skb; +	bucket->tail = skb; +	skb->next = NULL; +} + +static unsigned int hhf_drop(struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct wdrr_bucket *bucket; + +	/* Always try to drop from heavy-hitters first. */ +	bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; +	if (!bucket->head) +		bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; + +	if (bucket->head) { +		struct sk_buff *skb = dequeue_head(bucket); + +		sch->q.qlen--; +		sch->qstats.drops++; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		kfree_skb(skb); +	} + +	/* Return id of the bucket from which the packet was dropped. */ +	return bucket - q->buckets; +} + +static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	enum wdrr_bucket_idx idx; +	struct wdrr_bucket *bucket; + +	idx = hhf_classify(skb, sch); + +	bucket = &q->buckets[idx]; +	bucket_add(bucket, skb); +	sch->qstats.backlog += qdisc_pkt_len(skb); + +	if (list_empty(&bucket->bucketchain)) { +		unsigned int weight; + +		/* The logic of new_buckets vs. old_buckets is the same as +		 * new_flows vs. old_flows in the implementation of fq_codel, +		 * i.e., short bursts of non-HHs should have strict priority. +		 */ +		if (idx == WDRR_BUCKET_FOR_HH) { +			/* Always move heavy-hitters to old bucket. */ +			weight = 1; +			list_add_tail(&bucket->bucketchain, &q->old_buckets); +		} else { +			weight = q->hhf_non_hh_weight; +			list_add_tail(&bucket->bucketchain, &q->new_buckets); +		} +		bucket->deficit = weight * q->quantum; +	} +	if (++sch->q.qlen <= sch->limit) +		return NET_XMIT_SUCCESS; + +	q->drop_overlimit++; +	/* Return Congestion Notification only if we dropped a packet from this +	 * bucket. +	 */ +	if (hhf_drop(sch) == idx) +		return NET_XMIT_CN; + +	/* As we dropped a packet, better let upper stack know this. */ +	qdisc_tree_decrease_qlen(sch, 1); +	return NET_XMIT_SUCCESS; +} + +static struct sk_buff *hhf_dequeue(struct Qdisc *sch) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb = NULL; +	struct wdrr_bucket *bucket; +	struct list_head *head; + +begin: +	head = &q->new_buckets; +	if (list_empty(head)) { +		head = &q->old_buckets; +		if (list_empty(head)) +			return NULL; +	} +	bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); + +	if (bucket->deficit <= 0) { +		int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? +			      1 : q->hhf_non_hh_weight; + +		bucket->deficit += weight * q->quantum; +		list_move_tail(&bucket->bucketchain, &q->old_buckets); +		goto begin; +	} + +	if (bucket->head) { +		skb = dequeue_head(bucket); +		sch->q.qlen--; +		sch->qstats.backlog -= qdisc_pkt_len(skb); +	} + +	if (!skb) { +		/* Force a pass through old_buckets to prevent starvation. */ +		if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) +			list_move_tail(&bucket->bucketchain, &q->old_buckets); +		else +			list_del_init(&bucket->bucketchain); +		goto begin; +	} +	qdisc_bstats_update(sch, skb); +	bucket->deficit -= qdisc_pkt_len(skb); + +	return skb; +} + +static void hhf_reset(struct Qdisc *sch) +{ +	struct sk_buff *skb; + +	while ((skb = hhf_dequeue(sch)) != NULL) +		kfree_skb(skb); +} + +static void *hhf_zalloc(size_t sz) +{ +	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + +	if (!ptr) +		ptr = vzalloc(sz); + +	return ptr; +} + +static void hhf_free(void *addr) +{ +	kvfree(addr); +} + +static void hhf_destroy(struct Qdisc *sch) +{ +	int i; +	struct hhf_sched_data *q = qdisc_priv(sch); + +	for (i = 0; i < HHF_ARRAYS_CNT; i++) { +		hhf_free(q->hhf_arrays[i]); +		hhf_free(q->hhf_valid_bits[i]); +	} + +	for (i = 0; i < HH_FLOWS_CNT; i++) { +		struct hh_flow_state *flow, *next; +		struct list_head *head = &q->hh_flows[i]; + +		if (list_empty(head)) +			continue; +		list_for_each_entry_safe(flow, next, head, flowchain) { +			list_del(&flow->flowchain); +			kfree(flow); +		} +	} +	hhf_free(q->hh_flows); +} + +static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { +	[TCA_HHF_BACKLOG_LIMIT]	 = { .type = NLA_U32 }, +	[TCA_HHF_QUANTUM]	 = { .type = NLA_U32 }, +	[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, +	[TCA_HHF_RESET_TIMEOUT]	 = { .type = NLA_U32 }, +	[TCA_HHF_ADMIT_BYTES]	 = { .type = NLA_U32 }, +	[TCA_HHF_EVICT_TIMEOUT]	 = { .type = NLA_U32 }, +	[TCA_HHF_NON_HH_WEIGHT]	 = { .type = NLA_U32 }, +}; + +static int hhf_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_HHF_MAX + 1]; +	unsigned int qlen; +	int err; +	u64 non_hh_quantum; +	u32 new_quantum = q->quantum; +	u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); +	if (err < 0) +		return err; + +	if (tb[TCA_HHF_QUANTUM]) +		new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); + +	if (tb[TCA_HHF_NON_HH_WEIGHT]) +		new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); + +	non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; +	if (non_hh_quantum > INT_MAX) +		return -EINVAL; + +	sch_tree_lock(sch); + +	if (tb[TCA_HHF_BACKLOG_LIMIT]) +		sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + +	q->quantum = new_quantum; +	q->hhf_non_hh_weight = new_hhf_non_hh_weight; + +	if (tb[TCA_HHF_HH_FLOWS_LIMIT]) +		q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + +	if (tb[TCA_HHF_RESET_TIMEOUT]) { +		u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); + +		q->hhf_reset_timeout = usecs_to_jiffies(us); +	} + +	if (tb[TCA_HHF_ADMIT_BYTES]) +		q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + +	if (tb[TCA_HHF_EVICT_TIMEOUT]) { +		u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); + +		q->hhf_evict_timeout = usecs_to_jiffies(us); +	} + +	qlen = sch->q.qlen; +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = hhf_dequeue(sch); + +		kfree_skb(skb); +	} +	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + +	sch_tree_unlock(sch); +	return 0; +} + +static int hhf_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	int i; + +	sch->limit = 1000; +	q->quantum = psched_mtu(qdisc_dev(sch)); +	q->perturbation = prandom_u32(); +	INIT_LIST_HEAD(&q->new_buckets); +	INIT_LIST_HEAD(&q->old_buckets); + +	/* Configurable HHF parameters */ +	q->hhf_reset_timeout = HZ / 25; /* 40  ms */ +	q->hhf_admit_bytes = 131072;    /* 128 KB */ +	q->hhf_evict_timeout = HZ;      /* 1  sec */ +	q->hhf_non_hh_weight = 2; + +	if (opt) { +		int err = hhf_change(sch, opt); + +		if (err) +			return err; +	} + +	if (!q->hh_flows) { +		/* Initialize heavy-hitter flow table. */ +		q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * +					 sizeof(struct list_head)); +		if (!q->hh_flows) +			return -ENOMEM; +		for (i = 0; i < HH_FLOWS_CNT; i++) +			INIT_LIST_HEAD(&q->hh_flows[i]); + +		/* Cap max active HHs at twice len of hh_flows table. */ +		q->hh_flows_limit = 2 * HH_FLOWS_CNT; +		q->hh_flows_overlimit = 0; +		q->hh_flows_total_cnt = 0; +		q->hh_flows_current_cnt = 0; + +		/* Initialize heavy-hitter filter arrays. */ +		for (i = 0; i < HHF_ARRAYS_CNT; i++) { +			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * +						      sizeof(u32)); +			if (!q->hhf_arrays[i]) { +				hhf_destroy(sch); +				return -ENOMEM; +			} +		} +		q->hhf_arrays_reset_timestamp = hhf_time_stamp(); + +		/* Initialize valid bits of heavy-hitter filter arrays. */ +		for (i = 0; i < HHF_ARRAYS_CNT; i++) { +			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / +							  BITS_PER_BYTE); +			if (!q->hhf_valid_bits[i]) { +				hhf_destroy(sch); +				return -ENOMEM; +			} +		} + +		/* Initialize Weighted DRR buckets. */ +		for (i = 0; i < WDRR_BUCKET_CNT; i++) { +			struct wdrr_bucket *bucket = q->buckets + i; + +			INIT_LIST_HEAD(&bucket->bucketchain); +		} +	} + +	return 0; +} + +static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || +	    nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || +	    nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || +	    nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, +			jiffies_to_usecs(q->hhf_reset_timeout)) || +	    nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || +	    nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, +			jiffies_to_usecs(q->hhf_evict_timeout)) || +	    nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	return -1; +} + +static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct hhf_sched_data *q = qdisc_priv(sch); +	struct tc_hhf_xstats st = { +		.drop_overlimit = q->drop_overlimit, +		.hh_overlimit	= q->hh_flows_overlimit, +		.hh_tot_count	= q->hh_flows_total_cnt, +		.hh_cur_count	= q->hh_flows_current_cnt, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { +	.id		=	"hhf", +	.priv_size	=	sizeof(struct hhf_sched_data), + +	.enqueue	=	hhf_enqueue, +	.dequeue	=	hhf_dequeue, +	.peek		=	qdisc_peek_dequeued, +	.drop		=	hhf_drop, +	.init		=	hhf_init, +	.reset		=	hhf_reset, +	.destroy	=	hhf_destroy, +	.change		=	hhf_change, +	.dump		=	hhf_dump, +	.dump_stats	=	hhf_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init hhf_module_init(void) +{ +	return register_qdisc(&hhf_qdisc_ops); +} + +static void __exit hhf_module_exit(void) +{ +	unregister_qdisc(&hhf_qdisc_ops); +} + +module_init(hhf_module_init) +module_exit(hhf_module_exit) +MODULE_AUTHOR("Terry Lam"); +MODULE_AUTHOR("Nandita Dukkipati"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 01b519d6c52..9f949abcace 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -38,6 +38,7 @@  #include <linux/workqueue.h>  #include <linux/slab.h>  #include <net/netlink.h> +#include <net/sch_generic.h>  #include <net/pkt_sched.h>  /* HTB algorithm. @@ -64,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f  module_param    (htb_hysteresis, int, 0640);  MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); +static int htb_rate_est = 0; /* htb classes have a default rate estimator */ +module_param(htb_rate_est, int, 0640); +MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes"); +  /* used internaly to keep status of single class */  enum htb_cmode {  	HTB_CANT_SEND,		/* class can't send and can't borrow */ @@ -71,94 +76,105 @@ enum htb_cmode {  	HTB_CAN_SEND		/* class can send */  }; -/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_prio { +	union { +		struct rb_root	row; +		struct rb_root	feed; +	}; +	struct rb_node	*ptr; +	/* When class changes from state 1->2 and disconnects from +	 * parent's feed then we lost ptr value and start from the +	 * first child again. Here we store classid of the +	 * last valid ptr (used when ptr is NULL). +	 */ +	u32		last_ptr_id; +}; + +/* interior & leaf nodes; props specific to leaves are marked L: + * To reduce false sharing, place mostly read fields at beginning, + * and mostly written ones at the end. + */  struct htb_class {  	struct Qdisc_class_common common; -	/* general class parameters */ -	struct gnet_stats_basic_packed bstats; -	struct gnet_stats_queue qstats; -	struct gnet_stats_rate_est rate_est; -	struct tc_htb_xstats xstats;	/* our special stats */ -	int refcnt;		/* usage count of this class */ +	struct psched_ratecfg	rate; +	struct psched_ratecfg	ceil; +	s64			buffer, cbuffer;/* token bucket depth/rate */ +	s64			mbuffer;	/* max wait time */ +	u32			prio;		/* these two are used only by leaves... */ +	int			quantum;	/* but stored for parent-to-leaf return */ + +	struct tcf_proto	*filter_list;	/* class attached filters */ +	int			filter_cnt; +	int			refcnt;		/* usage count of this class */ -	/* topology */ -	int level;		/* our level (see above) */ -	unsigned int children; -	struct htb_class *parent;	/* parent class */ +	int			level;		/* our level (see above) */ +	unsigned int		children; +	struct htb_class	*parent;	/* parent class */ -	int prio;		/* these two are used only by leaves... */ -	int quantum;		/* but stored for parent-to-leaf return */ +	struct gnet_stats_rate_est64 rate_est; + +	/* +	 * Written often fields +	 */ +	struct gnet_stats_basic_packed bstats; +	struct gnet_stats_queue	qstats; +	struct tc_htb_xstats	xstats;	/* our special stats */ + +	/* token bucket parameters */ +	s64			tokens, ctokens;/* current number of tokens */ +	s64			t_c;		/* checkpoint time */  	union {  		struct htb_class_leaf { -			struct Qdisc *q; -			int deficit[TC_HTB_MAXDEPTH];  			struct list_head drop_list; +			int		deficit[TC_HTB_MAXDEPTH]; +			struct Qdisc	*q;  		} leaf;  		struct htb_class_inner { -			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */ -			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */ -			/* When class changes from state 1->2 and disconnects from -			   parent's feed then we lost ptr value and start from the -			   first child again. Here we store classid of the -			   last valid ptr (used when ptr is NULL). */ -			u32 last_ptr_id[TC_HTB_NUMPRIO]; +			struct htb_prio clprio[TC_HTB_NUMPRIO];  		} inner;  	} un; -	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */ -	struct rb_node pq_node;	/* node for event queue */ -	psched_time_t pq_key; - -	int prio_activity;	/* for which prios are we active */ -	enum htb_cmode cmode;	/* current mode of the class */ +	s64			pq_key; -	/* class attached filters */ -	struct tcf_proto *filter_list; -	int filter_cnt; +	int			prio_activity;	/* for which prios are we active */ +	enum htb_cmode		cmode;		/* current mode of the class */ +	struct rb_node		pq_node;	/* node for event queue */ +	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */ +}; -	/* token bucket parameters */ -	struct qdisc_rate_table *rate;	/* rate table of the class itself */ -	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */ -	long buffer, cbuffer;	/* token bucket depth/rate */ -	psched_tdiff_t mbuffer;	/* max wait time */ -	long tokens, ctokens;	/* current number of tokens */ -	psched_time_t t_c;	/* checkpoint time */ +struct htb_level { +	struct rb_root	wait_pq; +	struct htb_prio hprio[TC_HTB_NUMPRIO];  };  struct htb_sched {  	struct Qdisc_class_hash clhash; -	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ +	int			defcls;		/* class where unclassified flows go to */ +	int			rate2quantum;	/* quant = rate / rate2quantum */ -	/* self list - roots of self generating tree */ -	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; -	int row_mask[TC_HTB_MAXDEPTH]; -	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; -	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - -	/* self wait list - roots of wait PQs per row */ -	struct rb_root wait_pq[TC_HTB_MAXDEPTH]; +	/* filters for qdisc itself */ +	struct tcf_proto	*filter_list; -	/* time of nearest event per level (row) */ -	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH]; +#define HTB_WARN_TOOMANYEVENTS	0x1 +	unsigned int		warned;	/* only one warning */ +	int			direct_qlen; +	struct work_struct	work; -	int defcls;		/* class where unclassified flows go to */ +	/* non shaped skbs; let them go directly thru */ +	struct sk_buff_head	direct_queue; +	long			direct_pkts; -	/* filters for qdisc itself */ -	struct tcf_proto *filter_list; +	struct qdisc_watchdog	watchdog; -	int rate2quantum;	/* quant = rate / rate2quantum */ -	psched_time_t now;	/* cached dequeue time */ -	struct qdisc_watchdog watchdog; +	s64			now;	/* cached dequeue time */ +	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ -	/* non shaped skbs; let them go directly thru */ -	struct sk_buff_head direct_queue; -	int direct_qlen;	/* max qlen of above */ +	/* time of nearest event per level (row) */ +	s64			near_ev_cache[TC_HTB_MAXDEPTH]; -	long direct_pkts; +	int			row_mask[TC_HTB_MAXDEPTH]; -#define HTB_WARN_TOOMANYEVENTS	0x1 -	unsigned int warned;	/* only one warning */ -	struct work_struct work; +	struct htb_level	hlevel[TC_HTB_MAXDEPTH];  };  /* find class in global hash table using given handle */ @@ -182,10 +198,10 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)   * filters in qdisc and in inner nodes (if higher filter points to the inner   * node). If we end up with classid MAJOR:0 we enqueue the skb into special   * internal fifo (direct). These packets then go directly thru. If we still - * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful   * then finish and return direct queue.   */ -#define HTB_DIRECT (struct htb_class*)-1 +#define HTB_DIRECT ((struct htb_class *)-1L)  static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,  				      int *qerr) @@ -197,15 +213,22 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,  	int result;  	/* allow to select class by setting skb->priority to valid classid; -	   note that nfmark can be used too by attaching filter fw with no -	   rules in it */ +	 * note that nfmark can be used too by attaching filter fw with no +	 * rules in it +	 */  	if (skb->priority == sch->handle)  		return HTB_DIRECT;	/* X:0 (direct flow) selected */ -	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0) -		return cl; +	cl = htb_find(skb->priority, sch); +	if (cl) { +		if (cl->level == 0) +			return cl; +		/* Start with inner filter chain if a non-leaf class is selected */ +		tcf = cl->filter_list; +	} else { +		tcf = q->filter_list; +	}  	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -	tcf = q->filter_list;  	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {  #ifdef CONFIG_NET_CLS_ACT  		switch (result) { @@ -216,10 +239,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,  			return NULL;  		}  #endif -		if ((cl = (void *)res.class) == NULL) { +		cl = (void *)res.class; +		if (!cl) {  			if (res.classid == sch->handle)  				return HTB_DIRECT;	/* X:0 (direct flow) */ -			if ((cl = htb_find(res.classid, sch)) == NULL) +			cl = htb_find(res.classid, sch); +			if (!cl)  				break;	/* filter selected invalid classid */  		}  		if (!cl->level) @@ -268,9 +293,9 @@ static void htb_add_to_id_tree(struct rb_root *root,   * already in the queue.   */  static void htb_add_to_wait_tree(struct htb_sched *q, -				 struct htb_class *cl, long delay) +				 struct htb_class *cl, s64 delay)  { -	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; +	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;  	cl->pq_key = q->now + delay;  	if (cl->pq_key == q->now) @@ -290,7 +315,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q,  			p = &parent->rb_left;  	}  	rb_link_node(&cl->pq_node, parent, p); -	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); +	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);  }  /** @@ -317,7 +342,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q,  	while (mask) {  		int prio = ffz(~mask);  		mask &= ~(1 << prio); -		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio); +		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);  	}  } @@ -343,16 +368,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q,  						 struct htb_class *cl, int mask)  {  	int m = 0; +	struct htb_level *hlevel = &q->hlevel[cl->level];  	while (mask) {  		int prio = ffz(~mask); +		struct htb_prio *hprio = &hlevel->hprio[prio];  		mask &= ~(1 << prio); -		if (q->ptr[cl->level][prio] == cl->node + prio) -			htb_next_rb_node(q->ptr[cl->level] + prio); +		if (hprio->ptr == cl->node + prio) +			htb_next_rb_node(&hprio->ptr); -		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio); -		if (!q->row[cl->level][prio].rb_node) +		htb_safe_rb_erase(cl->node + prio, &hprio->row); +		if (!hprio->row.rb_node)  			m |= 1 << prio;  	}  	q->row_mask[cl->level] &= ~m; @@ -376,12 +403,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)  			int prio = ffz(~m);  			m &= ~(1 << prio); -			if (p->un.inner.feed[prio].rb_node) +			if (p->un.inner.clprio[prio].feed.rb_node)  				/* parent already has its feed in use so that -				   reset bit in mask as parent is already ok */ +				 * reset bit in mask as parent is already ok +				 */  				mask &= ~(1 << prio); -			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); +			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);  		}  		p->prio_activity |= mask;  		cl = p; @@ -411,17 +439,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)  			int prio = ffz(~m);  			m &= ~(1 << prio); -			if (p->un.inner.ptr[prio] == cl->node + prio) { +			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {  				/* we are removing child which is pointed to from -				   parent feed - forget the pointer but remember -				   classid */ -				p->un.inner.last_ptr_id[prio] = cl->common.classid; -				p->un.inner.ptr[prio] = NULL; +				 * parent feed - forget the pointer but remember +				 * classid +				 */ +				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; +				p->un.inner.clprio[prio].ptr = NULL;  			} -			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio); +			htb_safe_rb_erase(cl->node + prio, +					  &p->un.inner.clprio[prio].feed); -			if (!p->un.inner.feed[prio].rb_node) +			if (!p->un.inner.clprio[prio].feed.rb_node)  				mask |= 1 << prio;  		} @@ -434,14 +464,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)  		htb_remove_class_from_row(q, cl, mask);  } -static inline long htb_lowater(const struct htb_class *cl) +static inline s64 htb_lowater(const struct htb_class *cl)  {  	if (htb_hysteresis)  		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;  	else  		return 0;  } -static inline long htb_hiwater(const struct htb_class *cl) +static inline s64 htb_hiwater(const struct htb_class *cl)  {  	if (htb_hysteresis)  		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; @@ -462,9 +492,9 @@ static inline long htb_hiwater(const struct htb_class *cl)   * mode transitions per time unit. The speed gain is about 1/6.   */  static inline enum htb_cmode -htb_class_mode(struct htb_class *cl, long *diff) +htb_class_mode(struct htb_class *cl, s64 *diff)  { -	long toks; +	s64 toks;  	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {  		*diff = -toks; @@ -488,7 +518,7 @@ htb_class_mode(struct htb_class *cl, long *diff)   * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).   */  static void -htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)  {  	enum htb_cmode new_mode = htb_class_mode(cl, diff); @@ -551,9 +581,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)  			__skb_queue_tail(&q->direct_queue, skb);  			q->direct_pkts++;  		} else { -			kfree_skb(skb); -			sch->qstats.drops++; -			return NET_XMIT_DROP; +			return qdisc_drop(skb, sch);  		}  #ifdef CONFIG_NET_CLS_ACT  	} else if (!cl) { @@ -569,38 +597,33 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		}  		return ret;  	} else { -		cl->bstats.packets += -			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1; -		cl->bstats.bytes += qdisc_pkt_len(skb);  		htb_activate(q, cl);  	}  	sch->q.qlen++; -	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1; -	sch->bstats.bytes += qdisc_pkt_len(skb);  	return NET_XMIT_SUCCESS;  } -static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)  { -	long toks = diff + cl->tokens; +	s64 toks = diff + cl->tokens;  	if (toks > cl->buffer)  		toks = cl->buffer; -	toks -= (long) qdisc_l2t(cl->rate, bytes); +	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);  	if (toks <= -cl->mbuffer)  		toks = 1 - cl->mbuffer;  	cl->tokens = toks;  } -static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)  { -	long toks = diff + cl->ctokens; +	s64 toks = diff + cl->ctokens;  	if (toks > cl->cbuffer)  		toks = cl->cbuffer; -	toks -= (long) qdisc_l2t(cl->ceil, bytes); +	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);  	if (toks <= -cl->mbuffer)  		toks = 1 - cl->mbuffer; @@ -623,10 +646,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,  {  	int bytes = qdisc_pkt_len(skb);  	enum htb_cmode old_mode; -	long diff; +	s64 diff;  	while (cl) { -		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); +		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);  		if (cl->level >= level) {  			if (cl->level == level)  				cl->xstats.lends++; @@ -643,17 +666,15 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,  		htb_change_class_mode(q, cl, &diff);  		if (old_mode != cl->cmode) {  			if (old_mode != HTB_CAN_SEND) -				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); +				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);  			if (cl->cmode != HTB_CAN_SEND)  				htb_add_to_wait_tree(q, cl, diff);  		} -		/* update byte stats except for leaves which are already updated */ -		if (cl->level) { -			cl->bstats.bytes += bytes; -			cl->bstats.packets += skb_is_gso(skb)? -					skb_shinfo(skb)->gso_segs:1; -		} +		/* update basic stats except for leaves which are already updated */ +		if (cl->level) +			bstats_update(&cl->bstats, skb); +  		cl = cl->parent;  	}  } @@ -665,17 +686,20 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,   * next pending event (0 for no event in pq, q->now for too many events).   * Note: Applied are events whose have cl->pq_key <= q->now.   */ -static psched_time_t htb_do_events(struct htb_sched *q, int level, -				   unsigned long start) +static s64 htb_do_events(struct htb_sched *q, const int level, +			 unsigned long start)  {  	/* don't run for longer than 2 jiffies; 2 is used instead of -	   1 to simplify things when jiffy is going to be incremented -	   too soon */ +	 * 1 to simplify things when jiffy is going to be incremented +	 * too soon +	 */  	unsigned long stop_at = start + 2; +	struct rb_root *wait_pq = &q->hlevel[level].wait_pq; +  	while (time_before(jiffies, stop_at)) {  		struct htb_class *cl; -		long diff; -		struct rb_node *p = rb_first(&q->wait_pq[level]); +		s64 diff; +		struct rb_node *p = rb_first(wait_pq);  		if (!p)  			return 0; @@ -684,8 +708,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,  		if (cl->pq_key > q->now)  			return cl->pq_key; -		htb_safe_rb_erase(p, q->wait_pq + level); -		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); +		htb_safe_rb_erase(p, wait_pq); +		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);  		htb_change_class_mode(q, cl, &diff);  		if (cl->cmode != HTB_CAN_SEND)  			htb_add_to_wait_tree(q, cl, diff); @@ -693,7 +717,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,  	/* too much load - let's continue after a break for scheduling */  	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { -		printk(KERN_WARNING "htb: too many events!\n"); +		pr_warn("htb: too many events!\n");  		q->warned |= HTB_WARN_TOOMANYEVENTS;  	} @@ -701,7 +725,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,  }  /* Returns class->node+prio from id-tree where classe's id is >= id. NULL -   is no such one exists. */ + * is no such one exists. + */  static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,  					      u32 id)  { @@ -727,8 +752,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,   *   * Find leaf where current feed pointers points to.   */ -static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, -					 struct rb_node **pptr, u32 * pid) +static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)  {  	int i;  	struct { @@ -737,20 +761,22 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,  		u32 *pid;  	} stk[TC_HTB_MAXDEPTH], *sp = stk; -	BUG_ON(!tree->rb_node); -	sp->root = tree->rb_node; -	sp->pptr = pptr; -	sp->pid = pid; +	BUG_ON(!hprio->row.rb_node); +	sp->root = hprio->row.rb_node; +	sp->pptr = &hprio->ptr; +	sp->pid = &hprio->last_ptr_id;  	for (i = 0; i < 65535; i++) {  		if (!*sp->pptr && *sp->pid) {  			/* ptr was invalidated but id is valid - try to recover -			   the original or next ptr */ +			 * the original or next ptr +			 */  			*sp->pptr =  			    htb_id_find_next_upper(prio, sp->root, *sp->pid);  		}  		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it -				   can become out of date quickly */ +				 * can become out of date quickly +				 */  		if (!*sp->pptr) {	/* we are at right end; rewind & go up */  			*sp->pptr = sp->root;  			while ((*sp->pptr)->rb_left) @@ -765,12 +791,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,  			}  		} else {  			struct htb_class *cl; +			struct htb_prio *clp; +  			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);  			if (!cl->level)  				return cl; -			(++sp)->root = cl->un.inner.feed[prio].rb_node; -			sp->pptr = cl->un.inner.ptr + prio; -			sp->pid = cl->un.inner.last_ptr_id + prio; +			clp = &cl->un.inner.clprio[prio]; +			(++sp)->root = clp->feed.rb_node; +			sp->pptr = &clp->ptr; +			sp->pid = &clp->last_ptr_id;  		}  	}  	WARN_ON(1); @@ -778,16 +807,18 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,  }  /* dequeues packet at given priority and level; call only if -   you are sure that there is active class at prio/level */ -static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, -					int level) + * you are sure that there is active class at prio/level + */ +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, +					const int level)  {  	struct sk_buff *skb = NULL;  	struct htb_class *cl, *start; +	struct htb_level *hlevel = &q->hlevel[level]; +	struct htb_prio *hprio = &hlevel->hprio[prio]; +  	/* look initial class up in the row */ -	start = cl = htb_lookup_leaf(q->row[level] + prio, prio, -				     q->ptr[level] + prio, -				     q->last_ptr_id[level] + prio); +	start = cl = htb_lookup_leaf(hprio, prio);  	do {  next: @@ -795,9 +826,10 @@ next:  			return NULL;  		/* class can be empty - it is unlikely but can be true if leaf -		   qdisc drops packets in enqueue routine or if someone used -		   graft operation on the leaf since last dequeue; -		   simply deactivate and skip such class */ +		 * qdisc drops packets in enqueue routine or if someone used +		 * graft operation on the leaf since last dequeue; +		 * simply deactivate and skip such class +		 */  		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {  			struct htb_class *next;  			htb_deactivate(q, cl); @@ -806,9 +838,7 @@ next:  			if ((q->row_mask[level] & (1 << prio)) == 0)  				return NULL; -			next = htb_lookup_leaf(q->row[level] + prio, -					       prio, q->ptr[level] + prio, -					       q->last_ptr_id[level] + prio); +			next = htb_lookup_leaf(hprio, prio);  			if (cl == start)	/* fix start if we just deleted it */  				start = next; @@ -821,23 +851,23 @@ next:  			break;  		qdisc_warn_nonwc("htb", cl->un.leaf.q); -		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> -				  ptr[0]) + prio); -		cl = htb_lookup_leaf(q->row[level] + prio, prio, -				     q->ptr[level] + prio, -				     q->last_ptr_id[level] + prio); +		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: +					 &q->hlevel[0].hprio[prio].ptr); +		cl = htb_lookup_leaf(hprio, prio);  	} while (cl != start);  	if (likely(skb != NULL)) { +		bstats_update(&cl->bstats, skb);  		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);  		if (cl->un.leaf.deficit[level] < 0) {  			cl->un.leaf.deficit[level] += cl->quantum; -			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> -					  ptr[0]) + prio); +			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : +						 &q->hlevel[0].hprio[prio].ptr);  		}  		/* this used to be after charge_class but this constelation -		   gives us slightly better performance */ +		 * gives us slightly better performance +		 */  		if (!cl->un.leaf.q->q.qlen)  			htb_deactivate(q, cl);  		htb_charge_class(q, cl, level, skb); @@ -847,39 +877,40 @@ next:  static struct sk_buff *htb_dequeue(struct Qdisc *sch)  { -	struct sk_buff *skb = NULL; +	struct sk_buff *skb;  	struct htb_sched *q = qdisc_priv(sch);  	int level; -	psched_time_t next_event; +	s64 next_event;  	unsigned long start_at;  	/* try to dequeue direct packets as high prio (!) to minimize cpu work */  	skb = __skb_dequeue(&q->direct_queue);  	if (skb != NULL) { -		sch->flags &= ~TCQ_F_THROTTLED; +ok: +		qdisc_bstats_update(sch, skb); +		qdisc_unthrottled(sch);  		sch->q.qlen--;  		return skb;  	}  	if (!sch->q.qlen)  		goto fin; -	q->now = psched_get_time(); +	q->now = ktime_to_ns(ktime_get());  	start_at = jiffies; -	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; +	next_event = q->now + 5LLU * NSEC_PER_SEC;  	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {  		/* common case optimization - skip event handler quickly */  		int m; -		psched_time_t event; +		s64 event = q->near_ev_cache[level]; -		if (q->now >= q->near_ev_cache[level]) { +		if (q->now >= event) {  			event = htb_do_events(q, level, start_at);  			if (!event) -				event = q->now + PSCHED_TICKS_PER_SEC; +				event = q->now + NSEC_PER_SEC;  			q->near_ev_cache[level] = event; -		} else -			event = q->near_ev_cache[level]; +		}  		if (next_event > event)  			next_event = event; @@ -887,20 +918,25 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)  		m = ~q->row_mask[level];  		while (m != (int)(-1)) {  			int prio = ffz(m); +  			m |= 1 << prio;  			skb = htb_dequeue_tree(q, prio, level); -			if (likely(skb != NULL)) { -				sch->q.qlen--; -				sch->flags &= ~TCQ_F_THROTTLED; -				goto fin; -			} +			if (likely(skb != NULL)) +				goto ok;  		}  	}  	sch->qstats.overlimits++; -	if (likely(next_event > q->now)) -		qdisc_watchdog_schedule(&q->watchdog, next_event); -	else +	if (likely(next_event > q->now)) { +		if (!test_bit(__QDISC_STATE_DEACTIVATED, +			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { +			ktime_t time = ns_to_ktime(next_event); +			qdisc_throttled(q->watchdog.qdisc); +			hrtimer_start(&q->watchdog.timer, time, +				      HRTIMER_MODE_ABS); +		} +	} else {  		schedule_work(&q->work); +	}  fin:  	return skb;  } @@ -935,11 +971,10 @@ static void htb_reset(struct Qdisc *sch)  {  	struct htb_sched *q = qdisc_priv(sch);  	struct htb_class *cl; -	struct hlist_node *n;  	unsigned int i;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {  			if (cl->level)  				memset(&cl->un.inner, 0, sizeof(cl->un.inner));  			else { @@ -955,10 +990,8 @@ static void htb_reset(struct Qdisc *sch)  	qdisc_watchdog_cancel(&q->watchdog);  	__skb_queue_purge(&q->direct_queue);  	sch->q.qlen = 0; -	memset(q->row, 0, sizeof(q->row)); +	memset(q->hlevel, 0, sizeof(q->hlevel));  	memset(q->row_mask, 0, sizeof(q->row_mask)); -	memset(q->wait_pq, 0, sizeof(q->wait_pq)); -	memset(q->ptr, 0, sizeof(q->ptr));  	for (i = 0; i < TC_HTB_NUMPRIO; i++)  		INIT_LIST_HEAD(q->drops + i);  } @@ -968,6 +1001,9 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {  	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },  	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },  	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, +	[TCA_HTB_RATE64] = { .type = NLA_U64 }, +	[TCA_HTB_CEIL64] = { .type = NLA_U64 },  };  static void htb_work_func(struct work_struct *work) @@ -981,7 +1017,7 @@ static void htb_work_func(struct work_struct *work)  static int htb_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct htb_sched *q = qdisc_priv(sch); -	struct nlattr *tb[TCA_HTB_INIT + 1]; +	struct nlattr *tb[TCA_HTB_MAX + 1];  	struct tc_htb_glob *gopt;  	int err;  	int i; @@ -989,21 +1025,16 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)  	if (!opt)  		return -EINVAL; -	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy); +	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);  	if (err < 0)  		return err; -	if (tb[TCA_HTB_INIT] == NULL) { -		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); +	if (!tb[TCA_HTB_INIT])  		return -EINVAL; -	} +  	gopt = nla_data(tb[TCA_HTB_INIT]); -	if (gopt->version != HTB_VER >> 16) { -		printk(KERN_ERR -		       "HTB: need tc/htb version %d (minor is %d), you have %d\n", -		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); +	if (gopt->version != HTB_VER >> 16)  		return -EINVAL; -	}  	err = qdisc_class_hash_init(&q->clhash);  	if (err < 0) @@ -1015,10 +1046,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)  	INIT_WORK(&q->work, htb_work_func);  	skb_queue_head_init(&q->direct_queue); -	q->direct_qlen = qdisc_dev(sch)->tx_queue_len; -	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */ -		q->direct_qlen = 2; - +	if (tb[TCA_HTB_DIRECT_QLEN]) +		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); +	else { +		q->direct_qlen = qdisc_dev(sch)->tx_queue_len; +		if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */ +			q->direct_qlen = 2; +	}  	if ((q->rate2quantum = gopt->rate2quantum) < 1)  		q->rate2quantum = 1;  	q->defcls = gopt->defcls; @@ -1028,12 +1062,13 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)  static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)  { -	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);  	struct htb_sched *q = qdisc_priv(sch);  	struct nlattr *nest;  	struct tc_htb_glob gopt; -	spin_lock_bh(root_lock); +	/* Its safe to not acquire qdisc lock. As we hold RTNL, +	 * no change can happen on the qdisc parameters. +	 */  	gopt.direct_pkts = q->direct_pkts;  	gopt.version = HTB_VER; @@ -1044,14 +1079,13 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); -	nla_nest_end(skb, nest); +	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || +	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen)) +		goto nla_put_failure; -	spin_unlock_bh(root_lock); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure: -	spin_unlock_bh(root_lock);  	nla_nest_cancel(skb, nest);  	return -1;  } @@ -1060,11 +1094,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,  			  struct sk_buff *skb, struct tcmsg *tcm)  {  	struct htb_class *cl = (struct htb_class *)arg; -	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);  	struct nlattr *nest;  	struct tc_htb_opt opt; -	spin_lock_bh(root_lock); +	/* Its safe to not acquire qdisc lock. As we hold RTNL, +	 * no change can happen on the class parameters. +	 */  	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;  	tcm->tcm_handle = cl->common.classid;  	if (!cl->level && cl->un.leaf.q) @@ -1076,21 +1111,25 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,  	memset(&opt, 0, sizeof(opt)); -	opt.rate = cl->rate->rate; -	opt.buffer = cl->buffer; -	opt.ceil = cl->ceil->rate; -	opt.cbuffer = cl->cbuffer; +	psched_ratecfg_getrate(&opt.rate, &cl->rate); +	opt.buffer = PSCHED_NS2TICKS(cl->buffer); +	psched_ratecfg_getrate(&opt.ceil, &cl->ceil); +	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);  	opt.quantum = cl->quantum;  	opt.prio = cl->prio;  	opt.level = cl->level; -	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure; +	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && +	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) +		goto nla_put_failure; +	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && +	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) +		goto nla_put_failure; -	nla_nest_end(skb, nest); -	spin_unlock_bh(root_lock); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure: -	spin_unlock_bh(root_lock);  	nla_nest_cancel(skb, nest);  	return -1;  } @@ -1102,8 +1141,8 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)  	if (!cl->level && cl->un.leaf.q)  		cl->qstats.qlen = cl->un.leaf.q->q.qlen; -	cl->xstats.tokens = cl->tokens; -	cl->xstats.ctokens = cl->ctokens; +	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); +	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);  	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||  	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || @@ -1177,7 +1216,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,  	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);  	if (parent->cmode != HTB_CAN_SEND) -		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level); +		htb_safe_rb_erase(&parent->pq_node, +				  &q->hlevel[parent->level].wait_pq);  	parent->level = 0;  	memset(&parent->un.inner, 0, sizeof(parent->un.inner)); @@ -1185,7 +1225,7 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,  	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;  	parent->tokens = parent->buffer;  	parent->ctokens = parent->cbuffer; -	parent->t_c = psched_get_time(); +	parent->t_c = ktime_to_ns(ktime_get());  	parent->cmode = HTB_CAN_SEND;  } @@ -1196,9 +1236,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)  		qdisc_destroy(cl->un.leaf.q);  	}  	gen_kill_estimator(&cl->bstats, &cl->rate_est); -	qdisc_put_rtab(cl->rate); -	qdisc_put_rtab(cl->ceil); -  	tcf_destroy_chain(&cl->filter_list);  	kfree(cl);  } @@ -1206,24 +1243,25 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)  static void htb_destroy(struct Qdisc *sch)  {  	struct htb_sched *q = qdisc_priv(sch); -	struct hlist_node *n, *next; +	struct hlist_node *next;  	struct htb_class *cl;  	unsigned int i;  	cancel_work_sync(&q->work);  	qdisc_watchdog_cancel(&q->watchdog);  	/* This line used to be after htb_destroy_class call below -	   and surprisingly it worked in 2.4. But it must precede it -	   because filter need its target class alive to be able to call -	   unbind_filter on it (without Oops). */ +	 * and surprisingly it worked in 2.4. But it must precede it +	 * because filter need its target class alive to be able to call +	 * unbind_filter on it (without Oops). +	 */  	tcf_destroy_chain(&q->filter_list);  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)  			tcf_destroy_chain(&cl->filter_list);  	}  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], +		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],  					  common.hnode)  			htb_destroy_class(sch, cl);  	} @@ -1239,9 +1277,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)  	struct Qdisc *new_q = NULL;  	int last_child = 0; -	// TODO: why don't allow to delete subtree ? references ? does -	// tc subsys quarantee us that in htb_destroy it holds no class -	// refs so that we can remove children safely there ? +	/* TODO: why don't allow to delete subtree ? references ? does +	 * tc subsys guarantee us that in htb_destroy it holds no class +	 * refs so that we can remove children safely there ? +	 */  	if (cl->children || cl->filter_cnt)  		return -EBUSY; @@ -1268,7 +1307,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)  		htb_deactivate(q, cl);  	if (cl->cmode != HTB_CAN_SEND) -		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); +		htb_safe_rb_erase(&cl->pq_node, +				  &q->hlevel[cl->level].wait_pq);  	if (last_child)  		htb_parent_to_leaf(q, cl, new_q); @@ -1299,9 +1339,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  	struct htb_sched *q = qdisc_priv(sch);  	struct htb_class *cl = (struct htb_class *)*arg, *parent;  	struct nlattr *opt = tca[TCA_OPTIONS]; -	struct qdisc_rate_table *rtab = NULL, *ctab = NULL; -	struct nlattr *tb[__TCA_HTB_MAX]; +	struct nlattr *tb[TCA_HTB_MAX + 1];  	struct tc_htb_opt *hopt; +	u64 rate64, ceil64;  	/* extract all subattrs from opt attr */  	if (!opt) @@ -1318,12 +1358,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);  	hopt = nla_data(tb[TCA_HTB_PARMS]); - -	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); -	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); -	if (!rtab || !ctab) +	if (!hopt->rate.rate || !hopt->ceil.rate)  		goto failure; +	/* Keeping backward compatible with rate_table based iproute2 tc */ +	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB])); + +	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB])); +  	if (!cl) {		/* new class */  		struct Qdisc *new_q;  		int prio; @@ -1349,19 +1393,22 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  		/* check maximal depth */  		if (parent && parent->parent && parent->parent->level < 2) { -			printk(KERN_ERR "htb: tree is too deep\n"); +			pr_err("htb: tree is too deep\n");  			goto failure;  		}  		err = -ENOBUFS; -		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL) +		cl = kzalloc(sizeof(*cl), GFP_KERNEL); +		if (!cl)  			goto failure; -		err = gen_new_estimator(&cl->bstats, &cl->rate_est, -					qdisc_root_sleeping_lock(sch), -					tca[TCA_RATE] ? : &est.nla); -		if (err) { -			kfree(cl); -			goto failure; +		if (htb_rate_est || tca[TCA_RATE]) { +			err = gen_new_estimator(&cl->bstats, &cl->rate_est, +						qdisc_root_sleeping_lock(sch), +						tca[TCA_RATE] ? : &est.nla); +			if (err) { +				kfree(cl); +				goto failure; +			}  		}  		cl->refcnt = 1; @@ -1373,8 +1420,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  			RB_CLEAR_NODE(&cl->node[prio]);  		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) -		   so that can't be used inside of sch_tree_lock -		   -- thanks to Karlis Peisenieks */ +		 * so that can't be used inside of sch_tree_lock +		 * -- thanks to Karlis Peisenieks +		 */  		new_q = qdisc_create_dflt(sch->dev_queue,  					  &pfifo_qdisc_ops, classid);  		sch_tree_lock(sch); @@ -1390,7 +1438,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  			/* remove from evt list because of level change */  			if (parent->cmode != HTB_CAN_SEND) { -				htb_safe_rb_erase(&parent->pq_node, q->wait_pq); +				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);  				parent->cmode = HTB_CAN_SEND;  			}  			parent->level = (parent->parent ? parent->parent->level @@ -1404,10 +1452,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  		cl->parent = parent;  		/* set class to be in HTB_CAN_SEND state */ -		cl->tokens = hopt->buffer; -		cl->ctokens = hopt->cbuffer; -		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */ -		cl->t_c = psched_get_time(); +		cl->tokens = PSCHED_TICKS2NS(hopt->buffer); +		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); +		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */ +		cl->t_c = ktime_to_ns(ktime_get());  		cl->cmode = HTB_CAN_SEND;  		/* attach to the hash list and parent's family */ @@ -1425,20 +1473,30 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  		sch_tree_lock(sch);  	} +	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + +	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + +	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); +	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); +  	/* it used to be a nasty bug here, we have to check that node -	   is really leaf before changing cl->un.leaf ! */ +	 * is really leaf before changing cl->un.leaf ! +	 */  	if (!cl->level) { -		cl->quantum = rtab->rate.rate / q->rate2quantum; +		u64 quantum = cl->rate.rate_bytes_ps; + +		do_div(quantum, q->rate2quantum); +		cl->quantum = min_t(u64, quantum, INT_MAX); +  		if (!hopt->quantum && cl->quantum < 1000) { -			printk(KERN_WARNING -			       "HTB: quantum of class %X is small. Consider r2q change.\n", -			       cl->common.classid); +			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", +				cl->common.classid);  			cl->quantum = 1000;  		}  		if (!hopt->quantum && cl->quantum > 200000) { -			printk(KERN_WARNING -			       "HTB: quantum of class %X is big. Consider r2q change.\n", -			       cl->common.classid); +			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", +				cl->common.classid);  			cl->quantum = 200000;  		}  		if (hopt->quantum) @@ -1447,14 +1505,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  			cl->prio = TC_HTB_NUMPRIO - 1;  	} -	cl->buffer = hopt->buffer; -	cl->cbuffer = hopt->cbuffer; -	if (cl->rate) -		qdisc_put_rtab(cl->rate); -	cl->rate = rtab; -	if (cl->ceil) -		qdisc_put_rtab(cl->ceil); -	cl->ceil = ctab; +	cl->buffer = PSCHED_TICKS2NS(hopt->buffer); +	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); +  	sch_tree_unlock(sch);  	qdisc_class_hash_grow(sch, &q->clhash); @@ -1463,10 +1516,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,  	return 0;  failure: -	if (rtab) -		qdisc_put_rtab(rtab); -	if (ctab) -		qdisc_put_rtab(ctab);  	return err;  } @@ -1485,13 +1534,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,  	struct htb_class *cl = htb_find(classid, sch);  	/*if (cl && !cl->level) return 0; -	   The line above used to be there to prevent attaching filters to -	   leaves. But at least tc_index filter uses this just to get class -	   for other reasons so that we have to allow for it. -	   ---- -	   19.6.2002 As Werner explained it is ok - bind filter is just -	   another way to "lock" the class - unlike "get" this lock can -	   be broken by class during destroy IIUC. +	 * The line above used to be there to prevent attaching filters to +	 * leaves. But at least tc_index filter uses this just to get class +	 * for other reasons so that we have to allow for it. +	 * ---- +	 * 19.6.2002 As Werner explained it is ok - bind filter is just +	 * another way to "lock" the class - unlike "get" this lock can +	 * be broken by class during destroy IIUC.  	 */  	if (cl)  		cl->filter_cnt++; @@ -1510,14 +1559,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)  {  	struct htb_sched *q = qdisc_priv(sch);  	struct htb_class *cl; -	struct hlist_node *n;  	unsigned int i;  	if (arg->stop)  		return;  	for (i = 0; i < q->clhash.hashsize; i++) { -		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {  			if (arg->count < arg->skip) {  				arg->count++;  				continue; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index f10e34a6844..62871c14e1f 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -63,8 +63,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	result = tc_classify(skb, p->filter_list, &res); -	sch->bstats.packets++; -	sch->bstats.bytes += qdisc_pkt_len(skb); +	qdisc_bstats_update(sch, skb);  	switch (result) {  	case TC_ACT_SHOT:  		result = TC_ACT_SHOT; @@ -101,8 +100,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index ecc302f4d2a..a8b2864a696 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -11,6 +11,7 @@  #include <linux/types.h>  #include <linux/slab.h>  #include <linux/kernel.h> +#include <linux/export.h>  #include <linux/string.h>  #include <linux/errno.h>  #include <linux/skbuff.h> @@ -56,13 +57,13 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)  	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {  		dev_queue = netdev_get_tx_queue(dev, ntx); -		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, +		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,  					  TC_H_MAKE(TC_H_MAJ(sch->handle),  						    TC_H_MIN(ntx + 1)));  		if (qdisc == NULL)  			goto err; -		qdisc->flags |= TCQ_F_CAN_BYPASS;  		priv->qdiscs[ntx] = qdisc; +		qdisc->flags |= TCQ_F_ONETXQUEUE;  	}  	sch->flags |= TCQ_F_MQROOT; @@ -77,14 +78,19 @@ static void mq_attach(struct Qdisc *sch)  {  	struct net_device *dev = qdisc_dev(sch);  	struct mq_sched *priv = qdisc_priv(sch); -	struct Qdisc *qdisc; +	struct Qdisc *qdisc, *old;  	unsigned int ntx;  	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {  		qdisc = priv->qdiscs[ntx]; -		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); -		if (qdisc) -			qdisc_destroy(qdisc); +		old = dev_graft_qdisc(qdisc->dev_queue, qdisc); +		if (old) +			qdisc_destroy(old); +#ifdef CONFIG_NET_SCHED +		if (ntx < dev->real_num_tx_queues) +			qdisc_list_add(qdisc); +#endif +  	}  	kfree(priv->qdiscs);  	priv->qdiscs = NULL; @@ -150,7 +156,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,  		dev_deactivate(dev);  	*old = dev_graft_qdisc(dev_queue, new); - +	if (new) +		new->flags |= TCQ_F_ONETXQUEUE;  	if (dev->flags & IFF_UP)  		dev_activate(dev);  	return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c new file mode 100644 index 00000000000..6749e2f540d --- /dev/null +++ b/net/sched/sch_mqprio.c @@ -0,0 +1,426 @@ +/* + * net/sched/sch_mqprio.c + * + * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/module.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct mqprio_sched { +	struct Qdisc		**qdiscs; +	int hw_owned; +}; + +static void mqprio_destroy(struct Qdisc *sch) +{ +	struct net_device *dev = qdisc_dev(sch); +	struct mqprio_sched *priv = qdisc_priv(sch); +	unsigned int ntx; + +	if (priv->qdiscs) { +		for (ntx = 0; +		     ntx < dev->num_tx_queues && priv->qdiscs[ntx]; +		     ntx++) +			qdisc_destroy(priv->qdiscs[ntx]); +		kfree(priv->qdiscs); +	} + +	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) +		dev->netdev_ops->ndo_setup_tc(dev, 0); +	else +		netdev_set_num_tc(dev, 0); +} + +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ +	int i, j; + +	/* Verify num_tc is not out of max range */ +	if (qopt->num_tc > TC_MAX_QUEUE) +		return -EINVAL; + +	/* Verify priority mapping uses valid tcs */ +	for (i = 0; i < TC_BITMASK + 1; i++) { +		if (qopt->prio_tc_map[i] >= qopt->num_tc) +			return -EINVAL; +	} + +	/* net_device does not support requested operation */ +	if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) +		return -EINVAL; + +	/* if hw owned qcount and qoffset are taken from LLD so +	 * no reason to verify them here +	 */ +	if (qopt->hw) +		return 0; + +	for (i = 0; i < qopt->num_tc; i++) { +		unsigned int last = qopt->offset[i] + qopt->count[i]; + +		/* Verify the queue count is in tx range being equal to the +		 * real_num_tx_queues indicates the last queue is in use. +		 */ +		if (qopt->offset[i] >= dev->real_num_tx_queues || +		    !qopt->count[i] || +		    last > dev->real_num_tx_queues) +			return -EINVAL; + +		/* Verify that the offset and counts do not overlap */ +		for (j = i + 1; j < qopt->num_tc; j++) { +			if (last > qopt->offset[j]) +				return -EINVAL; +		} +	} + +	return 0; +} + +static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct net_device *dev = qdisc_dev(sch); +	struct mqprio_sched *priv = qdisc_priv(sch); +	struct netdev_queue *dev_queue; +	struct Qdisc *qdisc; +	int i, err = -EOPNOTSUPP; +	struct tc_mqprio_qopt *qopt = NULL; + +	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); +	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); + +	if (sch->parent != TC_H_ROOT) +		return -EOPNOTSUPP; + +	if (!netif_is_multiqueue(dev)) +		return -EOPNOTSUPP; + +	if (!opt || nla_len(opt) < sizeof(*qopt)) +		return -EINVAL; + +	qopt = nla_data(opt); +	if (mqprio_parse_opt(dev, qopt)) +		return -EINVAL; + +	/* pre-allocate qdisc, attachment can't fail */ +	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), +			       GFP_KERNEL); +	if (priv->qdiscs == NULL) { +		err = -ENOMEM; +		goto err; +	} + +	for (i = 0; i < dev->num_tx_queues; i++) { +		dev_queue = netdev_get_tx_queue(dev, i); +		qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, +					  TC_H_MAKE(TC_H_MAJ(sch->handle), +						    TC_H_MIN(i + 1))); +		if (qdisc == NULL) { +			err = -ENOMEM; +			goto err; +		} +		priv->qdiscs[i] = qdisc; +		qdisc->flags |= TCQ_F_ONETXQUEUE; +	} + +	/* If the mqprio options indicate that hardware should own +	 * the queue mapping then run ndo_setup_tc otherwise use the +	 * supplied and verified mapping +	 */ +	if (qopt->hw) { +		priv->hw_owned = 1; +		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); +		if (err) +			goto err; +	} else { +		netdev_set_num_tc(dev, qopt->num_tc); +		for (i = 0; i < qopt->num_tc; i++) +			netdev_set_tc_queue(dev, i, +					    qopt->count[i], qopt->offset[i]); +	} + +	/* Always use supplied priority mappings */ +	for (i = 0; i < TC_BITMASK + 1; i++) +		netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); + +	sch->flags |= TCQ_F_MQROOT; +	return 0; + +err: +	mqprio_destroy(sch); +	return err; +} + +static void mqprio_attach(struct Qdisc *sch) +{ +	struct net_device *dev = qdisc_dev(sch); +	struct mqprio_sched *priv = qdisc_priv(sch); +	struct Qdisc *qdisc, *old; +	unsigned int ntx; + +	/* Attach underlying qdisc */ +	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { +		qdisc = priv->qdiscs[ntx]; +		old = dev_graft_qdisc(qdisc->dev_queue, qdisc); +		if (old) +			qdisc_destroy(old); +		if (ntx < dev->real_num_tx_queues) +			qdisc_list_add(qdisc); +	} +	kfree(priv->qdiscs); +	priv->qdiscs = NULL; +} + +static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, +					     unsigned long cl) +{ +	struct net_device *dev = qdisc_dev(sch); +	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + +	if (ntx >= dev->num_tx_queues) +		return NULL; +	return netdev_get_tx_queue(dev, ntx); +} + +static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, +		    struct Qdisc **old) +{ +	struct net_device *dev = qdisc_dev(sch); +	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + +	if (!dev_queue) +		return -EINVAL; + +	if (dev->flags & IFF_UP) +		dev_deactivate(dev); + +	*old = dev_graft_qdisc(dev_queue, new); + +	if (new) +		new->flags |= TCQ_F_ONETXQUEUE; + +	if (dev->flags & IFF_UP) +		dev_activate(dev); + +	return 0; +} + +static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct net_device *dev = qdisc_dev(sch); +	struct mqprio_sched *priv = qdisc_priv(sch); +	unsigned char *b = skb_tail_pointer(skb); +	struct tc_mqprio_qopt opt = { 0 }; +	struct Qdisc *qdisc; +	unsigned int i; + +	sch->q.qlen = 0; +	memset(&sch->bstats, 0, sizeof(sch->bstats)); +	memset(&sch->qstats, 0, sizeof(sch->qstats)); + +	for (i = 0; i < dev->num_tx_queues; i++) { +		qdisc = netdev_get_tx_queue(dev, i)->qdisc; +		spin_lock_bh(qdisc_lock(qdisc)); +		sch->q.qlen		+= qdisc->q.qlen; +		sch->bstats.bytes	+= qdisc->bstats.bytes; +		sch->bstats.packets	+= qdisc->bstats.packets; +		sch->qstats.qlen	+= qdisc->qstats.qlen; +		sch->qstats.backlog	+= qdisc->qstats.backlog; +		sch->qstats.drops	+= qdisc->qstats.drops; +		sch->qstats.requeues	+= qdisc->qstats.requeues; +		sch->qstats.overlimits	+= qdisc->qstats.overlimits; +		spin_unlock_bh(qdisc_lock(qdisc)); +	} + +	opt.num_tc = netdev_get_num_tc(dev); +	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); +	opt.hw = priv->hw_owned; + +	for (i = 0; i < netdev_get_num_tc(dev); i++) { +		opt.count[i] = dev->tc_to_txq[i].count; +		opt.offset[i] = dev->tc_to_txq[i].offset; +	} + +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure; + +	return skb->len; +nla_put_failure: +	nlmsg_trim(skb, b); +	return -1; +} + +static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) +{ +	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + +	if (!dev_queue) +		return NULL; + +	return dev_queue->qdisc_sleeping; +} + +static unsigned long mqprio_get(struct Qdisc *sch, u32 classid) +{ +	struct net_device *dev = qdisc_dev(sch); +	unsigned int ntx = TC_H_MIN(classid); + +	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) +		return 0; +	return ntx; +} + +static void mqprio_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, +			 struct sk_buff *skb, struct tcmsg *tcm) +{ +	struct net_device *dev = qdisc_dev(sch); + +	if (cl <= netdev_get_num_tc(dev)) { +		tcm->tcm_parent = TC_H_ROOT; +		tcm->tcm_info = 0; +	} else { +		int i; +		struct netdev_queue *dev_queue; + +		dev_queue = mqprio_queue_get(sch, cl); +		tcm->tcm_parent = 0; +		for (i = 0; i < netdev_get_num_tc(dev); i++) { +			struct netdev_tc_txq tc = dev->tc_to_txq[i]; +			int q_idx = cl - netdev_get_num_tc(dev); + +			if (q_idx > tc.offset && +			    q_idx <= tc.offset + tc.count) { +				tcm->tcm_parent = +					TC_H_MAKE(TC_H_MAJ(sch->handle), +						  TC_H_MIN(i + 1)); +				break; +			} +		} +		tcm->tcm_info = dev_queue->qdisc_sleeping->handle; +	} +	tcm->tcm_handle |= TC_H_MIN(cl); +	return 0; +} + +static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, +				   struct gnet_dump *d) +	__releases(d->lock) +	__acquires(d->lock) +{ +	struct net_device *dev = qdisc_dev(sch); + +	if (cl <= netdev_get_num_tc(dev)) { +		int i; +		struct Qdisc *qdisc; +		struct gnet_stats_queue qstats = {0}; +		struct gnet_stats_basic_packed bstats = {0}; +		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + +		/* Drop lock here it will be reclaimed before touching +		 * statistics this is required because the d->lock we +		 * hold here is the look on dev_queue->qdisc_sleeping +		 * also acquired below. +		 */ +		spin_unlock_bh(d->lock); + +		for (i = tc.offset; i < tc.offset + tc.count; i++) { +			qdisc = netdev_get_tx_queue(dev, i)->qdisc; +			spin_lock_bh(qdisc_lock(qdisc)); +			bstats.bytes      += qdisc->bstats.bytes; +			bstats.packets    += qdisc->bstats.packets; +			qstats.qlen       += qdisc->qstats.qlen; +			qstats.backlog    += qdisc->qstats.backlog; +			qstats.drops      += qdisc->qstats.drops; +			qstats.requeues   += qdisc->qstats.requeues; +			qstats.overlimits += qdisc->qstats.overlimits; +			spin_unlock_bh(qdisc_lock(qdisc)); +		} +		/* Reclaim root sleeping lock before completing stats */ +		spin_lock_bh(d->lock); +		if (gnet_stats_copy_basic(d, &bstats) < 0 || +		    gnet_stats_copy_queue(d, &qstats) < 0) +			return -1; +	} else { +		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + +		sch = dev_queue->qdisc_sleeping; +		sch->qstats.qlen = sch->q.qlen; +		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || +		    gnet_stats_copy_queue(d, &sch->qstats) < 0) +			return -1; +	} +	return 0; +} + +static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ +	struct net_device *dev = qdisc_dev(sch); +	unsigned long ntx; + +	if (arg->stop) +		return; + +	/* Walk hierarchy with a virtual class per tc */ +	arg->count = arg->skip; +	for (ntx = arg->skip; +	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev); +	     ntx++) { +		if (arg->fn(sch, ntx + 1, arg) < 0) { +			arg->stop = 1; +			break; +		} +		arg->count++; +	} +} + +static const struct Qdisc_class_ops mqprio_class_ops = { +	.graft		= mqprio_graft, +	.leaf		= mqprio_leaf, +	.get		= mqprio_get, +	.put		= mqprio_put, +	.walk		= mqprio_walk, +	.dump		= mqprio_dump_class, +	.dump_stats	= mqprio_dump_class_stats, +}; + +static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { +	.cl_ops		= &mqprio_class_ops, +	.id		= "mqprio", +	.priv_size	= sizeof(struct mqprio_sched), +	.init		= mqprio_init, +	.destroy	= mqprio_destroy, +	.attach		= mqprio_attach, +	.dump		= mqprio_dump, +	.owner		= THIS_MODULE, +}; + +static int __init mqprio_module_init(void) +{ +	return register_qdisc(&mqprio_qdisc_ops); +} + +static void __exit mqprio_module_exit(void) +{ +	unregister_qdisc(&mqprio_qdisc_ops); +} + +module_init(mqprio_module_init); +module_exit(mqprio_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 32690deab5d..afb050a735f 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -11,8 +11,7 @@   * more details.   *   * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. + * this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: Alexander Duyck <alexander.h.duyck@intel.com>   */ @@ -83,8 +82,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	ret = qdisc_enqueue(skb, qdisc);  	if (ret == NET_XMIT_SUCCESS) { -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++;  		sch->q.qlen++;  		return NET_XMIT_SUCCESS;  	} @@ -109,10 +106,12 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)  		/* Check that target subqueue is available before  		 * pulling an skb to avoid head-of-line blocking.  		 */ -		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) { +		if (!netif_xmit_stopped( +		    netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {  			qdisc = q->queues[q->curband];  			skb = qdisc->dequeue(qdisc);  			if (skb) { +				qdisc_bstats_update(sch, skb);  				sch->q.qlen--;  				return skb;  			} @@ -139,7 +138,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)  		/* Check that target subqueue is available before  		 * pulling an skb to avoid head-of-line blocking.  		 */ -		if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) { +		if (!netif_xmit_stopped( +		    netdev_get_tx_queue(qdisc_dev(sch), curband))) {  			qdisc = q->queues[curband];  			skb = qdisc->ops->peek(qdisc);  			if (skb) @@ -157,7 +157,7 @@ static unsigned int multiq_drop(struct Qdisc *sch)  	unsigned int len;  	struct Qdisc *qdisc; -	for (band = q->bands-1; band >= 0; band--) { +	for (band = q->bands - 1; band >= 0; band--) {  		qdisc = q->queues[band];  		if (qdisc->ops->drop) {  			len = qdisc->ops->drop(qdisc); @@ -266,7 +266,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)  	for (i = 0; i < q->max_bands; i++)  		q->queues[i] = &noop_qdisc; -	err = multiq_tune(sch,opt); +	err = multiq_tune(sch, opt);  	if (err)  		kfree(q->queues); @@ -283,7 +283,8 @@ static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)  	opt.bands = q->bands;  	opt.max_bands = q->max_bands; -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len; @@ -347,7 +348,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,  	struct multiq_sched_data *q = qdisc_priv(sch);  	tcm->tcm_handle |= TC_H_MIN(cl); -	tcm->tcm_info = q->queues[cl-1]->handle; +	tcm->tcm_info = q->queues[cl - 1]->handle;  	return 0;  } @@ -379,7 +380,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)  			arg->count++;  			continue;  		} -		if (arg->fn(sch, band+1, arg) < 0) { +		if (arg->fn(sch, band + 1, arg) < 0) {  			arg->stop = 1;  			break;  		} diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e5593c083a7..111d70fddae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -13,18 +13,23 @@   *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>   */ +#include <linux/mm.h>  #include <linux/module.h>  #include <linux/slab.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/errno.h>  #include <linux/skbuff.h> +#include <linux/vmalloc.h>  #include <linux/rtnetlink.h> +#include <linux/reciprocal_div.h> +#include <linux/rbtree.h>  #include <net/netlink.h>  #include <net/pkt_sched.h> +#include <net/inet_ecn.h> -#define VERSION "1.2" +#define VERSION "1.3"  /*	Network Emulation Queuing algorithm.  	==================================== @@ -47,22 +52,47 @@  	 layering other disciplines.  It does not need to do bandwidth  	 control either since that can be handled by using token  	 bucket or other rate control. + +     Correlated Loss Generator models + +	Added generation of correlated loss according to the +	"Gilbert-Elliot" model, a 4-state markov model. + +	References: +	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG +	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general +	and intuitive loss model for packet networks and its implementation +	in the Netem module in the Linux kernel", available in [1] + +	Authors: Stefano Salsano <stefano.salsano at uniroma2.it +		 Fabio Ludovici <fabio.ludovici at yahoo.it>  */  struct netem_sched_data { +	/* internal t(ime)fifo qdisc uses t_root and sch->limit */ +	struct rb_root t_root; + +	/* optional qdisc for classful handling (NULL at netem init) */  	struct Qdisc	*qdisc; +  	struct qdisc_watchdog watchdog;  	psched_tdiff_t latency;  	psched_tdiff_t jitter;  	u32 loss; +	u32 ecn;  	u32 limit;  	u32 counter;  	u32 gap;  	u32 duplicate;  	u32 reorder;  	u32 corrupt; +	u64 rate; +	s32 packet_overhead; +	u32 cell_size; +	struct reciprocal_value cell_size_reciprocal; +	s32 cell_overhead;  	struct crndstate {  		u32 last; @@ -73,17 +103,75 @@ struct netem_sched_data {  		u32  size;  		s16 table[0];  	} *delay_dist; + +	enum  { +		CLG_RANDOM, +		CLG_4_STATES, +		CLG_GILB_ELL, +	} loss_model; + +	enum { +		TX_IN_GAP_PERIOD = 1, +		TX_IN_BURST_PERIOD, +		LOST_IN_GAP_PERIOD, +		LOST_IN_BURST_PERIOD, +	} _4_state_model; + +	enum { +		GOOD_STATE = 1, +		BAD_STATE, +	} GE_state_model; + +	/* Correlated Loss Generation models */ +	struct clgstate { +		/* state of the Markov chain */ +		u8 state; + +		/* 4-states and Gilbert-Elliot models */ +		u32 a1;	/* p13 for 4-states or p for GE */ +		u32 a2;	/* p31 for 4-states or r for GE */ +		u32 a3;	/* p32 for 4-states or h for GE */ +		u32 a4;	/* p14 for 4-states or 1-k for GE */ +		u32 a5; /* p23 used only in 4-states */ +	} clg; +  }; -/* Time stamp put into socket buffer control block */ +/* Time stamp put into socket buffer control block + * Only valid when skbs are in our internal t(ime)fifo queue. + */  struct netem_skb_cb {  	psched_time_t	time_to_send; +	ktime_t		tstamp_save;  }; +/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp + * to hold a rb_node structure. + * + * If struct sk_buff layout is changed, the following checks will complain. + */ +static struct rb_node *netem_rb_node(struct sk_buff *skb) +{ +	BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); +	BUILD_BUG_ON(offsetof(struct sk_buff, prev) != +		     offsetof(struct sk_buff, next) + sizeof(skb->next)); +	BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != +		     offsetof(struct sk_buff, prev) + sizeof(skb->prev)); +	BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + +					      sizeof(skb->prev) + +					      sizeof(skb->tstamp)); +	return (struct rb_node *)&skb->next; +} + +static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +{ +	return (struct sk_buff *)rb; +} +  static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)  { -	BUILD_BUG_ON(sizeof(skb->cb) < -		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); +	/* we assume we can use skb next/prev/tstamp as storage for rb_node */ +	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));  	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;  } @@ -93,7 +181,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)  static void init_crandom(struct crndstate *state, unsigned long rho)  {  	state->rho = rho; -	state->last = net_random(); +	state->last = prandom_u32();  }  /* get_crandom - correlated random number generator @@ -106,15 +194,133 @@ static u32 get_crandom(struct crndstate *state)  	unsigned long answer;  	if (state->rho == 0)	/* no correlation */ -		return net_random(); +		return prandom_u32(); -	value = net_random(); +	value = prandom_u32();  	rho = (u64)state->rho + 1;  	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;  	state->last = answer;  	return answer;  } +/* loss_4state - 4-state model loss generator + * Generates losses according to the 4-state Markov chain adopted in + * the GI (General and Intuitive) loss model. + */ +static bool loss_4state(struct netem_sched_data *q) +{ +	struct clgstate *clg = &q->clg; +	u32 rnd = prandom_u32(); + +	/* +	 * Makes a comparison between rnd and the transition +	 * probabilities outgoing from the current state, then decides the +	 * next state and if the next packet has to be transmitted or lost. +	 * The four states correspond to: +	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period +	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period +	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period +	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period +	 */ +	switch (clg->state) { +	case TX_IN_GAP_PERIOD: +		if (rnd < clg->a4) { +			clg->state = LOST_IN_BURST_PERIOD; +			return true; +		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { +			clg->state = LOST_IN_GAP_PERIOD; +			return true; +		} else if (clg->a1 + clg->a4 < rnd) { +			clg->state = TX_IN_GAP_PERIOD; +		} + +		break; +	case TX_IN_BURST_PERIOD: +		if (rnd < clg->a5) { +			clg->state = LOST_IN_GAP_PERIOD; +			return true; +		} else { +			clg->state = TX_IN_BURST_PERIOD; +		} + +		break; +	case LOST_IN_GAP_PERIOD: +		if (rnd < clg->a3) +			clg->state = TX_IN_BURST_PERIOD; +		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { +			clg->state = TX_IN_GAP_PERIOD; +		} else if (clg->a2 + clg->a3 < rnd) { +			clg->state = LOST_IN_GAP_PERIOD; +			return true; +		} +		break; +	case LOST_IN_BURST_PERIOD: +		clg->state = TX_IN_GAP_PERIOD; +		break; +	} + +	return false; +} + +/* loss_gilb_ell - Gilbert-Elliot model loss generator + * Generates losses according to the Gilbert-Elliot loss model or + * its special cases  (Gilbert or Simple Gilbert) + * + * Makes a comparison between random number and the transition + * probabilities outgoing from the current state, then decides the + * next state. A second random number is extracted and the comparison + * with the loss probability of the current state decides if the next + * packet will be transmitted or lost. + */ +static bool loss_gilb_ell(struct netem_sched_data *q) +{ +	struct clgstate *clg = &q->clg; + +	switch (clg->state) { +	case GOOD_STATE: +		if (prandom_u32() < clg->a1) +			clg->state = BAD_STATE; +		if (prandom_u32() < clg->a4) +			return true; +		break; +	case BAD_STATE: +		if (prandom_u32() < clg->a2) +			clg->state = GOOD_STATE; +		if (prandom_u32() > clg->a3) +			return true; +	} + +	return false; +} + +static bool loss_event(struct netem_sched_data *q) +{ +	switch (q->loss_model) { +	case CLG_RANDOM: +		/* Random packet drop 0 => none, ~0 => all */ +		return q->loss && q->loss >= get_crandom(&q->loss_cor); + +	case CLG_4_STATES: +		/* 4state loss model algorithm (used also for GI model) +		* Extracts a value from the markov 4 state loss generator, +		* if it is 1 drops a packet and if needed writes the event in +		* the kernel logs +		*/ +		return loss_4state(q); + +	case CLG_GILB_ELL: +		/* Gilbert-Elliot loss model algorithm +		* Extracts a value from the Gilbert-Elliot loss generator, +		* if it is 1 drops a packet and if needed writes the event in +		* the kernel logs +		*/ +		return loss_gilb_ell(q); +	} + +	return false;	/* not reached */ +} + +  /* tabledist - return a pseudo-randomly distributed value with mean mu and   * std deviation sigma.  Uses table lookup to approximate the desired   * distribution, and a uniformly-distributed pseudo-random source. @@ -146,6 +352,62 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,  	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;  } +static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) +{ +	u64 ticks; + +	len += q->packet_overhead; + +	if (q->cell_size) { +		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); + +		if (len > cells * q->cell_size)	/* extra cell needed for remainder */ +			cells++; +		len = cells * (q->cell_size + q->cell_overhead); +	} + +	ticks = (u64)len * NSEC_PER_SEC; + +	do_div(ticks, q->rate); +	return PSCHED_NS2TICKS(ticks); +} + +static void tfifo_reset(struct Qdisc *sch) +{ +	struct netem_sched_data *q = qdisc_priv(sch); +	struct rb_node *p; + +	while ((p = rb_first(&q->t_root))) { +		struct sk_buff *skb = netem_rb_to_skb(p); + +		rb_erase(p, &q->t_root); +		skb->next = NULL; +		skb->prev = NULL; +		kfree_skb(skb); +	} +} + +static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ +	struct netem_sched_data *q = qdisc_priv(sch); +	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; +	struct rb_node **p = &q->t_root.rb_node, *parent = NULL; + +	while (*p) { +		struct sk_buff *skb; + +		parent = *p; +		skb = netem_rb_to_skb(parent); +		if (tnext >= netem_skb_cb(skb)->time_to_send) +			p = &parent->rb_right; +		else +			p = &parent->rb_left; +	} +	rb_link_node(netem_rb_node(nskb), parent, p); +	rb_insert_color(netem_rb_node(nskb), &q->t_root); +	sch->q.qlen++; +} +  /*   * Insert one skb into qdisc.   * Note: parent depends on return value to account for queue length. @@ -158,26 +420,30 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	/* We don't fill cb now as skb_unshare() may invalidate it */  	struct netem_skb_cb *cb;  	struct sk_buff *skb2; -	int ret;  	int count = 1; -	pr_debug("netem_enqueue skb=%p\n", skb); -  	/* Random duplication */  	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))  		++count; -	/* Random packet drop 0 => none, ~0 => all */ -	if (q->loss && q->loss >= get_crandom(&q->loss_cor)) -		--count; - +	/* Drop packet? */ +	if (loss_event(q)) { +		if (q->ecn && INET_ECN_set_ce(skb)) +			sch->qstats.drops++; /* mark packet */ +		else +			--count; +	}  	if (count == 0) {  		sch->qstats.drops++;  		kfree_skb(skb);  		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;  	} -	skb_orphan(skb); +	/* If a delay is expected, orphan the skb. (orphaning usually takes +	 * place at TX completion time, so _before_ the link transit delay) +	 */ +	if (q->latency || q->jitter) +		skb_orphan_partial(skb);  	/*  	 * If we need to duplicate packet, then re-insert at top of the @@ -202,17 +468,21 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {  		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||  		    (skb->ip_summed == CHECKSUM_PARTIAL && -		     skb_checksum_help(skb))) { -			sch->qstats.drops++; -			return NET_XMIT_DROP; -		} +		     skb_checksum_help(skb))) +			return qdisc_drop(skb, sch); -		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); +		skb->data[prandom_u32() % skb_headlen(skb)] ^= +			1<<(prandom_u32() % 8);  	} +	if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) +		return qdisc_reshape_fail(skb, sch); + +	sch->qstats.backlog += qdisc_pkt_len(skb); +  	cb = netem_skb_cb(skb); -	if (q->gap == 0 || 		/* not doing reordering */ -	    q->counter < q->gap || 	/* inside last reordering gap */ +	if (q->gap == 0 ||		/* not doing reordering */ +	    q->counter < q->gap - 1 ||	/* inside last reordering gap */  	    q->reorder < get_crandom(&q->reorder_cor)) {  		psched_time_t now;  		psched_tdiff_t delay; @@ -221,9 +491,32 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  				  &q->delay_cor, q->delay_dist);  		now = psched_get_time(); + +		if (q->rate) { +			struct sk_buff *last; + +			if (!skb_queue_empty(&sch->q)) +				last = skb_peek_tail(&sch->q); +			else +				last = netem_rb_to_skb(rb_last(&q->t_root)); +			if (last) { +				/* +				 * Last packet in queue is reference point (now), +				 * calculate this time bonus and subtract +				 * from delay. +				 */ +				delay -= netem_skb_cb(last)->time_to_send - now; +				delay = max_t(psched_tdiff_t, 0, delay); +				now = netem_skb_cb(last)->time_to_send; +			} + +			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); +		} +  		cb->time_to_send = now + delay; +		cb->tstamp_save = skb->tstamp;  		++q->counter; -		ret = qdisc_enqueue(skb, q->qdisc); +		tfifo_enqueue(skb, sch);  	} else {  		/*  		 * Do re-ordering by putting one out of N packets at the front @@ -232,33 +525,40 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)  		cb->time_to_send = psched_get_time();  		q->counter = 0; -		__skb_queue_head(&q->qdisc->q, skb); -		q->qdisc->qstats.backlog += qdisc_pkt_len(skb); -		q->qdisc->qstats.requeues++; -		ret = NET_XMIT_SUCCESS; -	} - -	if (likely(ret == NET_XMIT_SUCCESS)) { -		sch->q.qlen++; -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++; -	} else if (net_xmit_drop_count(ret)) { -		sch->qstats.drops++; +		__skb_queue_head(&sch->q, skb); +		sch->qstats.requeues++;  	} -	pr_debug("netem: enqueue ret %d\n", ret); -	return ret; +	return NET_XMIT_SUCCESS;  } -static unsigned int netem_drop(struct Qdisc* sch) +static unsigned int netem_drop(struct Qdisc *sch)  {  	struct netem_sched_data *q = qdisc_priv(sch); -	unsigned int len = 0; +	unsigned int len; -	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { -		sch->q.qlen--; -		sch->qstats.drops++; +	len = qdisc_queue_drop(sch); + +	if (!len) { +		struct rb_node *p = rb_first(&q->t_root); + +		if (p) { +			struct sk_buff *skb = netem_rb_to_skb(p); + +			rb_erase(p, &q->t_root); +			sch->q.qlen--; +			skb->next = NULL; +			skb->prev = NULL; +			len = qdisc_pkt_len(skb); +			sch->qstats.backlog -= len; +			kfree_skb(skb); +		}  	} +	if (!len && q->qdisc && q->qdisc->ops->drop) +	    len = q->qdisc->ops->drop(q->qdisc); +	if (len) +		sch->qstats.drops++; +  	return len;  } @@ -266,20 +566,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)  {  	struct netem_sched_data *q = qdisc_priv(sch);  	struct sk_buff *skb; +	struct rb_node *p; -	if (sch->flags & TCQ_F_THROTTLED) +	if (qdisc_is_throttled(sch))  		return NULL; -	skb = q->qdisc->ops->peek(q->qdisc); +tfifo_dequeue: +	skb = __skb_dequeue(&sch->q);  	if (skb) { -		const struct netem_skb_cb *cb = netem_skb_cb(skb); -		psched_time_t now = psched_get_time(); +deliver: +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		qdisc_unthrottled(sch); +		qdisc_bstats_update(sch, skb); +		return skb; +	} +	p = rb_first(&q->t_root); +	if (p) { +		psched_time_t time_to_send; + +		skb = netem_rb_to_skb(p);  		/* if more time remaining? */ -		if (cb->time_to_send <= now) { -			skb = qdisc_dequeue_peeked(q->qdisc); -			if (unlikely(!skb)) -				return NULL; +		time_to_send = netem_skb_cb(skb)->time_to_send; +		if (time_to_send <= psched_get_time()) { +			rb_erase(p, &q->t_root); + +			sch->q.qlen--; +			skb->next = NULL; +			skb->prev = NULL; +			skb->tstamp = netem_skb_cb(skb)->tstamp_save;  #ifdef CONFIG_NET_CLS_ACT  			/* @@ -289,14 +604,34 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)  			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)  				skb->tstamp.tv64 = 0;  #endif -			pr_debug("netem_dequeue: return skb=%p\n", skb); -			sch->q.qlen--; -			return skb; + +			if (q->qdisc) { +				int err = qdisc_enqueue(skb, q->qdisc); + +				if (unlikely(err != NET_XMIT_SUCCESS)) { +					if (net_xmit_drop_count(err)) { +						sch->qstats.drops++; +						qdisc_tree_decrease_qlen(sch, 1); +					} +				} +				goto tfifo_dequeue; +			} +			goto deliver;  		} -		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); +		if (q->qdisc) { +			skb = q->qdisc->ops->dequeue(q->qdisc); +			if (skb) +				goto deliver; +		} +		qdisc_watchdog_schedule(&q->watchdog, time_to_send);  	} +	if (q->qdisc) { +		skb = q->qdisc->ops->dequeue(q->qdisc); +		if (skb) +			goto deliver; +	}  	return NULL;  } @@ -304,11 +639,18 @@ static void netem_reset(struct Qdisc *sch)  {  	struct netem_sched_data *q = qdisc_priv(sch); -	qdisc_reset(q->qdisc); -	sch->q.qlen = 0; +	qdisc_reset_queue(sch); +	tfifo_reset(sch); +	if (q->qdisc) +		qdisc_reset(q->qdisc);  	qdisc_watchdog_cancel(&q->watchdog);  } +static void dist_free(struct disttable *d) +{ +	kvfree(d); +} +  /*   * Distribution data is a variable size payload containing   * signed 16 bit values. @@ -316,16 +658,20 @@ static void netem_reset(struct Qdisc *sch)  static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)  {  	struct netem_sched_data *q = qdisc_priv(sch); -	unsigned long n = nla_len(attr)/sizeof(__s16); +	size_t n = nla_len(attr)/sizeof(__s16);  	const __s16 *data = nla_data(attr);  	spinlock_t *root_lock;  	struct disttable *d;  	int i; +	size_t s; -	if (n > 65536) +	if (n > NETEM_DIST_MAX)  		return -EINVAL; -	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); +	s = sizeof(struct disttable) + n * sizeof(s16); +	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); +	if (!d) +		d = vmalloc(s);  	if (!d)  		return -ENOMEM; @@ -336,15 +682,15 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)  	root_lock = qdisc_root_sleeping_lock(sch);  	spin_lock_bh(root_lock); -	kfree(q->delay_dist); -	q->delay_dist = d; +	swap(q->delay_dist, d);  	spin_unlock_bh(root_lock); + +	dist_free(d);  	return 0;  } -static void get_correlation(struct Qdisc *sch, const struct nlattr *attr) +static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_corr *c = nla_data(attr);  	init_crandom(&q->delay_cor, c->delay_corr); @@ -352,28 +698,98 @@ static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)  	init_crandom(&q->dup_cor, c->dup_corr);  } -static void get_reorder(struct Qdisc *sch, const struct nlattr *attr) +static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_reorder *r = nla_data(attr);  	q->reorder = r->probability;  	init_crandom(&q->reorder_cor, r->correlation);  } -static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) +static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)  { -	struct netem_sched_data *q = qdisc_priv(sch);  	const struct tc_netem_corrupt *r = nla_data(attr);  	q->corrupt = r->probability;  	init_crandom(&q->corrupt_cor, r->correlation);  } +static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) +{ +	const struct tc_netem_rate *r = nla_data(attr); + +	q->rate = r->rate; +	q->packet_overhead = r->packet_overhead; +	q->cell_size = r->cell_size; +	q->cell_overhead = r->cell_overhead; +	if (q->cell_size) +		q->cell_size_reciprocal = reciprocal_value(q->cell_size); +	else +		q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; +} + +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) +{ +	const struct nlattr *la; +	int rem; + +	nla_for_each_nested(la, attr, rem) { +		u16 type = nla_type(la); + +		switch (type) { +		case NETEM_LOSS_GI: { +			const struct tc_netem_gimodel *gi = nla_data(la); + +			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { +				pr_info("netem: incorrect gi model size\n"); +				return -EINVAL; +			} + +			q->loss_model = CLG_4_STATES; + +			q->clg.state = TX_IN_GAP_PERIOD; +			q->clg.a1 = gi->p13; +			q->clg.a2 = gi->p31; +			q->clg.a3 = gi->p32; +			q->clg.a4 = gi->p14; +			q->clg.a5 = gi->p23; +			break; +		} + +		case NETEM_LOSS_GE: { +			const struct tc_netem_gemodel *ge = nla_data(la); + +			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { +				pr_info("netem: incorrect ge model size\n"); +				return -EINVAL; +			} + +			q->loss_model = CLG_GILB_ELL; +			q->clg.state = GOOD_STATE; +			q->clg.a1 = ge->p; +			q->clg.a2 = ge->r; +			q->clg.a3 = ge->h; +			q->clg.a4 = ge->k1; +			break; +		} + +		default: +			pr_info("netem: unknown loss type %u\n", type); +			return -EINVAL; +		} +	} + +	return 0; +} +  static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {  	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },  	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },  	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) }, +	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) }, +	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED }, +	[TCA_NETEM_ECN]		= { .type = NLA_U32 }, +	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },  };  static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -381,11 +797,15 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,  {  	int nested_len = nla_len(nla) - NLA_ALIGN(len); -	if (nested_len < 0) +	if (nested_len < 0) { +		pr_info("netem: invalid attributes len %d\n", nested_len);  		return -EINVAL; +	} +  	if (nested_len >= nla_attr_size(0))  		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),  				 nested_len, policy); +  	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));  	return 0;  } @@ -396,6 +816,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  	struct netem_sched_data *q = qdisc_priv(sch);  	struct nlattr *tb[TCA_NETEM_MAX + 1];  	struct tc_netem_qopt *qopt; +	struct clgstate old_clg; +	int old_loss_model = CLG_RANDOM;  	int ret;  	if (opt == NULL) @@ -406,12 +828,35 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  	if (ret < 0)  		return ret; -	ret = fifo_set_limit(q->qdisc, qopt->limit); -	if (ret) { -		pr_debug("netem: can't set fifo limit\n"); -		return ret; +	/* backup q->clg and q->loss_model */ +	old_clg = q->clg; +	old_loss_model = q->loss_model; + +	if (tb[TCA_NETEM_LOSS]) { +		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); +		if (ret) { +			q->loss_model = old_loss_model; +			return ret; +		} +	} else { +		q->loss_model = CLG_RANDOM;  	} +	if (tb[TCA_NETEM_DELAY_DIST]) { +		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); +		if (ret) { +			/* recover clg and loss_model, in case of +			 * q->clg and q->loss_model were modified +			 * in get_loss_clg() +			 */ +			q->clg = old_clg; +			q->loss_model = old_loss_model; +			return ret; +		} +	} + +	sch->limit = qopt->limit; +  	q->latency = qopt->latency;  	q->jitter = qopt->jitter;  	q->limit = qopt->limit; @@ -427,107 +872,27 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)  		q->reorder = ~0;  	if (tb[TCA_NETEM_CORR]) -		get_correlation(sch, tb[TCA_NETEM_CORR]); - -	if (tb[TCA_NETEM_DELAY_DIST]) { -		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); -		if (ret) -			return ret; -	} +		get_correlation(q, tb[TCA_NETEM_CORR]);  	if (tb[TCA_NETEM_REORDER]) -		get_reorder(sch, tb[TCA_NETEM_REORDER]); +		get_reorder(q, tb[TCA_NETEM_REORDER]);  	if (tb[TCA_NETEM_CORRUPT]) -		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); +		get_corrupt(q, tb[TCA_NETEM_CORRUPT]); -	return 0; -} +	if (tb[TCA_NETEM_RATE]) +		get_rate(q, tb[TCA_NETEM_RATE]); -/* - * Special case version of FIFO queue for use by netem. - * It queues in order based on timestamps in skb's - */ -struct fifo_sched_data { -	u32 limit; -	psched_time_t oldest; -}; - -static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) -{ -	struct fifo_sched_data *q = qdisc_priv(sch); -	struct sk_buff_head *list = &sch->q; -	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; -	struct sk_buff *skb; - -	if (likely(skb_queue_len(list) < q->limit)) { -		/* Optimize for add at tail */ -		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) { -			q->oldest = tnext; -			return qdisc_enqueue_tail(nskb, sch); -		} - -		skb_queue_reverse_walk(list, skb) { -			const struct netem_skb_cb *cb = netem_skb_cb(skb); - -			if (tnext >= cb->time_to_send) -				break; -		} - -		__skb_queue_after(list, skb, nskb); - -		sch->qstats.backlog += qdisc_pkt_len(nskb); -		sch->bstats.bytes += qdisc_pkt_len(nskb); -		sch->bstats.packets++; - -		return NET_XMIT_SUCCESS; -	} - -	return qdisc_reshape_fail(nskb, sch); -} - -static int tfifo_init(struct Qdisc *sch, struct nlattr *opt) -{ -	struct fifo_sched_data *q = qdisc_priv(sch); - -	if (opt) { -		struct tc_fifo_qopt *ctl = nla_data(opt); -		if (nla_len(opt) < sizeof(*ctl)) -			return -EINVAL; - -		q->limit = ctl->limit; -	} else -		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); - -	q->oldest = PSCHED_PASTPERFECT; -	return 0; -} - -static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) -{ -	struct fifo_sched_data *q = qdisc_priv(sch); -	struct tc_fifo_qopt opt = { .limit = q->limit }; +	if (tb[TCA_NETEM_RATE64]) +		q->rate = max_t(u64, q->rate, +				nla_get_u64(tb[TCA_NETEM_RATE64])); -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); -	return skb->len; +	if (tb[TCA_NETEM_ECN]) +		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); -nla_put_failure: -	return -1; +	return ret;  } -static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = { -	.id		=	"tfifo", -	.priv_size	=	sizeof(struct fifo_sched_data), -	.enqueue	=	tfifo_enqueue, -	.dequeue	=	qdisc_dequeue_head, -	.peek		=	qdisc_peek_head, -	.drop		=	qdisc_queue_drop, -	.init		=	tfifo_init, -	.reset		=	qdisc_reset_queue, -	.change		=	tfifo_init, -	.dump		=	tfifo_dump, -}; -  static int netem_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct netem_sched_data *q = qdisc_priv(sch); @@ -538,18 +903,10 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)  	qdisc_watchdog_init(&q->watchdog, sch); -	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops, -				     TC_H_MAKE(sch->handle, 1)); -	if (!q->qdisc) { -		pr_debug("netem: qdisc create failed\n"); -		return -ENOMEM; -	} - +	q->loss_model = CLG_RANDOM;  	ret = netem_change(sch, opt); -	if (ret) { -		pr_debug("netem: change failed\n"); -		qdisc_destroy(q->qdisc); -	} +	if (ret) +		pr_info("netem: change failed\n");  	return ret;  } @@ -558,19 +915,70 @@ static void netem_destroy(struct Qdisc *sch)  	struct netem_sched_data *q = qdisc_priv(sch);  	qdisc_watchdog_cancel(&q->watchdog); -	qdisc_destroy(q->qdisc); -	kfree(q->delay_dist); +	if (q->qdisc) +		qdisc_destroy(q->qdisc); +	dist_free(q->delay_dist); +} + +static int dump_loss_model(const struct netem_sched_data *q, +			   struct sk_buff *skb) +{ +	struct nlattr *nest; + +	nest = nla_nest_start(skb, TCA_NETEM_LOSS); +	if (nest == NULL) +		goto nla_put_failure; + +	switch (q->loss_model) { +	case CLG_RANDOM: +		/* legacy loss model */ +		nla_nest_cancel(skb, nest); +		return 0;	/* no data */ + +	case CLG_4_STATES: { +		struct tc_netem_gimodel gi = { +			.p13 = q->clg.a1, +			.p31 = q->clg.a2, +			.p32 = q->clg.a3, +			.p14 = q->clg.a4, +			.p23 = q->clg.a5, +		}; + +		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) +			goto nla_put_failure; +		break; +	} +	case CLG_GILB_ELL: { +		struct tc_netem_gemodel ge = { +			.p = q->clg.a1, +			.r = q->clg.a2, +			.h = q->clg.a3, +			.k1 = q->clg.a4, +		}; + +		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) +			goto nla_put_failure; +		break; +	} +	} + +	nla_nest_end(skb, nest); +	return 0; + +nla_put_failure: +	nla_nest_cancel(skb, nest); +	return -1;  }  static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)  {  	const struct netem_sched_data *q = qdisc_priv(sch); -	unsigned char *b = skb_tail_pointer(skb); -	struct nlattr *nla = (struct nlattr *) b; +	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);  	struct tc_netem_qopt qopt;  	struct tc_netem_corr cor;  	struct tc_netem_reorder reorder;  	struct tc_netem_corrupt corrupt; +	struct tc_netem_rate rate;  	qopt.latency = q->latency;  	qopt.jitter = q->jitter; @@ -578,32 +986,121 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)  	qopt.loss = q->loss;  	qopt.gap = q->gap;  	qopt.duplicate = q->duplicate; -	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) +		goto nla_put_failure;  	cor.delay_corr = q->delay_cor.rho;  	cor.loss_corr = q->loss_cor.rho;  	cor.dup_corr = q->dup_cor.rho; -	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); +	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) +		goto nla_put_failure;  	reorder.probability = q->reorder;  	reorder.correlation = q->reorder_cor.rho; -	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); +	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) +		goto nla_put_failure;  	corrupt.probability = q->corrupt;  	corrupt.correlation = q->corrupt_cor.rho; -	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); +	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) +		goto nla_put_failure; -	nla->nla_len = skb_tail_pointer(skb) - b; +	if (q->rate >= (1ULL << 32)) { +		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) +			goto nla_put_failure; +		rate.rate = ~0U; +	} else { +		rate.rate = q->rate; +	} +	rate.packet_overhead = q->packet_overhead; +	rate.cell_size = q->cell_size; +	rate.cell_overhead = q->cell_overhead; +	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) +		goto nla_put_failure; + +	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) +		goto nla_put_failure; + +	if (dump_loss_model(q, skb) != 0) +		goto nla_put_failure; -	return skb->len; +	return nla_nest_end(skb, nla);  nla_put_failure: -	nlmsg_trim(skb, b); +	nlmsg_trim(skb, nla);  	return -1;  } +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, +			  struct sk_buff *skb, struct tcmsg *tcm) +{ +	struct netem_sched_data *q = qdisc_priv(sch); + +	if (cl != 1 || !q->qdisc) 	/* only one class */ +		return -ENOENT; + +	tcm->tcm_handle |= TC_H_MIN(1); +	tcm->tcm_info = q->qdisc->handle; + +	return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, +		     struct Qdisc **old) +{ +	struct netem_sched_data *q = qdisc_priv(sch); + +	sch_tree_lock(sch); +	*old = q->qdisc; +	q->qdisc = new; +	if (*old) { +		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); +		qdisc_reset(*old); +	} +	sch_tree_unlock(sch); + +	return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ +	struct netem_sched_data *q = qdisc_priv(sch); +	return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ +	return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ +	if (!walker->stop) { +		if (walker->count >= walker->skip) +			if (walker->fn(sch, 1, walker) < 0) { +				walker->stop = 1; +				return; +			} +		walker->count++; +	} +} + +static const struct Qdisc_class_ops netem_class_ops = { +	.graft		=	netem_graft, +	.leaf		=	netem_leaf, +	.get		=	netem_get, +	.put		=	netem_put, +	.walk		=	netem_walk, +	.dump		=	netem_dump_class, +}; +  static struct Qdisc_ops netem_qdisc_ops __read_mostly = {  	.id		=	"netem", +	.cl_ops		=	&netem_class_ops,  	.priv_size	=	sizeof(struct netem_sched_data),  	.enqueue	=	netem_enqueue,  	.dequeue	=	netem_dequeue, diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c new file mode 100644 index 00000000000..fefeeb73f15 --- /dev/null +++ b/net/sched/sch_pie.c @@ -0,0 +1,566 @@ +/* Copyright (C) 2013 Cisco Systems, Inc, 2013. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Author: Vijay Subramanian <vijaynsu@cisco.com> + * Author: Mythili Prabhu <mysuryan@cisco.com> + * + * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> + * University of Oslo, Norway. + * + * References: + * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 + * IEEE  Conference on High Performance Switching and Routing 2013 : + * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define QUEUE_THRESHOLD 10000 +#define DQCOUNT_INVALID -1 +#define MAX_PROB  0xffffffff +#define PIE_SCALE 8 + +/* parameters used */ +struct pie_params { +	psched_time_t target;	/* user specified target delay in pschedtime */ +	u32 tupdate;		/* timer frequency (in jiffies) */ +	u32 limit;		/* number of packets that can be enqueued */ +	u32 alpha;		/* alpha and beta are between 0 and 32 */ +	u32 beta;		/* and are used for shift relative to 1 */ +	bool ecn;		/* true if ecn is enabled */ +	bool bytemode;		/* to scale drop early prob based on pkt size */ +}; + +/* variables used */ +struct pie_vars { +	u32 prob;		/* probability but scaled by u32 limit. */ +	psched_time_t burst_time; +	psched_time_t qdelay; +	psched_time_t qdelay_old; +	u64 dq_count;		/* measured in bytes */ +	psched_time_t dq_tstamp;	/* drain rate */ +	u32 avg_dq_rate;	/* bytes per pschedtime tick,scaled */ +	u32 qlen_old;		/* in bytes */ +}; + +/* statistics gathering */ +struct pie_stats { +	u32 packets_in;		/* total number of packets enqueued */ +	u32 dropped;		/* packets dropped due to pie_action */ +	u32 overlimit;		/* dropped due to lack of space in queue */ +	u32 maxq;		/* maximum queue size */ +	u32 ecn_mark;		/* packets marked with ECN */ +}; + +/* private data for the Qdisc */ +struct pie_sched_data { +	struct pie_params params; +	struct pie_vars vars; +	struct pie_stats stats; +	struct timer_list adapt_timer; +}; + +static void pie_params_init(struct pie_params *params) +{ +	params->alpha = 2; +	params->beta = 20; +	params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC);	/* 30 ms */ +	params->limit = 1000;	/* default of 1000 packets */ +	params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC);	/* 20 ms */ +	params->ecn = false; +	params->bytemode = false; +} + +static void pie_vars_init(struct pie_vars *vars) +{ +	vars->dq_count = DQCOUNT_INVALID; +	vars->avg_dq_rate = 0; +	/* default of 100 ms in pschedtime */ +	vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); +} + +static bool drop_early(struct Qdisc *sch, u32 packet_size) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	u32 rnd; +	u32 local_prob = q->vars.prob; +	u32 mtu = psched_mtu(qdisc_dev(sch)); + +	/* If there is still burst allowance left skip random early drop */ +	if (q->vars.burst_time > 0) +		return false; + +	/* If current delay is less than half of target, and +	 * if drop prob is low already, disable early_drop +	 */ +	if ((q->vars.qdelay < q->params.target / 2) +	    && (q->vars.prob < MAX_PROB / 5)) +		return false; + +	/* If we have fewer than 2 mtu-sized packets, disable drop_early, +	 * similar to min_th in RED +	 */ +	if (sch->qstats.backlog < 2 * mtu) +		return false; + +	/* If bytemode is turned on, use packet size to compute new +	 * probablity. Smaller packets will have lower drop prob in this case +	 */ +	if (q->params.bytemode && packet_size <= mtu) +		local_prob = (local_prob / mtu) * packet_size; +	else +		local_prob = q->vars.prob; + +	rnd = prandom_u32(); +	if (rnd < local_prob) +		return true; + +	return false; +} + +static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	bool enqueue = false; + +	if (unlikely(qdisc_qlen(sch) >= sch->limit)) { +		q->stats.overlimit++; +		goto out; +	} + +	if (!drop_early(sch, skb->len)) { +		enqueue = true; +	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && +		   INET_ECN_set_ce(skb)) { +		/* If packet is ecn capable, mark it if drop probability +		 * is lower than 10%, else drop it. +		 */ +		q->stats.ecn_mark++; +		enqueue = true; +	} + +	/* we can enqueue the packet */ +	if (enqueue) { +		q->stats.packets_in++; +		if (qdisc_qlen(sch) > q->stats.maxq) +			q->stats.maxq = qdisc_qlen(sch); + +		return qdisc_enqueue_tail(skb, sch); +	} + +out: +	q->stats.dropped++; +	return qdisc_drop(skb, sch); +} + +static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { +	[TCA_PIE_TARGET] = {.type = NLA_U32}, +	[TCA_PIE_LIMIT] = {.type = NLA_U32}, +	[TCA_PIE_TUPDATE] = {.type = NLA_U32}, +	[TCA_PIE_ALPHA] = {.type = NLA_U32}, +	[TCA_PIE_BETA] = {.type = NLA_U32}, +	[TCA_PIE_ECN] = {.type = NLA_U32}, +	[TCA_PIE_BYTEMODE] = {.type = NLA_U32}, +}; + +static int pie_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct nlattr *tb[TCA_PIE_MAX + 1]; +	unsigned int qlen; +	int err; + +	if (!opt) +		return -EINVAL; + +	err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); +	if (err < 0) +		return err; + +	sch_tree_lock(sch); + +	/* convert from microseconds to pschedtime */ +	if (tb[TCA_PIE_TARGET]) { +		/* target is in us */ +		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); + +		/* convert to pschedtime */ +		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); +	} + +	/* tupdate is in jiffies */ +	if (tb[TCA_PIE_TUPDATE]) +		q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + +	if (tb[TCA_PIE_LIMIT]) { +		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); + +		q->params.limit = limit; +		sch->limit = limit; +	} + +	if (tb[TCA_PIE_ALPHA]) +		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + +	if (tb[TCA_PIE_BETA]) +		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + +	if (tb[TCA_PIE_ECN]) +		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + +	if (tb[TCA_PIE_BYTEMODE]) +		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + +	/* Drop excess packets if new limit is lower */ +	qlen = sch->q.qlen; +	while (sch->q.qlen > sch->limit) { +		struct sk_buff *skb = __skb_dequeue(&sch->q); + +		sch->qstats.backlog -= qdisc_pkt_len(skb); +		qdisc_drop(skb, sch); +	} +	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + +	sch_tree_unlock(sch); +	return 0; +} + +static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) +{ + +	struct pie_sched_data *q = qdisc_priv(sch); +	int qlen = sch->qstats.backlog;	/* current queue size in bytes */ + +	/* If current queue is about 10 packets or more and dq_count is unset +	 * we have enough packets to calculate the drain rate. Save +	 * current time as dq_tstamp and start measurement cycle. +	 */ +	if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { +		q->vars.dq_tstamp = psched_get_time(); +		q->vars.dq_count = 0; +	} + +	/* Calculate the average drain rate from this value.  If queue length +	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset +	 * the dq_count to -1 as we don't have enough packets to calculate the +	 * drain rate anymore The following if block is entered only when we +	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) +	 * and we calculate the drain rate for the threshold here.  dq_count is +	 * in bytes, time difference in psched_time, hence rate is in +	 * bytes/psched_time. +	 */ +	if (q->vars.dq_count != DQCOUNT_INVALID) { +		q->vars.dq_count += skb->len; + +		if (q->vars.dq_count >= QUEUE_THRESHOLD) { +			psched_time_t now = psched_get_time(); +			u32 dtime = now - q->vars.dq_tstamp; +			u32 count = q->vars.dq_count << PIE_SCALE; + +			if (dtime == 0) +				return; + +			count = count / dtime; + +			if (q->vars.avg_dq_rate == 0) +				q->vars.avg_dq_rate = count; +			else +				q->vars.avg_dq_rate = +				    (q->vars.avg_dq_rate - +				     (q->vars.avg_dq_rate >> 3)) + (count >> 3); + +			/* If the queue has receded below the threshold, we hold +			 * on to the last drain rate calculated, else we reset +			 * dq_count to 0 to re-enter the if block when the next +			 * packet is dequeued +			 */ +			if (qlen < QUEUE_THRESHOLD) +				q->vars.dq_count = DQCOUNT_INVALID; +			else { +				q->vars.dq_count = 0; +				q->vars.dq_tstamp = psched_get_time(); +			} + +			if (q->vars.burst_time > 0) { +				if (q->vars.burst_time > dtime) +					q->vars.burst_time -= dtime; +				else +					q->vars.burst_time = 0; +			} +		} +	} +} + +static void calculate_probability(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */ +	psched_time_t qdelay = 0;	/* in pschedtime */ +	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */ +	s32 delta = 0;		/* determines the change in probability */ +	u32 oldprob; +	u32 alpha, beta; +	bool update_prob = true; + +	q->vars.qdelay_old = q->vars.qdelay; + +	if (q->vars.avg_dq_rate > 0) +		qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; +	else +		qdelay = 0; + +	/* If qdelay is zero and qlen is not, it means qlen is very small, less +	 * than dequeue_rate, so we do not update probabilty in this round +	 */ +	if (qdelay == 0 && qlen != 0) +		update_prob = false; + +	/* In the algorithm, alpha and beta are between 0 and 2 with typical +	 * value for alpha as 0.125. In this implementation, we use values 0-32 +	 * passed from user space to represent this. Also, alpha and beta have +	 * unit of HZ and need to be scaled before they can used to update +	 * probability. alpha/beta are updated locally below by 1) scaling them +	 * appropriately 2) scaling down by 16 to come to 0-2 range. +	 * Please see paper for details. +	 * +	 * We scale alpha and beta differently depending on whether we are in +	 * light, medium or high dropping mode. +	 */ +	if (q->vars.prob < MAX_PROB / 100) { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; +	} else if (q->vars.prob < MAX_PROB / 10) { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; +	} else { +		alpha = +		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; +		beta = +		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; +	} + +	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */ +	delta += alpha * ((qdelay - q->params.target)); +	delta += beta * ((qdelay - qdelay_old)); + +	oldprob = q->vars.prob; + +	/* to ensure we increase probability in steps of no more than 2% */ +	if (delta > (s32) (MAX_PROB / (100 / 2)) && +	    q->vars.prob >= MAX_PROB / 10) +		delta = (MAX_PROB / 100) * 2; + +	/* Non-linear drop: +	 * Tune drop probability to increase quickly for high delays(>= 250ms) +	 * 250ms is derived through experiments and provides error protection +	 */ + +	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) +		delta += MAX_PROB / (100 / 2); + +	q->vars.prob += delta; + +	if (delta > 0) { +		/* prevent overflow */ +		if (q->vars.prob < oldprob) { +			q->vars.prob = MAX_PROB; +			/* Prevent normalization error. If probability is at +			 * maximum value already, we normalize it here, and +			 * skip the check to do a non-linear drop in the next +			 * section. +			 */ +			update_prob = false; +		} +	} else { +		/* prevent underflow */ +		if (q->vars.prob > oldprob) +			q->vars.prob = 0; +	} + +	/* Non-linear drop in probability: Reduce drop probability quickly if +	 * delay is 0 for 2 consecutive Tupdate periods. +	 */ + +	if ((qdelay == 0) && (qdelay_old == 0) && update_prob) +		q->vars.prob = (q->vars.prob * 98) / 100; + +	q->vars.qdelay = qdelay; +	q->vars.qlen_old = qlen; + +	/* We restart the measurement cycle if the following conditions are met +	 * 1. If the delay has been low for 2 consecutive Tupdate periods +	 * 2. Calculated drop probability is zero +	 * 3. We have atleast one estimate for the avg_dq_rate ie., +	 *    is a non-zero value +	 */ +	if ((q->vars.qdelay < q->params.target / 2) && +	    (q->vars.qdelay_old < q->params.target / 2) && +	    (q->vars.prob == 0) && +	    (q->vars.avg_dq_rate > 0)) +		pie_vars_init(&q->vars); +} + +static void pie_timer(unsigned long arg) +{ +	struct Qdisc *sch = (struct Qdisc *)arg; +	struct pie_sched_data *q = qdisc_priv(sch); +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + +	spin_lock(root_lock); +	calculate_probability(sch); + +	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ +	if (q->params.tupdate) +		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); +	spin_unlock(root_lock); + +} + +static int pie_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct pie_sched_data *q = qdisc_priv(sch); + +	pie_params_init(&q->params); +	pie_vars_init(&q->vars); +	sch->limit = q->params.limit; + +	setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); +	mod_timer(&q->adapt_timer, jiffies + HZ / 2); + +	if (opt) { +		int err = pie_change(sch, opt); + +		if (err) +			return err; +	} + +	return 0; +} + +static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; + +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; + +	/* convert target from pschedtime to us */ +	if (nla_put_u32(skb, TCA_PIE_TARGET, +			((u32) PSCHED_TICKS2NS(q->params.target)) / +			NSEC_PER_USEC) || +	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || +	    nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || +	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || +	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || +	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || +	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) +		goto nla_put_failure; + +	return nla_nest_end(skb, opts); + +nla_put_failure: +	nla_nest_cancel(skb, opts); +	return -1; + +} + +static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	struct tc_pie_xstats st = { +		.prob		= q->vars.prob, +		.delay		= ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / +				   NSEC_PER_USEC, +		/* unscale and return dq_rate in bytes per sec */ +		.avg_dq_rate	= q->vars.avg_dq_rate * +				  (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, +		.packets_in	= q->stats.packets_in, +		.overlimit	= q->stats.overlimit, +		.maxq		= q->stats.maxq, +		.dropped	= q->stats.dropped, +		.ecn_mark	= q->stats.ecn_mark, +	}; + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) +{ +	struct sk_buff *skb; +	skb = __qdisc_dequeue_head(sch, &sch->q); + +	if (!skb) +		return NULL; + +	pie_process_dequeue(sch, skb); +	return skb; +} + +static void pie_reset(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	qdisc_reset_queue(sch); +	pie_vars_init(&q->vars); +} + +static void pie_destroy(struct Qdisc *sch) +{ +	struct pie_sched_data *q = qdisc_priv(sch); +	q->params.tupdate = 0; +	del_timer_sync(&q->adapt_timer); +} + +static struct Qdisc_ops pie_qdisc_ops __read_mostly = { +	.id = "pie", +	.priv_size	= sizeof(struct pie_sched_data), +	.enqueue	= pie_qdisc_enqueue, +	.dequeue	= pie_qdisc_dequeue, +	.peek		= qdisc_peek_dequeued, +	.init		= pie_init, +	.destroy	= pie_destroy, +	.reset		= pie_reset, +	.change		= pie_change, +	.dump		= pie_dump, +	.dump_stats	= pie_dump_stats, +	.owner		= THIS_MODULE, +}; + +static int __init pie_module_init(void) +{ +	return register_qdisc(&pie_qdisc_ops); +} + +static void __exit pie_module_exit(void) +{ +	unregister_qdisc(&pie_qdisc_ops); +} + +module_init(pie_module_init); +module_exit(pie_module_exit); + +MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); +MODULE_AUTHOR("Vijay Subramanian"); +MODULE_AUTHOR("Mythili Prabhu"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 00000000000..89f8fcf73f1 --- /dev/null +++ b/net/sched/sch_plug.c @@ -0,0 +1,233 @@ +/* + * sch_plug.c Queue traffic until an explicit release command + * + *             This program is free software; you can redistribute it and/or + *             modify it under the terms of the GNU General Public License + *             as published by the Free Software Foundation; either version + *             2 of the License, or (at your option) any later version. + * + * There are two ways to use this qdisc: + * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating + *    sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. + * + * 2. For network output buffering (a.k.a output commit) functionality. + *    Output commit property is commonly used by applications using checkpoint + *    based fault-tolerance to ensure that the checkpoint from which a system + *    is being restored is consistent w.r.t outside world. + * + *    Consider for e.g. Remus - a Virtual Machine checkpointing system, + *    wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated + *    asynchronously to the backup host, while the VM continues executing the + *    next epoch speculatively. + * + *    The following is a typical sequence of output buffer operations: + *       1.At epoch i, start_buffer(i) + *       2. At end of epoch i (i.e. after 50ms): + *          2.1 Stop VM and take checkpoint(i). + *          2.2 start_buffer(i+1) and Resume VM + *       3. While speculatively executing epoch(i+1), asynchronously replicate + *          checkpoint(i) to backup host. + *       4. When checkpoint_ack(i) is received from backup, release_buffer(i) + *    Thus, this Qdisc would receive the following sequence of commands: + *       TCQ_PLUG_BUFFER (epoch i) + *       .. TCQ_PLUG_BUFFER (epoch i+1) + *       ....TCQ_PLUG_RELEASE_ONE (epoch i) + *       ......TCQ_PLUG_BUFFER (epoch i+2) + *       ........ + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +/* + * State of the queue, when used for network output buffering: + * + *                 plug(i+1)            plug(i)          head + * ------------------+--------------------+----------------> + *                   |                    | + *                   |                    | + * pkts_current_epoch| pkts_last_epoch    |pkts_to_release + * ----------------->|<--------+--------->|+---------------> + *                   v                    v + * + */ + +struct plug_sched_data { +	/* If true, the dequeue function releases all packets +	 * from head to end of the queue. The queue turns into +	 * a pass-through queue for newly arriving packets. +	 */ +	bool unplug_indefinite; + +	/* Queue Limit in bytes */ +	u32 limit; + +	/* Number of packets (output) from the current speculatively +	 * executing epoch. +	 */ +	u32 pkts_current_epoch; + +	/* Number of packets corresponding to the recently finished +	 * epoch. These will be released when we receive a +	 * TCQ_PLUG_RELEASE_ONE command. This command is typically +	 * issued after committing a checkpoint at the target. +	 */ +	u32 pkts_last_epoch; + +	/* +	 * Number of packets from the head of the queue, that can +	 * be released (committed checkpoint). +	 */ +	u32 pkts_to_release; +}; + +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct plug_sched_data *q = qdisc_priv(sch); + +	if (likely(sch->qstats.backlog + skb->len <= q->limit)) { +		if (!q->unplug_indefinite) +			q->pkts_current_epoch++; +		return qdisc_enqueue_tail(skb, sch); +	} + +	return qdisc_reshape_fail(skb, sch); +} + +static struct sk_buff *plug_dequeue(struct Qdisc *sch) +{ +	struct plug_sched_data *q = qdisc_priv(sch); + +	if (qdisc_is_throttled(sch)) +		return NULL; + +	if (!q->unplug_indefinite) { +		if (!q->pkts_to_release) { +			/* No more packets to dequeue. Block the queue +			 * and wait for the next release command. +			 */ +			qdisc_throttled(sch); +			return NULL; +		} +		q->pkts_to_release--; +	} + +	return qdisc_dequeue_head(sch); +} + +static int plug_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct plug_sched_data *q = qdisc_priv(sch); + +	q->pkts_current_epoch = 0; +	q->pkts_last_epoch = 0; +	q->pkts_to_release = 0; +	q->unplug_indefinite = false; + +	if (opt == NULL) { +		/* We will set a default limit of 100 pkts (~150kB) +		 * in case tx_queue_len is not available. The +		 * default value is completely arbitrary. +		 */ +		u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; +		q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); +	} else { +		struct tc_plug_qopt *ctl = nla_data(opt); + +		if (nla_len(opt) < sizeof(*ctl)) +			return -EINVAL; + +		q->limit = ctl->limit; +	} + +	qdisc_throttled(sch); +	return 0; +} + +/* Receives 4 types of messages: + * TCQ_PLUG_BUFFER: Inset a plug into the queue and + *  buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + *   to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + *   Stop buffering packets until the next TCQ_PLUG_BUFFER + *   command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ +static int plug_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct plug_sched_data *q = qdisc_priv(sch); +	struct tc_plug_qopt *msg; + +	if (opt == NULL) +		return -EINVAL; + +	msg = nla_data(opt); +	if (nla_len(opt) < sizeof(*msg)) +		return -EINVAL; + +	switch (msg->action) { +	case TCQ_PLUG_BUFFER: +		/* Save size of the current buffer */ +		q->pkts_last_epoch = q->pkts_current_epoch; +		q->pkts_current_epoch = 0; +		if (q->unplug_indefinite) +			qdisc_throttled(sch); +		q->unplug_indefinite = false; +		break; +	case TCQ_PLUG_RELEASE_ONE: +		/* Add packets from the last complete buffer to the +		 * packets to be released set. +		 */ +		q->pkts_to_release += q->pkts_last_epoch; +		q->pkts_last_epoch = 0; +		qdisc_unthrottled(sch); +		netif_schedule_queue(sch->dev_queue); +		break; +	case TCQ_PLUG_RELEASE_INDEFINITE: +		q->unplug_indefinite = true; +		q->pkts_to_release = 0; +		q->pkts_last_epoch = 0; +		q->pkts_current_epoch = 0; +		qdisc_unthrottled(sch); +		netif_schedule_queue(sch->dev_queue); +		break; +	case TCQ_PLUG_LIMIT: +		/* Limit is supplied in bytes */ +		q->limit = msg->limit; +		break; +	default: +		return -EINVAL; +	} + +	return 0; +} + +static struct Qdisc_ops plug_qdisc_ops __read_mostly = { +	.id          =       "plug", +	.priv_size   =       sizeof(struct plug_sched_data), +	.enqueue     =       plug_enqueue, +	.dequeue     =       plug_dequeue, +	.peek        =       qdisc_peek_head, +	.init        =       plug_init, +	.change      =       plug_change, +	.owner       =       THIS_MODULE, +}; + +static int __init plug_module_init(void) +{ +	return register_qdisc(&plug_qdisc_ops); +} + +static void __exit plug_module_exit(void) +{ +	unregister_qdisc(&plug_qdisc_ops); +} +module_init(plug_module_init) +module_exit(plug_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index b1c95bce33c..79359b69ad8 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -22,8 +22,7 @@  #include <net/pkt_sched.h> -struct prio_sched_data -{ +struct prio_sched_data {  	int bands;  	struct tcf_proto *filter_list;  	u8  prio2band[TC_PRIO_MAX+1]; @@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)  		if (!q->filter_list || err < 0) {  			if (TC_H_MAJ(band))  				band = 0; -			return q->queues[q->prio2band[band&TC_PRIO_MAX]]; +			return q->queues[q->prio2band[band & TC_PRIO_MAX]];  		}  		band = res.classid;  	} @@ -84,8 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	ret = qdisc_enqueue(skb, qdisc);  	if (ret == NET_XMIT_SUCCESS) { -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++;  		sch->q.qlen++;  		return NET_XMIT_SUCCESS;  	} @@ -108,15 +105,16 @@ static struct sk_buff *prio_peek(struct Qdisc *sch)  	return NULL;  } -static struct sk_buff *prio_dequeue(struct Qdisc* sch) +static struct sk_buff *prio_dequeue(struct Qdisc *sch)  {  	struct prio_sched_data *q = qdisc_priv(sch);  	int prio;  	for (prio = 0; prio < q->bands; prio++) {  		struct Qdisc *qdisc = q->queues[prio]; -		struct sk_buff *skb = qdisc->dequeue(qdisc); +		struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);  		if (skb) { +			qdisc_bstats_update(sch, skb);  			sch->q.qlen--;  			return skb;  		} @@ -125,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)  } -static unsigned int prio_drop(struct Qdisc* sch) +static unsigned int prio_drop(struct Qdisc *sch)  {  	struct prio_sched_data *q = qdisc_priv(sch);  	int prio; @@ -144,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch)  static void -prio_reset(struct Qdisc* sch) +prio_reset(struct Qdisc *sch)  {  	int prio;  	struct prio_sched_data *q = qdisc_priv(sch); -	for (prio=0; prio<q->bands; prio++) +	for (prio = 0; prio < q->bands; prio++)  		qdisc_reset(q->queues[prio]);  	sch->q.qlen = 0;  }  static void -prio_destroy(struct Qdisc* sch) +prio_destroy(struct Qdisc *sch)  {  	int prio;  	struct prio_sched_data *q = qdisc_priv(sch);  	tcf_destroy_chain(&q->filter_list); -	for (prio=0; prio<q->bands; prio++) +	for (prio = 0; prio < q->bands; prio++)  		qdisc_destroy(q->queues[prio]);  } @@ -178,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)  	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)  		return -EINVAL; -	for (i=0; i<=TC_PRIO_MAX; i++) { +	for (i = 0; i <= TC_PRIO_MAX; i++) {  		if (qopt->priomap[i] >= qopt->bands)  			return -EINVAL;  	} @@ -187,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)  	q->bands = qopt->bands;  	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); -	for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { +	for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {  		struct Qdisc *child = q->queues[i];  		q->queues[i] = &noop_qdisc;  		if (child != &noop_qdisc) { @@ -197,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)  	}  	sch_tree_unlock(sch); -	for (i=0; i<q->bands; i++) { +	for (i = 0; i < q->bands; i++) {  		if (q->queues[i] == &noop_qdisc) {  			struct Qdisc *child, *old; +  			child = qdisc_create_dflt(sch->dev_queue,  						  &pfifo_qdisc_ops,  						  TC_H_MAKE(sch->handle, i + 1)); @@ -225,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)  	struct prio_sched_data *q = qdisc_priv(sch);  	int i; -	for (i=0; i<TCQ_PRIO_BANDS; i++) +	for (i = 0; i < TCQ_PRIO_BANDS; i++)  		q->queues[i] = &noop_qdisc;  	if (opt == NULL) { @@ -233,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)  	} else {  		int err; -		if ((err= prio_tune(sch, opt)) != 0) +		if ((err = prio_tune(sch, opt)) != 0)  			return err;  	}  	return 0; @@ -246,9 +245,10 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)  	struct tc_prio_qopt opt;  	opt.bands = q->bands; -	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); +	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len; @@ -343,7 +343,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)  			arg->count++;  			continue;  		} -		if (arg->fn(sch, prio+1, arg) < 0) { +		if (arg->fn(sch, prio + 1, arg) < 0) {  			arg->stop = 1;  			break;  		} @@ -351,7 +351,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)  	}  } -static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)  {  	struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c new file mode 100644 index 00000000000..8056fb4e618 --- /dev/null +++ b/net/sched/sch_qfq.c @@ -0,0 +1,1582 @@ +/* + * net/sched/sch_qfq.c         Quick Fair Queueing Plus Scheduler. + * + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * Copyright (c) 2012 Paolo Valente. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + + +/*  Quick Fair Queueing Plus +    ======================== + +    Sources: + +    [1] Paolo Valente, +    "Reducing the Execution Time of Fair-Queueing Schedulers." +    http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf + +    Sources for QFQ: + +    [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient +    Packet Scheduling with Tight Bandwidth Distribution Guarantees." + +    See also: +    http://retis.sssup.it/~fabio/linux/qfq/ + */ + +/* + +  QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES +  classes. Each aggregate is timestamped with a virtual start time S +  and a virtual finish time F, and scheduled according to its +  timestamps. S and F are computed as a function of a system virtual +  time function V. The classes within each aggregate are instead +  scheduled with DRR. + +  To speed up operations, QFQ+ divides also aggregates into a limited +  number of groups. Which group a class belongs to depends on the +  ratio between the maximum packet length for the class and the weight +  of the class. Groups have their own S and F. In the end, QFQ+ +  schedules groups, then aggregates within groups, then classes within +  aggregates. See [1] and [2] for a full description. + +  Virtual time computations. + +  S, F and V are all computed in fixed point arithmetic with +  FRAC_BITS decimal bits. + +  QFQ_MAX_INDEX is the maximum index allowed for a group. We need +	one bit per index. +  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + +  The layout of the bits is as below: + +                   [ MTU_SHIFT ][      FRAC_BITS    ] +                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ] +				 ^.__grp->index = 0 +				 *.__grp->slot_shift + +  where MIN_SLOT_SHIFT is derived by difference from the others. + +  The max group index corresponds to Lmax/w_min, where +  Lmax=1<<MTU_SHIFT, w_min = 1 . +  From this, and knowing how many groups (MAX_INDEX) we want, +  we can derive the shift corresponding to each group. + +  Because we often need to compute +	F = S + len/w_i  and V = V + len/wsum +  instead of storing w_i store the value +	inv_w = (1<<FRAC_BITS)/w_i +  so we can do F = S + len * inv_w * wsum. +  We use W_TOT in the formulas so we can easily move between +  static and adaptive weight sum. + +  The per-scheduler-instance data contain all the data structures +  for the scheduler: bitmaps and bucket lists. + + */ + +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. + */ +#define QFQ_MAX_SLOTS	32 + +/* + * Shifts used for aggregate<->group mapping.  We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the + * group with the smallest index that can support the L_i / r_i configured + * for the classes in the aggregate. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + */ +#define QFQ_MAX_INDEX		24 +#define QFQ_MAX_WSHIFT		10 + +#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ +#define QFQ_MAX_WSUM		(64*QFQ_MAX_WEIGHT) + +#define FRAC_BITS		30	/* fixed point arithmetic */ +#define ONE_FP			(1UL << FRAC_BITS) + +#define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */ +#define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */ + +#define QFQ_MAX_AGG_CLASSES	8 /* max num classes per aggregate allowed */ + +/* + * Possible group states.  These values are used as indexes for the bitmaps + * array of struct qfq_queue. + */ +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; + +struct qfq_group; + +struct qfq_aggregate; + +struct qfq_class { +	struct Qdisc_class_common common; + +	unsigned int refcnt; +	unsigned int filter_cnt; + +	struct gnet_stats_basic_packed bstats; +	struct gnet_stats_queue qstats; +	struct gnet_stats_rate_est64 rate_est; +	struct Qdisc *qdisc; +	struct list_head alist;		/* Link for active-classes list. */ +	struct qfq_aggregate *agg;	/* Parent aggregate. */ +	int deficit;			/* DRR deficit counter. */ +}; + +struct qfq_aggregate { +	struct hlist_node next;	/* Link for the slot list. */ +	u64 S, F;		/* flow timestamps (exact) */ + +	/* group we belong to. In principle we would need the index, +	 * which is log_2(lmax/weight), but we never reference it +	 * directly, only the group. +	 */ +	struct qfq_group *grp; + +	/* these are copied from the flowset. */ +	u32	class_weight; /* Weight of each class in this aggregate. */ +	/* Max pkt size for the classes in this aggregate, DRR quantum. */ +	int	lmax; + +	u32	inv_w;	    /* ONE_FP/(sum of weights of classes in aggr.). */ +	u32	budgetmax;  /* Max budget for this aggregate. */ +	u32	initial_budget, budget;     /* Initial and current budget. */ + +	int		  num_classes;	/* Number of classes in this aggr. */ +	struct list_head  active;	/* DRR queue of active classes. */ + +	struct hlist_node nonfull_next;	/* See nonfull_aggs in qfq_sched. */ +}; + +struct qfq_group { +	u64 S, F;			/* group timestamps (approx). */ +	unsigned int slot_shift;	/* Slot shift. */ +	unsigned int index;		/* Group index. */ +	unsigned int front;		/* Index of the front slot. */ +	unsigned long full_slots;	/* non-empty slots */ + +	/* Array of RR lists of active aggregates. */ +	struct hlist_head slots[QFQ_MAX_SLOTS]; +}; + +struct qfq_sched { +	struct tcf_proto *filter_list; +	struct Qdisc_class_hash clhash; + +	u64			oldV, V;	/* Precise virtual times. */ +	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */ +	u32			num_active_agg; /* Num. of active aggregates */ +	u32			wsum;		/* weight sum */ +	u32			iwsum;		/* inverse weight sum */ + +	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */ +	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ +	u32 min_slot_shift;	/* Index of the group-0 bit in the bitmaps. */ + +	u32 max_agg_classes;		/* Max number of classes per aggr. */ +	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ +}; + +/* + * Possible reasons why the timestamps of an aggregate are updated + * enqueue: the aggregate switches from idle to active and must scheduled + *	    for service + * requeue: the aggregate finishes its budget, so it stops being served and + *	    must be rescheduled for service + */ +enum update_reason {enqueue, requeue}; + +static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct Qdisc_class_common *clc; + +	clc = qdisc_class_find(&q->clhash, classid); +	if (clc == NULL) +		return NULL; +	return container_of(clc, struct qfq_class, common); +} + +static void qfq_purge_queue(struct qfq_class *cl) +{ +	unsigned int len = cl->qdisc->q.qlen; + +	qdisc_reset(cl->qdisc); +	qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { +	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, +	[TCA_QFQ_LMAX] = { .type = NLA_U32 }, +}; + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) +{ +	u64 slot_size = (u64)maxlen * inv_w; +	unsigned long size_map; +	int index = 0; + +	size_map = slot_size >> min_slot_shift; +	if (!size_map) +		goto out; + +	index = __fls(size_map) + 1;	/* basically a log_2 */ +	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); + +	if (index < 0) +		index = 0; +out: +	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", +		 (unsigned long) ONE_FP/inv_w, maxlen, index); + +	return index; +} + +static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); +static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, +			     enum update_reason); + +static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, +			 u32 lmax, u32 weight) +{ +	INIT_LIST_HEAD(&agg->active); +	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + +	agg->lmax = lmax; +	agg->class_weight = weight; +} + +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, +					  u32 lmax, u32 weight) +{ +	struct qfq_aggregate *agg; + +	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) +		if (agg->lmax == lmax && agg->class_weight == weight) +			return agg; + +	return NULL; +} + + +/* Update aggregate as a function of the new number of classes. */ +static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, +			   int new_num_classes) +{ +	u32 new_agg_weight; + +	if (new_num_classes == q->max_agg_classes) +		hlist_del_init(&agg->nonfull_next); + +	if (agg->num_classes > new_num_classes && +	    new_num_classes == q->max_agg_classes - 1) /* agg no more full */ +		hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + +	/* The next assignment may let +	 * agg->initial_budget > agg->budgetmax +	 * hold, we will take it into account in charge_actual_service(). +	 */ +	agg->budgetmax = new_num_classes * agg->lmax; +	new_agg_weight = agg->class_weight * new_num_classes; +	agg->inv_w = ONE_FP/new_agg_weight; + +	if (agg->grp == NULL) { +		int i = qfq_calc_index(agg->inv_w, agg->budgetmax, +				       q->min_slot_shift); +		agg->grp = &q->groups[i]; +	} + +	q->wsum += +		(int) agg->class_weight * (new_num_classes - agg->num_classes); +	q->iwsum = ONE_FP / q->wsum; + +	agg->num_classes = new_num_classes; +} + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, +			   struct qfq_aggregate *agg, +			   struct qfq_class *cl) +{ +	cl->agg = agg; + +	qfq_update_agg(q, agg, agg->num_classes+1); +	if (cl->qdisc->q.qlen > 0) { /* adding an active class */ +		list_add_tail(&cl->alist, &agg->active); +		if (list_first_entry(&agg->active, struct qfq_class, alist) == +		    cl && q->in_serv_agg != agg) /* agg was inactive */ +			qfq_activate_agg(q, agg, enqueue); /* schedule agg */ +	} +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); + +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ +	if (!hlist_unhashed(&agg->nonfull_next)) +		hlist_del_init(&agg->nonfull_next); +	q->wsum -= agg->class_weight; +	if (q->wsum != 0) +		q->iwsum = ONE_FP / q->wsum; + +	if (q->in_serv_agg == agg) +		q->in_serv_agg = qfq_choose_next_agg(q); +	kfree(agg); +} + +/* Deschedule class from within its parent aggregate. */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ +	struct qfq_aggregate *agg = cl->agg; + + +	list_del(&cl->alist); /* remove from RR queue of the aggregate */ +	if (list_empty(&agg->active)) /* agg is now inactive */ +		qfq_deactivate_agg(q, agg); +} + +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ +	struct qfq_aggregate *agg = cl->agg; + +	cl->agg = NULL; +	if (agg->num_classes == 1) { /* agg being emptied, destroy it */ +		qfq_destroy_agg(q, agg); +		return; +	} +	qfq_update_agg(q, agg, agg->num_classes-1); +} + +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ +	if (cl->qdisc->q.qlen > 0) /* class is active */ +		qfq_deactivate_class(q, cl); + +	qfq_rm_from_agg(q, cl); +} + +/* Move class to a new aggregate, matching the new class weight and/or lmax */ +static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, +			   u32 lmax) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + +	if (new_agg == NULL) { /* create new aggregate */ +		new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); +		if (new_agg == NULL) +			return -ENOBUFS; +		qfq_init_agg(q, new_agg, lmax, weight); +	} +	qfq_deact_rm_from_agg(q, cl); +	qfq_add_to_agg(q, new_agg, cl); + +	return 0; +} + +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, +			    struct nlattr **tca, unsigned long *arg) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl = (struct qfq_class *)*arg; +	bool existing = false; +	struct nlattr *tb[TCA_QFQ_MAX + 1]; +	struct qfq_aggregate *new_agg = NULL; +	u32 weight, lmax, inv_w; +	int err; +	int delta_w; + +	if (tca[TCA_OPTIONS] == NULL) { +		pr_notice("qfq: no options\n"); +		return -EINVAL; +	} + +	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy); +	if (err < 0) +		return err; + +	if (tb[TCA_QFQ_WEIGHT]) { +		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); +		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { +			pr_notice("qfq: invalid weight %u\n", weight); +			return -EINVAL; +		} +	} else +		weight = 1; + +	if (tb[TCA_QFQ_LMAX]) { +		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); +		if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { +			pr_notice("qfq: invalid max length %u\n", lmax); +			return -EINVAL; +		} +	} else +		lmax = psched_mtu(qdisc_dev(sch)); + +	inv_w = ONE_FP / weight; +	weight = ONE_FP / inv_w; + +	if (cl != NULL && +	    lmax == cl->agg->lmax && +	    weight == cl->agg->class_weight) +		return 0; /* nothing to change */ + +	delta_w = weight - (cl ? cl->agg->class_weight : 0); + +	if (q->wsum + delta_w > QFQ_MAX_WSUM) { +		pr_notice("qfq: total weight out of range (%d + %u)\n", +			  delta_w, q->wsum); +		return -EINVAL; +	} + +	if (cl != NULL) { /* modify existing class */ +		if (tca[TCA_RATE]) { +			err = gen_replace_estimator(&cl->bstats, &cl->rate_est, +						    qdisc_root_sleeping_lock(sch), +						    tca[TCA_RATE]); +			if (err) +				return err; +		} +		existing = true; +		goto set_change_agg; +	} + +	/* create and init new class */ +	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); +	if (cl == NULL) +		return -ENOBUFS; + +	cl->refcnt = 1; +	cl->common.classid = classid; +	cl->deficit = lmax; + +	cl->qdisc = qdisc_create_dflt(sch->dev_queue, +				      &pfifo_qdisc_ops, classid); +	if (cl->qdisc == NULL) +		cl->qdisc = &noop_qdisc; + +	if (tca[TCA_RATE]) { +		err = gen_new_estimator(&cl->bstats, &cl->rate_est, +					qdisc_root_sleeping_lock(sch), +					tca[TCA_RATE]); +		if (err) +			goto destroy_class; +	} + +	sch_tree_lock(sch); +	qdisc_class_hash_insert(&q->clhash, &cl->common); +	sch_tree_unlock(sch); + +	qdisc_class_hash_grow(sch, &q->clhash); + +set_change_agg: +	sch_tree_lock(sch); +	new_agg = qfq_find_agg(q, lmax, weight); +	if (new_agg == NULL) { /* create new aggregate */ +		sch_tree_unlock(sch); +		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); +		if (new_agg == NULL) { +			err = -ENOBUFS; +			gen_kill_estimator(&cl->bstats, &cl->rate_est); +			goto destroy_class; +		} +		sch_tree_lock(sch); +		qfq_init_agg(q, new_agg, lmax, weight); +	} +	if (existing) +		qfq_deact_rm_from_agg(q, cl); +	qfq_add_to_agg(q, new_agg, cl); +	sch_tree_unlock(sch); + +	*arg = (unsigned long)cl; +	return 0; + +destroy_class: +	qdisc_destroy(cl->qdisc); +	kfree(cl); +	return err; +} + +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) +{ +	struct qfq_sched *q = qdisc_priv(sch); + +	qfq_rm_from_agg(q, cl); +	gen_kill_estimator(&cl->bstats, &cl->rate_est); +	qdisc_destroy(cl->qdisc); +	kfree(cl); +} + +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl = (struct qfq_class *)arg; + +	if (cl->filter_cnt > 0) +		return -EBUSY; + +	sch_tree_lock(sch); + +	qfq_purge_queue(cl); +	qdisc_class_hash_remove(&q->clhash, &cl->common); + +	BUG_ON(--cl->refcnt == 0); +	/* +	 * This shouldn't happen: we "hold" one cops->get() when called +	 * from tc_ctl_tclass; the destroy method is done from cops->put(). +	 */ + +	sch_tree_unlock(sch); +	return 0; +} + +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid) +{ +	struct qfq_class *cl = qfq_find_class(sch, classid); + +	if (cl != NULL) +		cl->refcnt++; + +	return (unsigned long)cl; +} + +static void qfq_put_class(struct Qdisc *sch, unsigned long arg) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; + +	if (--cl->refcnt == 0) +		qfq_destroy_class(sch, cl); +} + +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ +	struct qfq_sched *q = qdisc_priv(sch); + +	if (cl) +		return NULL; + +	return &q->filter_list; +} + +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, +				  u32 classid) +{ +	struct qfq_class *cl = qfq_find_class(sch, classid); + +	if (cl != NULL) +		cl->filter_cnt++; + +	return (unsigned long)cl; +} + +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; + +	cl->filter_cnt--; +} + +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, +			   struct Qdisc *new, struct Qdisc **old) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; + +	if (new == NULL) { +		new = qdisc_create_dflt(sch->dev_queue, +					&pfifo_qdisc_ops, cl->common.classid); +		if (new == NULL) +			new = &noop_qdisc; +	} + +	sch_tree_lock(sch); +	qfq_purge_queue(cl); +	*old = cl->qdisc; +	cl->qdisc = new; +	sch_tree_unlock(sch); +	return 0; +} + +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; + +	return cl->qdisc; +} + +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, +			  struct sk_buff *skb, struct tcmsg *tcm) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; +	struct nlattr *nest; + +	tcm->tcm_parent	= TC_H_ROOT; +	tcm->tcm_handle	= cl->common.classid; +	tcm->tcm_info	= cl->qdisc->handle; + +	nest = nla_nest_start(skb, TCA_OPTIONS); +	if (nest == NULL) +		goto nla_put_failure; +	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || +	    nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) +		goto nla_put_failure; +	return nla_nest_end(skb, nest); + +nla_put_failure: +	nla_nest_cancel(skb, nest); +	return -EMSGSIZE; +} + +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, +				struct gnet_dump *d) +{ +	struct qfq_class *cl = (struct qfq_class *)arg; +	struct tc_qfq_stats xstats; + +	memset(&xstats, 0, sizeof(xstats)); +	cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + +	xstats.weight = cl->agg->class_weight; +	xstats.lmax = cl->agg->lmax; + +	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || +	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) +		return -1; + +	return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl; +	unsigned int i; + +	if (arg->stop) +		return; + +	for (i = 0; i < q->clhash.hashsize; i++) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { +			if (arg->count < arg->skip) { +				arg->count++; +				continue; +			} +			if (arg->fn(sch, (unsigned long)cl, arg) < 0) { +				arg->stop = 1; +				return; +			} +			arg->count++; +		} +	} +} + +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, +				      int *qerr) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl; +	struct tcf_result res; +	int result; + +	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { +		pr_debug("qfq_classify: found %d\n", skb->priority); +		cl = qfq_find_class(sch, skb->priority); +		if (cl != NULL) +			return cl; +	} + +	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; +	result = tc_classify(skb, q->filter_list, &res); +	if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT +		switch (result) { +		case TC_ACT_QUEUED: +		case TC_ACT_STOLEN: +			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; +		case TC_ACT_SHOT: +			return NULL; +		} +#endif +		cl = (struct qfq_class *)res.class; +		if (cl == NULL) +			cl = qfq_find_class(sch, res.classid); +		return cl; +	} + +	return NULL; +} + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(u64 a, u64 b) +{ +	return (s64)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline u64 qfq_round_down(u64 ts, unsigned int shift) +{ +	return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, +					unsigned long bitmap) +{ +	int index = __ffs(bitmap); +	return &q->groups[index]; +} +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long mask_from(unsigned long bitmap, int from) +{ +	return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) +{ +	/* if S > V we are not eligible */ +	unsigned int state = qfq_gt(grp->S, q->V); +	unsigned long mask = mask_from(q->bitmaps[ER], grp->index); +	struct qfq_group *next; + +	if (mask) { +		next = qfq_ffs(q, mask); +		if (qfq_gt(grp->F, next->F)) +			state |= EB; +	} + +	return state; +} + + +/* + * In principle + *	q->bitmaps[dst] |= q->bitmaps[src] & mask; + *	q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, +				   int src, int dst) +{ +	q->bitmaps[dst] |= q->bitmaps[src] & mask; +	q->bitmaps[src] &= ~mask; +} + +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) +{ +	unsigned long mask = mask_from(q->bitmaps[ER], index + 1); +	struct qfq_group *next; + +	if (mask) { +		next = qfq_ffs(q, mask); +		if (!qfq_gt(next->F, old_F)) +			return; +	} + +	mask = (1UL << index) - 1; +	qfq_move_groups(q, mask, EB, ER); +	qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * +	old_V ^= q->V; +	old_V >>= q->min_slot_shift; +	if (old_V) { +		... +	} + * + */ +static void qfq_make_eligible(struct qfq_sched *q) +{ +	unsigned long vslot = q->V >> q->min_slot_shift; +	unsigned long old_vslot = q->oldV >> q->min_slot_shift; + +	if (vslot != old_vslot) { +		unsigned long mask; +		int last_flip_pos = fls(vslot ^ old_vslot); + +		if (last_flip_pos > 31) /* higher than the number of groups */ +			mask = ~0UL;    /* make all groups eligible */ +		else +			mask = (1UL << last_flip_pos) - 1; + +		qfq_move_groups(q, mask, IR, ER); +		qfq_move_groups(q, mask, IB, EB); +	} +} + +/* + * The index of the slot in which the input aggregate agg is to be + * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' + * and not a '-1' because the start time of the group may be moved + * backward by one slot after the aggregate has been inserted, and + * this would cause non-empty slots to be right-shifted by one + * position. + * + * QFQ+ fully satisfies this bound to the slot index if the parameters + * of the classes are not changed dynamically, and if QFQ+ never + * happens to postpone the service of agg unjustly, i.e., it never + * happens that the aggregate becomes backlogged and eligible, or just + * eligible, while an aggregate with a higher approximated finish time + * is being served. In particular, in this case QFQ+ guarantees that + * the timestamps of agg are low enough that the slot index is never + * higher than 2. Unfortunately, QFQ+ cannot provide the same + * guarantee if it happens to unjustly postpone the service of agg, or + * if the parameters of some class are changed. + * + * As for the first event, i.e., an out-of-order service, the + * upper bound to the slot index guaranteed by QFQ+ grows to + * 2 + + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. + * + * The following function deals with this problem by backward-shifting + * the timestamps of agg, if needed, so as to guarantee that the slot + * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may + * cause the service of other aggregates to be postponed, yet the + * worst-case guarantees of these aggregates are not violated.  In + * fact, in case of no out-of-order service, the timestamps of agg + * would have been even lower than they are after the backward shift, + * because QFQ+ would have guaranteed a maximum value equal to 2 for + * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose + * service is postponed because of the backward-shift would have + * however waited for the service of agg before being served. + * + * The other event that may cause the slot index to be higher than 2 + * for agg is a recent change of the parameters of some class. If the + * weight of a class is increased or the lmax (max_pkt_size) of the + * class is decreased, then a new aggregate with smaller slot size + * than the original parent aggregate of the class may happen to be + * activated. The activation of this aggregate should be properly + * delayed to when the service of the class has finished in the ideal + * system tracked by QFQ+. If the activation of the aggregate is not + * delayed to this reference time instant, then this aggregate may be + * unjustly served before other aggregates waiting for service. This + * may cause the above bound to the slot index to be violated for some + * of these unlucky aggregates. + * + * Instead of delaying the activation of the new aggregate, which is + * quite complex, the above-discussed capping of the slot index is + * used to handle also the consequences of a change of the parameters + * of a class. + */ +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, +			    u64 roundedS) +{ +	u64 slot = (roundedS - grp->S) >> grp->slot_shift; +	unsigned int i; /* slot index in the bucket list */ + +	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { +		u64 deltaS = roundedS - grp->S - +			((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); +		agg->S -= deltaS; +		agg->F -= deltaS; +		slot = QFQ_MAX_SLOTS - 2; +	} + +	i = (grp->front + slot) % QFQ_MAX_SLOTS; + +	hlist_add_head(&agg->next, &grp->slots[i]); +	__set_bit(slot, &grp->full_slots); +} + +/* Maybe introduce hlist_first_entry?? */ +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) +{ +	return hlist_entry(grp->slots[grp->front].first, +			   struct qfq_aggregate, next); +} + +/* + * remove the entry from the slot + */ +static void qfq_front_slot_remove(struct qfq_group *grp) +{ +	struct qfq_aggregate *agg = qfq_slot_head(grp); + +	BUG_ON(!agg); +	hlist_del(&agg->next); +	if (hlist_empty(&grp->slots[grp->front])) +		__clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first aggregate in the first non-empty bucket of the + * group. As a side effect, adjusts the bucket list so the first + * non-empty bucket is at position 0 in full_slots. + */ +static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) +{ +	unsigned int i; + +	pr_debug("qfq slot_scan: grp %u full %#lx\n", +		 grp->index, grp->full_slots); + +	if (grp->full_slots == 0) +		return NULL; + +	i = __ffs(grp->full_slots);  /* zero based */ +	if (i > 0) { +		grp->front = (grp->front + i) % QFQ_MAX_SLOTS; +		grp->full_slots >>= i; +	} + +	return qfq_slot_head(grp); +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) +{ +	unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + +	grp->full_slots <<= i; +	grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + +static void qfq_update_eligible(struct qfq_sched *q) +{ +	struct qfq_group *grp; +	unsigned long ineligible; + +	ineligible = q->bitmaps[IR] | q->bitmaps[IB]; +	if (ineligible) { +		if (!q->bitmaps[ER]) { +			grp = qfq_ffs(q, ineligible); +			if (qfq_gt(grp->S, q->V)) +				q->V = grp->S; +		} +		qfq_make_eligible(q); +	} +} + +/* Dequeue head packet of the head class in the DRR queue of the aggregate. */ +static void agg_dequeue(struct qfq_aggregate *agg, +			struct qfq_class *cl, unsigned int len) +{ +	qdisc_dequeue_peeked(cl->qdisc); + +	cl->deficit -= (int) len; + +	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ +		list_del(&cl->alist); +	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { +		cl->deficit += agg->lmax; +		list_move_tail(&cl->alist, &agg->active); +	} +} + +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, +					   struct qfq_class **cl, +					   unsigned int *len) +{ +	struct sk_buff *skb; + +	*cl = list_first_entry(&agg->active, struct qfq_class, alist); +	skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); +	if (skb == NULL) +		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); +	else +		*len = qdisc_pkt_len(skb); + +	return skb; +} + +/* Update F according to the actual service received by the aggregate. */ +static inline void charge_actual_service(struct qfq_aggregate *agg) +{ +	/* Compute the service received by the aggregate, taking into +	 * account that, after decreasing the number of classes in +	 * agg, it may happen that +	 * agg->initial_budget - agg->budget > agg->bugdetmax +	 */ +	u32 service_received = min(agg->budgetmax, +				   agg->initial_budget - agg->budget); + +	agg->F = agg->S + (u64)service_received * agg->inv_w; +} + +/* Assign a reasonable start time for a new aggregate in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) +{ +	unsigned long mask; +	u64 limit, roundedF; +	int slot_shift = agg->grp->slot_shift; + +	roundedF = qfq_round_down(agg->F, slot_shift); +	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); + +	if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { +		/* timestamp was stale */ +		mask = mask_from(q->bitmaps[ER], agg->grp->index); +		if (mask) { +			struct qfq_group *next = qfq_ffs(q, mask); +			if (qfq_gt(roundedF, next->F)) { +				if (qfq_gt(limit, next->F)) +					agg->S = next->F; +				else /* preserve timestamp correctness */ +					agg->S = limit; +				return; +			} +		} +		agg->S = q->V; +	} else  /* timestamp is not stale */ +		agg->S = agg->F; +} + +/* Update the timestamps of agg before scheduling/rescheduling it for + * service.  In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, +		    struct qfq_aggregate *agg, enum update_reason reason) +{ +	if (reason != requeue) +		qfq_update_start(q, agg); +	else /* just charge agg for the service received */ +		agg->S = agg->F; + +	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); + +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_aggregate *in_serv_agg = q->in_serv_agg; +	struct qfq_class *cl; +	struct sk_buff *skb = NULL; +	/* next-packet len, 0 means no more active classes in in-service agg */ +	unsigned int len = 0; + +	if (in_serv_agg == NULL) +		return NULL; + +	if (!list_empty(&in_serv_agg->active)) +		skb = qfq_peek_skb(in_serv_agg, &cl, &len); + +	/* +	 * If there are no active classes in the in-service aggregate, +	 * or if the aggregate has not enough budget to serve its next +	 * class, then choose the next aggregate to serve. +	 */ +	if (len == 0 || in_serv_agg->budget < len) { +		charge_actual_service(in_serv_agg); + +		/* recharge the budget of the aggregate */ +		in_serv_agg->initial_budget = in_serv_agg->budget = +			in_serv_agg->budgetmax; + +		if (!list_empty(&in_serv_agg->active)) { +			/* +			 * Still active: reschedule for +			 * service. Possible optimization: if no other +			 * aggregate is active, then there is no point +			 * in rescheduling this aggregate, and we can +			 * just keep it as the in-service one. This +			 * should be however a corner case, and to +			 * handle it, we would need to maintain an +			 * extra num_active_aggs field. +			*/ +			qfq_update_agg_ts(q, in_serv_agg, requeue); +			qfq_schedule_agg(q, in_serv_agg); +		} else if (sch->q.qlen == 0) { /* no aggregate to serve */ +			q->in_serv_agg = NULL; +			return NULL; +		} + +		/* +		 * If we get here, there are other aggregates queued: +		 * choose the new aggregate to serve. +		 */ +		in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); +		skb = qfq_peek_skb(in_serv_agg, &cl, &len); +	} +	if (!skb) +		return NULL; + +	sch->q.qlen--; +	qdisc_bstats_update(sch, skb); + +	agg_dequeue(in_serv_agg, cl, len); +	/* If lmax is lowered, through qfq_change_class, for a class +	 * owning pending packets with larger size than the new value +	 * of lmax, then the following condition may hold. +	 */ +	if (unlikely(in_serv_agg->budget < len)) +		in_serv_agg->budget = 0; +	else +		in_serv_agg->budget -= len; + +	q->V += (u64)len * q->iwsum; +	pr_debug("qfq dequeue: len %u F %lld now %lld\n", +		 len, (unsigned long long) in_serv_agg->F, +		 (unsigned long long) q->V); + +	return skb; +} + +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ +	struct qfq_group *grp; +	struct qfq_aggregate *agg, *new_front_agg; +	u64 old_F; + +	qfq_update_eligible(q); +	q->oldV = q->V; + +	if (!q->bitmaps[ER]) +		return NULL; + +	grp = qfq_ffs(q, q->bitmaps[ER]); +	old_F = grp->F; + +	agg = qfq_slot_head(grp); + +	/* agg starts to be served, remove it from schedule */ +	qfq_front_slot_remove(grp); + +	new_front_agg = qfq_slot_scan(grp); + +	if (new_front_agg == NULL) /* group is now inactive, remove from ER */ +		__clear_bit(grp->index, &q->bitmaps[ER]); +	else { +		u64 roundedS = qfq_round_down(new_front_agg->S, +					      grp->slot_shift); +		unsigned int s; + +		if (grp->S == roundedS) +			return agg; +		grp->S = roundedS; +		grp->F = roundedS + (2ULL << grp->slot_shift); +		__clear_bit(grp->index, &q->bitmaps[ER]); +		s = qfq_calc_state(q, grp); +		__set_bit(grp->index, &q->bitmaps[s]); +	} + +	qfq_unblock_groups(q, grp->index, old_F); + +	return agg; +} + +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl; +	struct qfq_aggregate *agg; +	int err = 0; + +	cl = qfq_classify(skb, sch, &err); +	if (cl == NULL) { +		if (err & __NET_XMIT_BYPASS) +			sch->qstats.drops++; +		kfree_skb(skb); +		return err; +	} +	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + +	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { +		pr_debug("qfq: increasing maxpkt from %u to %u for class %u", +			 cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); +		err = qfq_change_agg(sch, cl, cl->agg->class_weight, +				     qdisc_pkt_len(skb)); +		if (err) +			return err; +	} + +	err = qdisc_enqueue(skb, cl->qdisc); +	if (unlikely(err != NET_XMIT_SUCCESS)) { +		pr_debug("qfq_enqueue: enqueue failed %d\n", err); +		if (net_xmit_drop_count(err)) { +			cl->qstats.drops++; +			sch->qstats.drops++; +		} +		return err; +	} + +	bstats_update(&cl->bstats, skb); +	++sch->q.qlen; + +	agg = cl->agg; +	/* if the queue was not empty, then done here */ +	if (cl->qdisc->q.qlen != 1) { +		if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && +		    list_first_entry(&agg->active, struct qfq_class, alist) +		    == cl && cl->deficit < qdisc_pkt_len(skb)) +			list_move_tail(&cl->alist, &agg->active); + +		return err; +	} + +	/* schedule class for service within the aggregate */ +	cl->deficit = agg->lmax; +	list_add_tail(&cl->alist, &agg->active); + +	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || +	    q->in_serv_agg == agg) +		return err; /* non-empty or in service, nothing else to do */ + +	qfq_activate_agg(q, agg, enqueue); + +	return err; +} + +/* + * Schedule aggregate according to its timestamps. + */ +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ +	struct qfq_group *grp = agg->grp; +	u64 roundedS; +	int s; + +	roundedS = qfq_round_down(agg->S, grp->slot_shift); + +	/* +	 * Insert agg in the correct bucket. +	 * If agg->S >= grp->S we don't need to adjust the +	 * bucket list and simply go to the insertion phase. +	 * Otherwise grp->S is decreasing, we must make room +	 * in the bucket list, and also recompute the group state. +	 * Finally, if there were no flows in this group and nobody +	 * was in ER make sure to adjust V. +	 */ +	if (grp->full_slots) { +		if (!qfq_gt(grp->S, agg->S)) +			goto skip_update; + +		/* create a slot for this agg->S */ +		qfq_slot_rotate(grp, roundedS); +		/* group was surely ineligible, remove */ +		__clear_bit(grp->index, &q->bitmaps[IR]); +		__clear_bit(grp->index, &q->bitmaps[IB]); +	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && +		   q->in_serv_agg == NULL) +		q->V = roundedS; + +	grp->S = roundedS; +	grp->F = roundedS + (2ULL << grp->slot_shift); +	s = qfq_calc_state(q, grp); +	__set_bit(grp->index, &q->bitmaps[s]); + +	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", +		 s, q->bitmaps[s], +		 (unsigned long long) agg->S, +		 (unsigned long long) agg->F, +		 (unsigned long long) q->V); + +skip_update: +	qfq_slot_insert(grp, agg, roundedS); +} + + +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, +			     enum update_reason reason) +{ +	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + +	qfq_update_agg_ts(q, agg, reason); +	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ +		q->in_serv_agg = agg; /* start serving this aggregate */ +		 /* update V: to be in service, agg must be eligible */ +		q->oldV = q->V = agg->S; +	} else if (agg != q->in_serv_agg) +		qfq_schedule_agg(q, agg); +} + +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, +			    struct qfq_aggregate *agg) +{ +	unsigned int i, offset; +	u64 roundedS; + +	roundedS = qfq_round_down(agg->S, grp->slot_shift); +	offset = (roundedS - grp->S) >> grp->slot_shift; + +	i = (grp->front + offset) % QFQ_MAX_SLOTS; + +	hlist_del(&agg->next); +	if (hlist_empty(&grp->slots[i])) +		__clear_bit(offset, &grp->full_slots); +} + +/* + * Called to forcibly deschedule an aggregate.  If the aggregate is + * not in the front bucket, or if the latter has other aggregates in + * the front bucket, we can simply remove the aggregate with no other + * side effects. + * Otherwise we must propagate the event up. + */ +static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ +	struct qfq_group *grp = agg->grp; +	unsigned long mask; +	u64 roundedS; +	int s; + +	if (agg == q->in_serv_agg) { +		charge_actual_service(agg); +		q->in_serv_agg = qfq_choose_next_agg(q); +		return; +	} + +	agg->F = agg->S; +	qfq_slot_remove(q, grp, agg); + +	if (!grp->full_slots) { +		__clear_bit(grp->index, &q->bitmaps[IR]); +		__clear_bit(grp->index, &q->bitmaps[EB]); +		__clear_bit(grp->index, &q->bitmaps[IB]); + +		if (test_bit(grp->index, &q->bitmaps[ER]) && +		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { +			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); +			if (mask) +				mask = ~((1UL << __fls(mask)) - 1); +			else +				mask = ~0UL; +			qfq_move_groups(q, mask, EB, ER); +			qfq_move_groups(q, mask, IB, IR); +		} +		__clear_bit(grp->index, &q->bitmaps[ER]); +	} else if (hlist_empty(&grp->slots[grp->front])) { +		agg = qfq_slot_scan(grp); +		roundedS = qfq_round_down(agg->S, grp->slot_shift); +		if (grp->S != roundedS) { +			__clear_bit(grp->index, &q->bitmaps[ER]); +			__clear_bit(grp->index, &q->bitmaps[IR]); +			__clear_bit(grp->index, &q->bitmaps[EB]); +			__clear_bit(grp->index, &q->bitmaps[IB]); +			grp->S = roundedS; +			grp->F = roundedS + (2ULL << grp->slot_shift); +			s = qfq_calc_state(q, grp); +			__set_bit(grp->index, &q->bitmaps[s]); +		} +	} +} + +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl = (struct qfq_class *)arg; + +	if (cl->qdisc->q.qlen == 0) +		qfq_deactivate_class(q, cl); +} + +static unsigned int qfq_drop_from_slot(struct qfq_sched *q, +				       struct hlist_head *slot) +{ +	struct qfq_aggregate *agg; +	struct qfq_class *cl; +	unsigned int len; + +	hlist_for_each_entry(agg, slot, next) { +		list_for_each_entry(cl, &agg->active, alist) { + +			if (!cl->qdisc->ops->drop) +				continue; + +			len = cl->qdisc->ops->drop(cl->qdisc); +			if (len > 0) { +				if (cl->qdisc->q.qlen == 0) +					qfq_deactivate_class(q, cl); + +				return len; +			} +		} +	} +	return 0; +} + +static unsigned int qfq_drop(struct Qdisc *sch) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_group *grp; +	unsigned int i, j, len; + +	for (i = 0; i <= QFQ_MAX_INDEX; i++) { +		grp = &q->groups[i]; +		for (j = 0; j < QFQ_MAX_SLOTS; j++) { +			len = qfq_drop_from_slot(q, &grp->slots[j]); +			if (len > 0) { +				sch->q.qlen--; +				return len; +			} +		} + +	} + +	return 0; +} + +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_group *grp; +	int i, j, err; +	u32 max_cl_shift, maxbudg_shift, max_classes; + +	err = qdisc_class_hash_init(&q->clhash); +	if (err < 0) +		return err; + +	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) +		max_classes = QFQ_MAX_AGG_CLASSES; +	else +		max_classes = qdisc_dev(sch)->tx_queue_len + 1; +	/* max_cl_shift = floor(log_2(max_classes)) */ +	max_cl_shift = __fls(max_classes); +	q->max_agg_classes = 1<<max_cl_shift; + +	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */ +	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; +	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; + +	for (i = 0; i <= QFQ_MAX_INDEX; i++) { +		grp = &q->groups[i]; +		grp->index = i; +		grp->slot_shift = q->min_slot_shift + i; +		for (j = 0; j < QFQ_MAX_SLOTS; j++) +			INIT_HLIST_HEAD(&grp->slots[j]); +	} + +	INIT_HLIST_HEAD(&q->nonfull_aggs); + +	return 0; +} + +static void qfq_reset_qdisc(struct Qdisc *sch) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl; +	unsigned int i; + +	for (i = 0; i < q->clhash.hashsize; i++) { +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { +			if (cl->qdisc->q.qlen > 0) +				qfq_deactivate_class(q, cl); + +			qdisc_reset(cl->qdisc); +		} +	} +	sch->q.qlen = 0; +} + +static void qfq_destroy_qdisc(struct Qdisc *sch) +{ +	struct qfq_sched *q = qdisc_priv(sch); +	struct qfq_class *cl; +	struct hlist_node *next; +	unsigned int i; + +	tcf_destroy_chain(&q->filter_list); + +	for (i = 0; i < q->clhash.hashsize; i++) { +		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], +					  common.hnode) { +			qfq_destroy_class(sch, cl); +		} +	} +	qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops qfq_class_ops = { +	.change		= qfq_change_class, +	.delete		= qfq_delete_class, +	.get		= qfq_get_class, +	.put		= qfq_put_class, +	.tcf_chain	= qfq_tcf_chain, +	.bind_tcf	= qfq_bind_tcf, +	.unbind_tcf	= qfq_unbind_tcf, +	.graft		= qfq_graft_class, +	.leaf		= qfq_class_leaf, +	.qlen_notify	= qfq_qlen_notify, +	.dump		= qfq_dump_class, +	.dump_stats	= qfq_dump_class_stats, +	.walk		= qfq_walk, +}; + +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { +	.cl_ops		= &qfq_class_ops, +	.id		= "qfq", +	.priv_size	= sizeof(struct qfq_sched), +	.enqueue	= qfq_enqueue, +	.dequeue	= qfq_dequeue, +	.peek		= qdisc_peek_dequeued, +	.drop		= qfq_drop, +	.init		= qfq_init_qdisc, +	.reset		= qfq_reset_qdisc, +	.destroy	= qfq_destroy_qdisc, +	.owner		= THIS_MODULE, +}; + +static int __init qfq_init(void) +{ +	return register_qdisc(&qfq_qdisc_ops); +} + +static void __exit qfq_exit(void) +{ +	unregister_qdisc(&qfq_qdisc_ops); +} + +module_init(qfq_init); +module_exit(qfq_exit); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 8d42bb3ba54..633e32defdc 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -36,11 +36,12 @@  	if RED works correctly.   */ -struct red_sched_data -{ +struct red_sched_data {  	u32			limit;		/* HARD maximal queue length */  	unsigned char		flags; +	struct timer_list	adapt_timer;  	struct red_parms	parms; +	struct red_vars		vars;  	struct red_stats	stats;  	struct Qdisc		*qdisc;  }; @@ -55,47 +56,47 @@ static inline int red_use_harddrop(struct red_sched_data *q)  	return q->flags & TC_RED_HARDDROP;  } -static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)  {  	struct red_sched_data *q = qdisc_priv(sch);  	struct Qdisc *child = q->qdisc;  	int ret; -	q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog); - -	if (red_is_idling(&q->parms)) -		red_end_of_idle_period(&q->parms); - -	switch (red_action(&q->parms, q->parms.qavg)) { -		case RED_DONT_MARK: -			break; - -		case RED_PROB_MARK: -			sch->qstats.overlimits++; -			if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { -				q->stats.prob_drop++; -				goto congestion_drop; -			} - -			q->stats.prob_mark++; -			break; - -		case RED_HARD_MARK: -			sch->qstats.overlimits++; -			if (red_use_harddrop(q) || !red_use_ecn(q) || -			    !INET_ECN_set_ce(skb)) { -				q->stats.forced_drop++; -				goto congestion_drop; -			} - -			q->stats.forced_mark++; -			break; +	q->vars.qavg = red_calc_qavg(&q->parms, +				     &q->vars, +				     child->qstats.backlog); + +	if (red_is_idling(&q->vars)) +		red_end_of_idle_period(&q->vars); + +	switch (red_action(&q->parms, &q->vars, q->vars.qavg)) { +	case RED_DONT_MARK: +		break; + +	case RED_PROB_MARK: +		sch->qstats.overlimits++; +		if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { +			q->stats.prob_drop++; +			goto congestion_drop; +		} + +		q->stats.prob_mark++; +		break; + +	case RED_HARD_MARK: +		sch->qstats.overlimits++; +		if (red_use_harddrop(q) || !red_use_ecn(q) || +		    !INET_ECN_set_ce(skb)) { +			q->stats.forced_drop++; +			goto congestion_drop; +		} + +		q->stats.forced_mark++; +		break;  	}  	ret = qdisc_enqueue(skb, child);  	if (likely(ret == NET_XMIT_SUCCESS)) { -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++;  		sch->q.qlen++;  	} else if (net_xmit_drop_count(ret)) {  		q->stats.pdrop++; @@ -108,22 +109,24 @@ congestion_drop:  	return NET_XMIT_CN;  } -static struct sk_buff * red_dequeue(struct Qdisc* sch) +static struct sk_buff *red_dequeue(struct Qdisc *sch)  {  	struct sk_buff *skb;  	struct red_sched_data *q = qdisc_priv(sch);  	struct Qdisc *child = q->qdisc;  	skb = child->dequeue(child); -	if (skb) +	if (skb) { +		qdisc_bstats_update(sch, skb);  		sch->q.qlen--; -	else if (!red_is_idling(&q->parms)) -		red_start_of_idle_period(&q->parms); - +	} else { +		if (!red_is_idling(&q->vars)) +			red_start_of_idle_period(&q->vars); +	}  	return skb;  } -static struct sk_buff * red_peek(struct Qdisc* sch) +static struct sk_buff *red_peek(struct Qdisc *sch)  {  	struct red_sched_data *q = qdisc_priv(sch);  	struct Qdisc *child = q->qdisc; @@ -131,7 +134,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch)  	return child->ops->peek(child);  } -static unsigned int red_drop(struct Qdisc* sch) +static unsigned int red_drop(struct Qdisc *sch)  {  	struct red_sched_data *q = qdisc_priv(sch);  	struct Qdisc *child = q->qdisc; @@ -144,30 +147,33 @@ static unsigned int red_drop(struct Qdisc* sch)  		return len;  	} -	if (!red_is_idling(&q->parms)) -		red_start_of_idle_period(&q->parms); +	if (!red_is_idling(&q->vars)) +		red_start_of_idle_period(&q->vars);  	return 0;  } -static void red_reset(struct Qdisc* sch) +static void red_reset(struct Qdisc *sch)  {  	struct red_sched_data *q = qdisc_priv(sch);  	qdisc_reset(q->qdisc);  	sch->q.qlen = 0; -	red_restart(&q->parms); +	red_restart(&q->vars);  }  static void red_destroy(struct Qdisc *sch)  {  	struct red_sched_data *q = qdisc_priv(sch); + +	del_timer_sync(&q->adapt_timer);  	qdisc_destroy(q->qdisc);  }  static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {  	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },  	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE }, +	[TCA_RED_MAX_P] = { .type = NLA_U32 },  };  static int red_change(struct Qdisc *sch, struct nlattr *opt) @@ -177,6 +183,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)  	struct tc_red_qopt *ctl;  	struct Qdisc *child = NULL;  	int err; +	u32 max_P;  	if (opt == NULL)  		return -EINVAL; @@ -189,6 +196,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)  	    tb[TCA_RED_STAB] == NULL)  		return -EINVAL; +	max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; +  	ctl = nla_data(tb[TCA_RED_PARMS]);  	if (ctl->limit > 0) { @@ -206,22 +215,42 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)  		q->qdisc = child;  	} -	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, -				 ctl->Plog, ctl->Scell_log, -				 nla_data(tb[TCA_RED_STAB])); +	red_set_parms(&q->parms, +		      ctl->qth_min, ctl->qth_max, ctl->Wlog, +		      ctl->Plog, ctl->Scell_log, +		      nla_data(tb[TCA_RED_STAB]), +		      max_P); +	red_set_vars(&q->vars); -	if (skb_queue_empty(&sch->q)) -		red_end_of_idle_period(&q->parms); +	del_timer(&q->adapt_timer); +	if (ctl->flags & TC_RED_ADAPTATIVE) +		mod_timer(&q->adapt_timer, jiffies + HZ/2); + +	if (!q->qdisc->q.qlen) +		red_start_of_idle_period(&q->vars);  	sch_tree_unlock(sch);  	return 0;  } -static int red_init(struct Qdisc* sch, struct nlattr *opt) +static inline void red_adaptative_timer(unsigned long arg) +{ +	struct Qdisc *sch = (struct Qdisc *)arg; +	struct red_sched_data *q = qdisc_priv(sch); +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + +	spin_lock(root_lock); +	red_adaptative_algo(&q->parms, &q->vars); +	mod_timer(&q->adapt_timer, jiffies + HZ/2); +	spin_unlock(root_lock); +} + +static int red_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct red_sched_data *q = qdisc_priv(sch);  	q->qdisc = &noop_qdisc; +	setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);  	return red_change(sch, opt);  } @@ -239,10 +268,13 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)  		.Scell_log	= q->parms.Scell_log,  	}; +	sch->qstats.backlog = q->qdisc->qstats.backlog;  	opts = nla_nest_start(skb, TCA_OPTIONS);  	if (opts == NULL)  		goto nla_put_failure; -	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || +	    nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) +		goto nla_put_failure;  	return nla_nest_end(skb, opts);  nla_put_failure: diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c new file mode 100644 index 00000000000..9b0f7093d97 --- /dev/null +++ b/net/sched/sch_sfb.c @@ -0,0 +1,725 @@ +/* + * net/sched/sch_sfb.c	  Stochastic Fair Blue + * + * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: + * A New Class of Active Queue Management Algorithms. + * U. Michigan CSE-TR-387-99, April 1999. + * + * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <net/ip.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/flow_keys.h> + +/* + * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) + * This implementation uses L = 8 and N = 16 + * This permits us to split one 32bit hash (provided per packet by rxhash or + * external classifier) into 8 subhashes of 4 bits. + */ +#define SFB_BUCKET_SHIFT 4 +#define SFB_NUMBUCKETS	(1 << SFB_BUCKET_SHIFT) /* N bins per Level */ +#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) +#define SFB_LEVELS	(32 / SFB_BUCKET_SHIFT) /* L */ + +/* SFB algo uses a virtual queue, named "bin" */ +struct sfb_bucket { +	u16		qlen; /* length of virtual queue */ +	u16		p_mark; /* marking probability */ +}; + +/* We use a double buffering right before hash change + * (Section 4.4 of SFB reference : moving hash functions) + */ +struct sfb_bins { +	u32		  perturbation; /* jhash perturbation */ +	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; +}; + +struct sfb_sched_data { +	struct Qdisc	*qdisc; +	struct tcf_proto *filter_list; +	unsigned long	rehash_interval; +	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */ +	u32		max; +	u32		bin_size;	/* maximum queue length per bin */ +	u32		increment;	/* d1 */ +	u32		decrement;	/* d2 */ +	u32		limit;		/* HARD maximal queue length */ +	u32		penalty_rate; +	u32		penalty_burst; +	u32		tokens_avail; +	unsigned long	rehash_time; +	unsigned long	token_time; + +	u8		slot;		/* current active bins (0 or 1) */ +	bool		double_buffering; +	struct sfb_bins bins[2]; + +	struct { +		u32	earlydrop; +		u32	penaltydrop; +		u32	bucketdrop; +		u32	queuedrop; +		u32	childdrop;	/* drops in child qdisc */ +		u32	marked;		/* ECN mark */ +	} stats; +}; + +/* + * Each queued skb might be hashed on one or two bins + * We store in skb_cb the two hash values. + * (A zero value means double buffering was not used) + */ +struct sfb_skb_cb { +	u32 hashes[2]; +}; + +static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) +{ +	qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); +	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; +} + +/* + * If using 'internal' SFB flow classifier, hash comes from skb rxhash + * If using external classifier, hash comes from the classid. + */ +static u32 sfb_hash(const struct sk_buff *skb, u32 slot) +{ +	return sfb_skb_cb(skb)->hashes[slot]; +} + +/* Probabilities are coded as Q0.16 fixed-point values, + * with 0xFFFF representing 65535/65536 (almost 1.0) + * Addition and subtraction are saturating in [0, 65535] + */ +static u32 prob_plus(u32 p1, u32 p2) +{ +	u32 res = p1 + p2; + +	return min_t(u32, res, SFB_MAX_PROB); +} + +static u32 prob_minus(u32 p1, u32 p2) +{ +	return p1 > p2 ? p1 - p2 : 0; +} + +static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) +{ +	int i; +	struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + +	for (i = 0; i < SFB_LEVELS; i++) { +		u32 hash = sfbhash & SFB_BUCKET_MASK; + +		sfbhash >>= SFB_BUCKET_SHIFT; +		if (b[hash].qlen < 0xFFFF) +			b[hash].qlen++; +		b += SFB_NUMBUCKETS; /* next level */ +	} +} + +static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ +	u32 sfbhash; + +	sfbhash = sfb_hash(skb, 0); +	if (sfbhash) +		increment_one_qlen(sfbhash, 0, q); + +	sfbhash = sfb_hash(skb, 1); +	if (sfbhash) +		increment_one_qlen(sfbhash, 1, q); +} + +static void decrement_one_qlen(u32 sfbhash, u32 slot, +			       struct sfb_sched_data *q) +{ +	int i; +	struct sfb_bucket *b = &q->bins[slot].bins[0][0]; + +	for (i = 0; i < SFB_LEVELS; i++) { +		u32 hash = sfbhash & SFB_BUCKET_MASK; + +		sfbhash >>= SFB_BUCKET_SHIFT; +		if (b[hash].qlen > 0) +			b[hash].qlen--; +		b += SFB_NUMBUCKETS; /* next level */ +	} +} + +static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +{ +	u32 sfbhash; + +	sfbhash = sfb_hash(skb, 0); +	if (sfbhash) +		decrement_one_qlen(sfbhash, 0, q); + +	sfbhash = sfb_hash(skb, 1); +	if (sfbhash) +		decrement_one_qlen(sfbhash, 1, q); +} + +static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ +	b->p_mark = prob_minus(b->p_mark, q->decrement); +} + +static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) +{ +	b->p_mark = prob_plus(b->p_mark, q->increment); +} + +static void sfb_zero_all_buckets(struct sfb_sched_data *q) +{ +	memset(&q->bins, 0, sizeof(q->bins)); +} + +/* + * compute max qlen, max p_mark, and avg p_mark + */ +static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) +{ +	int i; +	u32 qlen = 0, prob = 0, totalpm = 0; +	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; + +	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { +		if (qlen < b->qlen) +			qlen = b->qlen; +		totalpm += b->p_mark; +		if (prob < b->p_mark) +			prob = b->p_mark; +		b++; +	} +	*prob_r = prob; +	*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); +	return qlen; +} + + +static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) +{ +	q->bins[slot].perturbation = prandom_u32(); +} + +static void sfb_swap_slot(struct sfb_sched_data *q) +{ +	sfb_init_perturbation(q->slot, q); +	q->slot ^= 1; +	q->double_buffering = false; +} + +/* Non elastic flows are allowed to use part of the bandwidth, expressed + * in "penalty_rate" packets per second, with "penalty_burst" burst + */ +static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) +{ +	if (q->penalty_rate == 0 || q->penalty_burst == 0) +		return true; + +	if (q->tokens_avail < 1) { +		unsigned long age = min(10UL * HZ, jiffies - q->token_time); + +		q->tokens_avail = (age * q->penalty_rate) / HZ; +		if (q->tokens_avail > q->penalty_burst) +			q->tokens_avail = q->penalty_burst; +		q->token_time = jiffies; +		if (q->tokens_avail < 1) +			return true; +	} + +	q->tokens_avail--; +	return false; +} + +static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q, +			 int *qerr, u32 *salt) +{ +	struct tcf_result res; +	int result; + +	result = tc_classify(skb, q->filter_list, &res); +	if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT +		switch (result) { +		case TC_ACT_STOLEN: +		case TC_ACT_QUEUED: +			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; +		case TC_ACT_SHOT: +			return false; +		} +#endif +		*salt = TC_H_MIN(res.classid); +		return true; +	} +	return false; +} + +static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct Qdisc *child = q->qdisc; +	int i; +	u32 p_min = ~0; +	u32 minqlen = ~0; +	u32 r, slot, salt, sfbhash; +	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; +	struct flow_keys keys; + +	if (unlikely(sch->q.qlen >= q->limit)) { +		sch->qstats.overlimits++; +		q->stats.queuedrop++; +		goto drop; +	} + +	if (q->rehash_interval > 0) { +		unsigned long limit = q->rehash_time + q->rehash_interval; + +		if (unlikely(time_after(jiffies, limit))) { +			sfb_swap_slot(q); +			q->rehash_time = jiffies; +		} else if (unlikely(!q->double_buffering && q->warmup_time > 0 && +				    time_after(jiffies, limit - q->warmup_time))) { +			q->double_buffering = true; +		} +	} + +	if (q->filter_list) { +		/* If using external classifiers, get result and record it. */ +		if (!sfb_classify(skb, q, &ret, &salt)) +			goto other_drop; +		keys.src = salt; +		keys.dst = 0; +		keys.ports = 0; +	} else { +		skb_flow_dissect(skb, &keys); +	} + +	slot = q->slot; + +	sfbhash = jhash_3words((__force u32)keys.dst, +			       (__force u32)keys.src, +			       (__force u32)keys.ports, +			       q->bins[slot].perturbation); +	if (!sfbhash) +		sfbhash = 1; +	sfb_skb_cb(skb)->hashes[slot] = sfbhash; + +	for (i = 0; i < SFB_LEVELS; i++) { +		u32 hash = sfbhash & SFB_BUCKET_MASK; +		struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + +		sfbhash >>= SFB_BUCKET_SHIFT; +		if (b->qlen == 0) +			decrement_prob(b, q); +		else if (b->qlen >= q->bin_size) +			increment_prob(b, q); +		if (minqlen > b->qlen) +			minqlen = b->qlen; +		if (p_min > b->p_mark) +			p_min = b->p_mark; +	} + +	slot ^= 1; +	sfb_skb_cb(skb)->hashes[slot] = 0; + +	if (unlikely(minqlen >= q->max)) { +		sch->qstats.overlimits++; +		q->stats.bucketdrop++; +		goto drop; +	} + +	if (unlikely(p_min >= SFB_MAX_PROB)) { +		/* Inelastic flow */ +		if (q->double_buffering) { +			sfbhash = jhash_3words((__force u32)keys.dst, +					       (__force u32)keys.src, +					       (__force u32)keys.ports, +					       q->bins[slot].perturbation); +			if (!sfbhash) +				sfbhash = 1; +			sfb_skb_cb(skb)->hashes[slot] = sfbhash; + +			for (i = 0; i < SFB_LEVELS; i++) { +				u32 hash = sfbhash & SFB_BUCKET_MASK; +				struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; + +				sfbhash >>= SFB_BUCKET_SHIFT; +				if (b->qlen == 0) +					decrement_prob(b, q); +				else if (b->qlen >= q->bin_size) +					increment_prob(b, q); +			} +		} +		if (sfb_rate_limit(skb, q)) { +			sch->qstats.overlimits++; +			q->stats.penaltydrop++; +			goto drop; +		} +		goto enqueue; +	} + +	r = prandom_u32() & SFB_MAX_PROB; + +	if (unlikely(r < p_min)) { +		if (unlikely(p_min > SFB_MAX_PROB / 2)) { +			/* If we're marking that many packets, then either +			 * this flow is unresponsive, or we're badly congested. +			 * In either case, we want to start dropping packets. +			 */ +			if (r < (p_min - SFB_MAX_PROB / 2) * 2) { +				q->stats.earlydrop++; +				goto drop; +			} +		} +		if (INET_ECN_set_ce(skb)) { +			q->stats.marked++; +		} else { +			q->stats.earlydrop++; +			goto drop; +		} +	} + +enqueue: +	ret = qdisc_enqueue(skb, child); +	if (likely(ret == NET_XMIT_SUCCESS)) { +		sch->q.qlen++; +		increment_qlen(skb, q); +	} else if (net_xmit_drop_count(ret)) { +		q->stats.childdrop++; +		sch->qstats.drops++; +	} +	return ret; + +drop: +	qdisc_drop(skb, sch); +	return NET_XMIT_CN; +other_drop: +	if (ret & __NET_XMIT_BYPASS) +		sch->qstats.drops++; +	kfree_skb(skb); +	return ret; +} + +static struct sk_buff *sfb_dequeue(struct Qdisc *sch) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct Qdisc *child = q->qdisc; +	struct sk_buff *skb; + +	skb = child->dequeue(q->qdisc); + +	if (skb) { +		qdisc_bstats_update(sch, skb); +		sch->q.qlen--; +		decrement_qlen(skb, q); +	} + +	return skb; +} + +static struct sk_buff *sfb_peek(struct Qdisc *sch) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct Qdisc *child = q->qdisc; + +	return child->ops->peek(child); +} + +/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ + +static void sfb_reset(struct Qdisc *sch) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	qdisc_reset(q->qdisc); +	sch->q.qlen = 0; +	q->slot = 0; +	q->double_buffering = false; +	sfb_zero_all_buckets(q); +	sfb_init_perturbation(0, q); +} + +static void sfb_destroy(struct Qdisc *sch) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	tcf_destroy_chain(&q->filter_list); +	qdisc_destroy(q->qdisc); +} + +static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { +	[TCA_SFB_PARMS]	= { .len = sizeof(struct tc_sfb_qopt) }, +}; + +static const struct tc_sfb_qopt sfb_default_ops = { +	.rehash_interval = 600 * MSEC_PER_SEC, +	.warmup_time = 60 * MSEC_PER_SEC, +	.limit = 0, +	.max = 25, +	.bin_size = 20, +	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ +	.decrement = (SFB_MAX_PROB + 3000) / 6000, +	.penalty_rate = 10, +	.penalty_burst = 20, +}; + +static int sfb_change(struct Qdisc *sch, struct nlattr *opt) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct Qdisc *child; +	struct nlattr *tb[TCA_SFB_MAX + 1]; +	const struct tc_sfb_qopt *ctl = &sfb_default_ops; +	u32 limit; +	int err; + +	if (opt) { +		err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); +		if (err < 0) +			return -EINVAL; + +		if (tb[TCA_SFB_PARMS] == NULL) +			return -EINVAL; + +		ctl = nla_data(tb[TCA_SFB_PARMS]); +	} + +	limit = ctl->limit; +	if (limit == 0) +		limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + +	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); +	if (IS_ERR(child)) +		return PTR_ERR(child); + +	sch_tree_lock(sch); + +	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen); +	qdisc_destroy(q->qdisc); +	q->qdisc = child; + +	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); +	q->warmup_time = msecs_to_jiffies(ctl->warmup_time); +	q->rehash_time = jiffies; +	q->limit = limit; +	q->increment = ctl->increment; +	q->decrement = ctl->decrement; +	q->max = ctl->max; +	q->bin_size = ctl->bin_size; +	q->penalty_rate = ctl->penalty_rate; +	q->penalty_burst = ctl->penalty_burst; +	q->tokens_avail = ctl->penalty_burst; +	q->token_time = jiffies; + +	q->slot = 0; +	q->double_buffering = false; +	sfb_zero_all_buckets(q); +	sfb_init_perturbation(0, q); +	sfb_init_perturbation(1, q); + +	sch_tree_unlock(sch); + +	return 0; +} + +static int sfb_init(struct Qdisc *sch, struct nlattr *opt) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	q->qdisc = &noop_qdisc; +	return sfb_change(sch, opt); +} + +static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct nlattr *opts; +	struct tc_sfb_qopt opt = { +		.rehash_interval = jiffies_to_msecs(q->rehash_interval), +		.warmup_time = jiffies_to_msecs(q->warmup_time), +		.limit = q->limit, +		.max = q->max, +		.bin_size = q->bin_size, +		.increment = q->increment, +		.decrement = q->decrement, +		.penalty_rate = q->penalty_rate, +		.penalty_burst = q->penalty_burst, +	}; + +	sch->qstats.backlog = q->qdisc->qstats.backlog; +	opts = nla_nest_start(skb, TCA_OPTIONS); +	if (opts == NULL) +		goto nla_put_failure; +	if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure; +	return nla_nest_end(skb, opts); + +nla_put_failure: +	nla_nest_cancel(skb, opts); +	return -EMSGSIZE; +} + +static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); +	struct tc_sfb_xstats st = { +		.earlydrop = q->stats.earlydrop, +		.penaltydrop = q->stats.penaltydrop, +		.bucketdrop = q->stats.bucketdrop, +		.queuedrop = q->stats.queuedrop, +		.childdrop = q->stats.childdrop, +		.marked = q->stats.marked, +	}; + +	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); + +	return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, +			  struct sk_buff *skb, struct tcmsg *tcm) +{ +	return -ENOSYS; +} + +static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, +		     struct Qdisc **old) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	if (new == NULL) +		new = &noop_qdisc; + +	sch_tree_lock(sch); +	*old = q->qdisc; +	q->qdisc = new; +	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); +	qdisc_reset(*old); +	sch_tree_unlock(sch); +	return 0; +} + +static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	return q->qdisc; +} + +static unsigned long sfb_get(struct Qdisc *sch, u32 classid) +{ +	return 1; +} + +static void sfb_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, +			    struct nlattr **tca, unsigned long *arg) +{ +	return -ENOSYS; +} + +static int sfb_delete(struct Qdisc *sch, unsigned long cl) +{ +	return -ENOSYS; +} + +static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ +	if (!walker->stop) { +		if (walker->count >= walker->skip) +			if (walker->fn(sch, 1, walker) < 0) { +				walker->stop = 1; +				return; +			} +		walker->count++; +	} +} + +static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl) +{ +	struct sfb_sched_data *q = qdisc_priv(sch); + +	if (cl) +		return NULL; +	return &q->filter_list; +} + +static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, +			      u32 classid) +{ +	return 0; +} + + +static const struct Qdisc_class_ops sfb_class_ops = { +	.graft		=	sfb_graft, +	.leaf		=	sfb_leaf, +	.get		=	sfb_get, +	.put		=	sfb_put, +	.change		=	sfb_change_class, +	.delete		=	sfb_delete, +	.walk		=	sfb_walk, +	.tcf_chain	=	sfb_find_tcf, +	.bind_tcf	=	sfb_bind, +	.unbind_tcf	=	sfb_put, +	.dump		=	sfb_dump_class, +}; + +static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { +	.id		=	"sfb", +	.priv_size	=	sizeof(struct sfb_sched_data), +	.cl_ops		=	&sfb_class_ops, +	.enqueue	=	sfb_enqueue, +	.dequeue	=	sfb_dequeue, +	.peek		=	sfb_peek, +	.init		=	sfb_init, +	.reset		=	sfb_reset, +	.destroy	=	sfb_destroy, +	.change		=	sfb_change, +	.dump		=	sfb_dump, +	.dump_stats	=	sfb_dump_stats, +	.owner		=	THIS_MODULE, +}; + +static int __init sfb_module_init(void) +{ +	return register_qdisc(&sfb_qdisc_ops); +} + +static void __exit sfb_module_exit(void) +{ +	unregister_qdisc(&sfb_qdisc_ops); +} + +module_init(sfb_module_init) +module_exit(sfb_module_exit) + +MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); +MODULE_AUTHOR("Juliusz Chroboczek"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 3cf478d012d..1af2f73906d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -17,13 +17,14 @@  #include <linux/in.h>  #include <linux/errno.h>  #include <linux/init.h> -#include <linux/ipv6.h>  #include <linux/skbuff.h>  #include <linux/jhash.h>  #include <linux/slab.h> -#include <net/ip.h> +#include <linux/vmalloc.h>  #include <net/netlink.h>  #include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/red.h>  /*	Stochastic Fairness Queuing algorithm. @@ -66,105 +67,119 @@  	SFQ is superior for this purpose.  	IMPLEMENTATION: -	This implementation limits maximal queue length to 128; -	maximal mtu to 2^15-1; number of hash buckets to 1024. -	The only goal of this restrictions was that all data -	fit into one 4K page :-). Struct sfq_sched_data is -	organized in anti-cache manner: all the data for a bucket -	are scattered over different locations. This is not good, -	but it allowed me to put it into 4K. +	This implementation limits : +	- maximal queue length per flow to 127 packets. +	- max mtu to 2^18-1; +	- max 65408 flows, +	- number of hash buckets to 65536.  	It is easy to increase these values, but not in flight.  */ -#define SFQ_DEPTH		128 -#define SFQ_HASH_DIVISOR	1024 +#define SFQ_MAX_DEPTH		127 /* max number of packets per flow */ +#define SFQ_DEFAULT_FLOWS	128 +#define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ +#define SFQ_EMPTY_SLOT		0xffff +#define SFQ_DEFAULT_HASH_DIVISOR 1024 -/* This type should contain at least SFQ_DEPTH*2 values */ -typedef unsigned char sfq_index; +/* We use 16 bits to store allot, and want to handle packets up to 64K + * Scale allot by 8 (1<<3) so that no overflow occurs. + */ +#define SFQ_ALLOT_SHIFT		3 +#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) -struct sfq_head -{ +/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ +typedef u16 sfq_index; + +/* + * We dont use pointers to save space. + * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array + * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] + * are 'pointers' to dep[] array + */ +struct sfq_head {  	sfq_index	next;  	sfq_index	prev;  }; -struct sfq_sched_data -{ -/* Parameters */ -	int		perturb_period; -	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */ -	int		limit; +struct sfq_slot { +	struct sk_buff	*skblist_next; +	struct sk_buff	*skblist_prev; +	sfq_index	qlen; /* number of skbs in skblist */ +	sfq_index	next; /* next slot in sfq RR chain */ +	struct sfq_head dep; /* anchor in dep[] chains */ +	unsigned short	hash; /* hash value (index in ht[]) */ +	short		allot; /* credit for this slot */ + +	unsigned int    backlog; +	struct red_vars vars; +}; -/* Variables */ +struct sfq_sched_data { +/* frequently used fields */ +	int		limit;		/* limit of total number of packets in this qdisc */ +	unsigned int	divisor;	/* number of slots in hash table */ +	u8		headdrop; +	u8		maxdepth;	/* limit of packets per flow */ + +	u32		perturbation; +	u8		cur_depth;	/* depth of longest slot */ +	u8		flags; +	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */  	struct tcf_proto *filter_list; +	sfq_index	*ht;		/* Hash table ('divisor' slots) */ +	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */ + +	struct red_parms *red_parms; +	struct tc_sfqred_stats stats; +	struct sfq_slot *tail;		/* current slot in round */ + +	struct sfq_head	dep[SFQ_MAX_DEPTH + 1]; +					/* Linked lists of slots, indexed by depth +					 * dep[0] : list of unused flows +					 * dep[1] : list of flows with 1 packet +					 * dep[X] : list of flows with X packets +					 */ + +	unsigned int	maxflows;	/* number of flows in flows array */ +	int		perturb_period; +	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */  	struct timer_list perturb_timer; -	u32		perturbation; -	sfq_index	tail;		/* Index of current slot in round */ -	sfq_index	max_depth;	/* Maximal depth */ - -	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */ -	sfq_index	next[SFQ_DEPTH];	/* Active slots link */ -	short		allot[SFQ_DEPTH];	/* Current allotment per slot */ -	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */ -	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */ -	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */  }; -static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +/* + * sfq_head are either in a sfq_slot or in dep[] array + */ +static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)  { -	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1); +	if (val < SFQ_MAX_FLOWS) +		return &q->slots[val].dep; +	return &q->dep[val - SFQ_MAX_FLOWS];  } -static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) -{ -	u32 h, h2; +/* + * In order to be able to quickly rehash our queue when timer changes + * q->perturbation, we store flow_keys in skb->cb[] + */ +struct sfq_skb_cb { +       struct flow_keys        keys; +}; -	switch (skb->protocol) { -	case htons(ETH_P_IP): -	{ -		const struct iphdr *iph; -		int poff; +static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) +{ +	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); +	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; +} -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			goto err; -		iph = ip_hdr(skb); -		h = (__force u32)iph->daddr; -		h2 = (__force u32)iph->saddr ^ iph->protocol; -		if (iph->frag_off & htons(IP_MF|IP_OFFSET)) -			break; -		poff = proto_ports_offset(iph->protocol); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { -			iph = ip_hdr(skb); -			h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); -		} -		break; -	} -	case htons(ETH_P_IPV6): -	{ -		struct ipv6hdr *iph; -		int poff; - -		if (!pskb_network_may_pull(skb, sizeof(*iph))) -			goto err; -		iph = ipv6_hdr(skb); -		h = (__force u32)iph->daddr.s6_addr32[3]; -		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; -		poff = proto_ports_offset(iph->nexthdr); -		if (poff >= 0 && -		    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { -			iph = ipv6_hdr(skb); -			h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); -		} -		break; -	} -	default: -err: -		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol; -		h2 = (unsigned long)skb->sk; -	} +static unsigned int sfq_hash(const struct sfq_sched_data *q, +			     const struct sk_buff *skb) +{ +	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; +	unsigned int hash; -	return sfq_fold_hash(q, h, h2); +	hash = jhash_3words((__force u32)keys->dst, +			    (__force u32)keys->src ^ keys->ip_proto, +			    (__force u32)keys->ports, q->perturbation); +	return hash & (q->divisor - 1);  }  static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -176,11 +191,13 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,  	if (TC_H_MAJ(skb->priority) == sch->handle &&  	    TC_H_MIN(skb->priority) > 0 && -	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR) +	    TC_H_MIN(skb->priority) <= q->divisor)  		return TC_H_MIN(skb->priority); -	if (!q->filter_list) +	if (!q->filter_list) { +		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);  		return sfq_hash(q, skb) + 1; +	}  	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;  	result = tc_classify(skb, q->filter_list, &res); @@ -194,36 +211,50 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,  			return 0;  		}  #endif -		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR) +		if (TC_H_MIN(res.classid) <= q->divisor)  			return TC_H_MIN(res.classid);  	}  	return 0;  } +/* + * x : slot number [0 .. SFQ_MAX_FLOWS - 1] + */  static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)  {  	sfq_index p, n; -	int d = q->qs[x].qlen + SFQ_DEPTH; +	struct sfq_slot *slot = &q->slots[x]; +	int qlen = slot->qlen; + +	p = qlen + SFQ_MAX_FLOWS; +	n = q->dep[qlen].next; -	p = d; -	n = q->dep[d].next; -	q->dep[x].next = n; -	q->dep[x].prev = p; -	q->dep[p].next = q->dep[n].prev = x; +	slot->dep.next = n; +	slot->dep.prev = p; + +	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */ +	sfq_dep_head(q, n)->prev = x;  } +#define sfq_unlink(q, x, n, p)			\ +	do {					\ +		n = q->slots[x].dep.next;	\ +		p = q->slots[x].dep.prev;	\ +		sfq_dep_head(q, p)->next = n;	\ +		sfq_dep_head(q, n)->prev = p;	\ +	} while (0) + +  static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)  {  	sfq_index p, n; +	int d; -	n = q->dep[x].next; -	p = q->dep[x].prev; -	q->dep[p].next = n; -	q->dep[n].prev = p; - -	if (n == p && q->max_depth == q->qs[x].qlen + 1) -		q->max_depth--; +	sfq_unlink(q, x, n, p); +	d = q->slots[x].qlen--; +	if (n == p && q->cur_depth == d) +		q->cur_depth--;  	sfq_link(q, x);  } @@ -232,34 +263,76 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)  	sfq_index p, n;  	int d; -	n = q->dep[x].next; -	p = q->dep[x].prev; -	q->dep[p].next = n; -	q->dep[n].prev = p; -	d = q->qs[x].qlen; -	if (q->max_depth < d) -		q->max_depth = d; +	sfq_unlink(q, x, n, p); +	d = ++q->slots[x].qlen; +	if (q->cur_depth < d) +		q->cur_depth = d;  	sfq_link(q, x);  } +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from tail of slot queue */ +static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) +{ +	struct sk_buff *skb = slot->skblist_prev; + +	slot->skblist_prev = skb->prev; +	skb->prev->next = (struct sk_buff *)slot; +	skb->next = skb->prev = NULL; +	return skb; +} + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) +{ +	struct sk_buff *skb = slot->skblist_next; + +	slot->skblist_next = skb->next; +	skb->next->prev = (struct sk_buff *)slot; +	skb->next = skb->prev = NULL; +	return skb; +} + +static inline void slot_queue_init(struct sfq_slot *slot) +{ +	memset(slot, 0, sizeof(*slot)); +	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; +} + +/* add skb to slot queue (tail add) */ +static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) +{ +	skb->prev = slot->skblist_prev; +	skb->next = (struct sk_buff *)slot; +	slot->skblist_prev->next = skb; +	slot->skblist_prev = skb; +} + +#define	slot_queue_walk(slot, skb)		\ +	for (skb = slot->skblist_next;		\ +	     skb != (struct sk_buff *)slot;	\ +	     skb = skb->next) +  static unsigned int sfq_drop(struct Qdisc *sch)  {  	struct sfq_sched_data *q = qdisc_priv(sch); -	sfq_index d = q->max_depth; +	sfq_index x, d = q->cur_depth;  	struct sk_buff *skb;  	unsigned int len; +	struct sfq_slot *slot; -	/* Queue is full! Find the longest slot and -	   drop a packet from it */ - +	/* Queue is full! Find the longest slot and drop tail packet from it */  	if (d > 1) { -		sfq_index x = q->dep[d + SFQ_DEPTH].next; -		skb = q->qs[x].prev; +		x = q->dep[d].next; +		slot = &q->slots[x]; +drop: +		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);  		len = qdisc_pkt_len(skb); -		__skb_unlink(skb, &q->qs[x]); -		kfree_skb(skb); +		slot->backlog -= len;  		sfq_dec(q, x); +		kfree_skb(skb);  		sch->q.qlen--;  		sch->qstats.drops++;  		sch->qstats.backlog -= len; @@ -268,31 +341,43 @@ static unsigned int sfq_drop(struct Qdisc *sch)  	if (d == 1) {  		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ -		d = q->next[q->tail]; -		q->next[q->tail] = q->next[d]; -		q->allot[q->next[d]] += q->quantum; -		skb = q->qs[d].prev; -		len = qdisc_pkt_len(skb); -		__skb_unlink(skb, &q->qs[d]); -		kfree_skb(skb); -		sfq_dec(q, d); -		sch->q.qlen--; -		q->ht[q->hash[d]] = SFQ_DEPTH; -		sch->qstats.drops++; -		sch->qstats.backlog -= len; -		return len; +		x = q->tail->next; +		slot = &q->slots[x]; +		q->tail->next = slot->next; +		q->ht[slot->hash] = SFQ_EMPTY_SLOT; +		goto drop;  	}  	return 0;  } +/* Is ECN parameter configured */ +static int sfq_prob_mark(const struct sfq_sched_data *q) +{ +	return q->flags & TC_RED_ECN; +} + +/* Should packets over max threshold just be marked */ +static int sfq_hard_mark(const struct sfq_sched_data *q) +{ +	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; +} + +static int sfq_headdrop(const struct sfq_sched_data *q) +{ +	return q->headdrop; +} +  static int  sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  {  	struct sfq_sched_data *q = qdisc_priv(sch);  	unsigned int hash; -	sfq_index x; +	sfq_index x, qlen; +	struct sfq_slot *slot;  	int uninitialized_var(ret); +	struct sk_buff *head; +	int delta;  	hash = sfq_classify(skb, sch, &ret);  	if (hash == 0) { @@ -304,54 +389,114 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)  	hash--;  	x = q->ht[hash]; -	if (x == SFQ_DEPTH) { -		q->ht[hash] = x = q->dep[SFQ_DEPTH].next; -		q->hash[x] = hash; +	slot = &q->slots[x]; +	if (x == SFQ_EMPTY_SLOT) { +		x = q->dep[0].next; /* get a free slot */ +		if (x >= SFQ_MAX_FLOWS) +			return qdisc_drop(skb, sch); +		q->ht[hash] = x; +		slot = &q->slots[x]; +		slot->hash = hash; +		slot->backlog = 0; /* should already be 0 anyway... */ +		red_set_vars(&slot->vars); +		goto enqueue;  	} +	if (q->red_parms) { +		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, +							&slot->vars, +							slot->backlog); +		switch (red_action(q->red_parms, +				   &slot->vars, +				   slot->vars.qavg)) { +		case RED_DONT_MARK: +			break; -	/* If selected queue has length q->limit, this means that -	 * all another queues are empty and that we do simple tail drop, -	 * i.e. drop _this_ packet. -	 */ -	if (q->qs[x].qlen >= q->limit) -		return qdisc_drop(skb, sch); +		case RED_PROB_MARK: +			sch->qstats.overlimits++; +			if (sfq_prob_mark(q)) { +				/* We know we have at least one packet in queue */ +				if (sfq_headdrop(q) && +				    INET_ECN_set_ce(slot->skblist_next)) { +					q->stats.prob_mark_head++; +					break; +				} +				if (INET_ECN_set_ce(skb)) { +					q->stats.prob_mark++; +					break; +				} +			} +			q->stats.prob_drop++; +			goto congestion_drop; + +		case RED_HARD_MARK: +			sch->qstats.overlimits++; +			if (sfq_hard_mark(q)) { +				/* We know we have at least one packet in queue */ +				if (sfq_headdrop(q) && +				    INET_ECN_set_ce(slot->skblist_next)) { +					q->stats.forced_mark_head++; +					break; +				} +				if (INET_ECN_set_ce(skb)) { +					q->stats.forced_mark++; +					break; +				} +			} +			q->stats.forced_drop++; +			goto congestion_drop; +		} +	} +	if (slot->qlen >= q->maxdepth) { +congestion_drop: +		if (!sfq_headdrop(q)) +			return qdisc_drop(skb, sch); + +		/* We know we have at least one packet in queue */ +		head = slot_dequeue_head(slot); +		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); +		sch->qstats.backlog -= delta; +		slot->backlog -= delta; +		qdisc_drop(head, sch); + +		slot_queue_add(slot, skb); +		return NET_XMIT_CN; +	} + +enqueue:  	sch->qstats.backlog += qdisc_pkt_len(skb); -	__skb_queue_tail(&q->qs[x], skb); +	slot->backlog += qdisc_pkt_len(skb); +	slot_queue_add(slot, skb);  	sfq_inc(q, x); -	if (q->qs[x].qlen == 1) {		/* The flow is new */ -		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */ -			q->tail = x; -			q->next[x] = x; -			q->allot[x] = q->quantum; +	if (slot->qlen == 1) {		/* The flow is new */ +		if (q->tail == NULL) {	/* It is the first flow */ +			slot->next = x;  		} else { -			q->next[x] = q->next[q->tail]; -			q->next[q->tail] = x; -			q->tail = x; +			slot->next = q->tail->next; +			q->tail->next = x;  		} +		/* We put this flow at the end of our flow list. +		 * This might sound unfair for a new flow to wait after old ones, +		 * but we could endup servicing new flows only, and freeze old ones. +		 */ +		q->tail = slot; +		/* We could use a bigger initial quantum for new flows */ +		slot->allot = q->scaled_quantum;  	} -	if (++sch->q.qlen <= q->limit) { -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++; +	if (++sch->q.qlen <= q->limit)  		return NET_XMIT_SUCCESS; -	} +	qlen = slot->qlen;  	sfq_drop(sch); -	return NET_XMIT_CN; -} - -static struct sk_buff * -sfq_peek(struct Qdisc *sch) -{ -	struct sfq_sched_data *q = qdisc_priv(sch); -	sfq_index a; - -	/* No active slots */ -	if (q->tail == SFQ_DEPTH) -		return NULL; +	/* Return Congestion Notification only if we dropped a packet +	 * from this flow. +	 */ +	if (qlen != slot->qlen) +		return NET_XMIT_CN; -	a = q->next[q->tail]; -	return skb_peek(&q->qs[a]); +	/* As we dropped a packet, better let upper stack know this */ +	qdisc_tree_decrease_qlen(sch, 1); +	return NET_XMIT_SUCCESS;  }  static struct sk_buff * @@ -359,34 +504,38 @@ sfq_dequeue(struct Qdisc *sch)  {  	struct sfq_sched_data *q = qdisc_priv(sch);  	struct sk_buff *skb; -	sfq_index a, old_a; +	sfq_index a, next_a; +	struct sfq_slot *slot;  	/* No active slots */ -	if (q->tail == SFQ_DEPTH) +	if (q->tail == NULL)  		return NULL; -	a = old_a = q->next[q->tail]; - -	/* Grab packet */ -	skb = __skb_dequeue(&q->qs[a]); +next_slot: +	a = q->tail->next; +	slot = &q->slots[a]; +	if (slot->allot <= 0) { +		q->tail = slot; +		slot->allot += q->scaled_quantum; +		goto next_slot; +	} +	skb = slot_dequeue_head(slot);  	sfq_dec(q, a); +	qdisc_bstats_update(sch, skb);  	sch->q.qlen--;  	sch->qstats.backlog -= qdisc_pkt_len(skb); - +	slot->backlog -= qdisc_pkt_len(skb);  	/* Is the slot empty? */ -	if (q->qs[a].qlen == 0) { -		q->ht[q->hash[a]] = SFQ_DEPTH; -		a = q->next[a]; -		if (a == old_a) { -			q->tail = SFQ_DEPTH; +	if (slot->qlen == 0) { +		q->ht[slot->hash] = SFQ_EMPTY_SLOT; +		next_a = slot->next; +		if (a == next_a) { +			q->tail = NULL; /* no more active slots */  			return skb;  		} -		q->next[q->tail] = a; -		q->allot[a] += q->quantum; -	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) { -		q->tail = a; -		a = q->next[a]; -		q->allot[a] += q->quantum; +		q->tail->next = next_a; +	} else { +		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));  	}  	return skb;  } @@ -400,12 +549,90 @@ sfq_reset(struct Qdisc *sch)  		kfree_skb(skb);  } +/* + * When q->perturbation is changed, we rehash all queued skbs + * to avoid OOO (Out Of Order) effects. + * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change + * counters. + */ +static void sfq_rehash(struct Qdisc *sch) +{ +	struct sfq_sched_data *q = qdisc_priv(sch); +	struct sk_buff *skb; +	int i; +	struct sfq_slot *slot; +	struct sk_buff_head list; +	int dropped = 0; + +	__skb_queue_head_init(&list); + +	for (i = 0; i < q->maxflows; i++) { +		slot = &q->slots[i]; +		if (!slot->qlen) +			continue; +		while (slot->qlen) { +			skb = slot_dequeue_head(slot); +			sfq_dec(q, i); +			__skb_queue_tail(&list, skb); +		} +		slot->backlog = 0; +		red_set_vars(&slot->vars); +		q->ht[slot->hash] = SFQ_EMPTY_SLOT; +	} +	q->tail = NULL; + +	while ((skb = __skb_dequeue(&list)) != NULL) { +		unsigned int hash = sfq_hash(q, skb); +		sfq_index x = q->ht[hash]; + +		slot = &q->slots[x]; +		if (x == SFQ_EMPTY_SLOT) { +			x = q->dep[0].next; /* get a free slot */ +			if (x >= SFQ_MAX_FLOWS) { +drop:				sch->qstats.backlog -= qdisc_pkt_len(skb); +				kfree_skb(skb); +				dropped++; +				continue; +			} +			q->ht[hash] = x; +			slot = &q->slots[x]; +			slot->hash = hash; +		} +		if (slot->qlen >= q->maxdepth) +			goto drop; +		slot_queue_add(slot, skb); +		if (q->red_parms) +			slot->vars.qavg = red_calc_qavg(q->red_parms, +							&slot->vars, +							slot->backlog); +		slot->backlog += qdisc_pkt_len(skb); +		sfq_inc(q, x); +		if (slot->qlen == 1) {		/* The flow is new */ +			if (q->tail == NULL) {	/* It is the first flow */ +				slot->next = x; +			} else { +				slot->next = q->tail->next; +				q->tail->next = x; +			} +			q->tail = slot; +			slot->allot = q->scaled_quantum; +		} +	} +	sch->q.qlen -= dropped; +	qdisc_tree_decrease_qlen(sch, dropped); +} +  static void sfq_perturbation(unsigned long arg)  {  	struct Qdisc *sch = (struct Qdisc *)arg;  	struct sfq_sched_data *q = qdisc_priv(sch); +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); -	q->perturbation = net_random(); +	spin_lock(root_lock); +	q->perturbation = prandom_u32(); +	if (!q->filter_list && q->tail) +		sfq_rehash(sch); +	spin_unlock(root_lock);  	if (q->perturb_period)  		mod_timer(&q->perturb_timer, jiffies + q->perturb_period); @@ -415,16 +642,53 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)  {  	struct sfq_sched_data *q = qdisc_priv(sch);  	struct tc_sfq_qopt *ctl = nla_data(opt); +	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;  	unsigned int qlen; +	struct red_parms *p = NULL;  	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))  		return -EINVAL; - +	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) +		ctl_v1 = nla_data(opt); +	if (ctl->divisor && +	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) +		return -EINVAL; +	if (ctl_v1 && ctl_v1->qth_min) { +		p = kmalloc(sizeof(*p), GFP_KERNEL); +		if (!p) +			return -ENOMEM; +	}  	sch_tree_lock(sch); -	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); +	if (ctl->quantum) { +		q->quantum = ctl->quantum; +		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); +	}  	q->perturb_period = ctl->perturb_period * HZ; -	if (ctl->limit) -		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); +	if (ctl->flows) +		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); +	if (ctl->divisor) { +		q->divisor = ctl->divisor; +		q->maxflows = min_t(u32, q->maxflows, q->divisor); +	} +	if (ctl_v1) { +		if (ctl_v1->depth) +			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); +		if (p) { +			swap(q->red_parms, p); +			red_set_parms(q->red_parms, +				      ctl_v1->qth_min, ctl_v1->qth_max, +				      ctl_v1->Wlog, +				      ctl_v1->Plog, ctl_v1->Scell_log, +				      NULL, +				      ctl_v1->max_P); +		} +		q->flags = ctl_v1->flags; +		q->headdrop = ctl_v1->headdrop; +	} +	if (ctl->limit) { +		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); +		q->maxflows = min_t(u32, q->maxflows, q->limit); +	}  	qlen = sch->q.qlen;  	while (sch->q.qlen > q->limit) @@ -434,12 +698,39 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)  	del_timer(&q->perturb_timer);  	if (q->perturb_period) {  		mod_timer(&q->perturb_timer, jiffies + q->perturb_period); -		q->perturbation = net_random(); +		q->perturbation = prandom_u32();  	}  	sch_tree_unlock(sch); +	kfree(p);  	return 0;  } +static void *sfq_alloc(size_t sz) +{ +	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); + +	if (!ptr) +		ptr = vmalloc(sz); +	return ptr; +} + +static void sfq_free(void *addr) +{ +	kvfree(addr); +} + +static void sfq_destroy(struct Qdisc *sch) +{ +	struct sfq_sched_data *q = qdisc_priv(sch); + +	tcf_destroy_chain(&q->filter_list); +	q->perturb_period = 0; +	del_timer_sync(&q->perturb_timer); +	sfq_free(q->ht); +	sfq_free(q->slots); +	kfree(q->red_parms); +} +  static int sfq_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct sfq_sched_data *q = qdisc_priv(sch); @@ -449,56 +740,77 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)  	q->perturb_timer.data = (unsigned long)sch;  	init_timer_deferrable(&q->perturb_timer); -	for (i = 0; i < SFQ_HASH_DIVISOR; i++) -		q->ht[i] = SFQ_DEPTH; - -	for (i = 0; i < SFQ_DEPTH; i++) { -		skb_queue_head_init(&q->qs[i]); -		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH; -		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH; +	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { +		q->dep[i].next = i + SFQ_MAX_FLOWS; +		q->dep[i].prev = i + SFQ_MAX_FLOWS;  	} -	q->limit = SFQ_DEPTH - 1; -	q->max_depth = 0; -	q->tail = SFQ_DEPTH; -	if (opt == NULL) { -		q->quantum = psched_mtu(qdisc_dev(sch)); -		q->perturb_period = 0; -		q->perturbation = net_random(); -	} else { +	q->limit = SFQ_MAX_DEPTH; +	q->maxdepth = SFQ_MAX_DEPTH; +	q->cur_depth = 0; +	q->tail = NULL; +	q->divisor = SFQ_DEFAULT_HASH_DIVISOR; +	q->maxflows = SFQ_DEFAULT_FLOWS; +	q->quantum = psched_mtu(qdisc_dev(sch)); +	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); +	q->perturb_period = 0; +	q->perturbation = prandom_u32(); + +	if (opt) {  		int err = sfq_change(sch, opt);  		if (err)  			return err;  	} -	for (i = 0; i < SFQ_DEPTH; i++) +	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); +	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); +	if (!q->ht || !q->slots) { +		sfq_destroy(sch); +		return -ENOMEM; +	} +	for (i = 0; i < q->divisor; i++) +		q->ht[i] = SFQ_EMPTY_SLOT; + +	for (i = 0; i < q->maxflows; i++) { +		slot_queue_init(&q->slots[i]);  		sfq_link(q, i); +	} +	if (q->limit >= 1) +		sch->flags |= TCQ_F_CAN_BYPASS; +	else +		sch->flags &= ~TCQ_F_CAN_BYPASS;  	return 0;  } -static void sfq_destroy(struct Qdisc *sch) -{ -	struct sfq_sched_data *q = qdisc_priv(sch); - -	tcf_destroy_chain(&q->filter_list); -	q->perturb_period = 0; -	del_timer_sync(&q->perturb_timer); -} -  static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)  {  	struct sfq_sched_data *q = qdisc_priv(sch);  	unsigned char *b = skb_tail_pointer(skb); -	struct tc_sfq_qopt opt; - -	opt.quantum = q->quantum; -	opt.perturb_period = q->perturb_period / HZ; - -	opt.limit = q->limit; -	opt.divisor = SFQ_HASH_DIVISOR; -	opt.flows = q->limit; +	struct tc_sfq_qopt_v1 opt; +	struct red_parms *p = q->red_parms; + +	memset(&opt, 0, sizeof(opt)); +	opt.v0.quantum	= q->quantum; +	opt.v0.perturb_period = q->perturb_period / HZ; +	opt.v0.limit	= q->limit; +	opt.v0.divisor	= q->divisor; +	opt.v0.flows	= q->maxflows; +	opt.depth	= q->maxdepth; +	opt.headdrop	= q->headdrop; + +	if (p) { +		opt.qth_min	= p->qth_min >> p->Wlog; +		opt.qth_max	= p->qth_max >> p->Wlog; +		opt.Wlog	= p->Wlog; +		opt.Plog	= p->Plog; +		opt.Scell_log	= p->Scell_log; +		opt.max_P	= p->max_P; +	} +	memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); +	opt.flags	= q->flags; -	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); +	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) +		goto nla_put_failure;  	return skb->len; @@ -520,6 +832,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)  static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,  			      u32 classid)  { +	/* we cannot bypass queue discipline anymore */ +	sch->flags &= ~TCQ_F_CAN_BYPASS;  	return 0;  } @@ -547,10 +861,17 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,  				struct gnet_dump *d)  {  	struct sfq_sched_data *q = qdisc_priv(sch); -	sfq_index idx = q->ht[cl-1]; -	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen }; -	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] }; +	sfq_index idx = q->ht[cl - 1]; +	struct gnet_stats_queue qs = { 0 }; +	struct tc_sfq_xstats xstats = { 0 }; +	if (idx != SFQ_EMPTY_SLOT) { +		const struct sfq_slot *slot = &q->slots[idx]; + +		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; +		qs.qlen = slot->qlen; +		qs.backlog = slot->backlog; +	}  	if (gnet_stats_copy_queue(d, &qs) < 0)  		return -1;  	return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); @@ -564,8 +885,8 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)  	if (arg->stop)  		return; -	for (i = 0; i < SFQ_HASH_DIVISOR; i++) { -		if (q->ht[i] == SFQ_DEPTH || +	for (i = 0; i < q->divisor; i++) { +		if (q->ht[i] == SFQ_EMPTY_SLOT ||  		    arg->count < arg->skip) {  			arg->count++;  			continue; @@ -596,7 +917,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {  	.priv_size	=	sizeof(struct sfq_sched_data),  	.enqueue	=	sfq_enqueue,  	.dequeue	=	sfq_dequeue, -	.peek		=	sfq_peek, +	.peek		=	qdisc_peek_dequeued,  	.drop		=	sfq_drop,  	.init		=	sfq_init,  	.reset		=	sfq_reset, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 641a30d6463..18ff6343370 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -19,6 +19,7 @@  #include <linux/errno.h>  #include <linux/skbuff.h>  #include <net/netlink.h> +#include <net/sch_generic.h>  #include <net/pkt_sched.h> @@ -97,35 +98,106 @@  	changed the limit is not effective anymore.  */ -struct tbf_sched_data -{ +struct tbf_sched_data {  /* Parameters */  	u32		limit;		/* Maximal length of backlog: bytes */ -	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */ -	u32		mtu;  	u32		max_size; -	struct qdisc_rate_table	*R_tab; -	struct qdisc_rate_table	*P_tab; +	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */ +	s64		mtu; +	struct psched_ratecfg rate; +	struct psched_ratecfg peak;  /* Variables */ -	long	tokens;			/* Current number of B tokens */ -	long	ptokens;		/* Current number of P tokens */ -	psched_time_t	t_c;		/* Time check-point */ +	s64	tokens;			/* Current number of B tokens */ +	s64	ptokens;		/* Current number of P tokens */ +	s64	t_c;			/* Time check-point */  	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */  	struct qdisc_watchdog watchdog;	/* Watchdog timer */  }; -#define L2T(q,L)   qdisc_l2t((q)->R_tab,L) -#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L) -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +/* Time to Length, convert time in ns to length in bytes + * to determinate how many bytes can be sent in given time. + */ +static u64 psched_ns_t2l(const struct psched_ratecfg *r, +			 u64 time_in_ns) +{ +	/* The formula is : +	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC +	 */ +	u64 len = time_in_ns * r->rate_bytes_ps; + +	do_div(len, NSEC_PER_SEC); + +	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { +		do_div(len, 53); +		len = len * 48; +	} + +	if (len > r->overhead) +		len -= r->overhead; +	else +		len = 0; + +	return len; +} + +/* + * Return length of individual segments of a gso packet, + * including all headers (MAC, IP, TCP/UDP) + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ +	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); +	return hdr_len + skb_gso_transport_seglen(skb); +} + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)  {  	struct tbf_sched_data *q = qdisc_priv(sch); -	int ret; +	struct sk_buff *segs, *nskb; +	netdev_features_t features = netif_skb_features(skb); +	int ret, nb; + +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); -	if (qdisc_pkt_len(skb) > q->max_size) +	if (IS_ERR_OR_NULL(segs))  		return qdisc_reshape_fail(skb, sch); +	nb = 0; +	while (segs) { +		nskb = segs->next; +		segs->next = NULL; +		qdisc_skb_cb(segs)->pkt_len = segs->len; +		ret = qdisc_enqueue(segs, q->qdisc); +		if (ret != NET_XMIT_SUCCESS) { +			if (net_xmit_drop_count(ret)) +				sch->qstats.drops++; +		} else { +			nb++; +		} +		segs = nskb; +	} +	sch->q.qlen += nb; +	if (nb > 1) +		qdisc_tree_decrease_qlen(sch, 1 - nb); +	consume_skb(skb); +	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ +	struct tbf_sched_data *q = qdisc_priv(sch); +	int ret; + +	if (qdisc_pkt_len(skb) > q->max_size) { +		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) +			return tbf_segment(skb, sch); +		return qdisc_reshape_fail(skb, sch); +	}  	ret = qdisc_enqueue(skb, q->qdisc);  	if (ret != NET_XMIT_SUCCESS) {  		if (net_xmit_drop_count(ret)) @@ -134,12 +206,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)  	}  	sch->q.qlen++; -	sch->bstats.bytes += qdisc_pkt_len(skb); -	sch->bstats.packets++;  	return NET_XMIT_SUCCESS;  } -static unsigned int tbf_drop(struct Qdisc* sch) +static unsigned int tbf_drop(struct Qdisc *sch)  {  	struct tbf_sched_data *q = qdisc_priv(sch);  	unsigned int len = 0; @@ -151,7 +221,12 @@ static unsigned int tbf_drop(struct Qdisc* sch)  	return len;  } -static struct sk_buff *tbf_dequeue(struct Qdisc* sch) +static bool tbf_peak_present(const struct tbf_sched_data *q) +{ +	return q->peak.rate_bytes_ps; +} + +static struct sk_buff *tbf_dequeue(struct Qdisc *sch)  {  	struct tbf_sched_data *q = qdisc_priv(sch);  	struct sk_buff *skb; @@ -159,24 +234,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)  	skb = q->qdisc->ops->peek(q->qdisc);  	if (skb) { -		psched_time_t now; -		long toks; -		long ptoks = 0; +		s64 now; +		s64 toks; +		s64 ptoks = 0;  		unsigned int len = qdisc_pkt_len(skb); -		now = psched_get_time(); -		toks = psched_tdiff_bounded(now, q->t_c, q->buffer); +		now = ktime_to_ns(ktime_get()); +		toks = min_t(s64, now - q->t_c, q->buffer); -		if (q->P_tab) { +		if (tbf_peak_present(q)) {  			ptoks = toks + q->ptokens; -			if (ptoks > (long)q->mtu) +			if (ptoks > q->mtu)  				ptoks = q->mtu; -			ptoks -= L2T_P(q, len); +			ptoks -= (s64) psched_l2t_ns(&q->peak, len);  		}  		toks += q->tokens; -		if (toks > (long)q->buffer) +		if (toks > q->buffer)  			toks = q->buffer; -		toks -= L2T(q, len); +		toks -= (s64) psched_l2t_ns(&q->rate, len);  		if ((toks|ptoks) >= 0) {  			skb = qdisc_dequeue_peeked(q->qdisc); @@ -187,12 +262,13 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)  			q->tokens = toks;  			q->ptokens = ptoks;  			sch->q.qlen--; -			sch->flags &= ~TCQ_F_THROTTLED; +			qdisc_unthrottled(sch); +			qdisc_bstats_update(sch, skb);  			return skb;  		} -		qdisc_watchdog_schedule(&q->watchdog, -					now + max_t(long, -toks, -ptoks)); +		qdisc_watchdog_schedule_ns(&q->watchdog, +					   now + max_t(long, -toks, -ptoks));  		/* Maybe we have a shorter packet in the queue,  		   which can be sent now. It sounds cool, @@ -210,13 +286,13 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)  	return NULL;  } -static void tbf_reset(struct Qdisc* sch) +static void tbf_reset(struct Qdisc *sch)  {  	struct tbf_sched_data *q = qdisc_priv(sch);  	qdisc_reset(q->qdisc);  	sch->q.qlen = 0; -	q->t_c = psched_get_time(); +	q->t_c = ktime_to_ns(ktime_get());  	q->tokens = q->buffer;  	q->ptokens = q->mtu;  	qdisc_watchdog_cancel(&q->watchdog); @@ -226,20 +302,26 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {  	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },  	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },  	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, +	[TCA_TBF_RATE64]	= { .type = NLA_U64 }, +	[TCA_TBF_PRATE64]	= { .type = NLA_U64 }, +	[TCA_TBF_BURST] = { .type = NLA_U32 }, +	[TCA_TBF_PBURST] = { .type = NLA_U32 },  }; -static int tbf_change(struct Qdisc* sch, struct nlattr *opt) +static int tbf_change(struct Qdisc *sch, struct nlattr *opt)  {  	int err;  	struct tbf_sched_data *q = qdisc_priv(sch); -	struct nlattr *tb[TCA_TBF_PTAB + 1]; +	struct nlattr *tb[TCA_TBF_MAX + 1];  	struct tc_tbf_qopt *qopt; -	struct qdisc_rate_table *rtab = NULL; -	struct qdisc_rate_table *ptab = NULL;  	struct Qdisc *child = NULL; -	int max_size,n; +	struct psched_ratecfg rate; +	struct psched_ratecfg peak; +	u64 max_size; +	s64 buffer, mtu; +	u64 rate64 = 0, prate64 = 0; -	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); +	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);  	if (err < 0)  		return err; @@ -248,30 +330,59 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)  		goto done;  	qopt = nla_data(tb[TCA_TBF_PARMS]); -	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]); -	if (rtab == NULL) -		goto done; +	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) +		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, +					      tb[TCA_TBF_RTAB])); + +	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) +			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, +						      tb[TCA_TBF_PTAB])); + +	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); +	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); + +	if (tb[TCA_TBF_RATE64]) +		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); +	psched_ratecfg_precompute(&rate, &qopt->rate, rate64); + +	if (tb[TCA_TBF_BURST]) { +		max_size = nla_get_u32(tb[TCA_TBF_BURST]); +		buffer = psched_l2t_ns(&rate, max_size); +	} else { +		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); +	}  	if (qopt->peakrate.rate) { -		if (qopt->peakrate.rate > qopt->rate.rate) -			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]); -		if (ptab == NULL) +		if (tb[TCA_TBF_PRATE64]) +			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); +		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); +		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { +			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", +					peak.rate_bytes_ps, rate.rate_bytes_ps); +			err = -EINVAL;  			goto done; +		} + +		if (tb[TCA_TBF_PBURST]) { +			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); +			max_size = min_t(u32, max_size, pburst); +			mtu = psched_l2t_ns(&peak, pburst); +		} else { +			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); +		} +	} else { +		memset(&peak, 0, sizeof(peak));  	} -	for (n = 0; n < 256; n++) -		if (rtab->data[n] > qopt->buffer) break; -	max_size = (n << qopt->rate.cell_log)-1; -	if (ptab) { -		int size; +	if (max_size < psched_mtu(qdisc_dev(sch))) +		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", +				    max_size, qdisc_dev(sch)->name, +				    psched_mtu(qdisc_dev(sch))); -		for (n = 0; n < 256; n++) -			if (ptab->data[n] > qopt->mtu) break; -		size = (n << qopt->peakrate.cell_log)-1; -		if (size < max_size) max_size = size; -	} -	if (max_size < 0) +	if (!max_size) { +		err = -EINVAL;  		goto done; +	}  	if (q->qdisc != &noop_qdisc) {  		err = fifo_set_limit(q->qdisc, qopt->limit); @@ -292,33 +403,35 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)  		q->qdisc = child;  	}  	q->limit = qopt->limit; -	q->mtu = qopt->mtu; +	if (tb[TCA_TBF_PBURST]) +		q->mtu = mtu; +	else +		q->mtu = PSCHED_TICKS2NS(qopt->mtu);  	q->max_size = max_size; -	q->buffer = qopt->buffer; +	if (tb[TCA_TBF_BURST]) +		q->buffer = buffer; +	else +		q->buffer = PSCHED_TICKS2NS(qopt->buffer);  	q->tokens = q->buffer;  	q->ptokens = q->mtu; -	swap(q->R_tab, rtab); -	swap(q->P_tab, ptab); +	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); +	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));  	sch_tree_unlock(sch);  	err = 0;  done: -	if (rtab) -		qdisc_put_rtab(rtab); -	if (ptab) -		qdisc_put_rtab(ptab);  	return err;  } -static int tbf_init(struct Qdisc* sch, struct nlattr *opt) +static int tbf_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct tbf_sched_data *q = qdisc_priv(sch);  	if (opt == NULL)  		return -EINVAL; -	q->t_c = psched_get_time(); +	q->t_c = ktime_to_ns(ktime_get());  	qdisc_watchdog_init(&q->watchdog, sch);  	q->qdisc = &noop_qdisc; @@ -330,12 +443,6 @@ static void tbf_destroy(struct Qdisc *sch)  	struct tbf_sched_data *q = qdisc_priv(sch);  	qdisc_watchdog_cancel(&q->watchdog); - -	if (q->P_tab) -		qdisc_put_rtab(q->P_tab); -	if (q->R_tab) -		qdisc_put_rtab(q->R_tab); -  	qdisc_destroy(q->qdisc);  } @@ -345,22 +452,30 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)  	struct nlattr *nest;  	struct tc_tbf_qopt opt; +	sch->qstats.backlog = q->qdisc->qstats.backlog;  	nest = nla_nest_start(skb, TCA_OPTIONS);  	if (nest == NULL)  		goto nla_put_failure;  	opt.limit = q->limit; -	opt.rate = q->R_tab->rate; -	if (q->P_tab) -		opt.peakrate = q->P_tab->rate; +	psched_ratecfg_getrate(&opt.rate, &q->rate); +	if (tbf_peak_present(q)) +		psched_ratecfg_getrate(&opt.peakrate, &q->peak);  	else  		memset(&opt.peakrate, 0, sizeof(opt.peakrate)); -	opt.mtu = q->mtu; -	opt.buffer = q->buffer; -	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); +	opt.mtu = PSCHED_NS2TICKS(q->mtu); +	opt.buffer = PSCHED_NS2TICKS(q->buffer); +	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) +		goto nla_put_failure; +	if (q->rate.rate_bytes_ps >= (1ULL << 32) && +	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) +		goto nla_put_failure; +	if (tbf_peak_present(q) && +	    q->peak.rate_bytes_ps >= (1ULL << 32) && +	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) +		goto nla_put_failure; -	nla_nest_end(skb, nest); -	return skb->len; +	return nla_nest_end(skb, nest);  nla_put_failure:  	nla_nest_cancel(skb, nest); @@ -423,8 +538,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)  	}  } -static const struct Qdisc_class_ops tbf_class_ops = -{ +static const struct Qdisc_class_ops tbf_class_ops = {  	.graft		=	tbf_graft,  	.leaf		=	tbf_leaf,  	.get		=	tbf_get, diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 106479a7c94..47416716294 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -53,48 +53,45 @@        which will not break load balancing, though native slave        traffic will have the highest priority.  */ -struct teql_master -{ +struct teql_master {  	struct Qdisc_ops qops;  	struct net_device *dev;  	struct Qdisc *slaves;  	struct list_head master_list; +	unsigned long	tx_bytes; +	unsigned long	tx_packets; +	unsigned long	tx_errors; +	unsigned long	tx_dropped;  }; -struct teql_sched_data -{ +struct teql_sched_data {  	struct Qdisc *next;  	struct teql_master *m; -	struct neighbour *ncache;  	struct sk_buff_head q;  }; -#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) +#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next) -#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT) +#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)  /* "teql*" qdisc routines */  static int -teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)  {  	struct net_device *dev = qdisc_dev(sch);  	struct teql_sched_data *q = qdisc_priv(sch);  	if (q->q.qlen < dev->tx_queue_len) {  		__skb_queue_tail(&q->q, skb); -		sch->bstats.bytes += qdisc_pkt_len(skb); -		sch->bstats.packets++;  		return NET_XMIT_SUCCESS;  	} -	kfree_skb(skb); -	sch->qstats.drops++; -	return NET_XMIT_DROP; +	return qdisc_drop(skb, sch);  }  static struct sk_buff * -teql_dequeue(struct Qdisc* sch) +teql_dequeue(struct Qdisc *sch)  {  	struct teql_sched_data *dat = qdisc_priv(sch);  	struct netdev_queue *dat_queue; @@ -108,19 +105,21 @@ teql_dequeue(struct Qdisc* sch)  			dat->m->slaves = sch;  			netif_wake_queue(m);  		} +	} else { +		qdisc_bstats_update(sch, skb);  	}  	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;  	return skb;  }  static struct sk_buff * -teql_peek(struct Qdisc* sch) +teql_peek(struct Qdisc *sch)  {  	/* teql is meant to be used as root qdisc */  	return NULL;  } -static __inline__ void +static inline void  teql_neigh_release(struct neighbour *n)  {  	if (n) @@ -128,23 +127,23 @@ teql_neigh_release(struct neighbour *n)  }  static void -teql_reset(struct Qdisc* sch) +teql_reset(struct Qdisc *sch)  {  	struct teql_sched_data *dat = qdisc_priv(sch);  	skb_queue_purge(&dat->q);  	sch->q.qlen = 0; -	teql_neigh_release(xchg(&dat->ncache, NULL));  }  static void -teql_destroy(struct Qdisc* sch) +teql_destroy(struct Qdisc *sch)  {  	struct Qdisc *q, *prev;  	struct teql_sched_data *dat = qdisc_priv(sch);  	struct teql_master *master = dat->m; -	if ((prev = master->slaves) != NULL) { +	prev = master->slaves; +	if (prev) {  		do {  			q = NEXT_SLAVE(prev);  			if (q == sch) { @@ -165,7 +164,6 @@ teql_destroy(struct Qdisc* sch)  					}  				}  				skb_queue_purge(&dat->q); -				teql_neigh_release(xchg(&dat->ncache, NULL));  				break;  			} @@ -176,7 +174,7 @@ teql_destroy(struct Qdisc* sch)  static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)  {  	struct net_device *dev = qdisc_dev(sch); -	struct teql_master *m = (struct teql_master*)sch->ops; +	struct teql_master *m = (struct teql_master *)sch->ops;  	struct teql_sched_data *q = qdisc_priv(sch);  	if (dev->hard_header_len > m->dev->hard_header_len) @@ -222,23 +220,27 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)  static int -__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, +	       struct net_device *dev, struct netdev_queue *txq, +	       struct dst_entry *dst)  { -	struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0); -	struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc); -	struct neighbour *mn = skb_dst(skb)->neighbour; -	struct neighbour *n = q->ncache; +	struct neighbour *n; +	int err = 0; -	if (mn->tbl == NULL) -		return -EINVAL; -	if (n && n->tbl == mn->tbl && -	    memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { -		atomic_inc(&n->refcnt); -	} else { -		n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); -		if (IS_ERR(n)) -			return PTR_ERR(n); +	n = dst_neigh_lookup_skb(dst, skb); +	if (!n) +		return -ENOENT; + +	if (dst->dev != dev) { +		struct neighbour *mn; + +		mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev); +		neigh_release(n); +		if (IS_ERR(mn)) +			return PTR_ERR(mn); +		n = mn;  	} +  	if (neigh_event_send(n, skb_res) == 0) {  		int err;  		char haddr[MAX_ADDR_LEN]; @@ -247,35 +249,39 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *  		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,  				      NULL, skb->len); -		if (err < 0) { -			neigh_release(n); -			return -EINVAL; -		} -		teql_neigh_release(xchg(&q->ncache, n)); -		return 0; +		if (err < 0) +			err = -EINVAL; +	} else { +		err = (skb_res == NULL) ? -EAGAIN : 1;  	}  	neigh_release(n); -	return (skb_res == NULL) ? -EAGAIN : 1; +	return err;  }  static inline int teql_resolve(struct sk_buff *skb, -			       struct sk_buff *skb_res, struct net_device *dev) +			       struct sk_buff *skb_res, +			       struct net_device *dev, +			       struct netdev_queue *txq)  { -	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); +	struct dst_entry *dst = skb_dst(skb); +	int res; +  	if (txq->qdisc == &noop_qdisc)  		return -ENODEV; -	if (dev->header_ops == NULL || -	    skb_dst(skb) == NULL || -	    skb_dst(skb)->neighbour == NULL) +	if (!dev->header_ops || !dst)  		return 0; -	return __teql_resolve(skb, skb_res, dev); + +	rcu_read_lock(); +	res = __teql_resolve(skb, skb_res, dev, txq, dst); +	rcu_read_unlock(); + +	return res;  }  static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct teql_master *master = netdev_priv(dev); -	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);  	struct Qdisc *start, *q;  	int busy;  	int nores; @@ -288,7 +294,8 @@ restart:  	nores = 0;  	busy = 0; -	if ((q = start) == NULL) +	q = start; +	if (!q)  		goto drop;  	do { @@ -298,30 +305,30 @@ restart:  		if (slave_txq->qdisc_sleeping != q)  			continue; -		if (__netif_subqueue_stopped(slave, subq) || +		if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||  		    !netif_running(slave)) {  			busy = 1;  			continue;  		} -		switch (teql_resolve(skb, skb_res, slave)) { +		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {  		case 0:  			if (__netif_tx_trylock(slave_txq)) {  				unsigned int length = qdisc_pkt_len(skb); -				if (!netif_tx_queue_frozen_or_stopped(slave_txq) && +				if (!netif_xmit_frozen_or_stopped(slave_txq) &&  				    slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {  					txq_trans_update(slave_txq);  					__netif_tx_unlock(slave_txq);  					master->slaves = NEXT_SLAVE(q);  					netif_wake_queue(dev); -					txq->tx_packets++; -					txq->tx_bytes += length; +					master->tx_packets++; +					master->tx_bytes += length;  					return NETDEV_TX_OK;  				}  				__netif_tx_unlock(slave_txq);  			} -			if (netif_queue_stopped(dev)) +			if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))  				busy = 1;  			break;  		case 1: @@ -343,20 +350,20 @@ restart:  		netif_stop_queue(dev);  		return NETDEV_TX_BUSY;  	} -	dev->stats.tx_errors++; +	master->tx_errors++;  drop: -	txq->tx_dropped++; +	master->tx_dropped++;  	dev_kfree_skb(skb);  	return NETDEV_TX_OK;  }  static int teql_master_open(struct net_device *dev)  { -	struct Qdisc * q; +	struct Qdisc *q;  	struct teql_master *m = netdev_priv(dev);  	int mtu = 0xFFFE; -	unsigned flags = IFF_NOARP|IFF_MULTICAST; +	unsigned int flags = IFF_NOARP | IFF_MULTICAST;  	if (m->slaves == NULL)  		return -EUNATCH; @@ -399,6 +406,18 @@ static int teql_master_close(struct net_device *dev)  	return 0;  } +static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, +						     struct rtnl_link_stats64 *stats) +{ +	struct teql_master *m = netdev_priv(dev); + +	stats->tx_packets	= m->tx_packets; +	stats->tx_bytes		= m->tx_bytes; +	stats->tx_errors	= m->tx_errors; +	stats->tx_dropped	= m->tx_dropped; +	return stats; +} +  static int teql_master_mtu(struct net_device *dev, int new_mtu)  {  	struct teql_master *m = netdev_priv(dev); @@ -412,7 +431,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)  		do {  			if (new_mtu > qdisc_dev(q)->mtu)  				return -EINVAL; -		} while ((q=NEXT_SLAVE(q)) != m->slaves); +		} while ((q = NEXT_SLAVE(q)) != m->slaves);  	}  	dev->mtu = new_mtu; @@ -423,6 +442,7 @@ static const struct net_device_ops teql_netdev_ops = {  	.ndo_open	= teql_master_open,  	.ndo_stop	= teql_master_close,  	.ndo_start_xmit	= teql_master_xmit, +	.ndo_get_stats64 = teql_master_stats64,  	.ndo_change_mtu	= teql_master_mtu,  };  | 
