Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6

author: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-07 08:05:11 -0800
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-07 08:05:11 -0800
commit: 8e33ba49765484bc6de3a2f8143733713fa93bc1 (patch)
tree: 2ea080e478e4ee86a893b75db2d5c81ce14cbf10 /net
parent: 8cde0776ec1e86c270f65bf482f96288e6bf0023 (diff)
parent: 2d43f1128a4282fbe8442f40b4cbbac05d8f10aa (diff)
19 files changed, 663 insertions, 846 deletions
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf874..15bfd03e802 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 {
 	struct task_struct *tsk = current;
 	DEFINE_WAIT(wait);
+	int done;
 
-	while (1) {
+	do {
 		if (sk->sk_err)
 			return sock_error(sk);
 		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 		sk->sk_write_pending++;
-		if (sk_wait_event(sk, timeo_p,
-				  !((1 << sk->sk_state) & 
-				    ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
-			break;
+		done = sk_wait_event(sk, timeo_p,
+				     !((1 << sk->sk_state) & 
+				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
 		finish_wait(sk->sk_sleep, &wait);
 		sk->sk_write_pending--;
-	}
+	} while (!done);
 	return 0;
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff9..4b9bc81ae1a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
-	.port_rover	= 1024 - 1,
 };
 
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
 	int ret;
 
  	if (snum == 0) {
- 		int rover;
  		int low = sysctl_local_port_range[0];
  		int high = sysctl_local_port_range[1];
  		int remaining = (high - low) + 1;
+ 		int rover = net_random() % (high - low) + low;
 		struct hlist_node *node;
  		struct inet_timewait_sock *tw = NULL;
 
  		local_bh_disable();
-
- 		/* TODO. Actually it is not so bad idea to remove
- 		 * dccp_hashinfo.portalloc_lock before next submission to
-		 * Linus.
- 		 * As soon as we touch this place at all it is time to think.
- 		 *
- 		 * Now it protects single _advisory_ variable
-		 * dccp_hashinfo.port_rover, hence it is mostly useless.
- 		 * Code will work nicely if we just delete it, but
- 		 * I am afraid in contented case it will work not better or
- 		 * even worse: another cpu just will hit the same bucket
- 		 * and spin there.
- 		 * So some cpu salt could remove both contention and
- 		 * memory pingpong. Any ideas how to do this in a nice way?
- 		 */
- 		spin_lock(&dccp_hashinfo.portalloc_lock);
- 		rover = dccp_hashinfo.port_rover;
-
  		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
  			head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
 						    dccp_hashinfo.bhash_size)];
  			spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
 
  		next_port:
  			spin_unlock(&head->lock);
+ 			if (++rover > high)
+ 				rover = low;
  		} while (--remaining > 0);
- 		dccp_hashinfo.port_rover = rover;
- 		spin_unlock(&dccp_hashinfo.portalloc_lock);
 
  		local_bh_enable();
 
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
 
 ok:
  		/* All locks still held and bhs disabled */
- 		dccp_hashinfo.port_rover = rover;
- 		spin_unlock(&dccp_hashinfo.portalloc_lock);
-
  		inet_bind_hash(sk, tb, rover);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(rover);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5b..3fe021f1a56 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		int rover = net_random() % (high - low) + low;
 
-		spin_lock(&hashinfo->portalloc_lock);
-		if (hashinfo->port_rover < low)
-			rover = low;
-		else
-			rover = hashinfo->port_rover;
 		do {
-			rover++;
-			if (rover > high)
-				rover = low;
 			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
 			spin_lock(&head->lock);
 			inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover > high)
+				rover = low;
 		} while (--remaining > 0);
-		hashinfo->port_rover = rover;
-		spin_unlock(&hashinfo->portalloc_lock);
 
 		/* Exhausted local port range during search?  It is not
 		 * possible for us to be holding one of the bind hash
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 926a6684643..4108a5e12b3 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master,
 	exp_orig->expectfn = pptp_expectfn;
 	exp_orig->flags = 0;
 
-	exp_orig->dir = IP_CT_DIR_ORIGINAL;
-
 	/* both expectations are identical apart from tuple */
 	memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
 	memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
 
-	exp_reply->dir = !exp_orig->dir;
-
 	if (ip_nat_pptp_hook_exp_gre)
 		ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
 	else {
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 166e6069f12..82a65043a8e 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 				  IPCTNL_MSG_CT_NEW, 1, ct);
 	ip_conntrack_put(ct);
 	if (err <= 0)
-		goto out;
+		goto free;
 
 	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
 	if (err < 0)
@@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 	DEBUGP("leaving\n");
 	return 0;
 
+free:
+	kfree_skb(skb2);
 out:
-	if (skb2)
-		kfree_skb(skb2);
 	return -1;
 }
 
@@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
 				      1, exp);
 	if (err <= 0)
-		goto out;
+		goto free;
 
 	ip_conntrack_expect_put(exp);
 
-	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
-	if (err < 0)
-		goto free;
-
-	return err;
+	return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
 
+free:
+	kfree_skb(skb2);
 out:
 	ip_conntrack_expect_put(exp);
-free:
-	if (skb2)
-		kfree_skb(skb2);
 	return err;
 }
 
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index c5e3abd2467..762f4d93936 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum)
 	 * removed until we've grabbed the reference */
 	preempt_disable();
 	p = __ip_nat_proto_find(protonum);
-	if (p) {
-		if (!try_module_get(p->me))
-			p = &ip_nat_unknown_protocol;
-	}
+	if (!try_module_get(p->me))
+		p = &ip_nat_unknown_protocol;
 	preempt_enable();
 
 	return p;
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 3cdd0684d30..ee6ab74ad3a 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 	expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id);
 	expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
 	expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+	expect_orig->dir = IP_CT_DIR_ORIGINAL;
 	inv_t.src.ip = reply_t->src.ip;
 	inv_t.dst.ip = reply_t->dst.ip;
 	inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
@@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 	expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
 	expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
 	expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+	expect_reply->dir = IP_CT_DIR_REPLY;
 	inv_t.src.ip = orig_t->src.ip;
 	inv_t.dst.ip = orig_t->dst.ip;
 	inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 7c128540167..f7cad7cf1ae 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb,
 			break;
 		case GRE_VERSION_PPTP:
 			DEBUGP("call_id -> 0x%04x\n", 
-				ntohl(tuple->dst.u.gre.key));
-			pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key));
+				ntohs(tuple->dst.u.gre.key));
+			pgreh->call_id = tuple->dst.u.gre.key;
 			break;
 		default:
 			DEBUGP("can't nat unknown GRE version\n");
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index 99bbef56f84..f0099a646a0 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
 
 struct ip_nat_protocol ip_nat_unknown_protocol = {
 	.name			= "unknown",
-	.me			= THIS_MODULE,
+	/* .me isn't set: getting a ref to this cannot fail. */
 	.manip_pkt		= unknown_manip_pkt,
 	.in_range		= unknown_in_range,
 	.unique_tuple		= unknown_unique_tuple,
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 13463802133..05d66ab5942 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = {
 
 static int __init init(void)
 {
+	need_ip_conntrack();
 	return ipt_register_target(&ipt_connmark_reg);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a958..72b7c22e1ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
 		sysctl_tcp_max_orphans >>= (3 - order);
 		sysctl_max_syn_backlog = 128;
 	}
-	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
 
 	sysctl_tcp_mem[0] =  768 << order;
 	sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d8474..49d67cd75ed 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
-	.port_rover	= 1024 - 1,
 };
 
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b7..d746d3b27ef 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		int rover = net_random() % (high - low) + low;
 
-		spin_lock(&tcp_hashinfo.portalloc_lock);
-		if (tcp_hashinfo.port_rover < low)
-			rover = low;
-		else
-			rover = tcp_hashinfo.port_rover;
-		do {	rover++;
-			if (rover > high)
-				rover = low;
+		do {
 			head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
 			spin_lock(&head->lock);
 			inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover > high)
+				rover = low;
 		} while (--remaining > 0);
-		tcp_hashinfo.port_rover = rover;
-		spin_unlock(&tcp_hashinfo.portalloc_lock);
 
 		/* Exhausted local port range during search?  It is not
 		 * possible for us to be holding one of the bind hash
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d10d552d9c4..d3a4f30a7f2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb,
 
 	/* QUEUE == DROP if noone is waiting, to be safe. */
 	read_lock(&queue_handler_lock);
-	if (!queue_handler[pf]->outfn) {
+	if (!queue_handler[pf] || !queue_handler[pf]->outfn) {
 		read_unlock(&queue_handler_lock);
 		kfree_skb(*skb);
 		return 1;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index efcd10f996b..d194676f365 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid)
 		goto out_unlock;
 	}
 
-	inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
 	if (!inst)
 		goto out_unlock;
 
-	memset(inst, 0, sizeof(*inst));
 	INIT_HLIST_NODE(&inst->hlist);
 	inst->lock = SPIN_LOCK_UNLOCKED;
 	/* needs to be two, since we _put() after creation */
@@ -962,10 +961,9 @@ static int nful_open(struct inode *inode, struct file *file)
 	struct iter_state *is;
 	int ret;
 
-	is = kmalloc(sizeof(*is), GFP_KERNEL);
+	is = kzalloc(sizeof(*is), GFP_KERNEL);
 	if (!is)
 		return -ENOMEM;
-	memset(is, 0, sizeof(*is));
 	ret = seq_open(file, &nful_seq_ops);
 	if (ret < 0)
 		goto out_free;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index eaa44c49567..f065a6c9495 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid)
 		goto out_unlock;
 	}
 
-	inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
 	if (!inst)
 		goto out_unlock;
 
-	memset(inst, 0, sizeof(*inst));
 	inst->queue_num = queue_num;
 	inst->peer_pid = pid;
 	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
@@ -1036,10 +1035,9 @@ static int nfqnl_open(struct inode *inode, struct file *file)
 	struct iter_state *is;
 	int ret;
 
-	is = kmalloc(sizeof(*is), GFP_KERNEL);
+	is = kzalloc(sizeof(*is), GFP_KERNEL);
 	if (!is)
 		return -ENOMEM;
-	memset(is, 0, sizeof(*is));
 	ret = seq_open(file, &nfqnl_seq_ops);
 	if (ret < 0)
 		goto out_free;
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 25c171c3271..29a2dd9f302 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -15,247 +15,281 @@
  *		         from Ren Liu
  *		       - More error checks
  *
- *
- *
- *  For all the glorious comments look at Alexey's sch_red.c
+ *  For all the glorious comments look at include/net/red.h
  */
 
 #include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/pkt_sched.h>
+#include <net/red.h>
 
-#if 1 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
+#define GRED_DEF_PRIO (MAX_DPs / 2)
+#define GRED_VQ_MASK (MAX_DPs - 1)
 
 struct gred_sched_data;
 struct gred_sched;
 
 struct gred_sched_data
 {
-/* Parameters */
 	u32		limit;		/* HARD maximal queue length	*/
-	u32		qth_min;	/* Min average length threshold: A scaled */
-	u32		qth_max;	/* Max average length threshold: A scaled */
 	u32      	DP;		/* the drop pramaters */
-	char		Wlog;		/* log(W)		*/
-	char		Plog;		/* random number bits	*/
-	u32		Scell_max;
-	u32		Rmask;
 	u32		bytesin;	/* bytes seen on virtualQ so far*/
 	u32		packetsin;	/* packets seen on virtualQ so far*/
 	u32		backlog;	/* bytes on the virtualQ */
-	u32		forced;	/* packets dropped for exceeding limits */
-	u32		early;	/* packets dropped as a warning */
-	u32		other;	/* packets dropped by invoking drop() */
-	u32		pdrop;	/* packets dropped because we exceeded physical queue limits */
-	char		Scell_log;
-	u8		Stab[256];
-	u8              prio;        /* the prio of this vq */
-
-/* Variables */
-	unsigned long	qave;		/* Average queue length: A scaled */
-	int		qcount;		/* Packets since last random number generation */
-	u32		qR;		/* Cached random number */
-
-	psched_time_t	qidlestart;	/* Start of idle period	*/
+	u8		prio;		/* the prio of this vq */
+
+	struct red_parms parms;
+	struct red_stats stats;
+};
+
+enum {
+	GRED_WRED_MODE = 1,
+	GRED_RIO_MODE,
 };
 
 struct gred_sched
 {
 	struct gred_sched_data *tab[MAX_DPs];
-	u32 		DPs;   
-	u32 		def; 
-	u8 		initd; 
-	u8 		grio; 
-	u8 		eqp; 
+	unsigned long	flags;
+	u32		red_flags;
+	u32 		DPs;
+	u32 		def;
+	struct red_parms wred_set;
 };
 
-static int
-gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static inline int gred_wred_mode(struct gred_sched *table)
 {
-	psched_time_t now;
-	struct gred_sched_data *q=NULL;
-	struct gred_sched *t= qdisc_priv(sch);
-	unsigned long	qave=0;	
-	int i=0;
+	return test_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_enable_wred_mode(struct gred_sched *table)
+{
+	__set_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+	__clear_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+	return test_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+	__set_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+	__clear_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	int i;
 
-	if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
-		D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
-		goto do_enqueue;
+	/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
+	for (i = 0; i < table->DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+		int n;
+
+		if (q == NULL)
+			continue;
+
+		for (n = 0; n < table->DPs; n++)
+			if (table->tab[n] && table->tab[n] != q &&
+			    table->tab[n]->prio == q->prio)
+				return 1;
 	}
 
+	return 0;
+}
+
+static inline unsigned int gred_backlog(struct gred_sched *table,
+					struct gred_sched_data *q,
+					struct Qdisc *sch)
+{
+	if (gred_wred_mode(table))
+		return sch->qstats.backlog;
+	else
+		return q->backlog;
+}
+
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+	return skb->tc_index & GRED_VQ_MASK;
+}
+
+static inline void gred_load_wred_set(struct gred_sched *table,
+				      struct gred_sched_data *q)
+{
+	q->parms.qavg = table->wred_set.qavg;
+	q->parms.qidlestart = table->wred_set.qidlestart;
+}
+
+static inline void gred_store_wred_set(struct gred_sched *table,
+				       struct gred_sched_data *q)
+{
+	table->wred_set.qavg = q->parms.qavg;
+}
+
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+	return t->red_flags & TC_RED_ECN;
+}
 
-	if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
-		printk("GRED: setting to default (%d)\n ",t->def);
-		if (!(q=t->tab[t->def])) {
-			DPRINTK("GRED: setting to default FAILED! dropping!! "
-			    "(%d)\n ", t->def);
-			goto drop;
+static inline int gred_use_harddrop(struct gred_sched *t)
+{
+	return t->red_flags & TC_RED_HARDDROP;
+}
+
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+	struct gred_sched_data *q=NULL;
+	struct gred_sched *t= qdisc_priv(sch);
+	unsigned long qavg = 0;
+	u16 dp = tc_index_to_dp(skb);
+
+	if (dp >= t->DPs  || (q = t->tab[dp]) == NULL) {
+		dp = t->def;
+
+		if ((q = t->tab[dp]) == NULL) {
+			/* Pass through packets not assigned to a DP
+			 * if no default DP has been configured. This
+			 * allows for DP flows to be left untouched.
+			 */
+			if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
+				return qdisc_enqueue_tail(skb, sch);
+			else
+				goto drop;
 		}
+
 		/* fix tc_index? --could be controvesial but needed for
 		   requeueing */
-		skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
+		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
 	}
 
-	D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
-	    "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
-	    sch->qstats.backlog);
-	/* sum up all the qaves of prios <= to ours to get the new qave*/
-	if (!t->eqp && t->grio) {
-		for (i=0;i<t->DPs;i++) {
-			if ((!t->tab[i]) || (i==q->DP))	
-				continue; 
-				
-			if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
-				qave +=t->tab[i]->qave;
+	/* sum up all the qaves of prios <= to ours to get the new qave */
+	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
+		int i;
+
+		for (i = 0; i < t->DPs; i++) {
+			if (t->tab[i] && t->tab[i]->prio < q->prio &&
+			    !red_is_idling(&t->tab[i]->parms))
+				qavg +=t->tab[i]->parms.qavg;
 		}
-			
+
 	}
 
 	q->packetsin++;
-	q->bytesin+=skb->len;
+	q->bytesin += skb->len;
 
-	if (t->eqp && t->grio) {
-		qave=0;
-		q->qave=t->tab[t->def]->qave;
-		q->qidlestart=t->tab[t->def]->qidlestart;
-	}
+	if (gred_wred_mode(t))
+		gred_load_wred_set(t, q);
 
-	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
-		long us_idle;
-		PSCHED_GET_TIME(now);
-		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
-		PSCHED_SET_PASTPERFECT(q->qidlestart);
+	q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
 
-		q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
-	} else {
-		if (t->eqp) {
-			q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
-		} else {
-			q->qave += q->backlog - (q->qave >> q->Wlog);
-		}
+	if (red_is_idling(&q->parms))
+		red_end_of_idle_period(&q->parms);
 
-	}
-	
-
-	if (t->eqp && t->grio) 
-		t->tab[t->def]->qave=q->qave;
-
-	if ((q->qave+qave) < q->qth_min) {
-		q->qcount = -1;
-enqueue:
-		if (q->backlog + skb->len <= q->limit) {
-			q->backlog += skb->len;
-do_enqueue:
-			__skb_queue_tail(&sch->q, skb);
-			sch->qstats.backlog += skb->len;
-			sch->bstats.bytes += skb->len;
-			sch->bstats.packets++;
-			return 0;
-		} else {
-			q->pdrop++;
-		}
+	if (gred_wred_mode(t))
+		gred_store_wred_set(t, q);
 
-drop:
-		kfree_skb(skb);
-		sch->qstats.drops++;
-		return NET_XMIT_DROP;
-	}
-	if ((q->qave+qave) >= q->qth_max) {
-		q->qcount = -1;
-		sch->qstats.overlimits++;
-		q->forced++;
-		goto drop;
+	switch (red_action(&q->parms, q->parms.qavg + qavg)) {
+		case RED_DONT_MARK:
+			break;
+
+		case RED_PROB_MARK:
+			sch->qstats.overlimits++;
+			if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+				q->stats.prob_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.prob_mark++;
+			break;
+
+		case RED_HARD_MARK:
+			sch->qstats.overlimits++;
+			if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+			q->stats.forced_mark++;
+			break;
 	}
-	if (++q->qcount) {
-		if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
-			goto enqueue;
-		q->qcount = 0;
-		q->qR = net_random()&q->Rmask;
-		sch->qstats.overlimits++;
-		q->early++;
-		goto drop;
+
+	if (q->backlog + skb->len <= q->limit) {
+		q->backlog += skb->len;
+		return qdisc_enqueue_tail(skb, sch);
 	}
-	q->qR = net_random()&q->Rmask;
-	goto enqueue;
+
+	q->stats.pdrop++;
+drop:
+	return qdisc_drop(skb, sch);
+
+congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
 }
 
-static int
-gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
 {
+	struct gred_sched *t = qdisc_priv(sch);
 	struct gred_sched_data *q;
-	struct gred_sched *t= qdisc_priv(sch);
-	q= t->tab[(skb->tc_index&0xf)];
-/* error checking here -- probably unnecessary */
-	PSCHED_SET_PASTPERFECT(q->qidlestart);
-
-	__skb_queue_head(&sch->q, skb);
-	sch->qstats.backlog += skb->len;
-	sch->qstats.requeues++;
-	q->backlog += skb->len;
-	return 0;
+	u16 dp = tc_index_to_dp(skb);
+
+	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
+			       "for requeue, screwing up backlog.\n",
+			       tc_index_to_dp(skb));
+	} else {
+		if (red_is_idling(&q->parms))
+			red_end_of_idle_period(&q->parms);
+		q->backlog += skb->len;
+	}
+
+	return qdisc_requeue(skb, sch);
 }
 
-static struct sk_buff *
-gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc* sch)
 {
 	struct sk_buff *skb;
-	struct gred_sched_data *q;
-	struct gred_sched *t= qdisc_priv(sch);
+	struct gred_sched *t = qdisc_priv(sch);
+
+	skb = qdisc_dequeue_head(sch);
 
-	skb = __skb_dequeue(&sch->q);
 	if (skb) {
-		sch->qstats.backlog -= skb->len;
-		q= t->tab[(skb->tc_index&0xf)];
-		if (q) {
-			q->backlog -= skb->len;
-			if (!q->backlog && !t->eqp)
-				PSCHED_GET_TIME(q->qidlestart);
+		struct gred_sched_data *q;
+		u16 dp = tc_index_to_dp(skb);
+
+		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+			if (net_ratelimit())
+				printk(KERN_WARNING "GRED: Unable to relocate "
+				       "VQ 0x%x after dequeue, screwing up "
+				       "backlog.\n", tc_index_to_dp(skb));
 		} else {
-			D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); 
+			q->backlog -= skb->len;
+
+			if (!q->backlog && !gred_wred_mode(t))
+				red_start_of_idle_period(&q->parms);
 		}
+
 		return skb;
 	}
 
-	if (t->eqp) {
-			q= t->tab[t->def];
-			if (!q)	
-				D2PRINTK("no default VQ set: Results will be "
-				       "screwed up\n");
-			else
-				PSCHED_GET_TIME(q->qidlestart);
-	}
+	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
+		red_start_of_idle_period(&t->wred_set);
 
 	return NULL;
 }
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch)
 static unsigned int gred_drop(struct Qdisc* sch)
 {
 	struct sk_buff *skb;
+	struct gred_sched *t = qdisc_priv(sch);
 
-	struct gred_sched_data *q;
-	struct gred_sched *t= qdisc_priv(sch);
-
-	skb = __skb_dequeue_tail(&sch->q);
+	skb = qdisc_dequeue_tail(sch);
 	if (skb) {
 		unsigned int len = skb->len;
-		sch->qstats.backlog -= len;
-		sch->qstats.drops++;
-		q= t->tab[(skb->tc_index&0xf)];
-		if (q) {
-			q->backlog -= len;
-			q->other++;
-			if (!q->backlog && !t->eqp)
-				PSCHED_GET_TIME(q->qidlestart);
+		struct gred_sched_data *q;
+		u16 dp = tc_index_to_dp(skb);
+
+		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+			if (net_ratelimit())
+				printk(KERN_WARNING "GRED: Unable to relocate "
+				       "VQ
author	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-07 08:05:11 -0800
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-07 08:05:11 -0800
commit	8e33ba49765484bc6de3a2f8143733713fa93bc1 (patch)
tree	2ea080e478e4ee86a893b75db2d5c81ce14cbf10 /net
parent	8cde0776ec1e86c270f65bf482f96288e6bf0023 (diff)
parent	2d43f1128a4282fbe8442f40b4cbbac05d8f10aa (diff)