diff options
Diffstat (limited to 'net/ipv4/ip_fragment.c')
| -rw-r--r-- | net/ipv4/ip_fragment.c | 240 | 
1 files changed, 165 insertions, 75 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e6215bdd96c..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -20,6 +20,8 @@   *		Patrick McHardy :	LRU queue of frag heads for evictor.   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/compiler.h>  #include <linux/module.h>  #include <linux/types.h> @@ -45,6 +47,7 @@  #include <linux/udp.h>  #include <linux/inet.h>  #include <linux/netfilter_ipv4.h> +#include <net/inet_ecn.h>  /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6   * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -70,11 +73,17 @@ struct ipq {  	__be32		daddr;  	__be16		id;  	u8		protocol; +	u8		ecn; /* RFC3168 support */  	int             iif;  	unsigned int    rid;  	struct inet_peer *peer;  }; +static inline u8 ip4_frag_ecn(u8 tos) +{ +	return 1 << (tos & INET_ECN_MASK); +} +  static struct inet_frags ip4_frags;  int ip_frag_nqueues(struct net *net) @@ -84,7 +93,7 @@ int ip_frag_nqueues(struct net *net)  int ip_frag_mem(struct net *net)  { -	return atomic_read(&net->ipv4.frags.mem); +	return sum_frag_mem_limit(&net->ipv4.frags);  }  static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -97,6 +106,7 @@ struct ip4_create_arg {  static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)  { +	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));  	return jhash_3words((__force u32)id << 16 | prot,  			    (__force u32)saddr, (__force u32)daddr,  			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -110,38 +120,36 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)  	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);  } -static int ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(struct inet_frag_queue *q, void *a)  {  	struct ipq *qp;  	struct ip4_create_arg *arg = a;  	qp = container_of(q, struct ipq, q);  	return	qp->id == arg->iph->id && -			qp->saddr == arg->iph->saddr && -			qp->daddr == arg->iph->daddr && -			qp->protocol == arg->iph->protocol && -			qp->user == arg->user; -} - -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ -	atomic_sub(skb->truesize, &nf->mem); -	kfree_skb(skb); +		qp->saddr == arg->iph->saddr && +		qp->daddr == arg->iph->daddr && +		qp->protocol == arg->iph->protocol && +		qp->user == arg->user;  }  static void ip4_frag_init(struct inet_frag_queue *q, void *a)  {  	struct ipq *qp = container_of(q, struct ipq, q); +	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, +					       frags); +	struct net *net = container_of(ipv4, struct net, ipv4); +  	struct ip4_create_arg *arg = a;  	qp->protocol = arg->iph->protocol;  	qp->id = arg->iph->id; +	qp->ecn = ip4_frag_ecn(arg->iph->tos);  	qp->saddr = arg->iph->saddr;  	qp->daddr = arg->iph->daddr;  	qp->user = arg->user;  	qp->peer = sysctl_ipfrag_max_dist ? -		inet_getpeer_v4(arg->iph->saddr, 1) : NULL; +		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;  }  static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -176,7 +184,7 @@ static void ip_evictor(struct net *net)  {  	int evicted; -	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); +	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);  	if (evicted)  		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);  } @@ -204,31 +212,31 @@ static void ip_expire(unsigned long arg)  	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {  		struct sk_buff *head = qp->q.fragments; +		const struct iphdr *iph; +		int err;  		rcu_read_lock();  		head->dev = dev_get_by_index_rcu(net, qp->iif);  		if (!head->dev)  			goto out_rcu_unlock; +		/* skb has no dst, perform route lookup again */ +		iph = ip_hdr(head); +		err = ip_route_input_noref(head, iph->daddr, iph->saddr, +					   iph->tos, head->dev); +		if (err) +			goto out_rcu_unlock; +  		/* -		 * Only search router table for the head fragment, -		 * when defraging timeout at PRE_ROUTING HOOK. +		 * Only an end host needs to send an ICMP +		 * "Fragment Reassembly Timeout" message, per RFC792.  		 */ -		if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { -			const struct iphdr *iph = ip_hdr(head); -			int err = ip_route_input(head, iph->daddr, iph->saddr, -						 iph->tos, head->dev); -			if (unlikely(err)) -				goto out_rcu_unlock; - -			/* -			 * Only an end host needs to send an ICMP -			 * "Fragment Reassembly Timeout" message, per RFC792. -			 */ -			if (skb_rtable(head)->rt_type != RTN_LOCAL) -				goto out_rcu_unlock; +		if (qp->user == IP_DEFRAG_AF_PACKET || +		    ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && +		     (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && +		     (skb_rtable(head)->rt_type != RTN_LOCAL))) +			goto out_rcu_unlock; -		}  		/* Send an ICMP "Fragment Reassembly Timeout" message. */  		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -256,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)  	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);  	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); -	if (q == NULL) -		goto out_nomem; - +	if (IS_ERR_OR_NULL(q)) { +		inet_frag_maybe_warn_overflow(q, pr_fmt()); +		return NULL; +	}  	return container_of(q, struct ipq, q); - -out_nomem: -	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); -	return NULL;  }  /* Is the fragment too far ahead to be part of ipq? */ @@ -297,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp)  static int ip_frag_reinit(struct ipq *qp)  {  	struct sk_buff *fp; +	unsigned int sum_truesize = 0;  	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {  		atomic_inc(&qp->q.refcnt); @@ -306,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp)  	fp = qp->q.fragments;  	do {  		struct sk_buff *xp = fp->next; -		frag_kfree_skb(qp->q.net, fp); + +		sum_truesize += fp->truesize; +		kfree_skb(fp);  		fp = xp;  	} while (fp); +	sub_frag_mem_limit(&qp->q, sum_truesize);  	qp->q.last_in = 0;  	qp->q.len = 0; @@ -316,6 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)  	qp->q.fragments = NULL;  	qp->q.fragments_tail = NULL;  	qp->iif = 0; +	qp->ecn = 0;  	return 0;  } @@ -328,6 +338,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  	int flags, offset;  	int ihl, end;  	int err = -ENOENT; +	u8 ecn;  	if (qp->q.last_in & INET_FRAG_COMPLETE)  		goto err; @@ -339,6 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  		goto err;  	} +	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);  	offset = ntohs(ip_hdr(skb)->frag_off);  	flags = offset & ~IP_OFFSET;  	offset &= IP_OFFSET; @@ -352,7 +364,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  	/* Is this the final fragment? */  	if ((flags & IP_MF) == 0) {  		/* If we already have some bits beyond end -		 * or have different end, the segment is corrrupted. +		 * or have different end, the segment is corrupted.  		 */  		if (end < qp->q.len ||  		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) @@ -450,7 +462,8 @@ found:  				qp->q.fragments = next;  			qp->q.meat -= free_it->len; -			frag_kfree_skb(qp->q.net, free_it); +			sub_frag_mem_limit(&qp->q, free_it->truesize); +			kfree_skb(free_it);  		}  	} @@ -472,17 +485,27 @@ found:  	}  	qp->q.stamp = skb->tstamp;  	qp->q.meat += skb->len; -	atomic_add(skb->truesize, &qp->q.net->mem); +	qp->ecn |= ecn; +	add_frag_mem_limit(&qp->q, skb->truesize);  	if (offset == 0)  		qp->q.last_in |= INET_FRAG_FIRST_IN; +	if (ip_hdr(skb)->frag_off & htons(IP_DF) && +	    skb->len + ihl > qp->q.max_size) +		qp->q.max_size = skb->len + ihl; +  	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && -	    qp->q.meat == qp->q.len) -		return ip_frag_reasm(qp, prev, dev); +	    qp->q.meat == qp->q.len) { +		unsigned long orefdst = skb->_skb_refdst; + +		skb->_skb_refdst = 0UL; +		err = ip_frag_reasm(qp, prev, dev); +		skb->_skb_refdst = orefdst; +		return err; +	} -	write_lock(&ip4_frags.lock); -	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); -	write_unlock(&ip4_frags.lock); +	skb_dst_drop(skb); +	inet_frag_lru_move(&qp->q);  	return -EINPROGRESS;  err: @@ -502,9 +525,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  	int len;  	int ihlen;  	int err; +	int sum_truesize; +	u8 ecn;  	ipq_kill(qp); +	ecn = ip_frag_ecn_table[qp->ecn]; +	if (unlikely(ecn == 0xff)) { +		err = -EINVAL; +		goto out_fail; +	}  	/* Make the one we just received the head. */  	if (prev) {  		head = prev->next; @@ -520,7 +550,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		skb_morph(head, qp->q.fragments);  		head->next = qp->q.fragments->next; -		kfree_skb(qp->q.fragments); +		consume_skb(qp->q.fragments);  		qp->q.fragments = head;  	} @@ -536,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		goto out_oversize;  	/* Head of list must not be cloned. */ -	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(head, GFP_ATOMIC))  		goto out_nomem;  	/* If the first fragment is fragmented itself, we split @@ -552,51 +582,65 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		head->next = clone;  		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;  		skb_frag_list_init(head); -		for (i=0; i<skb_shinfo(head)->nr_frags; i++) -			plen += skb_shinfo(head)->frags[i].size; +		for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);  		clone->len = clone->data_len = head->data_len - plen;  		head->data_len -= clone->len;  		head->len -= clone->len;  		clone->csum = 0;  		clone->ip_summed = head->ip_summed; -		atomic_add(clone->truesize, &qp->q.net->mem); +		add_frag_mem_limit(&qp->q, clone->truesize);  	} -	skb_shinfo(head)->frag_list = head->next;  	skb_push(head, head->data - skb_network_header(head)); -	for (fp=head->next; fp; fp = fp->next) { -		head->data_len += fp->len; -		head->len += fp->len; +	sum_truesize = head->truesize; +	for (fp = head->next; fp;) { +		bool headstolen; +		int delta; +		struct sk_buff *next = fp->next; + +		sum_truesize += fp->truesize;  		if (head->ip_summed != fp->ip_summed)  			head->ip_summed = CHECKSUM_NONE;  		else if (head->ip_summed == CHECKSUM_COMPLETE)  			head->csum = csum_add(head->csum, fp->csum); -		head->truesize += fp->truesize; + +		if (skb_try_coalesce(head, fp, &headstolen, &delta)) { +			kfree_skb_partial(fp, headstolen); +		} else { +			if (!skb_shinfo(head)->frag_list) +				skb_shinfo(head)->frag_list = fp; +			head->data_len += fp->len; +			head->len += fp->len; +			head->truesize += fp->truesize; +		} +		fp = next;  	} -	atomic_sub(head->truesize, &qp->q.net->mem); +	sub_frag_mem_limit(&qp->q, sum_truesize);  	head->next = NULL;  	head->dev = dev;  	head->tstamp = qp->q.stamp; +	IPCB(head)->frag_max_size = qp->q.max_size;  	iph = ip_hdr(head); -	iph->frag_off = 0; +	/* max_size != 0 implies at least one fragment had IP_DF set */ +	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;  	iph->tot_len = htons(len); +	iph->tos |= ecn;  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);  	qp->q.fragments = NULL;  	qp->q.fragments_tail = NULL;  	return 0;  out_nomem: -	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " -			      "queue %p\n", qp); +	LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), +		       qp);  	err = -ENOMEM;  	goto out_fail;  out_oversize: -	if (net_ratelimit()) -		printk(KERN_INFO "Oversized IP packet from %pI4.\n", -			&qp->saddr); +	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);  out_fail:  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);  	return err; @@ -612,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);  	/* Start by cleaning up the memory. */ -	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) -		ip_evictor(net); +	ip_evictor(net);  	/* Lookup (or create) queue header */  	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { @@ -634,6 +677,41 @@ int ip_defrag(struct sk_buff *skb, u32 user)  }  EXPORT_SYMBOL(ip_defrag); +struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +{ +	struct iphdr iph; +	u32 len; + +	if (skb->protocol != htons(ETH_P_IP)) +		return skb; + +	if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) +		return skb; + +	if (iph.ihl < 5 || iph.version != 4) +		return skb; + +	len = ntohs(iph.tot_len); +	if (skb->len < len || len < (iph.ihl * 4)) +		return skb; + +	if (ip_is_fragment(&iph)) { +		skb = skb_share_check(skb, GFP_ATOMIC); +		if (skb) { +			if (!pskb_may_pull(skb, iph.ihl*4)) +				return skb; +			if (pskb_trim_rcsum(skb, len)) +				return skb; +			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); +			if (ip_defrag(skb, user)) +				return NULL; +			skb_clear_hash(skb); +		} +	} +	return skb; +} +EXPORT_SYMBOL(ip_check_defrag); +  #ifdef CONFIG_SYSCTL  static int zero; @@ -695,9 +773,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)  		table[0].data = &net->ipv4.frags.high_thresh;  		table[1].data = &net->ipv4.frags.low_thresh;  		table[2].data = &net->ipv4.frags.timeout; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			table[0].procname = NULL;  	} -	hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); +	hdr = register_net_sysctl(net, "net/ipv4", table);  	if (hdr == NULL)  		goto err_reg; @@ -722,7 +804,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)  static void ip4_frags_ctl_register(void)  { -	register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);  }  #else  static inline int ip4_frags_ns_ctl_register(struct net *net) @@ -741,14 +823,22 @@ static inline void ip4_frags_ctl_register(void)  static int __net_init ipv4_frags_init_net(struct net *net)  { -	/* -	 * Fragment cache limits. We will commit 256K at one time. Should we -	 * cross that limit we will prune down to 192K. This should cope with -	 * even the most extreme cases without allowing an attacker to -	 * measurably harm machine performance. +	/* Fragment cache limits. +	 * +	 * The fragment memory accounting code, (tries to) account for +	 * the real memory usage, by measuring both the size of frag +	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) +	 * and the SKB's truesize. +	 * +	 * A 64K fragment consumes 129736 bytes (44*2944)+200 +	 * (1500 truesize == 2944, sizeof(struct ipq) == 200) +	 * +	 * We will commit 4MB at one time. Should we cross that limit +	 * we will prune down to 3MB, making room for approx 8 big 64K +	 * fragments 8x128k.  	 */ -	net->ipv4.frags.high_thresh = 256 * 1024; -	net->ipv4.frags.low_thresh = 192 * 1024; +	net->ipv4.frags.high_thresh = 4 * 1024 * 1024; +	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;  	/*  	 * Important NOTE! Fragment queue must be destroyed before MSL expires.  	 * RFC791 is wrong proposing to prolongate timer each fragment arrival  | 
