diff options
Diffstat (limited to 'net/ipv4/ip_fragment.c')
| -rw-r--r-- | net/ipv4/ip_fragment.c | 271 |
1 files changed, 197 insertions, 74 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 86964b353c3..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -20,6 +20,8 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> @@ -32,6 +34,9 @@ #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/slab.h> +#include <net/route.h> +#include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -42,6 +47,7 @@ #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> +#include <net/inet_ecn.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -67,11 +73,17 @@ struct ipq { __be32 daddr; __be16 id; u8 protocol; + u8 ecn; /* RFC3168 support */ int iif; unsigned int rid; struct inet_peer *peer; }; +static inline u8 ip4_frag_ecn(u8 tos) +{ + return 1 << (tos & INET_ECN_MASK); +} + static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -81,7 +93,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -94,6 +106,7 @@ struct ip4_create_arg { static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { + net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -107,41 +120,36 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static int ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(struct inet_frag_queue *q, void *a) { struct ipq *qp; struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); - return (qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user); -} - -/* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct netns_frags *nf, - struct sk_buff *skb, int *work) -{ - if (work) - *work -= skb->truesize; - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); + return qp->id == arg->iph->id && + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user; } static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); + struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, + frags); + struct net *net = container_of(ipv4, struct net, ipv4); + struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; + qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -176,7 +184,7 @@ static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); if (evicted) IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } @@ -204,12 +212,35 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); - if (head->dev) - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + if (!head->dev) + goto out_rcu_unlock; + + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (qp->user == IP_DEFRAG_AF_PACKET || + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) + goto out_rcu_unlock; + + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); +out_rcu_unlock: rcu_read_unlock(); } out: @@ -233,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ @@ -274,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -283,15 +312,20 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp, NULL); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; qp->iif = 0; + qp->ecn = 0; return 0; } @@ -304,6 +338,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int flags, offset; int ihl, end; int err = -ENOENT; + u8 ecn; if (qp->q.last_in & INET_FRAG_COMPLETE) goto err; @@ -315,6 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; } + ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; @@ -328,7 +364,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end - * or have different end, the segment is corrrupted. + * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) @@ -363,6 +399,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = qp->q.fragments_tail; + if (!prev || FRAG_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) @@ -370,6 +411,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) prev = next; } +found: /* We found where to put this one. Check for overlap with * preceding fragment, and, if needed, align things so that * any overlaps are eliminated. @@ -420,7 +462,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it, NULL); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -428,6 +471,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + qp->q.fragments_tail = skb; if (prev) prev->next = skb; else @@ -440,17 +485,27 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; - atomic_add(skb->truesize, &qp->q.net->mem); + qp->ecn |= ecn; + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len + ihl > qp->q.max_size) + qp->q.max_size = skb->len + ihl; + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; - write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); - write_unlock(&ip4_frags.lock); + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); + inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -470,9 +525,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; + int sum_truesize; + u8 ecn; ipq_kill(qp); + ecn = ip_frag_ecn_table[qp->ecn]; + if (unlikely(ecn == 0xff)) { + err = -EINVAL; + goto out_fail; + } /* Make the one we just received the head. */ if (prev) { head = prev->next; @@ -481,12 +543,14 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_nomem; fp->next = head->next; + if (!fp->next) + qp->q.fragments_tail = fp; prev->next = fp; skb_morph(head, qp->q.fragments); head->next = qp->q.fragments->next; - kfree_skb(qp->q.fragments); + consume_skb(qp->q.fragments); qp->q.fragments = head; } @@ -502,13 +566,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; @@ -518,51 +582,65 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &qp->q.net->mem); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - atomic_sub(fp->truesize, &qp->q.net->mem); + + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; } + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; + IPCB(head)->frag_max_size = qp->q.max_size; iph = ip_hdr(head); - iph->frag_off = 0; + /* max_size != 0 implies at least one fragment had IP_DF set */ + iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); + iph->tos |= ecn; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " - "queue %p\n", qp); + LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), + qp); err = -ENOMEM; goto out_fail; out_oversize: - if (net_ratelimit()) - printk(KERN_INFO "Oversized IP packet from %pI4.\n", - &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; @@ -578,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ - if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) - ip_evictor(net); + ip_evictor(net); /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { @@ -598,6 +675,42 @@ int ip_defrag(struct sk_buff *skb, u32 user) kfree_skb(skb); return -ENOMEM; } +EXPORT_SYMBOL(ip_defrag); + +struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +{ + struct iphdr iph; + u32 len; + + if (skb->protocol != htons(ETH_P_IP)) + return skb; + + if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) + return skb; + + if (iph.ihl < 5 || iph.version != 4) + return skb; + + len = ntohs(iph.tot_len); + if (skb->len < len || len < (iph.ihl * 4)) + return skb; + + if (ip_is_fragment(&iph)) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb) { + if (!pskb_may_pull(skb, iph.ihl*4)) + return skb; + if (pskb_trim_rcsum(skb, len)) + return skb; + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + if (ip_defrag(skb, user)) + return NULL; + skb_clear_hash(skb); + } + } + return skb; +} +EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL static int zero; @@ -646,7 +759,7 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ns_ctl_register(struct net *net) +static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -660,9 +773,13 @@ static int ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } - hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + hdr = register_net_sysctl(net, "net/ipv4", table); if (hdr == NULL) goto err_reg; @@ -676,7 +793,7 @@ err_alloc: return -ENOMEM; } -static void ip4_frags_ns_ctl_unregister(struct net *net) +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -687,7 +804,7 @@ static void ip4_frags_ns_ctl_unregister(struct net *net) static void ip4_frags_ctl_register(void) { - register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); + register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static inline int ip4_frags_ns_ctl_register(struct net *net) @@ -704,16 +821,24 @@ static inline void ip4_frags_ctl_register(void) } #endif -static int ipv4_frags_init_net(struct net *net) +static int __net_init ipv4_frags_init_net(struct net *net) { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. + /* Fragment cache limits. + * + * The fragment memory accounting code, (tries to) account for + * the real memory usage, by measuring both the size of frag + * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) + * and the SKB's truesize. + * + * A 64K fragment consumes 129736 bytes (44*2944)+200 + * (1500 truesize == 2944, sizeof(struct ipq) == 200) + * + * We will commit 4MB at one time. Should we cross that limit + * we will prune down to 3MB, making room for approx 8 big 64K + * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 256 * 1024; - net->ipv4.frags.low_thresh = 192 * 1024; + net->ipv4.frags.high_thresh = 4 * 1024 * 1024; + net->ipv4.frags.low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival @@ -726,7 +851,7 @@ static int ipv4_frags_init_net(struct net *net) return ip4_frags_ns_ctl_register(net); } -static void ipv4_frags_exit_net(struct net *net) +static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); @@ -751,5 +876,3 @@ void __init ipfrag_init(void) ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } - -EXPORT_SYMBOL(ip_defrag); |
