diff options
Diffstat (limited to 'net/ipv4/ip_fragment.c')
| -rw-r--r-- | net/ipv4/ip_fragment.c | 252 |
1 files changed, 154 insertions, 98 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a1151b8adf3..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -20,6 +20,8 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> @@ -77,20 +79,9 @@ struct ipq { struct inet_peer *peer; }; -#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */ -#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */ - static inline u8 ip4_frag_ecn(u8 tos) { - tos = (tos & INET_ECN_MASK) + 1; - /* - * After the last operation we have (in binary): - * INET_ECN_NOT_ECT => 001 - * INET_ECN_ECT_1 => 010 - * INET_ECN_ECT_0 => 011 - * INET_ECN_CE => 100 - */ - return (tos & 2) ? 0 : tos; + return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; @@ -102,7 +93,7 @@ int ip_frag_nqueues(struct net *net) int ip_frag_mem(struct net *net) { - return atomic_read(&net->ipv4.frags.mem); + return sum_frag_mem_limit(&net->ipv4.frags); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -115,6 +106,7 @@ struct ip4_create_arg { static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) { + net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); return jhash_3words((__force u32)id << 16 | prot, (__force u32)saddr, (__force u32)daddr, ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -128,29 +120,26 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static int ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(struct inet_frag_queue *q, void *a) { struct ipq *qp; struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user; -} - -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user; } static void ip4_frag_init(struct inet_frag_queue *q, void *a) { struct ipq *qp = container_of(q, struct ipq, q); + struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, + frags); + struct net *net = container_of(ipv4, struct net, ipv4); + struct ip4_create_arg *arg = a; qp->protocol = arg->iph->protocol; @@ -160,7 +149,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -195,7 +184,7 @@ static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); if (evicted) IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } @@ -223,31 +212,31 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + /* - * Only search router table for the head fragment, - * when defraging timeout at PRE_ROUTING HOOK. + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { - const struct iphdr *iph = ip_hdr(head); - int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (unlikely(err)) - goto out_rcu_unlock; - - /* - * Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (skb_rtable(head)->rt_type != RTN_LOCAL) - goto out_rcu_unlock; + if (qp->user == IP_DEFRAG_AF_PACKET || + ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && + (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && + (skb_rtable(head)->rt_type != RTN_LOCAL))) + goto out_rcu_unlock; - } /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -275,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ @@ -316,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; + unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); @@ -325,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp); + + sum_truesize += fp->truesize; + kfree_skb(fp); fp = xp; } while (fp); + sub_frag_mem_limit(&qp->q, sum_truesize); qp->q.last_in = 0; qp->q.len = 0; @@ -374,7 +364,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end - * or have different end, the segment is corrrupted. + * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) @@ -472,7 +462,8 @@ found: qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it); + sub_frag_mem_limit(&qp->q, free_it->truesize); + kfree_skb(free_it); } } @@ -495,17 +486,26 @@ found: qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; - atomic_add(skb->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, skb->truesize); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; + if (ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len + ihl > qp->q.max_size) + qp->q.max_size = skb->len + ihl; + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; - write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); - write_unlock(&ip4_frags.lock); + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); + inet_frag_lru_move(&qp->q); return -EINPROGRESS; err: @@ -525,9 +525,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; + int sum_truesize; + u8 ecn; ipq_kill(qp); + ecn = ip_frag_ecn_table[qp->ecn]; + if (unlikely(ecn == 0xff)) { + err = -EINVAL; + goto out_fail; + } /* Make the one we just received the head. */ if (prev) { head = prev->next; @@ -543,7 +550,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, skb_morph(head, qp->q.fragments); head->next = qp->q.fragments->next; - kfree_skb(qp->q.fragments); + consume_skb(qp->q.fragments); qp->q.fragments = head; } @@ -559,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split @@ -575,62 +582,65 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &qp->q.net->mem); + add_frag_mem_limit(&qp->q, clone->truesize); } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; + + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; } - atomic_sub(head->truesize, &qp->q.net->mem); + sub_frag_mem_limit(&qp->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = qp->q.stamp; + IPCB(head)->frag_max_size = qp->q.max_size; iph = ip_hdr(head); - iph->frag_off = 0; + /* max_size != 0 implies at least one fragment had IP_DF set */ + iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0; iph->tot_len = htons(len); - /* RFC3168 5.3 Fragmentation support - * If one fragment had INET_ECN_NOT_ECT, - * reassembled frame also has INET_ECN_NOT_ECT - * Elif one fragment had INET_ECN_CE - * reassembled frame also has INET_ECN_CE - */ - if (qp->ecn & IPFRAG_ECN_CLEAR) - iph->tos &= ~INET_ECN_MASK; - else if (qp->ecn & IPFRAG_ECN_SET_CE) - iph->tos |= INET_ECN_CE; - + iph->tos |= ecn; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " - "queue %p\n", qp); + LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), + qp); err = -ENOMEM; goto out_fail; out_oversize: - if (net_ratelimit()) - printk(KERN_INFO "Oversized IP packet from %pI4.\n", - &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; @@ -646,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ - if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) - ip_evictor(net); + ip_evictor(net); /* Lookup (or create) queue header */ if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { @@ -668,6 +677,41 @@ int ip_defrag(struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_defrag); +struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +{ + struct iphdr iph; + u32 len; + + if (skb->protocol != htons(ETH_P_IP)) + return skb; + + if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) + return skb; + + if (iph.ihl < 5 || iph.version != 4) + return skb; + + len = ntohs(iph.tot_len); + if (skb->len < len || len < (iph.ihl * 4)) + return skb; + + if (ip_is_fragment(&iph)) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb) { + if (!pskb_may_pull(skb, iph.ihl*4)) + return skb; + if (pskb_trim_rcsum(skb, len)) + return skb; + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + if (ip_defrag(skb, user)) + return NULL; + skb_clear_hash(skb); + } + } + return skb; +} +EXPORT_SYMBOL(ip_check_defrag); + #ifdef CONFIG_SYSCTL static int zero; @@ -729,9 +773,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } - hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + hdr = register_net_sysctl(net, "net/ipv4", table); if (hdr == NULL) goto err_reg; @@ -756,7 +804,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) static void ip4_frags_ctl_register(void) { - register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); + register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static inline int ip4_frags_ns_ctl_register(struct net *net) @@ -775,14 +823,22 @@ static inline void ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. + /* Fragment cache limits. + * + * The fragment memory accounting code, (tries to) account for + * the real memory usage, by measuring both the size of frag + * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) + * and the SKB's truesize. + * + * A 64K fragment consumes 129736 bytes (44*2944)+200 + * (1500 truesize == 2944, sizeof(struct ipq) == 200) + * + * We will commit 4MB at one time. Should we cross that limit + * we will prune down to 3MB, making room for approx 8 big 64K + * fragments 8x128k. */ - net->ipv4.frags.high_thresh = 256 * 1024; - net->ipv4.frags.low_thresh = 192 * 1024; + net->ipv4.frags.high_thresh = 4 * 1024 * 1024; + net->ipv4.frags.low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival |
