aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/ip_fragment.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/ip_fragment.c')
-rw-r--r--net/ipv4/ip_fragment.c208
1 files changed, 107 insertions, 101 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1f23a57aa9e..ed32313e307 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -20,6 +20,8 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -77,40 +79,11 @@ struct ipq {
struct inet_peer *peer;
};
-/* RFC 3168 support :
- * We want to check ECN values of all fragments, do detect invalid combinations.
- * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
- */
-#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
-#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
-#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
-#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
-
static inline u8 ip4_frag_ecn(u8 tos)
{
return 1 << (tos & INET_ECN_MASK);
}
-/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
- * Value : 0xff if frame should be dropped.
- * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
- */
-static const u8 ip4_frag_ecn_table[16] = {
- /* at least one fragment had CE, and others ECT_0 or ECT_1 */
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
- [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
-
- /* invalid combinations : drop frame */
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
- [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
-};
-
static struct inet_frags ip4_frags;
int ip_frag_nqueues(struct net *net)
@@ -120,7 +93,7 @@ int ip_frag_nqueues(struct net *net)
int ip_frag_mem(struct net *net)
{
- return atomic_read(&net->ipv4.frags.mem);
+ return sum_frag_mem_limit(&net->ipv4.frags);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -133,6 +106,7 @@ struct ip4_create_arg {
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
+ net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
@@ -146,29 +120,26 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static int ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
- qp->saddr == arg->iph->saddr &&
- qp->daddr == arg->iph->daddr &&
- qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
-}
-
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf->mem);
- kfree_skb(skb);
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
}
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
+ struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+ frags);
+ struct net *net = container_of(ipv4, struct net, ipv4);
+
struct ip4_create_arg *arg = a;
qp->protocol = arg->iph->protocol;
@@ -178,7 +149,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
qp->daddr = arg->iph->daddr;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ?
- inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
}
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -213,7 +184,7 @@ static void ip_evictor(struct net *net)
{
int evicted;
- evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
+ evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
if (evicted)
IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
}
@@ -249,8 +220,7 @@ static void ip_expire(unsigned long arg)
if (!head->dev)
goto out_rcu_unlock;
- /* skb dst is stale, drop it, and perform route lookup again */
- skb_dst_drop(head);
+ /* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
iph->tos, head->dev);
@@ -262,8 +232,9 @@ static void ip_expire(unsigned long arg)
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (qp->user == IP_DEFRAG_AF_PACKET ||
- (qp->user == IP_DEFRAG_CONNTRACK_IN &&
- skb_rtable(head)->rt_type != RTN_LOCAL))
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
goto out_rcu_unlock;
@@ -293,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
- if (q == NULL)
- goto out_nomem;
-
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
return container_of(q, struct ipq, q);
-
-out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
- return NULL;
}
/* Is the fragment too far ahead to be part of ipq? */
@@ -334,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
atomic_inc(&qp->q.refcnt);
@@ -343,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp);
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
fp = xp;
} while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
qp->q.last_in = 0;
qp->q.len = 0;
@@ -490,7 +462,8 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it);
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
}
}
@@ -513,17 +486,26 @@ found:
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- atomic_add(skb->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
qp->q.last_in |= INET_FRAG_FIRST_IN;
+ if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len + ihl > qp->q.max_size)
+ qp->q.max_size = skb->len + ihl;
+
if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- qp->q.meat == qp->q.len)
- return ip_frag_reasm(qp, prev, dev);
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
- write_lock(&ip4_frags.lock);
- list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
- write_unlock(&ip4_frags.lock);
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -543,11 +525,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
+ int sum_truesize;
u8 ecn;
ipq_kill(qp);
- ecn = ip4_frag_ecn_table[qp->ecn];
+ ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
@@ -567,7 +550,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next;
- kfree_skb(qp->q.fragments);
+ consume_skb(qp->q.fragments);
qp->q.fragments = head;
}
@@ -583,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_oversize;
/* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
/* If the first fragment is fragmented itself, we split
@@ -606,29 +589,44 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, clone->truesize);
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
}
- atomic_sub(head->truesize, &qp->q.net->mem);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
+ IPCB(head)->frag_max_size = qp->q.max_size;
iph = ip_hdr(head);
- iph->frag_off = 0;
+ /* max_size != 0 implies at least one fragment had IP_DF set */
+ iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -637,14 +635,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
return 0;
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
- "queue %p\n", qp);
+ LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
+ qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
- if (net_ratelimit())
- printk(KERN_INFO "Oversized IP packet from %pI4.\n",
- &qp->saddr);
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
@@ -660,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
/* Start by cleaning up the memory. */
- if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
- ip_evictor(net);
+ ip_evictor(net);
/* Lookup (or create) queue header */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
@@ -684,34 +679,33 @@ EXPORT_SYMBOL(ip_defrag);
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
{
- const struct iphdr *iph;
+ struct iphdr iph;
u32 len;
if (skb->protocol != htons(ETH_P_IP))
return skb;
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ if (!skb_copy_bits(skb, 0, &iph, sizeof(iph)))
return skb;
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
+ if (iph.ihl < 5 || iph.version != 4)
return skb;
- if (!pskb_may_pull(skb, iph->ihl*4))
- return skb;
- iph = ip_hdr(skb);
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl * 4))
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < len || len < (iph.ihl * 4))
return skb;
- if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_is_fragment(&iph)) {
skb = skb_share_check(skb, GFP_ATOMIC);
if (skb) {
+ if (!pskb_may_pull(skb, iph.ihl*4))
+ return skb;
if (pskb_trim_rcsum(skb, len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
if (ip_defrag(skb, user))
return NULL;
- skb->rxhash = 0;
+ skb_clear_hash(skb);
}
}
return skb;
@@ -779,9 +773,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[0].data = &net->ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
table[2].data = &net->ipv4.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
- hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+ hdr = register_net_sysctl(net, "net/ipv4", table);
if (hdr == NULL)
goto err_reg;
@@ -806,7 +804,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
static void ip4_frags_ctl_register(void)
{
- register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
#else
static inline int ip4_frags_ns_ctl_register(struct net *net)
@@ -825,14 +823,22 @@ static inline void ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
- /*
- * Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to
- * measurably harm machine performance.
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 256 * 1024;
- net->ipv4.frags.low_thresh = 192 * 1024;
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival