diff options
Diffstat (limited to 'net/ipv4/ip_output.c')
| -rw-r--r-- | net/ipv4/ip_output.c | 1189 |
1 files changed, 691 insertions, 498 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3324fbfe528..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> @@ -22,7 +20,7 @@ * Fixes: * Alan Cox : Missing nonblock feature in ip_build_xmit. * Mike Kilburn : htons() missing in ip_build_xmit. - * Bradford Johnson: Fix faulty handling of some frames when + * Bradford Johnson: Fix faulty handling of some frames when * no route is found. * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by @@ -33,9 +31,9 @@ * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Replace ip_reply with ip_send_reply. - * Andi Kleen : Split fast and slow ip_build_xmit path - * for decreased register pressure on x86 - * and more readibility. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. @@ -45,15 +43,14 @@ */ #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> -#include <linux/config.h> +#include <linux/highmem.h> +#include <linux/slab.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -76,7 +73,6 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> -#include <net/checksum.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -84,94 +80,105 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl = IPDEFTTL; - -static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)); +int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; +EXPORT_SYMBOL(sysctl_ip_default_ttl); /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +EXPORT_SYMBOL(ip_send_check); -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) +int __ip_local_out(struct sk_buff *skb) { - newskb->mac.raw = newskb->data; - __skb_pull(newskb, newskb->nh.raw - newskb->data); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); - netif_rx(newskb); - return 0; + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); } +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output_sk(sk, skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out_sk); + static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; if (ttl < 0) - ttl = dst_metric(dst, RTAX_HOPLIMIT); + ttl = ip4_dst_hoplimit(dst); return ttl; } -/* +/* * Add an ip header to a skbuff and send it out. * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - u32 saddr, u32 daddr, struct ip_options *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)skb->dst; + struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ - if (opt) - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); - + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->u.dst)) + if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->ttl = ip_select_ttl(inet, &rt->dst); + iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); + iph->saddr = saddr; iph->protocol = sk->sk_protocol; - iph->tot_len = htons(skb->len); - ip_select_ident(iph, &rt->u.dst, sk); - skb->nh.iph = iph; + ip_select_ident(skb, sk); - if (opt && opt->optlen) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, daddr, rt, 0); + if (opt && opt->opt.optlen) { + iph->ihl += opt->opt.optlen>>2; + ip_options_build(skb, &opt->opt, daddr, rt, 0); } - ip_send_check(iph); skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; /* Send it out. */ - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } - EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; - struct hh_cache *hh = dst->hh; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; - int hh_len = LL_RESERVED_SPACE(dev); + unsigned int hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + u32 nexthop; + + if (rt->rt_type == RTN_MULTICAST) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); + } else if (rt->rt_type == RTN_BROADCAST) + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ - if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); @@ -181,52 +188,98 @@ static inline int ip_finish_output2(struct sk_buff *skb) } if (skb->sk) skb_set_owner_w(skb2, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = skb2; } - if (hh) { - int hh_alen; + rcu_read_lock_bh(); + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + if (!IS_ERR(neigh)) { + int res = dst_neigh_output(dst, neigh, skb); - read_lock_bh(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(hh->hh_len); - memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); - read_unlock_bh(&hh->hh_lock); - skb_push(skb, hh->hh_len); - return hh->hh_output(skb); - } else if (dst->neighbour) - return dst->neighbour->output(skb); + rcu_read_unlock_bh(); + return res; + } + rcu_read_unlock_bh(); - if (net_ratelimit()) - printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + net_dbg_ratelimited("%s: No header cache and no neighbour!\n", + __func__); kfree_skb(skb); return -EINVAL; } -static inline int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output_gso(struct sk_buff *skb) +{ + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + +static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ - if (skb->dst->xfrm != NULL) - return xfrm4_output_finish(skb); + if (skb_dst(skb)->xfrm != NULL) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } #endif - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; - struct net_device *dev = rt->u.dst.dev; + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -236,7 +289,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST) { - if ((!sk || inet_sk(sk)->mc_loop) + if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped @@ -246,19 +299,21 @@ int ip_mc_output(struct sk_buff *skb) This check is duplicated in ip_mr_input at the moment. */ - && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif - ) { + ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, - ip_dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ - if (skb->nh.iph->ttl == 0) { + if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } @@ -267,114 +322,131 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, ip_dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, dev_loopback_xmit); } - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, - ip_finish_output); + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, + skb->dev, ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; + struct net_device *dev = skb_dst(skb)->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output); + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_queue_xmit(struct sk_buff *skb, int ipfragok) +/* + * copy saddr and daddr, possibly using 64bit load/stores + * Equivalent to : + * iph->saddr = fl4->saddr; + * iph->daddr = fl4->daddr; + */ +static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) +{ + BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != + offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); + memcpy(&iph->saddr, &fl4->saddr, + sizeof(fl4->saddr) + sizeof(fl4->daddr)); +} + +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt = inet->opt; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; + int res; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ - rt = (struct rtable *) skb->dst; + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + fl4 = &fl->u.ip4; + rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (rt == NULL) { - u32 daddr; + __be32 daddr; /* Use correct destination address if we have options. */ - daddr = inet->daddr; - if(opt && opt->srr) - daddr = opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = sk->sk_protocol, - .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; - - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times - * itself out. - */ - if (ip_route_output_flow(&rt, &fl, sk, 0)) - goto no_route; - } - sk_setup_caps(sk, &rt->u.dst); + daddr = inet->inet_daddr; + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), fl4, sk, + daddr, inet->inet_saddr, + inet->inet_dport, + inet->inet_sport, + sk->sk_protocol, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; + sk_setup_caps(sk, &rt->dst); } - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); - *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - iph->tot_len = htons(skb->len); - if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) + skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; - skb->nh.iph = iph; + ip_copy_addrs(iph, fl4); + /* Transport layer set skb->h.foo itself. */ - if (opt && opt->optlen) { - iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->daddr, rt, 0); + if (inet_opt && inet_opt->opt.optlen) { + iph->ihl += inet_opt->opt.optlen >> 2; + ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, - (skb_shinfo(skb)->tso_segs ?: 1) - 1); - - /* Add an IP checksum. */ - ip_send_check(iph); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + res = ip_local_out(skb); + rcu_read_unlock(); + return res; no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + rcu_read_unlock(); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } +EXPORT_SYMBOL(ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) @@ -382,9 +454,10 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - dst_release(to->dst); - to->dst = dst_clone(from->dst); + skb_dst_drop(to); + skb_dst_copy(to, from); to->dev = from->dev; + to->mark = from->mark; /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -392,22 +465,11 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif -#ifdef CONFIG_NETFILTER - to->nfmark = from->nfmark; - /* Connection association is same as pre-frag packet */ - nf_conntrack_put(to->nfct); - to->nfct = from->nfct; - nf_conntrack_get(to->nfct); - to->nfctinfo = from->nfctinfo; + nf_copy(to, from); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(to->nf_bridge); - to->nf_bridge = from->nf_bridge; - nf_bridge_get(to->nf_bridge); -#endif -#endif + skb_copy_secmark(to, from); } /* @@ -417,30 +479,33 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) * single device frame, and queue such a frame for sending. */ -static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; - int raw = 0; int ptr; struct net_device *dev; struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; __be16 not_last_frag; - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->u.dst.dev; + dev = rt->dst.dev; /* * Point into the IP datagram header. */ - iph = skb->nh.iph; + iph = ip_hdr(skb); - if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + mtu = ip_skb_dst_mtu(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -450,7 +515,11 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) + mtu -= nf_bridge_mtu_reduction(skb); +#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -460,34 +529,33 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *frag; + if (skb_has_frag_list(skb)) { + struct sk_buff *frag, *frag2; int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + ip_is_fragment(iph) || skb_cloned(skb)) goto slow_path; - for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen) - goto slow_path; + goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) - goto slow_path; + goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { - sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; - skb->truesize -= frag->truesize; } + skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ @@ -495,7 +563,7 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; - skb_shinfo(skb)->frag_list = NULL; + skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); @@ -507,10 +575,11 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; - frag->h.raw = frag->data; - frag->nh.raw = __skb_push(frag, hlen); - memcpy(frag->nh.raw, iph, hlen); - iph = frag->nh.iph; + skb_reset_transport_header(frag); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iph, hlen); + iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) @@ -525,6 +594,8 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); + if (!err) + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -534,7 +605,7 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -543,22 +614,33 @@ static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } } slow_path: + /* for offloaded checksums cleanup checksum before fragmentation */ + if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) + goto fail; + iph = ip_hdr(skb); + left = skb->len - hlen; /* Space per frame */ - ptr = raw + hlen; /* Where to start from */ + ptr = hlen; /* Where to start from */ -#ifdef CONFIG_BRIDGE_NETFILTER /* for bridged IP traffic encapsulated inside f.e. a vlan header, - * we need to make room for the encapsulating header */ - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb)); - mtu -= nf_bridge_pad(skb); -#else - ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev); -#endif + * we need to make room for the encapsulating header + */ + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); + /* * Fragment the datagram. */ @@ -570,12 +652,12 @@ slow_path: * Keep copying data until we run out. */ - while(left > 0) { + while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) len = mtu; - /* IF: we are not sending upto and including the packet end + /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len &= ~7; @@ -597,8 +679,8 @@ slow_path: ip_copy_metadata(skb2, skb); skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); - skb2->nh.raw = skb2->data; - skb2->h.raw = skb2->data + hlen; + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner @@ -612,19 +694,19 @@ slow_path: * Copy the packet header into the new buffer. */ - memcpy(skb2->nh.raw, skb->data, hlen); + skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ - if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ - iph = skb2->nh.iph; + iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if @@ -648,9 +730,6 @@ slow_path: /* * Put this fragment into the sending queue. */ - - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); - iph->tot_len = htons(len + hlen); ip_send_check(iph); @@ -658,39 +737,43 @@ slow_path: err = output(skb2); if (err) goto fail; + + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } - kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + consume_skb(skb); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: - kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } +EXPORT_SYMBOL(ip_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct iovec *iov = from; - if (skb->ip_summed == CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { if (memcpy_fromiovecend(to, iov, offset, len) < 0) return -EFAULT; } else { - unsigned int csum = 0; + __wsum csum = 0; if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } return 0; } +EXPORT_SYMBOL(ip_generic_getfrag); -static inline unsigned int +static inline __wsum csum_page(struct page *page, int offset, int copy) { char *kaddr; - unsigned int csum; + __wsum csum; kaddr = kmap(page); csum = csum_partial(kaddr + offset, copy, 0); kunmap(page); @@ -698,10 +781,11 @@ csum_page(struct page *page, int offset, int copy) } static inline int ip_ufo_append_data(struct sock *sk, + struct sk_buff_head *queue, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int maxfraglen, unsigned int flags) { struct sk_buff *skb; int err; @@ -710,7 +794,7 @@ static inline int ip_ufo_append_data(struct sock *sk, * device, so create one single skb packet containing complete * udp datagram */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + if ((skb = skb_peek_tail(queue)) == NULL) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); @@ -722,110 +806,70 @@ static inline int ip_ufo_append_data(struct sock *sk, skb_reserve(skb, hh_len); /* create space for UDP/IP header */ - skb_put(skb,fragheaderlen + transhdrlen); + skb_put(skb, fragheaderlen + transhdrlen); /* initialize network header pointer */ - skb->nh.raw = skb->data; + skb_reset_network_header(skb); /* initialize protocol header pointer */ - skb->h.raw = skb->data + fragheaderlen; + skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_HW; skb->csum = 0; - sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); - __skb_queue_tail(&sk->sk_write_queue, skb); - return 0; + __skb_queue_tail(queue, skb); + } else if (skb_is_gso(skb)) { + goto append; } - /* There is not enough support do UFO , - * so follow normal path - */ - kfree_skb(skb); - return err; + + skb->ip_summed = CHECKSUM_PARTIAL; + /* specify the length of each IP datagram fragment */ + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append: + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } -/* - * ip_append_data() and ip_append_page() can make one large IP datagram - * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Each piece can be a page - * or non-page data. - * - * Not only UDP, other transport protocols - e.g. raw sockets - can use - * this interface potentially. - * - * LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable *rt, - unsigned int flags) +static int __ip_append_data(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork, + struct page_frag *pfrag, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - struct ip_options *opt = NULL; + struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; - unsigned int maxfraglen, fragheaderlen; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; + struct rtable *rt = (struct rtable *)cork->dst; - if (flags&MSG_PROBE) - return 0; + skb = skb_peek_tail(queue); - if (skb_queue_empty(&sk->sk_write_queue)) { - /* - * setup for corking. - */ - opt = ipc->opt; - if (opt) { - if (inet->cork.opt == NULL) { - inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); - if (unlikely(inet->cork.opt == NULL)) - return -ENOBUFS; - } - memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); - inet->cork.flags |= IPCORK_OPT; - inet->cork.addr = ipc->addr; - } - dst_hold(&rt->u.dst); - inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); - inet->cork.rt = rt; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->u.dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } - } else { - rt = inet->cork.rt; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + exthdrlen = !skb ? rt->dst.header_len : 0; + mtu = cork->fragsize; - transhdrlen = 0; - exthdrlen = 0; - mtu = inet->cork.fragsize; - } - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + if (cork->length + length > maxnonfragsize - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -835,18 +879,19 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) - csummode = CHECKSUM_HW; - - inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - - if(ip_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, flags)) + csummode = CHECKSUM_PARTIAL; + + cork->length += length; + if (((length > mtu) || (skb && skb_is_gso(skb))) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { + err = ip_ufo_append_data(sk, queue, getfrag, from, length, + hh_len, fragheaderlen, transhdrlen, + maxfraglen, flags); + if (err) goto error; - return 0; } @@ -857,7 +902,7 @@ int ip_append_data(struct sock *sk, * adding appropriate IP header. */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + if (!skb) goto alloc_new_skb; while (length > 0) { @@ -888,33 +933,39 @@ alloc_new_skb: datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; - if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + if ((flags & MSG_MORE) && + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else - alloclen = datalen + fragheaderlen; + alloclen = fraglen; + + alloclen += exthdrlen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length) - alloclen += rt->u.dst.trailer_len; + if (datalen == length + fraggap) + alloclen += rt->dst.trailer_len; if (transhdrlen) { - skb = sock_alloc_send_skb(sk, + skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, + skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else + /* only the initial fragment is + time stamped */ + cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -925,14 +976,16 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); + skb_shinfo(skb)->tx_flags = cork->tx_flags; /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen); - skb->nh.raw = data + exthdrlen; - data += fragheaderlen; - skb->h.raw = data + exthdrlen; + data = skb_put(skb, fraglen + exthdrlen); + skb_set_network_header(skb, exthdrlen); + skb->transport_header = (skb->network_header + + fragheaderlen); + data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -941,7 +994,7 @@ alloc_new_skb: skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; @@ -960,18 +1013,18 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), + if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; @@ -979,50 +1032,34 @@ alloc_new_skb: } } else { int i = skb_shinfo(skb)->nr_frags; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; - unsigned int left; - - if (page && (left = PAGE_SIZE - off) > 0) { - if (copy >= left) - copy = left; - if (page != frag->page) { - if (i == MAX_SKB_FRAGS) { - err = -EMSGSIZE; - goto error; - } - get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); - frag = &skb_shinfo(skb)->frags[i]; - } - } else if (i < MAX_SKB_FRAGS) { - if (copy > PAGE_SIZE) - copy = PAGE_SIZE; - page = alloc_pages(sk->sk_allocation, 0); - if (page == NULL) { - err = -ENOMEM; - goto error; - } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; - - skb_fill_page_desc(skb, i, page, 0, 0); - frag = &skb_shinfo(skb)->frags[i]; - skb->truesize += PAGE_SIZE; - atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); - } else { - err = -EMSGSIZE; - goto error; - } - if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { - err = -EFAULT; + + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); } - sk->sk_sndmsg_off += copy; - frag->size += copy; + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; @@ -1030,24 +1067,104 @@ alloc_new_skb: return 0; +error_efault: + err = -EFAULT; error: - inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); - return err; + cork->length -= length; + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + return err; } -ssize_t ip_append_page(struct sock *sk, struct page *page, +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, + struct ipcm_cookie *ipc, struct rtable **rtp) +{ + struct ip_options_rcu *opt; + struct rtable *rt; + + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (cork->opt == NULL) { + cork->opt = kmalloc(sizeof(struct ip_options) + 40, + sk->sk_allocation); + if (unlikely(cork->opt == NULL)) + return -ENOBUFS; + } + memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); + cork->flags |= IPCORK_OPT; + cork->addr = ipc->addr; + } + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : rt->dst.dev->mtu; + cork->dst = &rt->dst; + cork->length = 0; + cork->ttl = ipc->ttl; + cork->tos = ipc->tos; + cork->priority = ipc->priority; + cork->tx_flags = ipc->tx_flags; + + return 0; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); + if (err) + return err; + } else { + transhdrlen = 0; + } + + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, + sk_page_frag(sk), getfrag, + from, length, transhdrlen, flags); +} + +ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int offset, size_t size, int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; + struct inet_cork *cork; int hh_len; int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen, fraggap; + unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; if (inet->hdrincl) return -EPERM; @@ -1058,37 +1175,43 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = inet->cork.rt; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + cork = &inet->cork.base; + rt = (struct rtable *)cork->dst; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - mtu = inet->cork.fragsize; + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + mtu = cork->fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + if (cork->length + size > maxnonfragsize - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) return -EINVAL; - inet->cork.length += size; - if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + cork->length += size; + if ((size + skb->len > mtu) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + } while (size > 0) { int i; - if (skb_shinfo(skb)->ufo_size) + if (skb_is_gso(skb)) len = size; else { @@ -1099,8 +1222,6 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (len <= 0) { struct sk_buff *skb_prev; - char *data; - struct iphdr *iph; int alloclen; skb_prev = skb; @@ -1123,18 +1244,18 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, /* * Find where to start putting bytes. */ - data = skb_put(skb, fragheaderlen + fraggap); - skb->nh.iph = iph = (struct iphdr *)data; - data += fragheaderlen; - skb->h.raw = data; - + skb_put(skb, fragheaderlen + fraggap); + skb_reset_network_header(skb); + skb->transport_header = (skb->network_header + + fragheaderlen); if (fraggap) { - skb->csum = skb_copy_and_csum_bits( - skb_prev, maxfraglen, - data, fraggap, 0); + skb->csum = skb_copy_and_csum_bits(skb_prev, + maxfraglen, + skb_transport_header(skb), + fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); - skb_trim(skb_prev, maxfraglen); + pskb_trim_unique(skb_prev, maxfraglen); } /* @@ -1148,7 +1269,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (len > size) len = size; if (skb_can_coalesce(skb, i, page, offset)) { - skb_shinfo(skb)->frags[i-1].size += len; + skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); } else if (i < MAX_SKB_FRAGS) { get_page(page); skb_fill_page_desc(skb, i, page, offset, len); @@ -1158,55 +1279,68 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, } if (skb->ip_summed == CHECKSUM_NONE) { - unsigned int csum; + __wsum csum; csum = csum_page(page, offset, len); skb->csum = csum_block_add(skb->csum, csum, skb->len); } skb->len += len; skb->data_len += len; + skb->truesize += len; + atomic_add(len, &sk->sk_wmem_alloc); offset += len; size -= len; } return 0; error: - inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + cork->length -= size; + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } +static void ip_cork_release(struct inet_cork *cork) +{ + cork->flags &= ~IPCORK_OPT; + kfree(cork->opt); + cork->opt = NULL; + dst_release(cork->dst); + cork->dst = NULL; +} + /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = inet->cork.rt; + struct rtable *rt = (struct rtable *)cork->dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; - if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + if ((skb = __skb_dequeue(queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ - if (skb->data < skb->nh.raw) - __skb_pull(skb, skb->nh.raw - skb->data); - while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { + __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; - __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } @@ -1215,176 +1349,239 @@ int ip_push_pending_frames(struct sock *sk) * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc != IP_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc == IP_PMTUDISC_DO || - (skb->len <= dst_mtu(&rt->u.dst) && - ip_dont_fragment(sk, &rt->u.dst))) + inet->pmtudisc == IP_PMTUDISC_PROBE || + (skb->len <= dst_mtu(&rt->dst) && + ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; - if (rt->rt_type == RTN_MULTICAST) + if (cork->ttl != 0) + ttl = cork->ttl; + else if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else - ttl = ip_select_ttl(inet, &rt->u.dst); + ttl = ip_select_ttl(inet, &rt->dst); - iph = (struct iphdr *)skb->data; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - if (opt) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, inet->cork.addr, rt, 0); - } - iph->tos = inet->tos; - iph->tot_len = htons(skb->len); + iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; iph->frag_off = df; - if (!df) { - __ip_select_ident(iph, &rt->u.dst, 0); - } else { - iph->id = htons(inet->id++); - } iph->ttl = ttl; iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; - ip_send_check(iph); + ip_copy_addrs(iph, fl4); + ip_select_ident(skb, sk); - skb->priority = sk->sk_priority; - skb->dst = dst_clone(&rt->u.dst); + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, cork->addr, rt, 0); + } - /* Netfilter gets whole the not fragmented skb. */ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; + skb->mark = sk->sk_mark; + /* + * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec + * on dst refcount + */ + cork->dst = NULL; + skb_dst_set(skb, &rt->dst); + + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); + + ip_cork_release(cork); +out: + return skb; +} + +int ip_send_skb(struct net *net, struct sk_buff *skb) +{ + int err; + + err = ip_local_out(skb); if (err) { if (err > 0) - err = inet->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) - goto error; + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } -out: - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } return err; +} -error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); - goto out; +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) +{ + struct sk_buff *skb; + + skb = ip_finish_skb(sk, fl4); + if (!skb) + return 0; + + /* Netfilter gets whole the not fragmented skb. */ + return ip_send_skb(sock_net(sk), skb); } /* * Throw away all pending data on the socket. */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(cork); } +void ip_flush_pending_frames(struct sock *sk) +{ + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); +} + +struct sk_buff *ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_cork cork; + struct sk_buff_head queue; + int err; + + if (flags & MSG_PROBE) + return NULL; + + __skb_queue_head_init(&queue); + + cork.flags = 0; + cork.addr = 0; + cork.opt = NULL; + err = ip_setup_cork(sk, &cork, ipc, rtp); + if (err) + return ERR_PTR(err); + + err = __ip_append_data(sk, fl4, &queue, &cork, + ¤t->task_frag, getfrag, + from, length, transhdrlen, flags); + if (err) { + __ip_flush_pending_frames(sk, &queue, &cork); + return ERR_PTR(err); + } + + return __ip_make_skb(sk, fl4, &queue, &cork); +} /* * Fetch data from kernel space and fill in checksum if needed. */ -static int ip_reply_glue_bits(void *dptr, char *to, int offset, +static int ip_reply_glue_bits(void *dptr, char *to, int offset, int len, int odd, struct sk_buff *skb) { - unsigned int csum; + __wsum csum; csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); skb->csum = csum_block_add(skb->csum, csum, odd); - return 0; + return 0; } -/* +/* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. - * - * LATER: switch from ip_build_xmit to ip_append_* + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, - unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, + .uc_ttl = -1, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { - struct inet_sock *inet = inet_sk(sk); - struct { - struct ip_options opt; - char data[40]; - } replyopts; + struct ip_options_data replyopts; struct ipcm_cookie ipc; - u32 daddr; - struct rtable *rt = (struct rtable*)skb->dst; + struct flowi4 fl4; + struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt, skb)) + if (ip_options_echo(&replyopts.opt.opt, skb)) return; - daddr = ipc.addr = rt->rt_src; + ipc.addr = daddr; ipc.opt = NULL; + ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; - if (replyopts.opt.optlen) { + if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; - if (ipc.opt->srr) - daddr = replyopts.opt.faddr; + if (replyopts.opt.opt.srr) + daddr = replyopts.opt.opt.faddr; } - { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = rt->rt_spec_dst, - .tos = RT_TOS(skb->nh.iph->tos) } }, - /* Not quite clean, but right. */ - .uli_u = { .ports = - { .sport = skb->h.th->dest, - .dport = skb->h.th->source } }, - .proto = sk->sk_protocol }; - if (ip_route_output_key(&rt, &fl)) - return; - } + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), + RT_TOS(arg->tos), + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, + ip_reply_arg_flowi_flags(arg), + daddr, saddr, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); - inet->tos = skb->nh.iph->tos; + inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; - sk->sk_protocol = skb->nh.iph->protocol; - ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, - &ipc, rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + sk->sk_protocol = ip_hdr(skb)->protocol; + sk->sk_bound_dev_if = arg->bound_dev_if; + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; + ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + &ipc, &rt, MSG_DONTWAIT); + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, + arg->csum)); + nskb->ip_summed = CHECKSUM_NONE; + skb_orphan(nskb); + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); + ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } @@ -1394,11 +1591,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } - -EXPORT_SYMBOL(ip_generic_getfrag); -EXPORT_SYMBOL(ip_queue_xmit); -EXPORT_SYMBOL(ip_send_check); |
