diff options
Diffstat (limited to 'net/ipv4/ip_output.c')
| -rw-r--r-- | net/ipv4/ip_output.c | 513 |
1 files changed, 291 insertions, 222 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 459c011b1d4..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -43,7 +43,6 @@ */ #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -85,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); @@ -102,29 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); - -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - netif_rx_ni(newskb); - return 0; -} +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -140,14 +127,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; @@ -158,14 +145,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); + iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->dst, sk); + ip_select_ident(skb, sk); - if (opt && opt->optlen) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, daddr, rt, 0); + if (opt && opt->opt.optlen) { + iph->ihl += opt->opt.optlen>>2; + ip_options_build(skb, &opt->opt, daddr, rt, 0); } skb->priority = sk->sk_priority; @@ -182,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -199,27 +188,69 @@ static inline int ip_finish_output2(struct sk_buff *skb) } if (skb->sk) skb_set_owner_w(skb2, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = skb2; } - if (dst->hh) - return neigh_hh_output(dst->hh, skb); - else if (dst->neighbour) - return dst->neighbour->output(skb); + rcu_read_lock_bh(); + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + if (!IS_ERR(neigh)) { + int res = dst_neigh_output(dst, neigh, skb); + + rcu_read_unlock_bh(); + return res; + } + rcu_read_unlock_bh(); - if (net_ratelimit()) - printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + net_dbg_ratelimited("%s: No header cache and no neighbour!\n", + __func__); kfree_skb(skb); return -EINVAL; } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) +static int ip_finish_output_gso(struct sk_buff *skb) { - struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + /* common case: locally created skb or seglen is <= mtu */ + if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || + skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) + return ip_finish_output2(skb); + + /* Slowpath - GSO segment length is exceeding the dst MTU. + * + * This can happen in two cases: + * 1) TCP GRO packet, DF bit not set + * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly + * from host network stack. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = ip_fragment(segs, ip_finish_output2); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); - return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + return ret; } static int ip_finish_output(struct sk_buff *skb) @@ -231,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) + if (skb_is_gso(skb)) + return ip_finish_output_gso(skb); + + if (skb->len > ip_skb_dst_mtu(skb)) return ip_fragment(skb, ip_finish_output2); - else - return ip_finish_output2(skb); + + return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -275,7 +308,7 @@ int ip_mc_output(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip_dev_loopback_xmit); + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -290,7 +323,7 @@ int ip_mc_output(struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, ip_dev_loopback_xmit); + NULL, newskb->dev, dev_loopback_xmit); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, @@ -298,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -312,11 +345,26 @@ int ip_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_queue_xmit(struct sk_buff *skb) +/* + * copy saddr and daddr, possibly using 64bit load/stores + * Equivalent to : + * iph->saddr = fl4->saddr; + * iph->daddr = fl4->daddr; + */ +static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) +{ + BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != + offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); + memcpy(&iph->saddr, &fl4->saddr, + sizeof(fl4->saddr) + sizeof(fl4->daddr)); +} + +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt = inet->opt; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; @@ -325,6 +373,8 @@ int ip_queue_xmit(struct sk_buff *skb) * f.e. by something like SCTP. */ rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -336,14 +386,14 @@ int ip_queue_xmit(struct sk_buff *skb) /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; - if(opt && opt->srr) - daddr = opt->faddr; + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), sk, + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, @@ -357,32 +407,32 @@ int ip_queue_xmit(struct sk_buff *skb) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + ip_copy_addrs(iph, fl4); + /* Transport layer set skb->h.foo itself. */ - if (opt && opt->optlen) { - iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->inet_daddr, rt, 0); + if (inet_opt && inet_opt->opt.optlen) { + iph->ihl += inet_opt->opt.optlen >> 2; + ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->dst, sk, - (skb_shinfo(skb)->gso_segs ?: 1) - 1); + ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -416,10 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - to->nf_trace = from->nf_trace; -#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) to->ipvs_property = from->ipvs_property; #endif @@ -453,10 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + mtu = ip_skb_dst_mtu(skb); + if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || + (IPCB(skb)->frag_max_size && + IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(ip_skb_dst_mtu(skb))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -466,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -486,7 +535,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + ip_is_fragment(iph) || skb_cloned(skb)) goto slow_path; @@ -579,6 +628,11 @@ slow_path_clean: } slow_path: + /* for offloaded checksums cleanup checksum before fragmentation */ + if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) + goto fail; + iph = ip_hdr(skb); + left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ @@ -686,7 +740,7 @@ slow_path: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } - kfree_skb(skb); + consume_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; @@ -731,7 +785,7 @@ static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu, unsigned int flags) + int transhdrlen, int maxfraglen, unsigned int flags) { struct sk_buff *skb; int err; @@ -760,21 +814,29 @@ static inline int ip_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - /* specify the length of each IP datagram fragment */ - skb_shinfo(skb)->gso_size = mtu - fragheaderlen; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + __skb_queue_tail(queue, skb); + } else if (skb_is_gso(skb)) { + goto append; } + skb->ip_summed = CHECKSUM_PARTIAL; + /* specify the length of each IP datagram fragment */ + skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append: return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } -static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, +static int __ip_append_data(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, struct inet_cork *cork, + struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, @@ -790,23 +852,24 @@ static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, int copy; int err; int offset = 0; - unsigned int maxfraglen, fragheaderlen; + unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; - exthdrlen = transhdrlen ? rt->dst.header_len : 0; - length += exthdrlen; - transhdrlen += exthdrlen; + skb = skb_peek_tail(queue); + + exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (cork->length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, - mtu-exthdrlen); + if (cork->length + length > maxnonfragsize - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } @@ -820,15 +883,13 @@ static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(queue); - cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, - mtu, flags); + maxfraglen, flags); if (err) goto error; return 0; @@ -878,17 +939,16 @@ alloc_new_skb: else alloclen = fraglen; + alloclen += exthdrlen; + /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) { + if (datalen == length + fraggap) alloclen += rt->dst.trailer_len; - /* make sure mtu is not reached */ - if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) - datalen -= ALIGN(rt->dst.trailer_len, 8); - } + if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -921,11 +981,11 @@ alloc_new_skb: /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen); + data = skb_put(skb, fraglen + exthdrlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); - data += fragheaderlen; + data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -972,46 +1032,30 @@ alloc_new_skb: } } else { int i = skb_shinfo(skb)->nr_frags; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = cork->page; - int off = cork->off; - unsigned int left; - - if (page && (left = PAGE_SIZE - off) > 0) { - if (copy >= left) - copy = left; - if (page != frag->page) { - if (i == MAX_SKB_FRAGS) { - err = -EMSGSIZE; - goto error; - } - get_page(page); - skb_fill_page_desc(skb, i, page, off, 0); - frag = &skb_shinfo(skb)->frags[i]; - } - } else if (i < MAX_SKB_FRAGS) { - if (copy > PAGE_SIZE) - copy = PAGE_SIZE; - page = alloc_pages(sk->sk_allocation, 0); - if (page == NULL) { - err = -ENOMEM; - goto error; - } - cork->page = page; - cork->off = 0; - skb_fill_page_desc(skb, i, page, 0, 0); - frag = &skb_shinfo(skb)->frags[i]; - } else { - err = -EMSGSIZE; - goto error; - } - if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { - err = -EFAULT; + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); } - cork->off += copy; - frag->size += copy; + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -1023,6 +1067,8 @@ alloc_new_skb: return 0; +error_efault: + err = -EFAULT; error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); @@ -1032,8 +1078,7 @@ error: static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { - struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt; + struct ip_options_rcu *opt; struct rtable *rt; /* @@ -1047,7 +1092,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, if (unlikely(cork->opt == NULL)) return -ENOBUFS; } - memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen); + memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); cork->flags |= IPCORK_OPT; cork->addr = ipc->addr; } @@ -1058,13 +1103,14 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * We steal reference to this route, caller should not release it */ *rtp = NULL; - cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + cork->fragsize = ip_sk_use_pmtu(sk) ? + dst_mtu(&rt->dst) : rt->dst.dev->mtu; cork->dst = &rt->dst; cork->length = 0; + cork->ttl = ipc->ttl; + cork->tos = ipc->tos; + cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; - cork->page = NULL; - cork->off = 0; return 0; } @@ -1080,7 +1126,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, * * LATER: length must be adjusted by pad at tail, when it is required. */ -int ip_append_data(struct sock *sk, +int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, @@ -1094,29 +1140,31 @@ int ip_append_data(struct sock *sk, return 0; if (skb_queue_empty(&sk->sk_write_queue)) { - err = ip_setup_cork(sk, &inet->cork, ipc, rtp); + err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); if (err) return err; } else { transhdrlen = 0; } - return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag, + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, + sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } -ssize_t ip_append_page(struct sock *sk, struct page *page, +ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int offset, size_t size, int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; + struct inet_cork *cork; int hh_len; int mtu; int len; int err; - unsigned int maxfraglen, fragheaderlen, fraggap; + unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize; if (inet->hdrincl) return -EPERM; @@ -1127,28 +1175,31 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + cork = &inet->cork.base; + rt = (struct rtable *)cork->dst; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); - mtu = inet->cork.fragsize; + mtu = cork->fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; - if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); + if (cork->length + size > maxnonfragsize - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) return -EINVAL; - inet->cork.length += size; + cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { @@ -1218,7 +1269,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (len > size) len = size; if (skb_can_coalesce(skb, i, page, offset)) { - skb_shinfo(skb)->frags[i-1].size += len; + skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); } else if (i < MAX_SKB_FRAGS) { get_page(page); skb_fill_page_desc(skb, i, page, offset, len); @@ -1243,7 +1294,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return 0; error: - inet->cork.length -= size; + cork->length -= size; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1262,6 +1313,7 @@ static void ip_cork_release(struct inet_cork *cork) * and push them out. */ struct sk_buff *__ip_make_skb(struct sock *sk, + struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork) { @@ -1297,13 +1349,13 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ - if (inet->pmtudisc < IP_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. - * If local_df is set too, we still allow to fragment this frame + * If ignore_df is set too, we still allow to fragment this frame * locally. */ - if (inet->pmtudisc >= IP_PMTUDISC_DO || + if (inet->pmtudisc == IP_PMTUDISC_DO || + inet->pmtudisc == IP_PMTUDISC_PROBE || (skb->len <= dst_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1311,27 +1363,29 @@ struct sk_buff *__ip_make_skb(struct sock *sk, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (rt->rt_type == RTN_MULTICAST) + if (cork->ttl != 0) + ttl = cork->ttl; + else if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else ttl = ip_select_ttl(inet, &rt->dst); - iph = (struct iphdr *)skb->data; + iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; + iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; + iph->frag_off = df; + iph->ttl = ttl; + iph->protocol = sk->sk_protocol; + ip_copy_addrs(iph, fl4); + ip_select_ident(skb, sk); + if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, cork->addr, rt, 0); } - iph->tos = inet->tos; - iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); - iph->ttl = ttl; - iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; - skb->priority = sk->sk_priority; + skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1349,9 +1403,8 @@ out: return skb; } -int ip_send_skb(struct sk_buff *skb) +int ip_send_skb(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); int err; err = ip_local_out(skb); @@ -1365,16 +1418,16 @@ int ip_send_skb(struct sk_buff *skb) return err; } -int ip_push_pending_frames(struct sock *sk) +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) { struct sk_buff *skb; - skb = ip_finish_skb(sk); + skb = ip_finish_skb(sk, fl4); if (!skb) return 0; /* Netfilter gets whole the not fragmented skb. */ - return ip_send_skb(skb); + return ip_send_skb(sock_net(sk), skb); } /* @@ -1394,17 +1447,18 @@ static void __ip_flush_pending_frames(struct sock *sk, void ip_flush_pending_frames(struct sock *sk) { - __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } struct sk_buff *ip_make_skb(struct sock *sk, + struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { - struct inet_cork cork = {}; + struct inet_cork cork; struct sk_buff_head queue; int err; @@ -1413,18 +1467,22 @@ struct sk_buff *ip_make_skb(struct sock *sk, __skb_queue_head_init(&queue); + cork.flags = 0; + cork.addr = 0; + cork.opt = NULL; err = ip_setup_cork(sk, &cork, ipc, rtp); if (err) return ERR_PTR(err); - err = __ip_append_data(sk, &queue, &cork, getfrag, + err = __ip_append_data(sk, fl4, &queue, &cork, + ¤t->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, &cork); return ERR_PTR(err); } - return __ip_make_skb(sk, &queue, &cork); + return __ip_make_skb(sk, fl4, &queue, &cork); } /* @@ -1442,77 +1500,88 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, - unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, + .uc_ttl = -1, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { - struct inet_sock *inet = inet_sk(sk); - struct { - struct ip_options opt; - char data[40]; - } replyopts; + struct ip_options_data replyopts; struct ipcm_cookie ipc; - __be32 daddr; + struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt, skb)) + if (ip_options_echo(&replyopts.opt.opt, skb)) return; - daddr = ipc.addr = rt->rt_src; + ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; - if (replyopts.opt.optlen) { + if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; - if (ipc.opt->srr) - daddr = replyopts.opt.faddr; + if (replyopts.opt.opt.srr) + daddr = replyopts.opt.opt.faddr; } - { - struct flowi4 fl4 = { - .flowi4_oif = arg->bound_dev_if, - .daddr = daddr, - .saddr = rt->rt_spec_dst, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl4_sport = tcp_hdr(skb)->dest, - .fl4_dport = tcp_hdr(skb)->source, - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = ip_reply_arg_flowi_flags(arg), - }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) - return; - } + flowi4_init_output(&fl4, arg->bound_dev_if, + IP4_REPLY_MARK(net, skb->mark), + RT_TOS(arg->tos), + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, + ip_reply_arg_flowi_flags(arg), + daddr, saddr, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); - inet->tos = ip_hdr(skb)->tos; + inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; + ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((__sum16 *)skb_transport_header(skb) + - arg->csumoffset) = csum_fold(csum_add(skb->csum, + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + nskb->ip_summed = CHECKSUM_NONE; + skb_orphan(nskb); + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); + ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } @@ -1522,7 +1591,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } |
