diff options
Diffstat (limited to 'net/ipv4/ip_output.c')
| -rw-r--r-- | net/ipv4/ip_output.c | 771 | 
1 files changed, 458 insertions, 313 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5090c7ff525..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -43,7 +43,6 @@   */  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/module.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -82,9 +81,10 @@  #include <linux/tcp.h>  int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; +EXPORT_SYMBOL(sysctl_ip_default_ttl);  /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph)  {  	iph->check = 0;  	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); @@ -101,36 +101,24 @@ int __ip_local_out(struct sk_buff *skb)  		       skb_dst(skb)->dev, dst_output);  } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)  {  	int err;  	err = __ip_local_out(skb);  	if (likely(err == 1)) -		err = dst_output(skb); +		err = dst_output_sk(sk, skb);  	return err;  } -EXPORT_SYMBOL_GPL(ip_local_out); - -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ -	skb_reset_mac_header(newskb); -	__skb_pull(newskb, skb_network_offset(newskb)); -	newskb->pkt_type = PACKET_LOOPBACK; -	newskb->ip_summed = CHECKSUM_UNNECESSARY; -	WARN_ON(!skb_dst(newskb)); -	netif_rx_ni(newskb); -	return 0; -} +EXPORT_SYMBOL_GPL(ip_local_out_sk);  static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)  {  	int ttl = inet->uc_ttl;  	if (ttl < 0) -		ttl = dst_metric(dst, RTAX_HOPLIMIT); +		ttl = ip4_dst_hoplimit(dst);  	return ttl;  } @@ -139,14 +127,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)   *   */  int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, -			  __be32 saddr, __be32 daddr, struct ip_options *opt) +			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)  {  	struct inet_sock *inet = inet_sk(sk);  	struct rtable *rt = skb_rtable(skb);  	struct iphdr *iph;  	/* Build the IP header. */ -	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); +	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	iph->version  = 4; @@ -157,14 +145,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,  	else  		iph->frag_off = 0;  	iph->ttl      = ip_select_ttl(inet, &rt->dst); -	iph->daddr    = rt->rt_dst; -	iph->saddr    = rt->rt_src; +	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr); +	iph->saddr    = saddr;  	iph->protocol = sk->sk_protocol; -	ip_select_ident(iph, &rt->dst, sk); +	ip_select_ident(skb, sk); -	if (opt && opt->optlen) { -		iph->ihl += opt->optlen>>2; -		ip_options_build(skb, opt, daddr, rt, 0); +	if (opt && opt->opt.optlen) { +		iph->ihl += opt->opt.optlen>>2; +		ip_options_build(skb, &opt->opt, daddr, rt, 0);  	}  	skb->priority = sk->sk_priority; @@ -181,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)  	struct rtable *rt = (struct rtable *)dst;  	struct net_device *dev = dst->dev;  	unsigned int hh_len = LL_RESERVED_SPACE(dev); +	struct neighbour *neigh; +	u32 nexthop;  	if (rt->rt_type == RTN_MULTICAST) {  		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -198,27 +188,69 @@ static inline int ip_finish_output2(struct sk_buff *skb)  		}  		if (skb->sk)  			skb_set_owner_w(skb2, skb->sk); -		kfree_skb(skb); +		consume_skb(skb);  		skb = skb2;  	} -	if (dst->hh) -		return neigh_hh_output(dst->hh, skb); -	else if (dst->neighbour) -		return dst->neighbour->output(skb); +	rcu_read_lock_bh(); +	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); +	neigh = __ipv4_neigh_lookup_noref(dev, nexthop); +	if (unlikely(!neigh)) +		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); +	if (!IS_ERR(neigh)) { +		int res = dst_neigh_output(dst, neigh, skb); -	if (net_ratelimit()) -		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); +		rcu_read_unlock_bh(); +		return res; +	} +	rcu_read_unlock_bh(); + +	net_dbg_ratelimited("%s: No header cache and no neighbour!\n", +			    __func__);  	kfree_skb(skb);  	return -EINVAL;  } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) +static int ip_finish_output_gso(struct sk_buff *skb)  { -	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; +	netdev_features_t features; +	struct sk_buff *segs; +	int ret = 0; + +	/* common case: locally created skb or seglen is <= mtu */ +	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || +	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) +		return ip_finish_output2(skb); -	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? -	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +	/* Slowpath -  GSO segment length is exceeding the dst MTU. +	 * +	 * This can happen in two cases: +	 * 1) TCP GRO packet, DF bit not set +	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly +	 * from host network stack. +	 */ +	features = netif_skb_features(skb); +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); +	if (IS_ERR(segs)) { +		kfree_skb(skb); +		return -ENOMEM; +	} + +	consume_skb(skb); + +	do { +		struct sk_buff *nskb = segs->next; +		int err; + +		segs->next = NULL; +		err = ip_fragment(segs, ip_finish_output2); + +		if (err && ret == 0) +			ret = err; +		segs = nskb; +	} while (segs); + +	return ret;  }  static int ip_finish_output(struct sk_buff *skb) @@ -230,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)  		return dst_output(skb);  	}  #endif -	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) +	if (skb_is_gso(skb)) +		return ip_finish_output_gso(skb); + +	if (skb->len > ip_skb_dst_mtu(skb))  		return ip_fragment(skb, ip_finish_output2); -	else -		return ip_finish_output2(skb); + +	return ip_finish_output2(skb);  } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb)  { -	struct sock *sk = skb->sk;  	struct rtable *rt = skb_rtable(skb);  	struct net_device *dev = rt->dst.dev; @@ -274,7 +308,7 @@ int ip_mc_output(struct sk_buff *skb)  			if (newskb)  				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,  					newskb, NULL, newskb->dev, -					ip_dev_loopback_xmit); +					dev_loopback_xmit);  		}  		/* Multicasts with ttl 0 must not go beyond the host */ @@ -289,7 +323,7 @@ int ip_mc_output(struct sk_buff *skb)  		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);  		if (newskb)  			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, -				NULL, newskb->dev, ip_dev_loopback_xmit); +				NULL, newskb->dev, dev_loopback_xmit);  	}  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, @@ -297,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb)  {  	struct net_device *dev = skb_dst(skb)->dev; @@ -311,11 +345,26 @@ int ip_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_queue_xmit(struct sk_buff *skb) +/* + * copy saddr and daddr, possibly using 64bit load/stores + * Equivalent to : + *   iph->saddr = fl4->saddr; + *   iph->daddr = fl4->daddr; + */ +static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) +{ +	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != +		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); +	memcpy(&iph->saddr, &fl4->saddr, +	       sizeof(fl4->saddr) + sizeof(fl4->daddr)); +} + +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)  { -	struct sock *sk = skb->sk;  	struct inet_sock *inet = inet_sk(sk); -	struct ip_options *opt = inet->opt; +	struct ip_options_rcu *inet_opt; +	struct flowi4 *fl4;  	struct rtable *rt;  	struct iphdr *iph;  	int res; @@ -324,6 +373,8 @@ int ip_queue_xmit(struct sk_buff *skb)  	 * f.e. by something like SCTP.  	 */  	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt); +	fl4 = &fl->u.ip4;  	rt = skb_rtable(skb);  	if (rt != NULL)  		goto packet_routed; @@ -335,59 +386,53 @@ int ip_queue_xmit(struct sk_buff *skb)  		/* Use correct destination address if we have options. */  		daddr = inet->inet_daddr; -		if(opt && opt->srr) -			daddr = opt->faddr; - -		{ -			struct flowi fl = { .oif = sk->sk_bound_dev_if, -					    .mark = sk->sk_mark, -					    .fl4_dst = daddr, -					    .fl4_src = inet->inet_saddr, -					    .fl4_tos = RT_CONN_FLAGS(sk), -					    .proto = sk->sk_protocol, -					    .flags = inet_sk_flowi_flags(sk), -					    .fl_ip_sport = inet->inet_sport, -					    .fl_ip_dport = inet->inet_dport }; - -			/* If this fails, retransmit mechanism of transport layer will -			 * keep trying until route appears or the connection times -			 * itself out. -			 */ -			security_sk_classify_flow(sk, &fl); -			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) -				goto no_route; -		} +		if (inet_opt && inet_opt->opt.srr) +			daddr = inet_opt->opt.faddr; + +		/* If this fails, retransmit mechanism of transport layer will +		 * keep trying until route appears or the connection times +		 * itself out. +		 */ +		rt = ip_route_output_ports(sock_net(sk), fl4, sk, +					   daddr, inet->inet_saddr, +					   inet->inet_dport, +					   inet->inet_sport, +					   sk->sk_protocol, +					   RT_CONN_FLAGS(sk), +					   sk->sk_bound_dev_if); +		if (IS_ERR(rt)) +			goto no_route;  		sk_setup_caps(sk, &rt->dst);  	}  	skb_dst_set_noref(skb, &rt->dst);  packet_routed: -	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) +	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)  		goto no_route;  	/* OK, we know where to send it, allocate and build IP header. */ -	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); +	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); -	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) +	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)  		iph->frag_off = htons(IP_DF);  	else  		iph->frag_off = 0;  	iph->ttl      = ip_select_ttl(inet, &rt->dst);  	iph->protocol = sk->sk_protocol; -	iph->saddr    = rt->rt_src; -	iph->daddr    = rt->rt_dst; +	ip_copy_addrs(iph, fl4); +  	/* Transport layer set skb->h.foo itself. */ -	if (opt && opt->optlen) { -		iph->ihl += opt->optlen >> 2; -		ip_options_build(skb, opt, inet->inet_daddr, rt, 0); +	if (inet_opt && inet_opt->opt.optlen) { +		iph->ihl += inet_opt->opt.optlen >> 2; +		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);  	} -	ip_select_ident_more(iph, &rt->dst, sk, -			     (skb_shinfo(skb)->gso_segs ?: 1) - 1); +	ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); +	/* TODO : should we use skb->sk here instead of sk ? */  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; @@ -421,10 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)  	to->tc_index = from->tc_index;  #endif  	nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -	to->nf_trace = from->nf_trace; -#endif  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)  	to->ipvs_property = from->ipvs_property;  #endif @@ -458,10 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	iph = ip_hdr(skb); -	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { +	mtu = ip_skb_dst_mtu(skb); +	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || +		     (IPCB(skb)->frag_max_size && +		      IPCB(skb)->frag_max_size > mtu))) {  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(ip_skb_dst_mtu(skb))); +			  htonl(mtu));  		kfree_skb(skb);  		return -EMSGSIZE;  	} @@ -471,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	 */  	hlen = iph->ihl * 4; -	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */ +	mtu = mtu - hlen;	/* Size of data space */  #ifdef CONFIG_BRIDGE_NETFILTER  	if (skb->nf_bridge)  		mtu -= nf_bridge_mtu_reduction(skb); @@ -491,7 +535,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  		if (first_len - hlen > mtu ||  		    ((first_len - hlen) & 7) || -		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) || +		    ip_is_fragment(iph) ||  		    skb_cloned(skb))  			goto slow_path; @@ -584,6 +628,11 @@ slow_path_clean:  	}  slow_path: +	/* for offloaded checksums cleanup checksum before fragmentation */ +	if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) +		goto fail; +	iph = ip_hdr(skb); +  	left = skb->len - hlen;		/* Space per frame */  	ptr = hlen;		/* Where to start from */ @@ -608,7 +657,7 @@ slow_path:  		/* IF: it doesn't fit, use 'mtu' - the data space left */  		if (len > mtu)  			len = mtu; -		/* IF: we are not sending upto and including the packet end +		/* IF: we are not sending up to and including the packet end  		   then align the next start on an eight byte boundary */  		if (len < left)	{  			len &= ~7; @@ -691,7 +740,7 @@ slow_path:  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);  	} -	kfree_skb(skb); +	consume_skb(skb);  	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);  	return err; @@ -732,10 +781,11 @@ csum_page(struct page *page, int offset, int copy)  }  static inline int ip_ufo_append_data(struct sock *sk, +			struct sk_buff_head *queue,  			int getfrag(void *from, char *to, int offset, int len,  			       int odd, struct sk_buff *skb),  			void *from, int length, int hh_len, int fragheaderlen, -			int transhdrlen, int mtu, unsigned int flags) +			int transhdrlen, int maxfraglen, unsigned int flags)  {  	struct sk_buff *skb;  	int err; @@ -744,7 +794,7 @@ static inline int ip_ufo_append_data(struct sock *sk,  	 * device, so create one single skb packet containing complete  	 * udp datagram  	 */ -	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { +	if ((skb = skb_peek_tail(queue)) == NULL) {  		skb = sock_alloc_send_skb(sk,  			hh_len + fragheaderlen + transhdrlen + 20,  			(flags & MSG_DONTWAIT), &err); @@ -764,104 +814,62 @@ static inline int ip_ufo_append_data(struct sock *sk,  		/* initialize protocol header pointer */  		skb->transport_header = skb->network_header + fragheaderlen; -		skb->ip_summed = CHECKSUM_PARTIAL;  		skb->csum = 0; -		sk->sk_sndmsg_off = 0; -		/* specify the length of each IP datagram fragment */ -		skb_shinfo(skb)->gso_size = mtu - fragheaderlen; -		skb_shinfo(skb)->gso_type = SKB_GSO_UDP; -		__skb_queue_tail(&sk->sk_write_queue, skb); + +		__skb_queue_tail(queue, skb); +	} else if (skb_is_gso(skb)) { +		goto append;  	} +	skb->ip_summed = CHECKSUM_PARTIAL; +	/* specify the length of each IP datagram fragment */ +	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; +	skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append:  	return skb_append_datato_frags(sk, skb, getfrag, from,  				       (length - transhdrlen));  } -/* - *	ip_append_data() and ip_append_page() can make one large IP datagram - *	from many pieces of data. Each pieces will be holded on the socket - *	until ip_push_pending_frames() is called. Each piece can be a page - *	or non-page data. - * - *	Not only UDP, other transport protocols - e.g. raw sockets - can use - *	this interface potentially. - * - *	LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, -		   int getfrag(void *from, char *to, int offset, int len, -			       int odd, struct sk_buff *skb), -		   void *from, int length, int transhdrlen, -		   struct ipcm_cookie *ipc, struct rtable **rtp, -		   unsigned int flags) +static int __ip_append_data(struct sock *sk, +			    struct flowi4 *fl4, +			    struct sk_buff_head *queue, +			    struct inet_cork *cork, +			    struct page_frag *pfrag, +			    int getfrag(void *from, char *to, int offset, +					int len, int odd, struct sk_buff *skb), +			    void *from, int length, int transhdrlen, +			    unsigned int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	struct sk_buff *skb; -	struct ip_options *opt = NULL; +	struct ip_options *opt = cork->opt;  	int hh_len;  	int exthdrlen;  	int mtu;  	int copy;  	int err;  	int offset = 0; -	unsigned int maxfraglen, fragheaderlen; +	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;  	int csummode = CHECKSUM_NONE; -	struct rtable *rt; +	struct rtable *rt = (struct rtable *)cork->dst; -	if (flags&MSG_PROBE) -		return 0; +	skb = skb_peek_tail(queue); -	if (skb_queue_empty(&sk->sk_write_queue)) { -		/* -		 * setup for corking. -		 */ -		opt = ipc->opt; -		if (opt) { -			if (inet->cork.opt == NULL) { -				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); -				if (unlikely(inet->cork.opt == NULL)) -					return -ENOBUFS; -			} -			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); -			inet->cork.flags |= IPCORK_OPT; -			inet->cork.addr = ipc->addr; -		} -		rt = *rtp; -		if (unlikely(!rt)) -			return -EFAULT; -		/* -		 * We steal reference to this route, caller should not release it -		 */ -		*rtp = NULL; -		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? -					    rt->dst.dev->mtu : -					    dst_mtu(rt->dst.path); -		inet->cork.dst = &rt->dst; -		inet->cork.length = 0; -		sk->sk_sndmsg_page = NULL; -		sk->sk_sndmsg_off = 0; -		exthdrlen = rt->dst.header_len; -		length += exthdrlen; -		transhdrlen += exthdrlen; -	} else { -		rt = (struct rtable *)inet->cork.dst; -		if (inet->cork.flags & IPCORK_OPT) -			opt = inet->cork.opt; +	exthdrlen = !skb ? rt->dst.header_len : 0; +	mtu = cork->fragsize; -		transhdrlen = 0; -		exthdrlen = 0; -		mtu = inet->cork.fragsize; -	}  	hh_len = LL_RESERVED_SPACE(rt->dst.dev);  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (inet->cork.length + length > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, -			       mtu-exthdrlen); +	if (cork->length + length > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -875,15 +883,13 @@ int ip_append_data(struct sock *sk,  	    !exthdrlen)  		csummode = CHECKSUM_PARTIAL; -	skb = skb_peek_tail(&sk->sk_write_queue); - -	inet->cork.length += length; +	cork->length += length;  	if (((length > mtu) || (skb && skb_is_gso(skb))) &&  	    (sk->sk_protocol == IPPROTO_UDP) && -	    (rt->dst.dev->features & NETIF_F_UFO)) { -		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, -					 fragheaderlen, transhdrlen, mtu, -					 flags); +	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { +		err = ip_ufo_append_data(sk, queue, getfrag, from, length, +					 hh_len, fragheaderlen, transhdrlen, +					 maxfraglen, flags);  		if (err)  			goto error;  		return 0; @@ -933,17 +939,16 @@ alloc_new_skb:  			else  				alloclen = fraglen; +			alloclen += exthdrlen; +  			/* The last fragment gets additional space at tail.  			 * Note, with MSG_MORE we overallocate on fragments,  			 * because we have no idea what fragment will be  			 * the last.  			 */ -			if (datalen == length + fraggap) { +			if (datalen == length + fraggap)  				alloclen += rt->dst.trailer_len; -				/* make sure mtu is not reached */ -				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) -					datalen -= ALIGN(rt->dst.trailer_len, 8); -			} +  			if (transhdrlen) {  				skb = sock_alloc_send_skb(sk,  						alloclen + hh_len + 15, @@ -960,7 +965,7 @@ alloc_new_skb:  				else  					/* only the initial fragment is  					   time stamped */ -					ipc->tx_flags = 0; +					cork->tx_flags = 0;  			}  			if (skb == NULL)  				goto error; @@ -971,16 +976,16 @@ alloc_new_skb:  			skb->ip_summed = csummode;  			skb->csum = 0;  			skb_reserve(skb, hh_len); -			skb_shinfo(skb)->tx_flags = ipc->tx_flags; +			skb_shinfo(skb)->tx_flags = cork->tx_flags;  			/*  			 *	Find where to start putting bytes.  			 */ -			data = skb_put(skb, fraglen); +			data = skb_put(skb, fraglen + exthdrlen);  			skb_set_network_header(skb, exthdrlen);  			skb->transport_header = (skb->network_header +  						 fragheaderlen); -			data += fragheaderlen; +			data += fragheaderlen + exthdrlen;  			if (fraggap) {  				skb->csum = skb_copy_and_csum_bits( @@ -1008,7 +1013,7 @@ alloc_new_skb:  			/*  			 * Put the packet on the pending queue.  			 */ -			__skb_queue_tail(&sk->sk_write_queue, skb); +			__skb_queue_tail(queue, skb);  			continue;  		} @@ -1027,46 +1032,30 @@ alloc_new_skb:  			}  		} else {  			int i = skb_shinfo(skb)->nr_frags; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; -			struct page *page = sk->sk_sndmsg_page; -			int off = sk->sk_sndmsg_off; -			unsigned int left; - -			if (page && (left = PAGE_SIZE - off) > 0) { -				if (copy >= left) -					copy = left; -				if (page != frag->page) { -					if (i == MAX_SKB_FRAGS) { -						err = -EMSGSIZE; -						goto error; -					} -					get_page(page); -					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); -					frag = &skb_shinfo(skb)->frags[i]; -				} -			} else if (i < MAX_SKB_FRAGS) { -				if (copy > PAGE_SIZE) -					copy = PAGE_SIZE; -				page = alloc_pages(sk->sk_allocation, 0); -				if (page == NULL)  { -					err = -ENOMEM; -					goto error; -				} -				sk->sk_sndmsg_page = page; -				sk->sk_sndmsg_off = 0; -				skb_fill_page_desc(skb, i, page, 0, 0); -				frag = &skb_shinfo(skb)->frags[i]; -			} else { -				err = -EMSGSIZE; -				goto error; -			} -			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { -				err = -EFAULT; +			err = -ENOMEM; +			if (!sk_page_frag_refill(sk, pfrag))  				goto error; + +			if (!skb_can_coalesce(skb, i, pfrag->page, +					      pfrag->offset)) { +				err = -EMSGSIZE; +				if (i == MAX_SKB_FRAGS) +					goto error; + +				__skb_fill_page_desc(skb, i, pfrag->page, +						     pfrag->offset, 0); +				skb_shinfo(skb)->nr_frags = ++i; +				get_page(pfrag->page);  			} -			sk->sk_sndmsg_off += copy; -			frag->size += copy; +			copy = min_t(int, copy, pfrag->size - pfrag->offset); +			if (getfrag(from, +				    page_address(pfrag->page) + pfrag->offset, +				    offset, copy, skb->len, skb) < 0) +				goto error_efault; + +			pfrag->offset += copy; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  			skb->len += copy;  			skb->data_len += copy;  			skb->truesize += copy; @@ -1078,24 +1067,104 @@ alloc_new_skb:  	return 0; +error_efault: +	err = -EFAULT;  error: -	inet->cork.length -= length; +	cork->length -= length;  	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);  	return err;  } -ssize_t	ip_append_page(struct sock *sk, struct page *page, +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, +			 struct ipcm_cookie *ipc, struct rtable **rtp) +{ +	struct ip_options_rcu *opt; +	struct rtable *rt; + +	/* +	 * setup for corking. +	 */ +	opt = ipc->opt; +	if (opt) { +		if (cork->opt == NULL) { +			cork->opt = kmalloc(sizeof(struct ip_options) + 40, +					    sk->sk_allocation); +			if (unlikely(cork->opt == NULL)) +				return -ENOBUFS; +		} +		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); +		cork->flags |= IPCORK_OPT; +		cork->addr = ipc->addr; +	} +	rt = *rtp; +	if (unlikely(!rt)) +		return -EFAULT; +	/* +	 * We steal reference to this route, caller should not release it +	 */ +	*rtp = NULL; +	cork->fragsize = ip_sk_use_pmtu(sk) ? +			 dst_mtu(&rt->dst) : rt->dst.dev->mtu; +	cork->dst = &rt->dst; +	cork->length = 0; +	cork->ttl = ipc->ttl; +	cork->tos = ipc->tos; +	cork->priority = ipc->priority; +	cork->tx_flags = ipc->tx_flags; + +	return 0; +} + +/* + *	ip_append_data() and ip_append_page() can make one large IP datagram + *	from many pieces of data. Each pieces will be holded on the socket + *	until ip_push_pending_frames() is called. Each piece can be a page + *	or non-page data. + * + *	Not only UDP, other transport protocols - e.g. raw sockets - can use + *	this interface potentially. + * + *	LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, struct flowi4 *fl4, +		   int getfrag(void *from, char *to, int offset, int len, +			       int odd, struct sk_buff *skb), +		   void *from, int length, int transhdrlen, +		   struct ipcm_cookie *ipc, struct rtable **rtp, +		   unsigned int flags) +{ +	struct inet_sock *inet = inet_sk(sk); +	int err; + +	if (flags&MSG_PROBE) +		return 0; + +	if (skb_queue_empty(&sk->sk_write_queue)) { +		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); +		if (err) +			return err; +	} else { +		transhdrlen = 0; +	} + +	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, +				sk_page_frag(sk), getfrag, +				from, length, transhdrlen, flags); +} + +ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  		       int offset, size_t size, int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	struct sk_buff *skb;  	struct rtable *rt;  	struct ip_options *opt = NULL; +	struct inet_cork *cork;  	int hh_len;  	int mtu;  	int len;  	int err; -	unsigned int maxfraglen, fragheaderlen, fraggap; +	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;  	if (inet->hdrincl)  		return -EPERM; @@ -1106,28 +1175,31 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  	if (skb_queue_empty(&sk->sk_write_queue))  		return -EINVAL; -	rt = (struct rtable *)inet->cork.dst; -	if (inet->cork.flags & IPCORK_OPT) -		opt = inet->cork.opt; +	cork = &inet->cork.base; +	rt = (struct rtable *)cork->dst; +	if (cork->flags & IPCORK_OPT) +		opt = cork->opt;  	if (!(rt->dst.dev->features&NETIF_F_SG))  		return -EOPNOTSUPP;  	hh_len = LL_RESERVED_SPACE(rt->dst.dev); -	mtu = inet->cork.fragsize; +	mtu = cork->fragsize;  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (inet->cork.length + size > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); +	if (cork->length + size > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	}  	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)  		return -EINVAL; -	inet->cork.length += size; +	cork->length += size;  	if ((size + skb->len > mtu) &&  	    (sk->sk_protocol == IPPROTO_UDP) &&  	    (rt->dst.dev->features & NETIF_F_UFO)) { @@ -1197,7 +1269,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  		if (len > size)  			len = size;  		if (skb_can_coalesce(skb, i, page, offset)) { -			skb_shinfo(skb)->frags[i-1].size += len; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);  		} else if (i < MAX_SKB_FRAGS) {  			get_page(page);  			skb_fill_page_desc(skb, i, page, offset, len); @@ -1222,45 +1294,47 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  	return 0;  error: -	inet->cork.length -= size; +	cork->length -= size;  	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);  	return err;  } -static void ip_cork_release(struct inet_sock *inet) +static void ip_cork_release(struct inet_cork *cork)  { -	inet->cork.flags &= ~IPCORK_OPT; -	kfree(inet->cork.opt); -	inet->cork.opt = NULL; -	dst_release(inet->cork.dst); -	inet->cork.dst = NULL; +	cork->flags &= ~IPCORK_OPT; +	kfree(cork->opt); +	cork->opt = NULL; +	dst_release(cork->dst); +	cork->dst = NULL;  }  /*   *	Combined all pending IP fragments on the socket as one IP datagram   *	and push them out.   */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, +			      struct flowi4 *fl4, +			      struct sk_buff_head *queue, +			      struct inet_cork *cork)  {  	struct sk_buff *skb, *tmp_skb;  	struct sk_buff **tail_skb;  	struct inet_sock *inet = inet_sk(sk);  	struct net *net = sock_net(sk);  	struct ip_options *opt = NULL; -	struct rtable *rt = (struct rtable *)inet->cork.dst; +	struct rtable *rt = (struct rtable *)cork->dst;  	struct iphdr *iph;  	__be16 df = 0;  	__u8 ttl; -	int err = 0; -	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) +	if ((skb = __skb_dequeue(queue)) == NULL)  		goto out;  	tail_skb = &(skb_shinfo(skb)->frag_list);  	/* move skb->data to ip header from ext header */  	if (skb->data < skb_network_header(skb))  		__skb_pull(skb, skb_network_offset(skb)); -	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { +	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {  		__skb_pull(tmp_skb, skb_network_header_len(skb));  		*tail_skb = tmp_skb;  		tail_skb = &(tmp_skb->next); @@ -1275,84 +1349,141 @@ int ip_push_pending_frames(struct sock *sk)  	 * to fragment the frame generated here. No matter, what transforms  	 * how transforms change size of the packet, it will come out.  	 */ -	if (inet->pmtudisc < IP_PMTUDISC_DO) -		skb->local_df = 1; +	skb->ignore_df = ip_sk_ignore_df(sk);  	/* DF bit is set when we want to see DF on outgoing frames. -	 * If local_df is set too, we still allow to fragment this frame +	 * If ignore_df is set too, we still allow to fragment this frame  	 * locally. */ -	if (inet->pmtudisc >= IP_PMTUDISC_DO || +	if (inet->pmtudisc == IP_PMTUDISC_DO || +	    inet->pmtudisc == IP_PMTUDISC_PROBE ||  	    (skb->len <= dst_mtu(&rt->dst) &&  	     ip_dont_fragment(sk, &rt->dst)))  		df = htons(IP_DF); -	if (inet->cork.flags & IPCORK_OPT) -		opt = inet->cork.opt; +	if (cork->flags & IPCORK_OPT) +		opt = cork->opt; -	if (rt->rt_type == RTN_MULTICAST) +	if (cork->ttl != 0) +		ttl = cork->ttl; +	else if (rt->rt_type == RTN_MULTICAST)  		ttl = inet->mc_ttl;  	else  		ttl = ip_select_ttl(inet, &rt->dst); -	iph = (struct iphdr *)skb->data; +	iph = ip_hdr(skb);  	iph->version = 4;  	iph->ihl = 5; -	if (opt) { -		iph->ihl += opt->optlen>>2; -		ip_options_build(skb, opt, inet->cork.addr, rt, 0); -	} -	iph->tos = inet->tos; +	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;  	iph->frag_off = df; -	ip_select_ident(iph, &rt->dst, sk);  	iph->ttl = ttl;  	iph->protocol = sk->sk_protocol; -	iph->saddr = rt->rt_src; -	iph->daddr = rt->rt_dst; +	ip_copy_addrs(iph, fl4); +	ip_select_ident(skb, sk); -	skb->priority = sk->sk_priority; +	if (opt) { +		iph->ihl += opt->optlen>>2; +		ip_options_build(skb, opt, cork->addr, rt, 0); +	} + +	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;  	skb->mark = sk->sk_mark;  	/*  	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec  	 * on dst refcount  	 */ -	inet->cork.dst = NULL; +	cork->dst = NULL;  	skb_dst_set(skb, &rt->dst);  	if (iph->protocol == IPPROTO_ICMP)  		icmp_out_count(net, ((struct icmphdr *)  			skb_transport_header(skb))->type); -	/* Netfilter gets whole the not fragmented skb. */ +	ip_cork_release(cork); +out: +	return skb; +} + +int ip_send_skb(struct net *net, struct sk_buff *skb) +{ +	int err; +  	err = ip_local_out(skb);  	if (err) {  		if (err > 0)  			err = net_xmit_errno(err);  		if (err) -			goto error; +			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);  	} -out: -	ip_cork_release(inet);  	return err; +} -error: -	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); -	goto out; +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) +{ +	struct sk_buff *skb; + +	skb = ip_finish_skb(sk, fl4); +	if (!skb) +		return 0; + +	/* Netfilter gets whole the not fragmented skb. */ +	return ip_send_skb(sock_net(sk), skb);  }  /*   *	Throw away all pending data on the socket.   */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, +				      struct sk_buff_head *queue, +				      struct inet_cork *cork)  {  	struct sk_buff *skb; -	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) +	while ((skb = __skb_dequeue_tail(queue)) != NULL)  		kfree_skb(skb); -	ip_cork_release(inet_sk(sk)); +	ip_cork_release(cork);  } +void ip_flush_pending_frames(struct sock *sk) +{ +	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); +} + +struct sk_buff *ip_make_skb(struct sock *sk, +			    struct flowi4 *fl4, +			    int getfrag(void *from, char *to, int offset, +					int len, int odd, struct sk_buff *skb), +			    void *from, int length, int transhdrlen, +			    struct ipcm_cookie *ipc, struct rtable **rtp, +			    unsigned int flags) +{ +	struct inet_cork cork; +	struct sk_buff_head queue; +	int err; + +	if (flags & MSG_PROBE) +		return NULL; + +	__skb_queue_head_init(&queue); + +	cork.flags = 0; +	cork.addr = 0; +	cork.opt = NULL; +	err = ip_setup_cork(sk, &cork, ipc, rtp); +	if (err) +		return ERR_PTR(err); + +	err = __ip_append_data(sk, fl4, &queue, &cork, +			       ¤t->task_frag, getfrag, +			       from, length, transhdrlen, flags); +	if (err) { +		__ip_flush_pending_frames(sk, &queue, &cork); +		return ERR_PTR(err); +	} + +	return __ip_make_skb(sk, fl4, &queue, &cork); +}  /*   *	Fetch data from kernel space and fill in checksum if needed. @@ -1369,74 +1500,88 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,  /*   *	Generic function to send a packet as reply to another packet. - *	Used to send TCP resets so far. ICMP should use this function too. + *	Used to send some TCP resets/acks so far.   * - *	Should run single threaded per socket because it uses the sock - *     	structure to pass arguments. + *	Use a fake percpu inet socket to avoid false sharing and contention.   */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, -		   unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { +	.sk = { +		.__sk_common = { +			.skc_refcnt = ATOMIC_INIT(1), +		}, +		.sk_wmem_alloc	= ATOMIC_INIT(1), +		.sk_allocation	= GFP_ATOMIC, +		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE), +	}, +	.pmtudisc	= IP_PMTUDISC_WANT, +	.uc_ttl		= -1, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, +			   __be32 saddr, const struct ip_reply_arg *arg, +			   unsigned int len)  { -	struct inet_sock *inet = inet_sk(sk); -	struct { -		struct ip_options	opt; -		char			data[40]; -	} replyopts; +	struct ip_options_data replyopts;  	struct ipcm_cookie ipc; -	__be32 daddr; +	struct flowi4 fl4;  	struct rtable *rt = skb_rtable(skb); +	struct sk_buff *nskb; +	struct sock *sk; +	struct inet_sock *inet; -	if (ip_options_echo(&replyopts.opt, skb)) +	if (ip_options_echo(&replyopts.opt.opt, skb))  		return; -	daddr = ipc.addr = rt->rt_src; +	ipc.addr = daddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; -	if (replyopts.opt.optlen) { +	if (replyopts.opt.opt.optlen) {  		ipc.opt = &replyopts.opt; -		if (ipc.opt->srr) -			daddr = replyopts.opt.faddr; +		if (replyopts.opt.opt.srr) +			daddr = replyopts.opt.opt.faddr;  	} -	{ -		struct flowi fl = { .oif = arg->bound_dev_if, -				    .fl4_dst = daddr, -				    .fl4_src = rt->rt_spec_dst, -				    .fl4_tos = RT_TOS(ip_hdr(skb)->tos), -				    .fl_ip_sport = tcp_hdr(skb)->dest, -				    .fl_ip_dport = tcp_hdr(skb)->source, -				    .proto = sk->sk_protocol, -				    .flags = ip_reply_arg_flowi_flags(arg) }; -		security_skb_classify_flow(skb, &fl); -		if (ip_route_output_key(sock_net(sk), &rt, &fl)) -			return; -	} +	flowi4_init_output(&fl4, arg->bound_dev_if, +			   IP4_REPLY_MARK(net, skb->mark), +			   RT_TOS(arg->tos), +			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, +			   ip_reply_arg_flowi_flags(arg), +			   daddr, saddr, +			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest); +	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt)) +		return; -	/* And let IP do all the hard work. +	inet = &get_cpu_var(unicast_sock); -	   This chunk is not reenterable, hence spinlock. -	   Note that it uses the fact, that this function is called -	   with locally disabled BH and that sk cannot be already spinlocked. -	 */ -	bh_lock_sock(sk); -	inet->tos = ip_hdr(skb)->tos; +	inet->tos = arg->tos; +	sk = &inet->sk;  	sk->sk_priority = skb->priority;  	sk->sk_protocol = ip_hdr(skb)->protocol;  	sk->sk_bound_dev_if = arg->bound_dev_if; -	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, +	sock_net_set(sk, net); +	__skb_queue_head_init(&sk->sk_write_queue); +	sk->sk_sndbuf = sysctl_wmem_default; +	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,  		       &ipc, &rt, MSG_DONTWAIT); -	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { +	nskb = skb_peek(&sk->sk_write_queue); +	if (nskb) {  		if (arg->csumoffset >= 0) -			*((__sum16 *)skb_transport_header(skb) + -			  arg->csumoffset) = csum_fold(csum_add(skb->csum, +			*((__sum16 *)skb_transport_header(nskb) + +			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,  								arg->csum)); -		skb->ip_summed = CHECKSUM_NONE; -		ip_push_pending_frames(sk); +		nskb->ip_summed = CHECKSUM_NONE; +		skb_orphan(nskb); +		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); +		ip_push_pending_frames(sk, &fl4);  	} -	bh_unlock_sock(sk); +	put_cpu_var(unicast_sock);  	ip_rt_put(rt);  } @@ -1446,7 +1591,7 @@ void __init ip_init(void)  	ip_rt_init();  	inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) -	igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) +	igmp_mc_init();  #endif  }  | 
