diff options
Diffstat (limited to 'net/ipv4/ip_output.c')
| -rw-r--r-- | net/ipv4/ip_output.c | 149 | 
1 files changed, 103 insertions, 46 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a04d872c54f..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb)  		       skb_dst(skb)->dev, dst_output);  } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)  {  	int err;  	err = __ip_local_out(skb);  	if (likely(err == 1)) -		err = dst_output(skb); +		err = dst_output_sk(sk, skb);  	return err;  } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk);  static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)  { @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,  	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);  	iph->saddr    = saddr;  	iph->protocol = sk->sk_protocol; -	ip_select_ident(skb, &rt->dst, sk); +	ip_select_ident(skb, sk);  	if (opt && opt->opt.optlen) {  		iph->ihl += opt->opt.optlen>>2; @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb)  	return -EINVAL;  } +static int ip_finish_output_gso(struct sk_buff *skb) +{ +	netdev_features_t features; +	struct sk_buff *segs; +	int ret = 0; + +	/* common case: locally created skb or seglen is <= mtu */ +	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || +	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) +		return ip_finish_output2(skb); + +	/* Slowpath -  GSO segment length is exceeding the dst MTU. +	 * +	 * This can happen in two cases: +	 * 1) TCP GRO packet, DF bit not set +	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly +	 * from host network stack. +	 */ +	features = netif_skb_features(skb); +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); +	if (IS_ERR(segs)) { +		kfree_skb(skb); +		return -ENOMEM; +	} + +	consume_skb(skb); + +	do { +		struct sk_buff *nskb = segs->next; +		int err; + +		segs->next = NULL; +		err = ip_fragment(segs, ip_finish_output2); + +		if (err && ret == 0) +			ret = err; +		segs = nskb; +	} while (segs); + +	return ret; +} +  static int ip_finish_output(struct sk_buff *skb)  {  #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)  		return dst_output(skb);  	}  #endif -	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) +	if (skb_is_gso(skb)) +		return ip_finish_output_gso(skb); + +	if (skb->len > ip_skb_dst_mtu(skb))  		return ip_fragment(skb, ip_finish_output2); -	else -		return ip_finish_output2(skb); + +	return ip_finish_output2(skb);  } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb)  { -	struct sock *sk = skb->sk;  	struct rtable *rt = skb_rtable(skb);  	struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb)  {  	struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)  	       sizeof(fl4->saddr) + sizeof(fl4->daddr));  } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)  { -	struct sock *sk = skb->sk;  	struct inet_sock *inet = inet_sk(sk);  	struct ip_options_rcu *inet_opt;  	struct flowi4 *fl4; @@ -371,7 +415,7 @@ packet_routed:  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); -	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) +	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)  		iph->frag_off = htons(IP_DF);  	else  		iph->frag_off = 0; @@ -386,9 +430,9 @@ packet_routed:  		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);  	} -	ip_select_ident_more(skb, &rt->dst, sk, -			     (skb_shinfo(skb)->gso_segs ?: 1) - 1); +	ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); +	/* TODO : should we use skb->sk here instead of sk ? */  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; @@ -422,9 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)  	to->tc_index = from->tc_index;  #endif  	nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) -	to->nf_trace = from->nf_trace; -#endif  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)  	to->ipvs_property = from->ipvs_property;  #endif @@ -458,12 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	iph = ip_hdr(skb); -	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || +	mtu = ip_skb_dst_mtu(skb); +	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||  		     (IPCB(skb)->frag_max_size && -		      IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { +		      IPCB(skb)->frag_max_size > mtu))) {  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(ip_skb_dst_mtu(skb))); +			  htonl(mtu));  		kfree_skb(skb);  		return -EMSGSIZE;  	} @@ -473,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	 */  	hlen = iph->ihl * 4; -	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */ +	mtu = mtu - hlen;	/* Size of data space */  #ifdef CONFIG_BRIDGE_NETFILTER  	if (skb->nf_bridge)  		mtu -= nf_bridge_mtu_reduction(skb); @@ -772,15 +814,20 @@ static inline int ip_ufo_append_data(struct sock *sk,  		/* initialize protocol header pointer */  		skb->transport_header = skb->network_header + fragheaderlen; -		skb->ip_summed = CHECKSUM_PARTIAL;  		skb->csum = 0; -		/* specify the length of each IP datagram fragment */ -		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; -		skb_shinfo(skb)->gso_type = SKB_GSO_UDP; +  		__skb_queue_tail(queue, skb); +	} else if (skb_is_gso(skb)) { +		goto append;  	} +	skb->ip_summed = CHECKSUM_PARTIAL; +	/* specify the length of each IP datagram fragment */ +	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; +	skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append:  	return skb_append_datato_frags(sk, skb, getfrag, from,  				       (length - transhdrlen));  } @@ -805,7 +852,7 @@ static int __ip_append_data(struct sock *sk,  	int copy;  	int err;  	int offset = 0; -	unsigned int maxfraglen, fragheaderlen; +	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;  	int csummode = CHECKSUM_NONE;  	struct rtable *rt = (struct rtable *)cork->dst; @@ -818,10 +865,11 @@ static int __ip_append_data(struct sock *sk,  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (cork->length + length > 0xFFFF - fragheaderlen) { +	if (cork->length + length > maxnonfragsize - fragheaderlen) {  		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, -			       mtu-exthdrlen); +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -1030,7 +1078,6 @@ error:  static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,  			 struct ipcm_cookie *ipc, struct rtable **rtp)  { -	struct inet_sock *inet = inet_sk(sk);  	struct ip_options_rcu *opt;  	struct rtable *rt; @@ -1056,10 +1103,13 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,  	 * We steal reference to this route, caller should not release it  	 */  	*rtp = NULL; -	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? -			 rt->dst.dev->mtu : dst_mtu(&rt->dst); +	cork->fragsize = ip_sk_use_pmtu(sk) ? +			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;  	cork->dst = &rt->dst;  	cork->length = 0; +	cork->ttl = ipc->ttl; +	cork->tos = ipc->tos; +	cork->priority = ipc->priority;  	cork->tx_flags = ipc->tx_flags;  	return 0; @@ -1114,7 +1164,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  	int mtu;  	int len;  	int err; -	unsigned int maxfraglen, fragheaderlen, fraggap; +	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;  	if (inet->hdrincl)  		return -EPERM; @@ -1138,9 +1188,11 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (cork->length + size > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); +	if (cork->length + size > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -1297,13 +1349,13 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	 * to fragment the frame generated here. No matter, what transforms  	 * how transforms change size of the packet, it will come out.  	 */ -	if (inet->pmtudisc < IP_PMTUDISC_DO) -		skb->local_df = 1; +	skb->ignore_df = ip_sk_ignore_df(sk);  	/* DF bit is set when we want to see DF on outgoing frames. -	 * If local_df is set too, we still allow to fragment this frame +	 * If ignore_df is set too, we still allow to fragment this frame  	 * locally. */ -	if (inet->pmtudisc >= IP_PMTUDISC_DO || +	if (inet->pmtudisc == IP_PMTUDISC_DO || +	    inet->pmtudisc == IP_PMTUDISC_PROBE ||  	    (skb->len <= dst_mtu(&rt->dst) &&  	     ip_dont_fragment(sk, &rt->dst)))  		df = htons(IP_DF); @@ -1311,7 +1363,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	if (cork->flags & IPCORK_OPT)  		opt = cork->opt; -	if (rt->rt_type == RTN_MULTICAST) +	if (cork->ttl != 0) +		ttl = cork->ttl; +	else if (rt->rt_type == RTN_MULTICAST)  		ttl = inet->mc_ttl;  	else  		ttl = ip_select_ttl(inet, &rt->dst); @@ -1319,19 +1373,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	iph = ip_hdr(skb);  	iph->version = 4;  	iph->ihl = 5; -	iph->tos = inet->tos; +	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;  	iph->frag_off = df;  	iph->ttl = ttl;  	iph->protocol = sk->sk_protocol;  	ip_copy_addrs(iph, fl4); -	ip_select_ident(skb, &rt->dst, sk); +	ip_select_ident(skb, sk);  	if (opt) {  		iph->ihl += opt->optlen>>2;  		ip_options_build(skb, opt, cork->addr, rt, 0);  	} -	skb->priority = sk->sk_priority; +	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;  	skb->mark = sk->sk_mark;  	/*  	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1481,6 +1535,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,  	ipc.addr = daddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	if (replyopts.opt.opt.optlen) {  		ipc.opt = &replyopts.opt; @@ -1489,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,  			daddr = replyopts.opt.opt.faddr;  	} -	flowi4_init_output(&fl4, arg->bound_dev_if, 0, +	flowi4_init_output(&fl4, arg->bound_dev_if, +			   IP4_REPLY_MARK(net, skb->mark),  			   RT_TOS(arg->tos),  			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,  			   ip_reply_arg_flowi_flags(arg), @@ -1534,7 +1591,7 @@ void __init ip_init(void)  	ip_rt_init();  	inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) -	igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) +	igmp_mc_init();  #endif  }  | 
