diff options
Diffstat (limited to 'net/ipv4')
106 files changed, 4944 insertions, 3269 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4b81e91c80f..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y     := route.o inetpeer.o protocol.o \  	     tcp_offload.o datagram.o raw.o udp.o udplite.o \  	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \  	     fib_frontend.o fib_semantics.o fib_trie.o \ -	     inet_fragment.o ping.o ip_tunnel_core.o +	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o @@ -19,7 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o  obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o  obj-$(CONFIG_IP_MROUTE) += ipmr.o  obj-$(CONFIG_NET_IPIP) += ipip.o -gre-y := gre_demux.o gre_offload.o +gre-y := gre_demux.o  obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o  obj-$(CONFIG_NET_IPGRE) += ip_gre.o  obj-$(CONFIG_NET_IPVTI) += ip_vti.o @@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o  obj-$(CONFIG_NETLABEL) += cipso_ipv4.o  obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ -		      xfrm4_output.o +		      xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cfeb85cff4f..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@  static struct list_head inetsw[SOCK_MAX];  static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); -  /* New destruction routine */  void inet_sock_destruct(struct sock *sk) @@ -245,29 +242,6 @@ out:  }  EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -u32 ipv6_hash_secret __read_mostly; -EXPORT_SYMBOL(ipv6_hash_secret); - -/* - * inet_ehash_secret must be set exactly once, and to a non nul value - * ipv6_hash_secret must be set exactly once. - */ -void build_ehash_secret(void) -{ -	u32 rnd; - -	do { -		get_random_bytes(&rnd, sizeof(rnd)); -	} while (rnd == 0); - -	if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) -		get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); -} -EXPORT_SYMBOL(build_ehash_secret); -  /*   *	Create an inet socket.   */ @@ -280,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,  	struct inet_sock *inet;  	struct proto *answer_prot;  	unsigned char answer_flags; -	char answer_no_check;  	int try_loading_module = 0;  	int err; -	if (unlikely(!inet_ehash_secret)) -		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) -			build_ehash_secret(); -  	sock->state = SS_UNCONNECTED;  	/* Look for the requested type/protocol pair. */ @@ -342,7 +311,6 @@ lookup_protocol:  	sock->ops = answer->ops;  	answer_prot = answer->prot; -	answer_no_check = answer->no_check;  	answer_flags = answer->flags;  	rcu_read_unlock(); @@ -354,7 +322,6 @@ lookup_protocol:  		goto out;  	err = 0; -	sk->sk_no_check = answer_no_check;  	if (INET_PROTOSW_REUSE & answer_flags)  		sk->sk_reuse = SK_CAN_REUSE; @@ -369,7 +336,7 @@ lookup_protocol:  			inet->hdrincl = 1;  	} -	if (ipv4_config.no_pmtu_disc) +	if (net->ipv4.sysctl_ip_no_pmtu_disc)  		inet->pmtudisc = IP_PMTUDISC_DONT;  	else  		inet->pmtudisc = IP_PMTUDISC_WANT; @@ -1032,7 +999,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_TCP,  		.prot =       &tcp_prot,  		.ops =        &inet_stream_ops, -		.no_check =   0,  		.flags =      INET_PROTOSW_PERMANENT |  			      INET_PROTOSW_ICSK,  	}, @@ -1042,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_UDP,  		.prot =       &udp_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_PERMANENT,         }, @@ -1051,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_ICMP,  		.prot =       &ping_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_REUSE,         }, @@ -1060,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =  	       .protocol =   IPPROTO_IP,	/* wild card */  	       .prot =       &raw_prot,  	       .ops =        &inet_sockraw_ops, -	       .no_check =   UDP_CSUM_DEFAULT,  	       .flags =      INET_PROTOSW_REUSE,         }  }; @@ -1160,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)  	fl4 = &inet->cork.fl.u.ip4;  	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),  			      sk->sk_bound_dev_if, sk->sk_protocol, -			      inet->inet_sport, inet->inet_dport, sk, false); +			      inet->inet_sport, inet->inet_dport, sk);  	if (IS_ERR(rt))  		return PTR_ERR(rt); @@ -1254,36 +1217,36 @@ static int inet_gso_send_check(struct sk_buff *skb)  	if (ihl < sizeof(*iph))  		goto out; +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	proto = iph->protocol;  	err = -EPROTONOSUPPORT; -	rcu_read_lock();  	ops = rcu_dereference(inet_offloads[proto]);  	if (likely(ops && ops->callbacks.gso_send_check))  		err = ops->callbacks.gso_send_check(skb); -	rcu_read_unlock();  out:  	return err;  }  static struct sk_buff *inet_gso_segment(struct sk_buff *skb, -	netdev_features_t features) +					netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	const struct net_offload *ops; +	unsigned int offset = 0; +	bool udpfrag, encap;  	struct iphdr *iph;  	int proto; +	int nhoff;  	int ihl;  	int id; -	unsigned int offset = 0; -	bool tunnel;  	if (unlikely(skb_shinfo(skb)->gso_type &  		     ~(SKB_GSO_TCPV4 | @@ -1291,12 +1254,18 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,  		       SKB_GSO_DODGY |  		       SKB_GSO_TCP_ECN |  		       SKB_GSO_GRE | +		       SKB_GSO_GRE_CSUM | +		       SKB_GSO_IPIP | +		       SKB_GSO_SIT |  		       SKB_GSO_TCPV6 |  		       SKB_GSO_UDP_TUNNEL | +		       SKB_GSO_UDP_TUNNEL_CSUM |  		       SKB_GSO_MPLS |  		       0)))  		goto out; +	skb_reset_network_header(skb); +	nhoff = skb_network_header(skb) - skb_mac_header(skb);  	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))  		goto out; @@ -1305,42 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,  	if (ihl < sizeof(*iph))  		goto out; +	id = ntohs(iph->id); +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; +	__skb_pull(skb, ihl); -	tunnel = !!skb->encapsulation; +	encap = SKB_GSO_CB(skb)->encap_level > 0; +	if (encap) +		features = skb->dev->hw_enc_features & netif_skb_features(skb); +	SKB_GSO_CB(skb)->encap_level += ihl; -	__skb_pull(skb, ihl);  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	id = ntohs(iph->id); -	proto = iph->protocol; +  	segs = ERR_PTR(-EPROTONOSUPPORT); -	rcu_read_lock(); +	if (skb->encapsulation && +	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) +		udpfrag = proto == IPPROTO_UDP && encap; +	else +		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; +  	ops = rcu_dereference(inet_offloads[proto]);  	if (likely(ops && ops->callbacks.gso_segment))  		segs = ops->callbacks.gso_segment(skb, features); -	rcu_read_unlock();  	if (IS_ERR_OR_NULL(segs))  		goto out;  	skb = segs;  	do { -		iph = ip_hdr(skb); -		if (!tunnel && proto == IPPROTO_UDP) { +		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); +		if (udpfrag) {  			iph->id = htons(id);  			iph->frag_off = htons(offset >> 3);  			if (skb->next != NULL)  				iph->frag_off |= htons(IP_MF); -			offset += (skb->len - skb->mac_len - iph->ihl * 4); -		} else  { +			offset += skb->len - nhoff - ihl; +		} else {  			iph->id = htons(id++);  		} -		iph->tot_len = htons(skb->len - skb->mac_len); -		iph->check = 0; -		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); +		iph->tot_len = htons(skb->len - nhoff); +		ip_send_check(iph); +		if (encap) +			skb_reset_inner_headers(skb); +		skb->network_header = (u8 *)iph - skb->head;  	} while ((skb = skb->next));  out: @@ -1392,8 +1372,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		if (!NAPI_GRO_CB(p)->same_flow)  			continue; -		iph2 = ip_hdr(p); - +		iph2 = (struct iphdr *)(p->data + off); +		/* The above works because, with the exception of the top +		 * (inner most) layer, we only aggregate pkts with the same +		 * hdr length so all the hdrs we'll need to verify will start +		 * at the same offset. +		 */  		if ((iph->protocol ^ iph2->protocol) |  		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |  		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1405,13 +1389,24 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		NAPI_GRO_CB(p)->flush |=  			(iph->ttl ^ iph2->ttl) |  			(iph->tos ^ iph2->tos) | -			(__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | -			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); +			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); +		/* Save the IP ID check to be included later when we get to +		 * the transport layer so only the inner most IP ID is checked. +		 * This is because some GSO/TSO implementations do not +		 * correctly increment the IP ID for the outer hdrs. +		 */ +		NAPI_GRO_CB(p)->flush_id = +			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);  		NAPI_GRO_CB(p)->flush |= flush;  	}  	NAPI_GRO_CB(skb)->flush |= flush; +	skb_set_network_header(skb, off); +	/* The above will be needed by the transport layer if there is one +	 * immediately following this IP hdr. +	 */ +  	skb_gro_pull(skb, sizeof(*iph));  	skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1426,14 +1421,17 @@ out:  	return pp;  } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff)  { -	__be16 newlen = htons(skb->len - skb_network_offset(skb)); -	struct iphdr *iph = ip_hdr(skb); +	__be16 newlen = htons(skb->len - nhoff); +	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);  	const struct net_offload *ops;  	int proto = iph->protocol;  	int err = -ENOSYS; +	if (skb->encapsulation) +		skb_set_inner_network_header(skb, nhoff); +  	csum_replace2(&iph->check, iph->tot_len, newlen);  	iph->tot_len = newlen; @@ -1442,7 +1440,11 @@ static int inet_gro_complete(struct sk_buff *skb)  	if (WARN_ON(!ops || !ops->callbacks.gro_complete))  		goto out_unlock; -	err = ops->callbacks.gro_complete(skb); +	/* Only need to add sizeof(*iph) to get to the next hdr below +	 * because any hdr with option will have been flushed in +	 * inet_gro_receive(). +	 */ +	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));  out_unlock:  	rcu_read_unlock(); @@ -1472,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,  }  EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt)  {  	unsigned long res = 0; -	int i, j; +	int i; -	for_each_possible_cpu(i) { -		for (j = 0; j < SNMP_ARRAY_SZ; j++) -			res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); -	} +	for_each_possible_cpu(i) +		res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field);  #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)  {  	u64 res = 0;  	int cpu; @@ -1498,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)  		u64 v;  		unsigned int start; -		bhptr = per_cpu_ptr(mib[0], cpu); +		bhptr = per_cpu_ptr(mib, cpu);  		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);  		do { -			start = u64_stats_fetch_begin_bh(syncp); +			start = u64_stats_fetch_begin_irq(syncp);  			v = *(((u64 *) bhptr) + offt); -		} while (u64_stats_fetch_retry_bh(syncp, start)); +		} while (u64_stats_fetch_retry_irq(syncp, start));  		res += v;  	} @@ -1512,24 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)  EXPORT_SYMBOL_GPL(snmp_fold_field64);  #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ -	BUG_ON(ptr == NULL); -	ptr[0] = __alloc_percpu(mibsize, align); -	if (!ptr[0]) -		return -ENOMEM; -#if SNMP_ARRAY_SZ == 2 -	ptr[1] = __alloc_percpu(mibsize, align); -	if (!ptr[1]) { -		free_percpu(ptr[0]); -		ptr[0] = NULL; -		return -ENOMEM; -	} -#endif -	return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); -  #ifdef CONFIG_IP_MULTICAST  static const struct net_protocol igmp_protocol = {  	.handler =	igmp_rcv, @@ -1543,9 +1525,11 @@ static const struct net_protocol tcp_protocol = {  	.err_handler	=	tcp_v4_err,  	.no_policy	=	1,  	.netns_ok	=	1, +	.icmp_strict_tag_validation = 1,  };  static const struct net_protocol udp_protocol = { +	.early_demux =	udp_v4_early_demux,  	.handler =	udp_rcv,  	.err_handler =	udp_err,  	.no_policy =	1, @@ -1561,29 +1545,32 @@ static const struct net_protocol icmp_protocol = {  static __net_init int ipv4_mib_init_net(struct net *net)  { -	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, -			  sizeof(struct tcp_mib), -			  __alignof__(struct tcp_mib)) < 0) +	int i; + +	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); +	if (!net->mib.tcp_statistics)  		goto err_tcp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, -			  sizeof(struct ipstats_mib), -			  __alignof__(struct ipstats_mib)) < 0) +	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); +	if (!net->mib.ip_statistics)  		goto err_ip_mib; -	if (snmp_mib_init((void __percpu **)net->mib.net_statistics, -			  sizeof(struct linux_mib), -			  __alignof__(struct linux_mib)) < 0) + +	for_each_possible_cpu(i) { +		struct ipstats_mib *af_inet_stats; +		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); +		u64_stats_init(&af_inet_stats->syncp); +	} + +	net->mib.net_statistics = alloc_percpu(struct linux_mib); +	if (!net->mib.net_statistics)  		goto err_net_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udp_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udp_statistics)  		goto err_udp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udplite_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udplite_statistics)  		goto err_udplite_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, -			  sizeof(struct icmp_mib), -			  __alignof__(struct icmp_mib)) < 0) +	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); +	if (!net->mib.icmp_statistics)  		goto err_icmp_mib;  	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),  					      GFP_KERNEL); @@ -1594,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)  	return 0;  err_icmpmsg_mib: -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); +	free_percpu(net->mib.icmp_statistics);  err_icmp_mib: -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); +	free_percpu(net->mib.udplite_statistics);  err_udplite_mib: -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); +	free_percpu(net->mib.udp_statistics);  err_udp_mib: -	snmp_mib_free((void __percpu **)net->mib.net_statistics); +	free_percpu(net->mib.net_statistics);  err_net_mib: -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); +	free_percpu(net->mib.ip_statistics);  err_ip_mib: -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.tcp_statistics);  err_tcp_mib:  	return -ENOMEM;  } @@ -1612,12 +1599,12 @@ err_tcp_mib:  static __net_exit void ipv4_mib_exit_net(struct net *net)  {  	kfree(net->mib.icmpmsg_statistics); -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); -	snmp_mib_free((void __percpu **)net->mib.net_statistics); -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.icmp_statistics); +	free_percpu(net->mib.udplite_statistics); +	free_percpu(net->mib.udp_statistics); +	free_percpu(net->mib.net_statistics); +	free_percpu(net->mib.ip_statistics); +	free_percpu(net->mib.tcp_statistics);  }  static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1630,6 +1617,39 @@ static int __init init_ipv4_mibs(void)  	return register_pernet_subsys(&ipv4_mib_ops);  } +static __net_init int inet_init_net(struct net *net) +{ +	/* +	 * Set defaults for local port range +	 */ +	seqlock_init(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] =  32768; +	net->ipv4.ip_local_ports.range[1] =  61000; + +	seqlock_init(&net->ipv4.ping_group_range.lock); +	/* +	 * Sane defaults - nobody may create ping sockets. +	 * Boot scripts should set this to distro-specific group. +	 */ +	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); +	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); +	return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { +	.init = inet_init_net, +	.exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ +	return register_pernet_subsys(&af_inet_ops); +} +  static int ipv4_proc_init(void);  /* @@ -1646,6 +1666,13 @@ static struct packet_offload ip_packet_offload __read_mostly = {  	},  }; +static const struct net_offload ipip_offload = { +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment	= inet_gso_segment, +	}, +}; +  static int __init ipv4_offload_init(void)  {  	/* @@ -1657,6 +1684,7 @@ static int __init ipv4_offload_init(void)  		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);  	dev_add_offload(&ip_packet_offload); +	inet_add_offload(&ipip_offload, IPPROTO_IPIP);  	return 0;  } @@ -1675,13 +1703,9 @@ static int __init inet_init(void)  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); -	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); -	if (!sysctl_local_reserved_ports) -		goto out; -  	rc = proto_register(&tcp_prot, 1);  	if (rc) -		goto out_free_reserved_ports; +		goto out;  	rc = proto_register(&udp_prot, 1);  	if (rc) @@ -1705,8 +1729,6 @@ static int __init inet_init(void)  	ip_static_sysctl_init();  #endif -	tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; -  	/*  	 *	Add all the base protocols.  	 */ @@ -1768,6 +1790,9 @@ static int __init inet_init(void)  	if (ip_mr_init())  		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);  #endif + +	if (init_inet_pernet_ops()) +		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);  	/*  	 *	Initialise per-cpu ipv4 mibs  	 */ @@ -1790,8 +1815,6 @@ out_unregister_udp_proto:  	proto_unregister(&udp_prot);  out_unregister_tcp_proto:  	proto_unregister(&tcp_prot); -out_free_reserved_ports: -	kfree(sysctl_local_reserved_ports);  	goto out;  } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 717902669d2..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	struct iphdr *iph, *top_iph;  	struct ip_auth_hdr *ah;  	struct ah_data *ahp; +	int seqhi_len = 0; +	__be32 *seqhi; +	int sglists = 0; +	struct scatterlist *seqhisg;  	ahp = x->data;  	ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	ah = ip_auth_hdr(skb);  	ihl = ip_hdrlen(skb); +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists = 1; +		seqhi_len = sizeof(*seqhi); +	}  	err = -ENOMEM; -	iph = ah_alloc_tmp(ahash, nfrags, ihl); +	iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);  	if (!iph)  		goto out; - -	icv = ah_tmp_icv(ahash, iph, ihl); +	seqhi = (__be32 *)((char *)iph + ihl); +	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);  	req = ah_tmp_req(ahash, icv);  	sg = ah_req_sg(ahash, req); +	seqhisg = sg + nfrags;  	memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	ah->spi = x->id.spi;  	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); -	sg_init_table(sg, nfrags); -	skb_to_sgvec(skb, sg, 0, skb->len); +	sg_init_table(sg, nfrags + sglists); +	skb_to_sgvec_nomark(skb, sg, 0, skb->len); -	ahash_request_set_crypt(req, sg, icv, skb->len); +	if (x->props.flags & XFRM_STATE_ESN) { +		/* Attach seqhi sg right after packet payload */ +		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); +		sg_set_buf(seqhisg, seqhi, seqhi_len); +	} +	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);  	ahash_request_set_callback(req, 0, ah_output_done, skb);  	AH_SKB_CB(skb)->tmp = iph; @@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	struct ip_auth_hdr *ah;  	struct ah_data *ahp;  	int err = -ENOMEM; +	int seqhi_len = 0; +	__be32 *seqhi; +	int sglists = 0; +	struct scatterlist *seqhisg;  	if (!pskb_may_pull(skb, sizeof(*ah)))  		goto out; @@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	iph = ip_hdr(skb);  	ihl = ip_hdrlen(skb); -	work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists = 1; +		seqhi_len = sizeof(*seqhi); +	} + +	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + +				ahp->icv_trunc_len + seqhi_len);  	if (!work_iph)  		goto out; -	auth_data = ah_tmp_auth(work_iph, ihl); +	seqhi = (__be32 *)((char *)work_iph + ihl); +	auth_data = ah_tmp_auth(seqhi, seqhi_len);  	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);  	req = ah_tmp_req(ahash, icv);  	sg = ah_req_sg(ahash, req); +	seqhisg = sg + nfrags;  	memcpy(work_iph, iph, ihl);  	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	skb_push(skb, ihl); -	sg_init_table(sg, nfrags); -	skb_to_sgvec(skb, sg, 0, skb->len); +	sg_init_table(sg, nfrags + sglists); +	skb_to_sgvec_nomark(skb, sg, 0, skb->len); -	ahash_request_set_crypt(req, sg, icv, skb->len); +	if (x->props.flags & XFRM_STATE_ESN) { +		/* Attach seqhi sg right after packet payload */ +		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi; +		sg_set_buf(seqhisg, seqhi, seqhi_len); +	} +	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);  	ahash_request_set_callback(req, 0, ah_input_done, skb);  	AH_SKB_CB(skb)->tmp = work_iph; @@ -397,7 +428,7 @@ out:  	return err;  } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev);  	const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)  	switch (icmp_hdr(skb)->type) {  	case ICMP_DEST_UNREACH:  		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -			return; +			return 0;  	case ICMP_REDIRECT:  		break;  	default: -		return; +		return 0;  	}  	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,  			      ah->spi, IPPROTO_AH, AF_INET);  	if (!x) -		return; +		return 0;  	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)  		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);  	else  		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);  	xfrm_state_put(x); + +	return 0;  }  static int ah_init_state(struct xfrm_state *x) @@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)  	kfree(ahp);  } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +}  static const struct xfrm_type ah_type =  { @@ -518,11 +555,12 @@ static const struct xfrm_type ah_type =  	.output		= ah_output  }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	ah4_rcv_cb,  	.err_handler	=	ah4_err, -	.no_policy	=	1, -	.netns_ok	=	1, +	.priority	=	0,  };  static int __init ah4_init(void) @@ -531,7 +569,7 @@ static int __init ah4_init(void)  		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { +	if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {  		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&ah_type, AF_INET);  		return -EAGAIN; @@ -541,7 +579,7 @@ static int __init ah4_init(void)  static void __exit ah4_fini(void)  { -	if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) +	if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)  		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&ah_type, AF_INET) < 0)  		pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7808093cede..1a9b99e0446 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -166,18 +166,20 @@ struct neigh_table arp_tbl = {  	.id		= "arp_cache",  	.parms		= {  		.tbl			= &arp_tbl, -		.base_reachable_time	= 30 * HZ, -		.retrans_time		= 1 * HZ, -		.gc_staletime		= 60 * HZ,  		.reachable_time		= 30 * HZ, -		.delay_probe_time	= 5 * HZ, -		.queue_len_bytes	= 64*1024, -		.ucast_probes		= 3, -		.mcast_probes		= 3, -		.anycast_delay		= 1 * HZ, -		.proxy_delay		= (8 * HZ) / 10, -		.proxy_qlen		= 64, -		.locktime		= 1 * HZ, +		.data	= { +			[NEIGH_VAR_MCAST_PROBES] = 3, +			[NEIGH_VAR_UCAST_PROBES] = 3, +			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ, +			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, +			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, +			[NEIGH_VAR_GC_STALETIME] = 60 * HZ, +			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, +			[NEIGH_VAR_PROXY_QLEN] = 64, +			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, +			[NEIGH_VAR_PROXY_DELAY]	= (8 * HZ) / 10, +			[NEIGH_VAR_LOCKTIME] = 1 * HZ, +		},  	},  	.gc_interval	= 30 * HZ,  	.gc_thresh1	= 128, @@ -359,14 +361,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  	if (!saddr)  		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); -	probes -= neigh->parms->ucast_probes; +	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);  	if (probes < 0) {  		if (!(neigh->nud_state & NUD_VALID))  			pr_debug("trying to ucast probe in NUD_INVALID\n");  		neigh_ha_snapshot(dst_ha, neigh, dev);  		dst_hw = dst_ha;  	} else { -		probes -= neigh->parms->app_probes; +		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);  		if (probes < 0) {  			neigh_app_ns(neigh);  			return; @@ -379,6 +381,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  { +	struct net *net = dev_net(in_dev->dev);  	int scope;  	switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -397,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  	case 3:	/* Do not reply for scope host addresses */  		sip = 0;  		scope = RT_SCOPE_LINK; +		in_dev = NULL;  		break;  	case 4:	/* Reserved */  	case 5: @@ -408,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  	default:  		return 0;  	} -	return !inet_confirm_addr(in_dev, sip, tip, scope); +	return !inet_confirm_addr(net, in_dev, sip, tip, scope);  }  static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -728,6 +732,7 @@ static int arp_process(struct sk_buff *skb)  	int addr_type;  	struct neighbour *n;  	struct net *net = dev_net(dev); +	bool is_garp = false;  	/* arp_rcv below verifies the ARP header and verifies the device  	 * is ARP'able. @@ -871,7 +876,7 @@ static int arp_process(struct sk_buff *skb)  				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||  				    skb->pkt_type == PACKET_HOST || -				    in_dev->arp_parms->proxy_delay == 0) { +				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {  					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,  						 dev, tip, sha, dev->dev_addr,  						 sha); @@ -894,10 +899,12 @@ static int arp_process(struct sk_buff *skb)  		   It is possible, that this option should be enabled for some  		   devices (strip is candidate)  		 */ +		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && +			  inet_addr_type(net, sip) == RTN_UNICAST; +  		if (n == NULL && -		    (arp->ar_op == htons(ARPOP_REPLY) || -		     (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && -		    inet_addr_type(net, sip) == RTN_UNICAST) +		    ((arp->ar_op == htons(ARPOP_REPLY)  && +		      inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))  			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);  	} @@ -910,7 +917,10 @@ static int arp_process(struct sk_buff *skb)  		   agents are active. Taking the first reply prevents  		   arp trashing and chooses the fastest router.  		 */ -		override = time_after(jiffies, n->updated + n->parms->locktime); +		override = time_after(jiffies, +				      n->updated + +				      NEIGH_VAR(n->parms, LOCKTIME)) || +			   is_garp;  		/* Broadcast replies and request packets  		   do not assert neighbour reachability. @@ -1107,7 +1117,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)  	return err;  } -int arp_invalidate(struct net_device *dev, __be32 ip) +static int arp_invalidate(struct net_device *dev, __be32 ip)  {  	struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);  	int err = -ENXIO; @@ -1122,7 +1132,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip)  	return err;  } -EXPORT_SYMBOL(arp_invalidate);  static int arp_req_delete_public(struct net *net, struct arpreq *r,  		struct net_device *dev) @@ -1284,7 +1293,7 @@ void __init arp_init(void)  	dev_add_pack(&arp_packet_type);  	arp_proc_init();  #ifdef CONFIG_SYSCTL -	neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); +	neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);  #endif  	register_netdevice_notifier(&arp_netdev_notifier);  } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 667c1d4ca98..69e77c8ff28 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -31,8 +31,7 @@   * the GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program;  if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program;  if not, see <http://www.gnu.org/licenses/>.   *   */ @@ -1336,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -		                       netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; @@ -1432,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -			               netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; @@ -1527,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -			               netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index b28e863fe0a..a3095fdefbe 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -53,11 +53,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,  			      RT_CONN_FLAGS(sk), oif,  			      sk->sk_protocol, -			      inet->inet_sport, usin->sin_port, sk, true); +			      inet->inet_sport, usin->sin_port, sk);  	if (IS_ERR(rt)) {  		err = PTR_ERR(rt);  		if (err == -ENETUNREACH) -			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); +			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);  		goto out;  	} @@ -86,18 +86,26 @@ out:  }  EXPORT_SYMBOL(ip4_datagram_connect); +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */  void ip4_datagram_release_cb(struct sock *sk)  {  	const struct inet_sock *inet = inet_sk(sk);  	const struct ip_options_rcu *inet_opt;  	__be32 daddr = inet->inet_daddr; +	struct dst_entry *dst;  	struct flowi4 fl4;  	struct rtable *rt; -	if (! __sk_dst_get(sk) || __sk_dst_check(sk, 0)) -		return; -  	rcu_read_lock(); + +	dst = __sk_dst_get(sk); +	if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { +		rcu_read_unlock(); +		return; +	}  	inet_opt = rcu_dereference(inet->inet_opt);  	if (inet_opt && inet_opt->opt.srr)  		daddr = inet_opt->opt.faddr; @@ -105,8 +113,10 @@ void ip4_datagram_release_cb(struct sock *sk)  				   inet->inet_saddr, inet->inet_dport,  				   inet->inet_sport, sk->sk_protocol,  				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); -	if (!IS_ERR(rt)) -		__sk_dst_set(sk, &rt->dst); + +	dst = !IS_ERR(rt) ? &rt->dst : NULL; +	sk_dst_set(sk, dst); +  	rcu_read_unlock();  }  EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a1b5bcbd04a..e9449376b58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -99,13 +99,13 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {  	[IFA_BROADCAST] 	= { .type = NLA_U32 },  	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },  	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) }, +	[IFA_FLAGS]		= { .type = NLA_U32 },  };  #define IN4_ADDR_HSIZE_SHIFT	8  #define IN4_ADDR_HSIZE		(1U << IN4_ADDR_HSIZE_SHIFT)  static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static DEFINE_SPINLOCK(inet_addr_hash_lock);  static u32 inet_addr_hash(struct net *net, __be32 addr)  { @@ -118,16 +118,14 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)  {  	u32 hash = inet_addr_hash(net, ifa->ifa_local); -	spin_lock(&inet_addr_hash_lock); +	ASSERT_RTNL();  	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); -	spin_unlock(&inet_addr_hash_lock);  }  static void inet_hash_remove(struct in_ifaddr *ifa)  { -	spin_lock(&inet_addr_hash_lock); +	ASSERT_RTNL();  	hlist_del_init_rcu(&ifa->hash); -	spin_unlock(&inet_addr_hash_lock);  }  /** @@ -463,7 +461,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,  	}  	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { -		net_srandom(ifa->ifa_local); +		prandom_seed((__force u32) ifa->ifa_local);  		ifap = last_primary;  	} @@ -473,7 +471,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,  	inet_hash_insert(dev_net(in_dev->dev), ifa);  	cancel_delayed_work(&check_lifetime_work); -	schedule_delayed_work(&check_lifetime_work, 0); +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);  	/* Send message first, then call notifier.  	   Notifier will trigger FIB update, so that @@ -500,6 +498,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)  		return -ENOBUFS;  	}  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	if (ifa->ifa_dev != in_dev) {  		WARN_ON(ifa->ifa_dev);  		in_dev_hold(in_dev); @@ -682,7 +681,8 @@ static void check_lifetime(struct work_struct *work)  	if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))  		next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; -	schedule_delayed_work(&check_lifetime_work, next_sched - now); +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, +			next_sched - now);  }  static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -747,6 +747,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,  		goto errout;  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	in_dev_hold(in_dev);  	if (tb[IFA_ADDRESS] == NULL) @@ -755,7 +756,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,  	INIT_HLIST_NODE(&ifa->hash);  	ifa->ifa_prefixlen = ifm->ifa_prefixlen;  	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); -	ifa->ifa_flags = ifm->ifa_flags; +	ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : +					 ifm->ifa_flags;  	ifa->ifa_scope = ifm->ifa_scope;  	ifa->ifa_dev = in_dev; @@ -825,7 +827,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)  	ifa_existing = find_matching_ifa(ifa);  	if (!ifa_existing) {  		/* It would be best to check for !NLM_F_CREATE here but -		 * userspace alreay relies on not having to provide this. +		 * userspace already relies on not having to provide this.  		 */  		set_ifa_lifetime(ifa, valid_lft, prefered_lft);  		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); @@ -838,7 +840,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)  		ifa = ifa_existing;  		set_ifa_lifetime(ifa, valid_lft, prefered_lft);  		cancel_delayed_work(&check_lifetime_work); -		schedule_delayed_work(&check_lifetime_work, 0); +		queue_delayed_work(system_power_efficient_wq, +				&check_lifetime_work, 0);  		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);  		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);  	} @@ -1236,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,  /*   * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface   * - dst: only in the same subnet as dst, 0=any dst   * - local: address, 0=autoselect the local address   * - scope: maximum allowed scope value for the local address   */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,  			 __be32 dst, __be32 local, int scope)  {  	__be32 addr = 0;  	struct net_device *dev; -	struct net *net; -	if (scope != RT_SCOPE_LINK) +	if (in_dev != NULL)  		return confirm_addr_indev(in_dev, dst, local, scope); -	net = dev_net(in_dev->dev);  	rcu_read_lock();  	for_each_netdev_rcu(net, dev) {  		in_dev = __in_dev_get_rcu(dev); @@ -1382,6 +1384,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,  				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);  				set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,  						 INFINITY_LIFE_TIME); +				ipv4_devconf_setall(in_dev); +				neigh_parms_data_state_setall(in_dev->arp_parms);  				inet_insert_ifa(ifa);  			}  		} @@ -1435,7 +1439,9 @@ static size_t inet_nlmsg_size(void)  	       + nla_total_size(4) /* IFA_ADDRESS */  	       + nla_total_size(4) /* IFA_LOCAL */  	       + nla_total_size(4) /* IFA_BROADCAST */ -	       + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ +	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ +	       + nla_total_size(4)  /* IFA_FLAGS */ +	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */  }  static inline u32 cstamp_delta(unsigned long cstamp) @@ -1503,6 +1509,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,  	     nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||  	    (ifa->ifa_label[0] &&  	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || +	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||  	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,  			  preferred, valid))  		goto nla_put_failure; @@ -1691,6 +1698,8 @@ static int inet_netconf_msgsize_devconf(int type)  		size += nla_total_size(4);  	if (type == -1 || type == NETCONFA_MC_FORWARDING)  		size += nla_total_size(4); +	if (type == -1 || type == NETCONFA_PROXY_NEIGH) +		size += nla_total_size(4);  	return size;  } @@ -1727,6 +1736,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,  	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,  			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)  		goto nla_put_failure; +	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && +	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, +			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) +		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -1764,6 +1777,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {  	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },  	[NETCONFA_FORWARDING]	= { .len = sizeof(int) },  	[NETCONFA_RP_FILTER]	= { .len = sizeof(int) }, +	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) },  };  static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1945,6 +1959,19 @@ static void inet_forward_change(struct net *net)  	}  } +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ +	if (cnf == net->ipv4.devconf_dflt) +		return NETCONFA_IFINDEX_DEFAULT; +	else if (cnf == net->ipv4.devconf_all) +		return NETCONFA_IFINDEX_ALL; +	else { +		struct in_device *idev +			= container_of(cnf, struct in_device, cnf); +		return idev->dev->ifindex; +	} +} +  static int devinet_conf_proc(struct ctl_table *ctl, int write,  			     void __user *buffer,  			     size_t *lenp, loff_t *ppos) @@ -1957,6 +1984,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,  		struct ipv4_devconf *cnf = ctl->extra1;  		struct net *net = ctl->extra2;  		int i = (int *)ctl->data - cnf->data; +		int ifindex;  		set_bit(i, cnf->state); @@ -1966,23 +1994,19 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,  		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)  			if ((new_value == 0) && (old_value != 0))  				rt_cache_flush(net); +  		if (i == IPV4_DEVCONF_RP_FILTER - 1 &&  		    new_value != old_value) { -			int ifindex; - -			if (cnf == net->ipv4.devconf_dflt) -				ifindex = NETCONFA_IFINDEX_DEFAULT; -			else if (cnf == net->ipv4.devconf_all) -				ifindex = NETCONFA_IFINDEX_ALL; -			else { -				struct in_device *idev = -					container_of(cnf, struct in_device, -						     cnf); -				ifindex = idev->dev->ifindex; -			} +			ifindex = devinet_conf_ifindex(net, cnf);  			inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,  						    ifindex, cnf);  		} +		if (i == IPV4_DEVCONF_PROXY_ARP - 1 && +		    new_value != old_value) { +			ifindex = devinet_conf_ifindex(net, cnf); +			inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, +						    ifindex, cnf); +		}  	}  	return ret; @@ -2160,7 +2184,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)  static void devinet_sysctl_register(struct in_device *idev)  { -	neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); +	neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);  	__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,  					&idev->cnf);  } @@ -2298,7 +2322,7 @@ void __init devinet_init(void)  	register_gifconf(PF_INET, inet_gifconf);  	register_netdevice_notifier(&ip_netdev_notifier); -	schedule_delayed_work(&check_lifetime_work, 0); +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);  	rtnl_af_register(&inet_af_ops); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 109ee89f123..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	struct aead_givcrypt_request *req;  	struct scatterlist *sg;  	struct scatterlist *asg; -	struct esp_data *esp;  	struct sk_buff *trailer;  	void *tmp;  	u8 *iv; @@ -139,8 +138,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	/* skb is pure payload to encrypt */ -	esp = x->data; -	aead = esp->aead; +	aead = x->data;  	alen = crypto_aead_authsize(aead);  	tfclen = 0; @@ -154,8 +152,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	}  	blksize = ALIGN(crypto_aead_blocksize(aead), 4);  	clen = ALIGN(skb->len + 2 + tfclen, blksize); -	if (esp->padlen) -		clen = ALIGN(clen, esp->padlen);  	plen = clen - skb->len - tfclen;  	err = skb_cow_data(skb, tfclen + plen + alen, &trailer); @@ -280,8 +276,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)  {  	const struct iphdr *iph;  	struct xfrm_state *x = xfrm_input_state(skb); -	struct esp_data *esp = x->data; -	struct crypto_aead *aead = esp->aead; +	struct crypto_aead *aead = x->data;  	int alen = crypto_aead_authsize(aead);  	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);  	int elen = skb->len - hlen; @@ -376,8 +371,7 @@ static void esp_input_done(struct crypto_async_request *base, int err)  static int esp_input(struct xfrm_state *x, struct sk_buff *skb)  {  	struct ip_esp_hdr *esph; -	struct esp_data *esp = x->data; -	struct crypto_aead *aead = esp->aead; +	struct crypto_aead *aead = x->data;  	struct aead_request *req;  	struct sk_buff *trailer;  	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); @@ -459,9 +453,8 @@ out:  static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)  { -	struct esp_data *esp = x->data; -	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); -	u32 align = max_t(u32, blksize, esp->padlen); +	struct crypto_aead *aead = x->data; +	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);  	unsigned int net_adj;  	switch (x->props.mode) { @@ -476,11 +469,11 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)  		BUG();  	} -	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - -		 net_adj) & ~(align - 1)) + net_adj - 2; +	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - +		 net_adj) & ~(blksize - 1)) + net_adj - 2;  } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev);  	const struct iphdr *iph = (const struct iphdr *)skb->data; @@ -490,39 +483,39 @@ static void esp4_err(struct sk_buff *skb, u32 info)  	switch (icmp_hdr(skb)->type) {  	case ICMP_DEST_UNREACH:  		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -			return; +			return 0;  	case ICMP_REDIRECT:  		break;  	default: -		return; +		return 0;  	}  	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,  			      esph->spi, IPPROTO_ESP, AF_INET);  	if (!x) -		return; +		return 0;  	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)  		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);  	else  		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);  	xfrm_state_put(x); + +	return 0;  }  static void esp_destroy(struct xfrm_state *x)  { -	struct esp_data *esp = x->data; +	struct crypto_aead *aead = x->data; -	if (!esp) +	if (!aead)  		return; -	crypto_free_aead(esp->aead); -	kfree(esp); +	crypto_free_aead(aead);  }  static int esp_init_aead(struct xfrm_state *x)  { -	struct esp_data *esp = x->data;  	struct crypto_aead *aead;  	int err; @@ -531,7 +524,7 @@ static int esp_init_aead(struct xfrm_state *x)  	if (IS_ERR(aead))  		goto error; -	esp->aead = aead; +	x->data = aead;  	err = crypto_aead_setkey(aead, x->aead->alg_key,  				 (x->aead->alg_key_len + 7) / 8); @@ -548,7 +541,6 @@ error:  static int esp_init_authenc(struct xfrm_state *x)  { -	struct esp_data *esp = x->data;  	struct crypto_aead *aead;  	struct crypto_authenc_key_param *param;  	struct rtattr *rta; @@ -583,7 +575,7 @@ static int esp_init_authenc(struct xfrm_state *x)  	if (IS_ERR(aead))  		goto error; -	esp->aead = aead; +	x->data = aead;  	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +  		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -638,16 +630,11 @@ error:  static int esp_init_state(struct xfrm_state *x)  { -	struct esp_data *esp;  	struct crypto_aead *aead;  	u32 align;  	int err; -	esp = kzalloc(sizeof(*esp), GFP_KERNEL); -	if (esp == NULL) -		return -ENOMEM; - -	x->data = esp; +	x->data = NULL;  	if (x->aead)  		err = esp_init_aead(x); @@ -657,9 +644,7 @@ static int esp_init_state(struct xfrm_state *x)  	if (err)  		goto error; -	aead = esp->aead; - -	esp->padlen = 0; +	aead = x->data;  	x->props.header_len = sizeof(struct ip_esp_hdr) +  			      crypto_aead_ivsize(aead); @@ -683,14 +668,17 @@ static int esp_init_state(struct xfrm_state *x)  	}  	align = ALIGN(crypto_aead_blocksize(aead), 4); -	if (esp->padlen) -		align = max_t(u32, align, esp->padlen); -	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); +	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);  error:  	return err;  } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +} +  static const struct xfrm_type esp_type =  {  	.description	= "ESP4", @@ -704,11 +692,12 @@ static const struct xfrm_type esp_type =  	.output		= esp_output  }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	esp4_rcv_cb,  	.err_handler	=	esp4_err, -	.no_policy	=	1, -	.netns_ok	=	1, +	.priority	=	0,  };  static int __init esp4_init(void) @@ -717,7 +706,7 @@ static int __init esp4_init(void)  		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { +	if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {  		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&esp_type, AF_INET);  		return -EAGAIN; @@ -727,7 +716,7 @@ static int __init esp4_init(void)  static void __exit esp4_fini(void)  { -	if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) +	if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)  		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&esp_type, AF_INET) < 0)  		pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b3f627ac4ed..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -250,7 +250,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,  	bool dev_match;  	fl4.flowi4_oif = 0; -	fl4.flowi4_iif = oif; +	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;  	fl4.daddr = src;  	fl4.saddr = dst;  	fl4.flowi4_tos = tos; @@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)  	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&  	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) -		return ip_rt_dump(skb, cb); +		return skb->len;  	s_h = cb->args[0];  	s_e = cb->args[1]; @@ -933,7 +933,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)  		local_bh_disable();  		frn->tb_id = tb->tb_id; -		rcu_read_lock();  		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);  		if (!frn->err) { @@ -942,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)  			frn->type = res.type;  			frn->scope = res.scope;  		} -		rcu_read_unlock();  		local_bh_enable();  	}  } @@ -1049,6 +1047,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo  	}  	in_dev = __in_dev_get_rtnl(dev); +	if (!in_dev) +		return NOTIFY_DONE;  	switch (event) {  	case NETDEV_UP: diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index af0f14aba16..1e4f6600b31 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -24,21 +24,15 @@ static inline void fib_alias_accessed(struct fib_alias *fa)  }  /* Exported by fib_semantics.c */ -extern void fib_release_info(struct fib_info *); -extern struct fib_info *fib_create_info(struct fib_config *cfg); -extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); -extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, -			 u32 tb_id, u8 type, __be32 dst, -			 int dst_len, u8 tos, struct fib_info *fi, -			 unsigned int); -extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, -		      int dst_len, u32 tb_id, struct nl_info *info, -		      unsigned int nlm_flags); -extern struct fib_alias *fib_find_alias(struct list_head *fah, -					u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, -			    struct fib_info **last_resort, -			    int *last_idx, int dflt); +void fib_release_info(struct fib_info *); +struct fib_info *fib_create_info(struct fib_config *cfg); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, +		  u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, +		  unsigned int); +void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, +	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);  static inline void fib_result_assign(struct fib_result *res,  				     struct fib_info *fi) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 523be38e37d..f2e15738534 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -104,7 +104,10 @@ errout:  static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)  {  	struct fib_result *result = (struct fib_result *) arg->result; -	struct net_device *dev = result->fi->fib_dev; +	struct net_device *dev = NULL; + +	if (result->fi) +		dev = result->fi->fib_dev;  	/* do not accept result if the route does  	 * not meet the required prefix length diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d5dbca5ecf6..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -380,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)  }  void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, -	       int dst_len, u32 tb_id, struct nl_info *info, +	       int dst_len, u32 tb_id, const struct nl_info *info,  	       unsigned int nlm_flags)  {  	struct sk_buff *skb; @@ -426,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)  	return NULL;  } -int fib_detect_death(struct fib_info *fi, int order, -		     struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, +			    struct fib_info **last_resort, int *last_idx, +			    int dflt)  {  	struct neighbour *n;  	int state = NUD_NONE; @@ -630,6 +631,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,  				.daddr = nh->nh_gw,  				.flowi4_scope = cfg->fc_scope + 1,  				.flowi4_oif = nh->nh_oif, +				.flowi4_iif = LOOPBACK_IFINDEX,  			};  			/* It is not necessary, but requires a bit of thinking */ @@ -819,13 +821,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);  	if (fi == NULL)  		goto failure; +	fib_info_cnt++;  	if (cfg->fc_mx) {  		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);  		if (!fi->fib_metrics)  			goto failure;  	} else  		fi->fib_metrics = (u32 *) dst_default_metrics; -	fib_info_cnt++;  	fi->fib_net = hold_net(net);  	fi->fib_protocol = cfg->fc_protocol; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3df6d3edb2a..5afeb5aa4c7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -762,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)  		if (IS_LEAF(node) || ((struct tnode *) node)->pos >  		   tn->pos + tn->bits - 1) { -			if (tkey_extract_bits(node->key, -					      oldtnode->pos + oldtnode->bits, -					      1) == 0) -				put_child(tn, 2*i, node); -			else -				put_child(tn, 2*i+1, node); +			put_child(tn, +				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), +				node);  			continue;  		} @@ -1120,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  		 *  first tnode need some special handling  		 */ -		if (tp) -			pos = tp->pos+tp->bits; -		else -			pos = 0; -  		if (n) { +			pos = tp ? tp->pos+tp->bits : 0;  			newpos = tkey_mismatch(key, pos, n->key);  			tn = tnode_new(n->key, newpos, 1);  		} else { @@ -2530,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  		list_for_each_entry_rcu(fa, &li->falh, fa_list) {  			const struct fib_info *fi = fa->fa_info;  			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); -			int len;  			if (fa->fa_type == RTN_BROADCAST  			    || fa->fa_type == RTN_MULTICAST)  				continue; +			seq_setwidth(seq, 127); +  			if (fi)  				seq_printf(seq,  					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t" -					 "%d\t%08X\t%d\t%u\t%u%n", +					 "%d\t%08X\t%d\t%u\t%u",  					 fi->fib_dev ? fi->fib_dev->name : "*",  					 prefix,  					 fi->fib_nh->nh_gw, flags, 0, 0, @@ -2548,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  					 (fi->fib_advmss ?  					  fi->fib_advmss + 40 : 0),  					 fi->fib_window, -					 fi->fib_rtt >> 3, &len); +					 fi->fib_rtt >> 3);  			else  				seq_printf(seq,  					 "*\t%08X\t%08X\t%04X\t%d\t%u\t" -					 "%d\t%08X\t%d\t%u\t%u%n", +					 "%d\t%08X\t%d\t%u\t%u",  					 prefix, 0, flags, 0, 0, 0, -					 mask, 0, 0, 0, &len); +					 mask, 0, 0, 0); -			seq_printf(seq, "%*s\n", 127 - len, ""); +			seq_pad(seq, '\n');  		}  	} diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 736c9fc3ef9..0485bf7f8f0 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -68,6 +68,7 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,  	skb_push(skb, hdr_len); +	skb_reset_transport_header(skb);  	greh = (struct gre_base_hdr *)skb->data;  	greh->flags = tnl_flags_to_gre_flags(tpi->flags);  	greh->protocol = tpi->proto; @@ -84,7 +85,8 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,  			ptr--;  		}  		if (tpi->flags&TUNNEL_CSUM && -		    !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) { +		    !(skb_shinfo(skb)->gso_type & +		      (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {  			*ptr = 0;  			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,  								 skb->len, 0)); @@ -93,57 +95,6 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,  }  EXPORT_SYMBOL_GPL(gre_build_header); -struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ -	int err; - -	if (likely(!skb->encapsulation)) { -		skb_reset_inner_headers(skb); -		skb->encapsulation = 1; -	} - -	if (skb_is_gso(skb)) { -		err = skb_unclone(skb, GFP_ATOMIC); -		if (unlikely(err)) -			goto error; -		skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; -		return skb; -	} else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) { -		err = skb_checksum_help(skb); -		if (unlikely(err)) -			goto error; -	} else if (skb->ip_summed != CHECKSUM_PARTIAL) -		skb->ip_summed = CHECKSUM_NONE; - -	return skb; -error: -	kfree_skb(skb); -	return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(gre_handle_offloads); - -static __sum16 check_checksum(struct sk_buff *skb) -{ -	__sum16 csum = 0; - -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		csum = csum_fold(skb->csum); - -		if (!csum) -			break; -		/* Fall through. */ - -	case CHECKSUM_NONE: -		skb->csum = 0; -		csum = __skb_checksum_complete(skb); -		skb->ip_summed = CHECKSUM_COMPLETE; -		break; -	} - -	return csum; -} -  static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,  			    bool *csum_err)  { @@ -170,7 +121,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,  	options = (__be32 *)(greh + 1);  	if (greh->flags & GRE_CSUM) { -		if (check_checksum(skb)) { +		if (skb_checksum_simple_validate(skb)) {  			*csum_err = true;  			return -EINVAL;  		} @@ -211,6 +162,14 @@ static int gre_cisco_rcv(struct sk_buff *skb)  	int i;  	bool csum_err = false; +#ifdef CONFIG_NET_IPGRE_BROADCAST +	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { +		/* Looped back packet, drop it! */ +		if (rt_is_output_route(skb_rtable(skb))) +			goto drop; +	} +#endif +  	if (parse_gre_header(skb, &tpi, &csum_err) < 0)  		goto drop; @@ -384,14 +343,7 @@ static int __init gre_init(void)  		goto err_gre;  	} -	if (gre_offload_init()) { -		pr_err("can't add protocol offload\n"); -		goto err_gso; -	} -  	return 0; -err_gso: -	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);  err_gre:  	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);  err: @@ -400,8 +352,6 @@ err:  static void __exit gre_exit(void)  { -	gre_offload_exit(); -  	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);  	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);  } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 55e6bfb3a28..f0bdd47bbbc 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@   */  #include <linux/skbuff.h> +#include <linux/init.h>  #include <net/protocol.h>  #include <net/gre.h> @@ -26,8 +27,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  {  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	netdev_features_t enc_features; -	int ghl = GRE_HEADER_SECTION; +	int ghl;  	struct gre_base_hdr *greh; +	u16 mac_offset = skb->mac_header;  	int mac_len = skb->mac_len;  	__be16 protocol = skb->protocol;  	int tnl_hlen; @@ -39,7 +41,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  				  SKB_GSO_UDP |  				  SKB_GSO_DODGY |  				  SKB_GSO_TCP_ECN | -				  SKB_GSO_GRE))) +				  SKB_GSO_GRE | +				  SKB_GSO_GRE_CSUM | +				  SKB_GSO_IPIP)))  		goto out;  	if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) @@ -47,23 +51,21 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  	greh = (struct gre_base_hdr *)skb_transport_header(skb); -	if (greh->flags & GRE_KEY) -		ghl += GRE_HEADER_SECTION; -	if (greh->flags & GRE_SEQ) -		ghl += GRE_HEADER_SECTION; -	if (greh->flags & GRE_CSUM) { -		ghl += GRE_HEADER_SECTION; -		csum = true; -	} else -		csum = false; +	ghl = skb_inner_network_header(skb) - skb_transport_header(skb); +	if (unlikely(ghl < sizeof(*greh))) +		goto out; -	/* setup inner skb. */ -	skb->protocol = greh->protocol; -	skb->encapsulation = 0; +	csum = !!(greh->flags & GRE_CSUM); +	if (csum) +		skb->encap_hdr_csum = 1;  	if (unlikely(!pskb_may_pull(skb, ghl)))  		goto out; +	/* setup inner skb. */ +	skb->protocol = greh->protocol; +	skb->encapsulation = 0; +  	__skb_pull(skb, ghl);  	skb_reset_mac_header(skb);  	skb_set_network_header(skb, skb_inner_network_offset(skb)); @@ -72,8 +74,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  	/* segment inner packet. */  	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);  	segs = skb_mac_gso_segment(skb, enc_features); -	if (!segs || IS_ERR(segs)) +	if (!segs || IS_ERR(segs)) { +		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);  		goto out; +	}  	skb = segs;  	tnl_hlen = skb_tnl_header_len(skb); @@ -93,10 +97,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  				}  			} -			greh = (struct gre_base_hdr *)(skb->data); +			skb_reset_transport_header(skb); + +			greh = (struct gre_base_hdr *) +			    skb_transport_header(skb);  			pcsum = (__be32 *)(greh + 1);  			*pcsum = 0; -			*(__sum16 *)pcsum = csum_fold(skb_checksum(skb, 0, skb->len, 0)); +			*(__sum16 *)pcsum = gso_make_checksum(skb, 0);  		}  		__skb_push(skb, tnl_hlen - ghl); @@ -112,19 +119,180 @@ out:  	return segs;  } +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ +	__sum16 sum; + +	skb->csum = skb_checksum(skb, 0, skb->len, 0); +	NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, +		csum_partial(skb->data, skb_gro_offset(skb), 0)); +	sum = csum_fold(NAPI_GRO_CB(skb)->csum); +	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { +		if (unlikely(!sum) && !skb->csum_complete_sw) +			netdev_rx_csum_fault(skb->dev); +	} else { +		skb->ip_summed = CHECKSUM_COMPLETE; +		skb->csum_complete_sw = 1; +	} + +	return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, +					struct sk_buff *skb) +{ +	struct sk_buff **pp = NULL; +	struct sk_buff *p; +	const struct gre_base_hdr *greh; +	unsigned int hlen, grehlen; +	unsigned int off; +	int flush = 1; +	struct packet_offload *ptype; +	__be16 type; + +	off = skb_gro_offset(skb); +	hlen = off + sizeof(*greh); +	greh = skb_gro_header_fast(skb, off); +	if (skb_gro_header_hard(skb, hlen)) { +		greh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!greh)) +			goto out; +	} + +	/* Only support version 0 and K (key), C (csum) flags. Note that +	 * although the support for the S (seq#) flag can be added easily +	 * for GRO, this is problematic for GSO hence can not be enabled +	 * here because a GRO pkt may end up in the forwarding path, thus +	 * requiring GSO support to break it up correctly. +	 */ +	if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) +		goto out; + +	type = greh->protocol; + +	rcu_read_lock(); +	ptype = gro_find_receive_by_type(type); +	if (ptype == NULL) +		goto out_unlock; + +	grehlen = GRE_HEADER_SECTION; + +	if (greh->flags & GRE_KEY) +		grehlen += GRE_HEADER_SECTION; + +	if (greh->flags & GRE_CSUM) +		grehlen += GRE_HEADER_SECTION; + +	hlen = off + grehlen; +	if (skb_gro_header_hard(skb, hlen)) { +		greh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!greh)) +			goto out_unlock; +	} +	if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ +		__sum16 csum = 0; + +		if (skb->ip_summed == CHECKSUM_COMPLETE) +			csum = csum_fold(NAPI_GRO_CB(skb)->csum); +		/* Don't trust csum error calculated/reported by h/w */ +		if (skb->ip_summed == CHECKSUM_NONE || csum != 0) +			csum = gro_skb_checksum(skb); + +		/* GRE CSUM is the 1's complement of the 1's complement sum +		 * of the GRE hdr plus payload so it should add up to 0xffff +		 * (and 0 after csum_fold()) just like the IPv4 hdr csum. +		 */ +		if (csum) +			goto out_unlock; +	} +	flush = 0; + +	for (p = *head; p; p = p->next) { +		const struct gre_base_hdr *greh2; + +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		/* The following checks are needed to ensure only pkts +		 * from the same tunnel are considered for aggregation. +		 * The criteria for "the same tunnel" includes: +		 * 1) same version (we only support version 0 here) +		 * 2) same protocol (we only support ETH_P_IP for now) +		 * 3) same set of flags +		 * 4) same key if the key field is present. +		 */ +		greh2 = (struct gre_base_hdr *)(p->data + off); + +		if (greh2->flags != greh->flags || +		    greh2->protocol != greh->protocol) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +		if (greh->flags & GRE_KEY) { +			/* compare keys */ +			if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { +				NAPI_GRO_CB(p)->same_flow = 0; +				continue; +			} +		} +	} + +	skb_gro_pull(skb, grehlen); + +	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ +	skb_gro_postpull_rcsum(skb, greh, grehlen); + +	pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: +	rcu_read_unlock(); +out: +	NAPI_GRO_CB(skb)->flush |= flush; + +	return pp; +} + +static int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ +	struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); +	struct packet_offload *ptype; +	unsigned int grehlen = sizeof(*greh); +	int err = -ENOENT; +	__be16 type; + +	skb->encapsulation = 1; +	skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + +	type = greh->protocol; +	if (greh->flags & GRE_KEY) +		grehlen += GRE_HEADER_SECTION; + +	if (greh->flags & GRE_CSUM) +		grehlen += GRE_HEADER_SECTION; + +	rcu_read_lock(); +	ptype = gro_find_complete_by_type(type); +	if (ptype != NULL) +		err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + +	rcu_read_unlock(); +	return err; +} +  static const struct net_offload gre_offload = {  	.callbacks = {  		.gso_send_check = gre_gso_send_check,  		.gso_segment = gre_gso_segment, +		.gro_receive = gre_gro_receive, +		.gro_complete = gre_gro_complete,  	},  }; -int __init gre_offload_init(void) +static int __init gre_offload_init(void)  {  	return inet_add_offload(&gre_offload, IPPROTO_GRE);  } - -void __exit gre_offload_exit(void) -{ -	inet_del_offload(&gre_offload, IPPROTO_GRE); -} +device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5f7d11a4587..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -337,6 +337,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	struct sock *sk;  	struct inet_sock *inet;  	__be32 daddr, saddr; +	u32 mark = IP4_REPLY_MARK(net, skb->mark);  	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))  		return; @@ -349,10 +350,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	icmp_param->data.icmph.checksum = 0;  	inet->tos = ip_hdr(skb)->tos; +	sk->sk_mark = mark;  	daddr = ipc.addr = ip_hdr(skb)->saddr;  	saddr = fib_compute_spec_dst(skb);  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; +  	if (icmp_param->replyopts.opt.opt.optlen) {  		ipc.opt = &icmp_param->replyopts.opt;  		if (ipc.opt->opt.srr) @@ -361,6 +366,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	memset(&fl4, 0, sizeof(fl4));  	fl4.daddr = daddr;  	fl4.saddr = saddr; +	fl4.flowi4_mark = mark;  	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);  	fl4.flowi4_proto = IPPROTO_ICMP;  	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); @@ -379,7 +385,7 @@ static struct rtable *icmp_route_lookup(struct net *net,  					struct flowi4 *fl4,  					struct sk_buff *skb_in,  					const struct iphdr *iph, -					__be32 saddr, u8 tos, +					__be32 saddr, u8 tos, u32 mark,  					int type, int code,  					struct icmp_bxm *param)  { @@ -391,6 +397,7 @@ static struct rtable *icmp_route_lookup(struct net *net,  	fl4->daddr = (param->replyopts.opt.opt.srr ?  		      param->replyopts.opt.opt.faddr : iph->saddr);  	fl4->saddr = saddr; +	fl4->flowi4_mark = mark;  	fl4->flowi4_tos = RT_TOS(tos);  	fl4->flowi4_proto = IPPROTO_ICMP;  	fl4->fl4_icmp_type = type; @@ -488,6 +495,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	struct flowi4 fl4;  	__be32 saddr;  	u8  tos; +	u32 mark;  	struct net *net;  	struct sock *sk; @@ -589,6 +597,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |  					   IPTOS_PREC_INTERNETCONTROL) :  					  iph->tos; +	mark = IP4_REPLY_MARK(net, skb_in->mark);  	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))  		goto out_unlock; @@ -605,11 +614,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	icmp_param->skb	  = skb_in;  	icmp_param->offset = skb_network_offset(skb_in);  	inet_sk(sk)->tos = tos; +	sk->sk_mark = mark;  	ipc.addr = iph->saddr;  	ipc.opt = &icmp_param->replyopts.opt;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; -	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, +	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,  			       type, code, icmp_param);  	if (IS_ERR(rt))  		goto out_unlock; @@ -663,6 +675,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)  	rcu_read_unlock();  } +static bool icmp_tag_validation(int proto) +{ +	bool ok; + +	rcu_read_lock(); +	ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; +	rcu_read_unlock(); +	return ok; +} +  /*   *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and   *	ICMP_PARAMETERPROB. @@ -700,13 +722,23 @@ static void icmp_unreach(struct sk_buff *skb)  		case ICMP_PORT_UNREACH:  			break;  		case ICMP_FRAG_NEEDED: -			if (ipv4_config.no_pmtu_disc) { +			/* for documentation of the ip_no_pmtu_disc +			 * values please see +			 * Documentation/networking/ip-sysctl.txt +			 */ +			switch (net->ipv4.sysctl_ip_no_pmtu_disc) { +			default:  				LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),  					       &iph->daddr); -			} else { -				info = ntohs(icmph->un.frag.mtu); -				if (!info) +				break; +			case 2: +				goto out; +			case 3: +				if (!icmp_tag_validation(iph->protocol))  					goto out; +				/* fall through */ +			case 0: +				info = ntohs(icmph->un.frag.mtu);  			}  			break;  		case ICMP_SR_FAILED: @@ -881,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb)  	ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		if (!csum_fold(skb->csum)) -			break; -		/* fall through */ -	case CHECKSUM_NONE: -		skb->csum = 0; -		if (__skb_checksum_complete(skb)) -			goto csum_error; -	} +	if (skb_checksum_simple_validate(skb)) +		goto csum_error;  	if (!pskb_pull(skb, sizeof(*icmph)))  		goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7defdc9ba16..db710b059ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -211,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)  /* It must be called with locked im->lock */  static void igmp_start_timer(struct ip_mc_list *im, int max_delay)  { -	int tv = net_random() % max_delay; +	int tv = prandom_u32() % max_delay;  	im->tm_running = 1;  	if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -220,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)  static void igmp_gq_start_timer(struct in_device *in_dev)  { -	int tv = net_random() % in_dev->mr_maxdelay; +	int tv = prandom_u32() % in_dev->mr_maxdelay;  	in_dev->mr_gq_running = 1;  	if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -229,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)  static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)  { -	int tv = net_random() % delay; +	int tv = prandom_u32() % delay;  	if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))  		in_dev_hold(in_dev); @@ -310,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)  	struct ip_sf_list *psf;  	int scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (!is_in(pmc, psf, type, gdeleted, sdeleted))  			continue;  		scount++; @@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)  	pip->saddr    = fl4.saddr;  	pip->protocol = IPPROTO_IGMP;  	pip->tot_len  = 0;	/* filled in later */ -	ip_select_ident(skb, &rt->dst, NULL); +	ip_select_ident(skb, NULL);  	((u8 *)&pip[1])[0] = IPOPT_RA;  	((u8 *)&pip[1])[1] = 4;  	((u8 *)&pip[1])[2] = 0; @@ -463,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,  	}  	first = 1;  	psf_prev = NULL; -	for (psf=*psf_list; psf; psf=psf_next) { +	for (psf = *psf_list; psf; psf = psf_next) {  		__be32 *psrc;  		psf_next = psf->sf_next; @@ -520,7 +520,7 @@ empty_source:  			return skb;  		if (pmc->crcount || isquery) {  			/* make sure we have room for group header */ -			if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) { +			if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {  				igmpv3_sendpack(skb);  				skb = NULL; /* add_grhead will get a new one */  			} @@ -576,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)  	struct ip_sf_list *psf_prev, *psf_next, *psf;  	psf_prev = NULL; -	for (psf=*ppsf; psf; psf = psf_next) { +	for (psf = *ppsf; psf; psf = psf_next) {  		psf_next = psf->sf_next;  		if (psf->sf_crcount == 0) {  			if (psf_prev) @@ -600,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)  	/* deleted MCA's */  	pmc_prev = NULL; -	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { +	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {  		pmc_next = pmc->next;  		if (pmc->sfmode == MCAST_INCLUDE) {  			type = IGMPV3_BLOCK_OLD_SOURCES; @@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,  	iph->daddr    = dst;  	iph->saddr    = fl4.saddr;  	iph->protocol = IPPROTO_IGMP; -	ip_select_ident(skb, &rt->dst, NULL); +	ip_select_ident(skb, NULL);  	((u8 *)&iph[1])[0] = IPOPT_RA;  	((u8 *)&iph[1])[1] = 4;  	((u8 *)&iph[1])[2] = 0; @@ -764,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev)  static void igmp_timer_expire(unsigned long data)  { -	struct ip_mc_list *im=(struct ip_mc_list *)data; +	struct ip_mc_list *im = (struct ip_mc_list *)data;  	struct in_device *in_dev = im->interface;  	spin_lock(&im->lock); @@ -794,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  	int i, scount;  	scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (scount == nsrcs)  			break; -		for (i=0; i<nsrcs; i++) { +		for (i = 0; i < nsrcs; i++) {  			/* skip inactive filters */  			if (psf->sf_count[MCAST_INCLUDE] ||  			    pmc->sfcount[MCAST_EXCLUDE] != @@ -825,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  	/* mark INCLUDE-mode sources */  	scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (scount == nsrcs)  			break; -		for (i=0; i<nsrcs; i++) +		for (i = 0; i < nsrcs; i++)  			if (srcs[i] == psf->sf_inaddr) {  				psf->sf_gsresp = 1;  				scount++; @@ -988,16 +988,8 @@ int igmp_rcv(struct sk_buff *skb)  	if (!pskb_may_pull(skb, sizeof(struct igmphdr)))  		goto drop; -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		if (!csum_fold(skb->csum)) -			break; -		/* fall through */ -	case CHECKSUM_NONE: -		skb->csum = 0; -		if (__skb_checksum_complete(skb)) -			goto drop; -	} +	if (skb_checksum_simple_validate(skb)) +		goto drop;  	ih = igmp_hdr(skb);  	switch (ih->type) { @@ -1103,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)  		pmc->tomb = im->tomb;  		pmc->sources = im->sources;  		im->tomb = im->sources = NULL; -		for (psf=pmc->sources; psf; psf=psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = pmc->crcount;  	}  	spin_unlock_bh(&im->lock); @@ -1121,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)  	spin_lock_bh(&in_dev->mc_tomb_lock);  	pmc_prev = NULL; -	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { +	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {  		if (pmc->multiaddr == multiaddr)  			break;  		pmc_prev = pmc; @@ -1134,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)  	}  	spin_unlock_bh(&in_dev->mc_tomb_lock);  	if (pmc) { -		for (psf=pmc->tomb; psf; psf=psf_next) { +		for (psf = pmc->tomb; psf; psf = psf_next) {  			psf_next = psf->sf_next;  			kfree(psf);  		} @@ -1167,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)  		psf = pmc->tomb;  		pmc->tomb = NULL;  		spin_unlock_bh(&pmc->lock); -		for (; psf; psf=psf_next) { +		for (; psf; psf = psf_next) {  			psf_next = psf->sf_next;  			kfree(psf);  		} @@ -1557,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,  	int rv = 0;  	psf_prev = NULL; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (psf->sf_inaddr == *psfsrc)  			break;  		psf_prev = psf; @@ -1630,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->sfcount[sfmode]--;  	}  	err = 0; -	for (i=0; i<sfcount; i++) { +	for (i = 0; i < sfcount; i++) {  		int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);  		changerec |= rv > 0; @@ -1650,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :  			IGMP_Unsolicited_Report_Count;  		in_dev->mr_ifc_count = pmc->crcount; -		for (psf=pmc->sources; psf; psf = psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0;  		igmp_ifc_event(pmc->interface);  	} else if (sf_setstate(pmc) || changerec) { @@ -1671,7 +1663,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,  	struct ip_sf_list *psf, *psf_prev;  	psf_prev = NULL; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (psf->sf_inaddr == *psfsrc)  			break;  		psf_prev = psf; @@ -1699,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc)  	struct ip_sf_list *psf;  	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; -	for (psf=pmc->sources; psf; psf=psf->sf_next) +	for (psf = pmc->sources; psf; psf = psf->sf_next)  		if (pmc->sfcount[MCAST_EXCLUDE]) {  			psf->sf_oldin = mca_xcount ==  				psf->sf_count[MCAST_EXCLUDE] && @@ -1716,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  	int new_in, rv;  	rv = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (pmc->sfcount[MCAST_EXCLUDE]) {  			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&  				!psf->sf_count[MCAST_INCLUDE]; @@ -1726,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  			if (!psf->sf_oldin) {  				struct ip_sf_list *prev = NULL; -				for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { +				for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {  					if (dpsf->sf_inaddr == psf->sf_inaddr)  						break;  					prev = dpsf; @@ -1748,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  			 * add or update "delete" records if an active filter  			 * is now inactive  			 */ -			for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) +			for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)  				if (dpsf->sf_inaddr == psf->sf_inaddr)  					break;  			if (!dpsf) { @@ -1800,7 +1792,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  	if (!delta)  		pmc->sfcount[sfmode]++;  	err = 0; -	for (i=0; i<sfcount; i++) { +	for (i = 0; i < sfcount; i++) {  		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);  		if (err)  			break; @@ -1810,7 +1802,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		if (!delta)  			pmc->sfcount[sfmode]--; -		for (j=0; j<i; j++) +		for (j = 0; j < i; j++)  			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);  	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {  #ifdef CONFIG_IP_MULTICAST @@ -1829,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :  			IGMP_Unsolicited_Report_Count;  		in_dev->mr_ifc_count = pmc->crcount; -		for (psf=pmc->sources; psf; psf = psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0;  		igmp_ifc_event(in_dev);  	} else if (sf_setstate(pmc)) { @@ -1844,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)  {  	struct ip_sf_list *psf, *nextpsf; -	for (psf=pmc->tomb; psf; psf=nextpsf) { +	for (psf = pmc->tomb; psf; psf = nextpsf) {  		nextpsf = psf->sf_next;  		kfree(psf);  	}  	pmc->tomb = NULL; -	for (psf=pmc->sources; psf; psf=nextpsf) { +	for (psf = pmc->sources; psf; psf = nextpsf) {  		nextpsf = psf->sf_next;  		kfree(psf);  	} @@ -1952,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)  	rtnl_lock();  	in_dev = ip_mc_find_dev(net, imr); +	if (!in_dev) { +		ret = -ENODEV; +		goto out; +	}  	ifindex = imr->imr_ifindex;  	for (imlp = &inet->mc_list;  	     (iml = rtnl_dereference(*imlp)) != NULL; @@ -1969,16 +1965,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)  		*imlp = iml->next_rcu; -		if (in_dev) -			ip_mc_dec_group(in_dev, group); +		ip_mc_dec_group(in_dev, group);  		rtnl_unlock();  		/* decrease mem now to avoid the memleak warning */  		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);  		kfree_rcu(iml, rcu);  		return 0;  	} -	if (!in_dev) -		ret = -ENODEV; +out:  	rtnl_unlock();  	return ret;  } @@ -2043,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		if (!psl)  			goto done;	/* err = -EADDRNOTAVAIL */  		rv = !0; -		for (i=0; i<psl->sl_count; i++) { +		for (i = 0; i < psl->sl_count; i++) {  			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,  				sizeof(__be32));  			if (rv == 0) @@ -2062,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,  			&mreqs->imr_sourceaddr, 1); -		for (j=i+1; j<psl->sl_count; j++) +		for (j = i+1; j < psl->sl_count; j++)  			psl->sl_addr[j-1] = psl->sl_addr[j];  		psl->sl_count--;  		err = 0; @@ -2088,7 +2082,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		newpsl->sl_max = count;  		newpsl->sl_count = count - IP_SFBLOCK;  		if (psl) { -			for (i=0; i<psl->sl_count; i++) +			for (i = 0; i < psl->sl_count; i++)  				newpsl->sl_addr[i] = psl->sl_addr[i];  			/* decrease mem now to avoid the memleak warning */  			atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); @@ -2098,7 +2092,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		psl = newpsl;  	}  	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */ -	for (i=0; i<psl->sl_count; i++) { +	for (i = 0; i < psl->sl_count; i++) {  		rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,  			sizeof(__be32));  		if (rv == 0) @@ -2106,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  	}  	if (rv == 0)		/* address already there is an error */  		goto done; -	for (j=psl->sl_count-1; j>=i; j--) +	for (j = psl->sl_count-1; j >= i; j--)  		psl->sl_addr[j+1] = psl->sl_addr[j];  	psl->sl_addr[i] = mreqs->imr_sourceaddr;  	psl->sl_count++; @@ -2305,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,  	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {  		return -EFAULT;  	} -	for (i=0; i<copycount; i++) { +	for (i = 0; i < copycount; i++) {  		struct sockaddr_storage ss;  		psin = (struct sockaddr_in *)&ss; @@ -2350,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)  	if (!psl)  		goto unlock; -	for (i=0; i<psl->sl_count; i++) { +	for (i = 0; i < psl->sl_count; i++) {  		if (psl->sl_addr[i] == rmt_addr)  			break;  	} @@ -2423,7 +2417,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u  		rv = 1;  	} else if (im) {  		if (src_addr) { -			for (psf=im->sources; psf; psf=psf->sf_next) { +			for (psf = im->sources; psf; psf = psf->sf_next) {  				if (psf->sf_inaddr == src_addr)  					break;  			} @@ -2762,6 +2756,7 @@ static struct pernet_operations igmp_net_ops = {  	.init = igmp_net_init,  	.exit = igmp_net_exit,  }; +#endif  static int igmp_netdev_event(struct notifier_block *this,  			     unsigned long event, void *ptr) @@ -2785,8 +2780,9 @@ static struct notifier_block igmp_notifier = {  	.notifier_call = igmp_netdev_event,  }; -int __init igmp_mc_proc_init(void) +int __init igmp_mc_init(void)  { +#if defined(CONFIG_PROC_FS)  	int err;  	err = register_pernet_subsys(&igmp_net_ops); @@ -2800,5 +2796,7 @@ int __init igmp_mc_proc_init(void)  reg_notif_fail:  	unregister_pernet_subsys(&igmp_net_ops);  	return err; -} +#else +	return register_netdevice_notifier(&igmp_notifier);  #endif +} diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6acb541c909..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,27 +29,16 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";  EXPORT_SYMBOL(inet_csk_timer_bug_msg);  #endif -/* - * This struct holds the first and last local port number. - */ -struct local_ports sysctl_local_ports __read_mostly = { -	.lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), -	.range = { 32768, 61000 }, -}; - -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - -void inet_get_local_port_range(int *low, int *high) +void inet_get_local_port_range(struct net *net, int *low, int *high)  {  	unsigned int seq;  	do { -		seq = read_seqbegin(&sysctl_local_ports.lock); +		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); -		*low = sysctl_local_ports.range[0]; -		*high = sysctl_local_ports.range[1]; -	} while (read_seqretry(&sysctl_local_ports.lock, seq)); +		*low = net->ipv4.ip_local_ports.range[0]; +		*high = net->ipv4.ip_local_ports.range[1]; +	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));  }  EXPORT_SYMBOL(inet_get_local_port_range); @@ -79,17 +68,16 @@ int inet_csk_bind_conflict(const struct sock *sk,  			    (!reuseport || !sk2->sk_reuseport ||  			    (sk2->sk_state != TCP_TIME_WAIT &&  			     !uid_eq(uid, sock_i_uid(sk2))))) { -				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); -				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || -				    sk2_rcv_saddr == sk_rcv_saddr(sk)) + +				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || +				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)  					break;  			}  			if (!relax && reuse && sk2->sk_reuse &&  			    sk2->sk_state != TCP_LISTEN) { -				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); -				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || -				    sk2_rcv_saddr == sk_rcv_saddr(sk)) +				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || +				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)  					break;  			}  		} @@ -116,13 +104,13 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)  		int remaining, rover, low, high;  again: -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1; -		smallest_rover = rover = net_random() % remaining + low; +		smallest_rover = rover = prandom_u32() % remaining + low;  		smallest_size = -1;  		do { -			if (inet_is_reserved_local_port(rover)) +			if (inet_is_local_reserved_port(net, rover))  				goto next_nolock;  			head = &hashinfo->bhash[inet_bhashfn(net, rover,  					hashinfo->bhash_size)]; @@ -417,12 +405,12 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,  	struct net *net = sock_net(sk);  	int flags = inet_sk_flowi_flags(sk); -	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, +	flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,  			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,  			   sk->sk_protocol,  			   flags, -			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, -			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); +			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);  	security_req_classify_flow(req, flowi4_to_flowi(fl4));  	rt = ip_route_output_flow(net, fl4, sk);  	if (IS_ERR(rt)) @@ -454,11 +442,11 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,  	rcu_read_lock();  	opt = rcu_dereference(newinet->inet_opt); -	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, +	flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,  			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,  			   sk->sk_protocol, inet_sk_flowi_flags(sk), -			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, -			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); +			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);  	security_req_classify_flow(req, flowi4_to_flowi(fl4));  	rt = ip_route_output_flow(net, fl4, sk);  	if (IS_ERR(rt)) @@ -504,9 +492,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,  	     prev = &req->dl_next) {  		const struct inet_request_sock *ireq = inet_rsk(req); -		if (ireq->rmt_port == rport && -		    ireq->rmt_addr == raddr && -		    ireq->loc_addr == laddr && +		if (ireq->ir_rmt_port == rport && +		    ireq->ir_rmt_addr == raddr && +		    ireq->ir_loc_addr == laddr &&  		    AF_INET_FAMILY(req->rsk_ops->family)) {  			WARN_ON(req->sk);  			*prevp = prev; @@ -523,7 +511,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; -	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, +	const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, +				     inet_rsk(req)->ir_rmt_port,  				     lopt->hash_rnd, lopt->nr_table_entries);  	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -683,11 +672,13 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,  		newsk->sk_state = TCP_SYN_RECV;  		newicsk->icsk_bind_hash = NULL; -		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; -		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); -		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; +		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; +		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; +		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);  		newsk->sk_write_space = sk_stream_write_space; +		newsk->sk_mark = inet_rsk(req)->ir_mark; +  		newicsk->icsk_retransmits = 0;  		newicsk->icsk_backoff	  = 0;  		newicsk->icsk_probes_out  = 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5f648751fce..e34dccbc4d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -106,6 +106,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,  	r->id.idiag_sport = inet->inet_sport;  	r->id.idiag_dport = inet->inet_dport; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); +  	r->id.idiag_src[0] = inet->inet_rcv_saddr;  	r->id.idiag_dst[0] = inet->inet_daddr; @@ -121,13 +125,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,  #if IS_ENABLED(CONFIG_IPV6)  	if (r->idiag_family == AF_INET6) { -		const struct ipv6_pinfo *np = inet6_sk(sk); -		*(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; -		*(struct in6_addr *)r->id.idiag_dst = np->daddr; +		*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; +		*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;  		if (ext & (1 << (INET_DIAG_TCLASS - 1))) -			if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0) +			if (nla_put_u8(skb, INET_DIAG_TCLASS, +				       inet6_sk(sk)->tclass) < 0)  				goto errout;  	}  #endif @@ -222,7 +226,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,  			       u32 portid, u32 seq, u16 nlmsg_flags,  			       const struct nlmsghdr *unlh)  { -	long tmo; +	s32 tmo;  	struct inet_diag_msg *r;  	struct nlmsghdr *nlh; @@ -234,32 +238,36 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,  	r = nlmsg_data(nlh);  	BUG_ON(tw->tw_state != TCP_TIME_WAIT); -	tmo = tw->tw_ttd - jiffies; +	tmo = tw->tw_ttd - inet_tw_time_stamp();  	if (tmo < 0)  		tmo = 0;  	r->idiag_family	      = tw->tw_family;  	r->idiag_retrans      = 0; +  	r->id.idiag_if	      = tw->tw_bound_dev_if;  	sock_diag_save_cookie(tw, r->id.idiag_cookie); +  	r->id.idiag_sport     = tw->tw_sport;  	r->id.idiag_dport     = tw->tw_dport; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); +  	r->id.idiag_src[0]    = tw->tw_rcv_saddr;  	r->id.idiag_dst[0]    = tw->tw_daddr; +  	r->idiag_state	      = tw->tw_substate;  	r->idiag_timer	      = 3; -	r->idiag_expires      = DIV_ROUND_UP(tmo * 1000, HZ); +	r->idiag_expires      = jiffies_to_msecs(tmo);  	r->idiag_rqueue	      = 0;  	r->idiag_wqueue	      = 0;  	r->idiag_uid	      = 0;  	r->idiag_inode	      = 0;  #if IS_ENABLED(CONFIG_IPV6)  	if (tw->tw_family == AF_INET6) { -		const struct inet6_timewait_sock *tw6 = -						inet6_twsk((struct sock *)tw); - -		*(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; -		*(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; +		*(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; +		*(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;  	}  #endif @@ -273,10 +281,11 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,  			const struct nlmsghdr *unlh)  {  	if (sk->sk_state == TCP_TIME_WAIT) -		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, -					   skb, r, portid, seq, nlmsg_flags, -					   unlh); -	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); +		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, +					   nlmsg_flags, unlh); + +	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, +				  nlmsg_flags, unlh);  }  int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, @@ -338,12 +347,9 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s  		err = 0;  out: -	if (sk) { -		if (sk->sk_state == TCP_TIME_WAIT) -			inet_twsk_put((struct inet_timewait_sock *)sk); -		else -			sock_put(sk); -	} +	if (sk) +		sock_gen_put(sk); +  out_nosk:  	return err;  } @@ -489,10 +495,9 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)  	entry.family = sk->sk_family;  #if IS_ENABLED(CONFIG_IPV6)  	if (entry.family == AF_INET6) { -		struct ipv6_pinfo *np = inet6_sk(sk); -		entry.saddr = np->rcv_saddr.s6_addr32; -		entry.daddr = np->daddr.s6_addr32; +		entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; +		entry.daddr = sk->sk_v6_daddr.s6_addr32;  	} else  #endif  	{ @@ -635,22 +640,22 @@ static int inet_csk_diag_dump(struct sock *sk,  				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);  } -static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, +static int inet_twsk_diag_dump(struct sock *sk,  			       struct sk_buff *skb,  			       struct netlink_callback *cb,  			       struct inet_diag_req_v2 *r,  			       const struct nlattr *bc)  { +	struct inet_timewait_sock *tw = inet_twsk(sk); +  	if (bc != NULL) {  		struct inet_diag_entry entry;  		entry.family = tw->tw_family;  #if IS_ENABLED(CONFIG_IPV6)  		if (tw->tw_family == AF_INET6) { -			struct inet6_timewait_sock *tw6 = -						inet6_twsk((struct sock *)tw); -			entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; -			entry.daddr = tw6->tw_v6_daddr.s6_addr32; +			entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; +			entry.daddr = tw->tw_v6_daddr.s6_addr32;  		} else  #endif  		{ @@ -682,12 +687,12 @@ static inline void inet_diag_req_addrs(const struct sock *sk,  #if IS_ENABLED(CONFIG_IPV6)  	if (sk->sk_family == AF_INET6) {  		if (req->rsk_ops->family == AF_INET6) { -			entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32; -			entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32; +			entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; +			entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;  		} else if (req->rsk_ops->family == AF_INET) { -			ipv6_addr_set_v4mapped(ireq->loc_addr, +			ipv6_addr_set_v4mapped(ireq->ir_loc_addr,  					       &entry->saddr_storage); -			ipv6_addr_set_v4mapped(ireq->rmt_addr, +			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,  					       &entry->daddr_storage);  			entry->saddr = entry->saddr_storage.s6_addr32;  			entry->daddr = entry->daddr_storage.s6_addr32; @@ -695,8 +700,8 @@ static inline void inet_diag_req_addrs(const struct sock *sk,  	} else  #endif  	{ -		entry->saddr = &ireq->loc_addr; -		entry->daddr = &ireq->rmt_addr; +		entry->saddr = &ireq->ir_loc_addr; +		entry->daddr = &ireq->ir_rmt_addr;  	}  } @@ -731,9 +736,14 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,  		tmo = 0;  	r->id.idiag_sport = inet->inet_sport; -	r->id.idiag_dport = ireq->rmt_port; -	r->id.idiag_src[0] = ireq->loc_addr; -	r->id.idiag_dst[0] = ireq->rmt_addr; +	r->id.idiag_dport = ireq->ir_rmt_port; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + +	r->id.idiag_src[0] = ireq->ir_loc_addr; +	r->id.idiag_dst[0] = ireq->ir_rmt_addr; +  	r->idiag_expires = jiffies_to_msecs(tmo);  	r->idiag_rqueue = 0;  	r->idiag_wqueue = 0; @@ -792,13 +802,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,  			if (reqnum < s_reqnum)  				continue; -			if (r->id.idiag_dport != ireq->rmt_port && +			if (r->id.idiag_dport != ireq->ir_rmt_port &&  			    r->id.idiag_dport)  				continue;  			if (bc) {  				inet_diag_req_addrs(sk, req, &entry); -				entry.dport = ntohs(ireq->rmt_port); +				entry.dport = ntohs(ireq->ir_rmt_port);  				if (!inet_diag_bc_run(bc, &entry))  					continue; @@ -911,8 +921,7 @@ skip_listen_ht:  		num = 0; -		if (hlist_nulls_empty(&head->chain) && -			hlist_nulls_empty(&head->twchain)) +		if (hlist_nulls_empty(&head->chain))  			continue;  		if (i > s_i) @@ -920,24 +929,31 @@ skip_listen_ht:  		spin_lock_bh(lock);  		sk_nulls_for_each(sk, node, &head->chain) { -			struct inet_sock *inet = inet_sk(sk); +			int res; +			int state;  			if (!net_eq(sock_net(sk), net))  				continue;  			if (num < s_num)  				goto next_normal; -			if (!(r->idiag_states & (1 << sk->sk_state))) +			state = (sk->sk_state == TCP_TIME_WAIT) ? +				inet_twsk(sk)->tw_substate : sk->sk_state; +			if (!(r->idiag_states & (1 << state)))  				goto next_normal;  			if (r->sdiag_family != AF_UNSPEC && -					sk->sk_family != r->sdiag_family) +			    sk->sk_family != r->sdiag_family)  				goto next_normal; -			if (r->id.idiag_sport != inet->inet_sport && +			if (r->id.idiag_sport != htons(sk->sk_num) &&  			    r->id.idiag_sport)  				goto next_normal; -			if (r->id.idiag_dport != inet->inet_dport && +			if (r->id.idiag_dport != sk->sk_dport &&  			    r->id.idiag_dport)  				goto next_normal; -			if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { +			if (sk->sk_state == TCP_TIME_WAIT) +				res = inet_twsk_diag_dump(sk, skb, cb, r, bc); +			else +				res = inet_csk_diag_dump(sk, skb, cb, r, bc); +			if (res < 0) {  				spin_unlock_bh(lock);  				goto done;  			} @@ -945,33 +961,6 @@ next_normal:  			++num;  		} -		if (r->idiag_states & TCPF_TIME_WAIT) { -			struct inet_timewait_sock *tw; - -			inet_twsk_for_each(tw, node, -				    &head->twchain) { -				if (!net_eq(twsk_net(tw), net)) -					continue; - -				if (num < s_num) -					goto next_dying; -				if (r->sdiag_family != AF_UNSPEC && -						tw->tw_family != r->sdiag_family) -					goto next_dying; -				if (r->id.idiag_sport != tw->tw_sport && -				    r->id.idiag_sport) -					goto next_dying; -				if (r->id.idiag_dport != tw->tw_dport && -				    r->id.idiag_dport) -					goto next_dying; -				if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) { -					spin_unlock_bh(lock); -					goto done; -				} -next_dying: -				++num; -			} -		}  		spin_unlock_bh(lock);  	} diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c5313a9c019..3b01959bf4b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -93,9 +93,6 @@ void inet_frags_init(struct inet_frags *f)  	}  	rwlock_init(&f->lock); -	f->rnd = (u32) ((totalram_pages ^ (totalram_pages >> 7)) ^ -				   (jiffies ^ (jiffies >> 6))); -  	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,  			(unsigned long)f);  	f->secret_timer.expires = jiffies + f->secret_interval; @@ -211,7 +208,7 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)  	}  	work = frag_mem_limit(nf) - nf->low_thresh; -	while (work > 0) { +	while (work > 0 || force) {  		spin_lock(&nf->lru_lock);  		if (list_empty(&nf->lru_list)) { @@ -281,9 +278,10 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,  	atomic_inc(&qp->refcnt);  	hlist_add_head(&qp->list, &hb->chain); +	inet_frag_lru_add(nf, qp);  	spin_unlock(&hb->chain_lock);  	read_unlock(&f->lock); -	inet_frag_lru_add(nf, qp); +  	return qp;  } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 96da9c77dec..43116e8c8e1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,6 +24,31 @@  #include <net/secure_seq.h>  #include <net/ip.h> +static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, +				 const __u16 lport, const __be32 faddr, +				 const __be16 fport) +{ +	static u32 inet_ehash_secret __read_mostly; + +	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); + +	return __inet_ehashfn(laddr, lport, faddr, fport, +			      inet_ehash_secret + net_hash_mix(net)); +} + + +static unsigned int inet_sk_ehashfn(const struct sock *sk) +{ +	const struct inet_sock *inet = inet_sk(sk); +	const __be32 laddr = inet->inet_rcv_saddr; +	const __u16 lport = inet->inet_num; +	const __be32 faddr = inet->inet_daddr; +	const __be16 fport = inet->inet_dport; +	struct net *net = sock_net(sk); + +	return inet_ehashfn(net, laddr, lport, faddr, fport); +} +  /*   * Allocate and initialize a new local port bind bucket.   * The bindhash mutex for snum's hash chain must be held here. @@ -230,13 +255,26 @@ begin:  }  EXPORT_SYMBOL_GPL(__inet_lookup_listener); +/* All sockets share common refcount, but have different destructors */ +void sock_gen_put(struct sock *sk) +{ +	if (!atomic_dec_and_test(&sk->sk_refcnt)) +		return; + +	if (sk->sk_state == TCP_TIME_WAIT) +		inet_twsk_free(inet_twsk(sk)); +	else +		sk_free(sk); +} +EXPORT_SYMBOL_GPL(sock_gen_put); +  struct sock *__inet_lookup_established(struct net *net,  				  struct inet_hashinfo *hashinfo,  				  const __be32 saddr, const __be16 sport,  				  const __be32 daddr, const u16 hnum,  				  const int dif)  { -	INET_ADDR_COOKIE(acookie, saddr, daddr) +	INET_ADDR_COOKIE(acookie, saddr, daddr);  	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);  	struct sock *sk;  	const struct hlist_nulls_node *node; @@ -255,13 +293,13 @@ begin:  		if (likely(INET_MATCH(sk, net, acookie,  				      saddr, daddr, ports, dif))) {  			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) -				goto begintw; +				goto out;  			if (unlikely(!INET_MATCH(sk, net, acookie,  						 saddr, daddr, ports, dif))) { -				sock_put(sk); +				sock_gen_put(sk);  				goto begin;  			} -			goto out; +			goto found;  		}  	}  	/* @@ -271,37 +309,9 @@ begin:  	 */  	if (get_nulls_value(node) != slot)  		goto begin; - -begintw: -	/* Must check for a TIME_WAIT'er before going to listener hash. */ -	sk_nulls_for_each_rcu(sk, node, &head->twchain) { -		if (sk->sk_hash != hash) -			continue; -		if (likely(INET_TW_MATCH(sk, net, acookie, -					 saddr, daddr, ports, -					 dif))) { -			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { -				sk = NULL; -				goto out; -			} -			if (unlikely(!INET_TW_MATCH(sk, net, acookie, -						    saddr, daddr, ports, -						    dif))) { -				inet_twsk_put(inet_twsk(sk)); -				goto begintw; -			} -			goto out; -		} -	} -	/* -	 * if the nulls value we got at the end of this lookup is -	 * not the expected one, we must restart lookup. -	 * We probably met an item that was moved to another chain. -	 */ -	if (get_nulls_value(node) != slot) -		goto begintw; -	sk = NULL;  out: +	sk = NULL; +found:  	rcu_read_unlock();  	return sk;  } @@ -317,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,  	__be32 daddr = inet->inet_rcv_saddr;  	__be32 saddr = inet->inet_daddr;  	int dif = sk->sk_bound_dev_if; -	INET_ADDR_COOKIE(acookie, saddr, daddr) +	INET_ADDR_COOKIE(acookie, saddr, daddr);  	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);  	struct net *net = sock_net(sk);  	unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -326,39 +336,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,  	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);  	struct sock *sk2;  	const struct hlist_nulls_node *node; -	struct inet_timewait_sock *tw; +	struct inet_timewait_sock *tw = NULL;  	int twrefcnt = 0;  	spin_lock(lock); -	/* Check TIME-WAIT sockets first. */ -	sk_nulls_for_each(sk2, node, &head->twchain) { -		if (sk2->sk_hash != hash) -			continue; - -		if (likely(INET_TW_MATCH(sk2, net, acookie, -					 saddr, daddr, ports, dif))) { -			tw = inet_twsk(sk2); -			if (twsk_unique(sk, sk2, twp)) -				goto unique; -			else -				goto not_unique; -		} -	} -	tw = NULL; - -	/* And established part... */  	sk_nulls_for_each(sk2, node, &head->chain) {  		if (sk2->sk_hash != hash)  			continue; +  		if (likely(INET_MATCH(sk2, net, acookie, -				      saddr, daddr, ports, dif))) +					 saddr, daddr, ports, dif))) { +			if (sk2->sk_state == TCP_TIME_WAIT) { +				tw = inet_twsk(sk2); +				if (twsk_unique(sk, sk2, twp)) +					break; +			}  			goto not_unique; +		}  	} -unique:  	/* Must record num and sport now. Otherwise we will see -	 * in hash table socket with a funny identity. */ +	 * in hash table socket with a funny identity. +	 */  	inet->inet_num = lport;  	inet->inet_sport = htons(lport);  	sk->sk_hash = hash; @@ -494,13 +494,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  		u32 offset = hint + port_offset;  		struct inet_timewait_sock *tw = NULL; -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1;  		local_bh_disable();  		for (i = 1; i <= remaining; i++) {  			port = low + (i + offset) % remaining; -			if (inet_is_reserved_local_port(port)) +			if (inet_is_local_reserved_port(net, port))  				continue;  			head = &hinfo->bhash[inet_bhashfn(net, port,  					hinfo->bhash_size)]; diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 1975f52933c..f17ea49b28f 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -230,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,  	lro_desc->last_skb = skb;  } -static void lro_add_frags(struct net_lro_desc *lro_desc, -			  int len, int hlen, int truesize, -			  struct skb_frag_struct *skb_frags, -			  struct iphdr *iph, struct tcphdr *tcph) -{ -	struct sk_buff *skb = lro_desc->parent; -	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - -	lro_add_common(lro_desc, iph, tcph, tcp_data_len); - -	skb->truesize += truesize; - -	skb_frags[0].page_offset += hlen; -	skb_frag_size_sub(&skb_frags[0], hlen); - -	while (tcp_data_len > 0) { -		*(lro_desc->next_frag) = *skb_frags; -		tcp_data_len -= skb_frag_size(skb_frags); -		lro_desc->next_frag++; -		skb_frags++; -		skb_shinfo(skb)->nr_frags++; -	} -}  static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,  			      struct iphdr *iph, @@ -371,128 +348,6 @@ out:  	return 1;  } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, -				   struct skb_frag_struct *frags, -				   int len, int true_size, -				   void *mac_hdr, -				   int hlen, __wsum sum, -				   u32 ip_summed) -{ -	struct sk_buff *skb; -	struct skb_frag_struct *skb_frags; -	int data_len = len; -	int hdr_len = min(len, hlen); - -	skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); -	if (!skb) -		return NULL; - -	skb_reserve(skb, lro_mgr->frag_align_pad); -	skb->len = len; -	skb->data_len = len - hdr_len; -	skb->truesize += true_size; -	skb->tail += hdr_len; - -	memcpy(skb->data, mac_hdr, hdr_len); - -	skb_frags = skb_shinfo(skb)->frags; -	while (data_len > 0) { -		*skb_frags = *frags; -		data_len -= skb_frag_size(frags); -		skb_frags++; -		frags++; -		skb_shinfo(skb)->nr_frags++; -	} - -	skb_shinfo(skb)->frags[0].page_offset += hdr_len; -	skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); - -	skb->ip_summed = ip_summed; -	skb->csum = sum; -	skb->protocol = eth_type_trans(skb, lro_mgr->dev); -	return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, -					  struct skb_frag_struct *frags, -					  int len, int true_size, -					  void *priv, __wsum sum) -{ -	struct net_lro_desc *lro_desc; -	struct iphdr *iph; -	struct tcphdr *tcph; -	struct sk_buff *skb; -	u64 flags; -	void *mac_hdr; -	int mac_hdr_len; -	int hdr_len = LRO_MAX_PG_HLEN; -	int vlan_hdr_len = 0; - -	if (!lro_mgr->get_frag_header || -	    lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, -				     (void *)&tcph, &flags, priv)) { -		mac_hdr = skb_frag_address(frags); -		goto out1; -	} - -	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) -		goto out1; - -	hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); -	mac_hdr_len = (int)((void *)(iph) - mac_hdr); - -	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); -	if (!lro_desc) -		goto out1; - -	if (!lro_desc->active) { /* start new lro session */ -		if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) -			goto out1; - -		skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, -				  hdr_len, 0, lro_mgr->ip_summed_aggr); -		if (!skb) -			goto out; - -		if ((skb->protocol == htons(ETH_P_8021Q)) && -		    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) -			vlan_hdr_len = VLAN_HLEN; - -		iph = (void *)(skb->data + vlan_hdr_len); -		tcph = (void *)((u8 *)skb->data + vlan_hdr_len -				+ IP_HDR_LEN(iph)); - -		lro_init_desc(lro_desc, skb, iph, tcph); -		LRO_INC_STATS(lro_mgr, aggregated); -		return NULL; -	} - -	if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) -		goto out2; - -	if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) -		goto out2; - -	lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); -	LRO_INC_STATS(lro_mgr, aggregated); - -	if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || -	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) -		lro_flush(lro_mgr, lro_desc); - -	return NULL; - -out2: /* send aggregated packets to the stack */ -	lro_flush(lro_mgr, lro_desc); - -out1:  /* Original packet has to be posted to the stack */ -	skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, -			  hdr_len, sum, lro_mgr->ip_summed); -out: -	return skb; -} -  void lro_receive_skb(struct net_lro_mgr *lro_mgr,  		     struct sk_buff *skb,  		     void *priv) @@ -506,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,  }  EXPORT_SYMBOL(lro_receive_skb); -void lro_receive_frags(struct net_lro_mgr *lro_mgr, -		       struct skb_frag_struct *frags, -		       int len, int true_size, void *priv, __wsum sum) -{ -	struct sk_buff *skb; - -	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); -	if (!skb) -		return; - -	if (lro_mgr->features & LRO_F_NAPI) -		netif_receive_skb(skb); -	else -		netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); -  void lro_flush_all(struct net_lro_mgr *lro_mgr)  {  	int i; @@ -534,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr)  	}  }  EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, -		  struct iphdr *iph, struct tcphdr *tcph) -{ -	struct net_lro_desc *lro_desc; - -	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); -	if (lro_desc->active) -		lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f27c9f4afd..6d592f8555f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -87,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,  	refcnt += inet_twsk_bind_unhash(tw, hashinfo);  	spin_unlock(&bhead->lock); -#ifdef SOCK_REFCNT_DEBUG -	if (atomic_read(&tw->tw_refcnt) != 1) { -		pr_debug("%s timewait_sock %p refcnt=%d\n", -			 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); -	} -#endif -	while (refcnt) { -		inet_twsk_put(tw); -		refcnt--; -	} +	BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); +	atomic_sub(refcnt, &tw->tw_refcnt);  } -static noinline void inet_twsk_free(struct inet_timewait_sock *tw) +void inet_twsk_free(struct inet_timewait_sock *tw)  {  	struct module *owner = tw->tw_prot->owner;  	twsk_destructor((struct sock *)tw); @@ -118,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)  }  EXPORT_SYMBOL_GPL(inet_twsk_put); +static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, +				   struct hlist_nulls_head *list) +{ +	hlist_nulls_add_head_rcu(&tw->tw_node, list); +} + +static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, +				    struct hlist_head *list) +{ +	hlist_add_head(&tw->tw_bind_node, list); +} +  /*   * Enter the time wait state. This is called with locally disabled BH.   * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -146,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,  	spin_lock(lock);  	/* -	 * Step 2: Hash TW into TIMEWAIT chain. -	 * Should be done before removing sk from established chain -	 * because readers are lockless and search established first. +	 * Step 2: Hash TW into tcp ehash chain. +	 * Notes : +	 * - tw_refcnt is set to 3 because : +	 * - We have one reference from bhash chain. +	 * - We have one reference from ehash chain. +	 * We can use atomic_set() because prior spin_lock()/spin_unlock() +	 * committed into memory all tw fields.  	 */ -	inet_twsk_add_node_rcu(tw, &ehead->twchain); +	atomic_set(&tw->tw_refcnt, 1 + 1 + 1); +	inet_twsk_add_node_rcu(tw, &ehead->chain); -	/* Step 3: Remove SK from established hash. */ +	/* Step 3: Remove SK from hash chain */  	if (__sk_nulls_del_node_init_rcu(sk))  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -	/* -	 * Notes : -	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc() -	 * - We add one reference for the bhash link -	 * - We add one reference for the ehash link -	 * - We want this refcnt update done before allowing other -	 *   threads to find this tw in ehash chain. -	 */ -	atomic_add(1 + 1 + 1, &tw->tw_refcnt); -  	spin_unlock(lock);  }  EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -387,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,  			if (slot >= INET_TWDR_TWKILL_SLOTS)  				slot = INET_TWDR_TWKILL_SLOTS - 1;  		} -		tw->tw_ttd = jiffies + timeo; +		tw->tw_ttd = inet_tw_time_stamp() + timeo;  		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);  		list = &twdr->cells[slot];  	} else { -		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); +		tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);  		if (twdr->twcal_hand < 0) {  			twdr->twcal_hand = 0; @@ -490,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,  restart_rcu:  		rcu_read_lock();  restart: -		sk_nulls_for_each_rcu(sk, node, &head->twchain) { +		sk_nulls_for_each_rcu(sk, node, &head->chain) { +			if (sk->sk_state != TCP_TIME_WAIT) +				continue;  			tw = inet_twsk(sk);  			if ((tw->tw_family != family) ||  				atomic_read(&twsk_net(tw)->count)) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 33d5537881e..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -26,20 +26,7 @@   *  Theory of operations.   *  We keep one entry for each peer IP address.  The nodes contains long-living   *  information about the peer which doesn't depend on routes. - *  At this moment this information consists only of ID field for the next - *  outgoing IP packet.  This field is incremented with each packet as encoded - *  in inet_getid() function (include/net/inetpeer.h). - *  At the moment of writing this notes identifier of IP packets is generated - *  to be unpredictable using this code only for packets subjected - *  (actually or potentially) to defragmentation.  I.e. DF packets less than - *  PMTU in size when local fragmentation is disabled use a constant ID and do - *  not use this code (see ip_select_ident() in include/net/ip.h).   * - *  Route cache entries hold references to our nodes. - *  New cache entries get references via lookup by destination IP address in - *  the avl tree.  The reference is grabbed only when it's needed i.e. only - *  when we try to output IP packet which needs an unpredictable ID (see - *  __ip_select_ident() in net/ipv4/route.c).   *  Nodes are removed only when reference counter goes to 0.   *  When it's happened the node may be removed when a sufficient amount of   *  time has been passed since its last use.  The less-recently-used entry can @@ -62,7 +49,6 @@   *		refcnt: atomically against modifications on other CPU;   *		   usually under some other lock to prevent node disappearing   *		daddr: unchangeable - *		ip_id_count: atomic value (no lock needed)   */  static struct kmem_cache *peer_cachep __read_mostly; @@ -109,13 +95,6 @@ static inline void flush_check(struct inet_peer_base *base, int family)  	}  } -void inetpeer_invalidate_family(int family) -{ -	atomic_t *fp = inetpeer_seq_ptr(family); - -	atomic_inc(fp); -} -  #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */  /* Exported for sysctl_net_ipv4.  */ @@ -127,7 +106,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min  static void inetpeer_gc_worker(struct work_struct *work)  {  	struct inet_peer *p, *n, *c; -	LIST_HEAD(list); +	struct list_head list;  	spin_lock_bh(&gc_lock);  	list_replace_init(&gc_list, &list); @@ -227,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a,  	stackptr = _stack;					\  	*stackptr++ = &_base->root;				\  	for (u = rcu_deref_locked(_base->root, _base);		\ -	     u != peer_avl_empty; ) {				\ +	     u != peer_avl_empty;) {				\  		int cmp = addr_compare(_daddr, &u->daddr);	\  		if (cmp == 0)					\  			break;					\ @@ -282,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,  	*stackptr++ = &start->avl_left;				\  	v = &start->avl_left;					\  	for (u = rcu_deref_locked(*v, base);			\ -	     u->avl_right != peer_avl_empty_rcu; ) {		\ +	     u->avl_right != peer_avl_empty_rcu;) {		\  		v = &u->avl_right;				\  		*stackptr++ = v;				\  		u = rcu_deref_locked(*v, base);			\ @@ -504,10 +483,6 @@ relookup:  		p->daddr = *daddr;  		atomic_set(&p->refcnt, 1);  		atomic_set(&p->rid, 0); -		atomic_set(&p->ip_id_count, -				(daddr->family == AF_INET) ? -					secure_ip_id(daddr->addr.a4) : -					secure_ipv6_id(daddr->addr.a6));  		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;  		p->rate_tokens = 0;  		/* 60*HZ is arbitrary, but chosen enough high so that the first @@ -529,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer);  void inet_putpeer(struct inet_peer *p)  {  	p->dtime = (__u32)jiffies; -	smp_mb__before_atomic_dec(); +	smp_mb__before_atomic();  	atomic_dec(&p->refcnt);  }  EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 694de3b7aeb..3a83ce5efa8 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,24 @@  #include <net/route.h>  #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ +	return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || +		skb->ignore_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ +	if (skb->len <= mtu) +		return false; + +	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) +		return false; + +	return true; +} + +  static int ip_forward_finish(struct sk_buff *skb)  {  	struct ip_options *opt	= &(IPCB(skb)->opt); @@ -54,10 +72,15 @@ static int ip_forward_finish(struct sk_buff *skb)  int ip_forward(struct sk_buff *skb)  { +	u32 mtu;  	struct iphdr *iph;	/* Our header */  	struct rtable *rt;	/* Route we use */  	struct ip_options *opt	= &(IPCB(skb)->opt); +	/* that should never happen */ +	if (skb->pkt_type != PACKET_HOST) +		goto drop; +  	if (skb_warn_if_lro(skb))  		goto drop; @@ -67,9 +90,6 @@ int ip_forward(struct sk_buff *skb)  	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))  		return NET_RX_SUCCESS; -	if (skb->pkt_type != PACKET_HOST) -		goto drop; -  	skb_forward_csum(skb);  	/* @@ -88,11 +108,12 @@ int ip_forward(struct sk_buff *skb)  	if (opt->is_strictroute && rt->rt_uses_gateway)  		goto sr_failed; -	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && -		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { +	IPCB(skb)->flags |= IPSKB_FORWARDED; +	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); +	if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {  		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(dst_mtu(&rt->dst))); +			  htonl(mtu));  		goto drop;  	} diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b66910aaef4..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -106,6 +106,7 @@ struct ip4_create_arg {  static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)  { +	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));  	return jhash_3words((__force u32)id << 16 | prot,  			    (__force u32)saddr, (__force u32)daddr,  			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -231,8 +232,9 @@ static void ip_expire(unsigned long arg)  		 * "Fragment Reassembly Timeout" message, per RFC792.  		 */  		if (qp->user == IP_DEFRAG_AF_PACKET || -		    (qp->user == IP_DEFRAG_CONNTRACK_IN && -		     skb_rtable(head)->rt_type != RTN_LOCAL)) +		    ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && +		     (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && +		     (skb_rtable(head)->rt_type != RTN_LOCAL)))  			goto out_rcu_unlock; @@ -703,7 +705,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)  			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));  			if (ip_defrag(skb, user))  				return NULL; -			skb->rxhash = 0; +			skb_clear_hash(skb);  		}  	}  	return skb; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d7aea4c5b94..9b842544aea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info,  	else  		itn = net_generic(net, ipgre_net_id); -	iph = (const struct iphdr *)skb->data; +	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);  	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,  			     iph->daddr, iph->saddr, tpi->key); @@ -217,6 +217,7 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)  				  iph->saddr, iph->daddr, tpi->key);  	if (tunnel) { +		skb_pop_mac_header(skb);  		ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);  		return PACKET_RCVD;  	} @@ -277,7 +278,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,  	return NETDEV_TX_OK;  free_skb: -	dev_kfree_skb(skb); +	kfree_skb(skb);  out:  	dev->stats.tx_dropped++;  	return NETDEV_TX_OK; @@ -300,7 +301,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,  	return NETDEV_TX_OK;  free_skb: -	dev_kfree_skb(skb); +	kfree_skb(skb);  out:  	dev->stats.tx_dropped++;  	return NETDEV_TX_OK; @@ -409,7 +410,7 @@ static int ipgre_open(struct net_device *dev)  		struct flowi4 fl4;  		struct rtable *rt; -		rt = ip_route_output_gre(dev_net(dev), &fl4, +		rt = ip_route_output_gre(t->net, &fl4,  					 t->parms.iph.daddr,  					 t->parms.iph.saddr,  					 t->parms.o_key, @@ -433,7 +434,7 @@ static int ipgre_close(struct net_device *dev)  	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {  		struct in_device *in_dev; -		in_dev = inetdev_by_index(dev_net(dev), t->mlink); +		in_dev = inetdev_by_index(t->net, t->mlink);  		if (in_dev)  			ip_mc_dec_group(in_dev, t->parms.iph.daddr);  	} @@ -462,6 +463,7 @@ static const struct net_device_ops ipgre_netdev_ops = {  static void ipgre_tunnel_setup(struct net_device *dev)  {  	dev->netdev_ops		= &ipgre_netdev_ops; +	dev->type		= ARPHRD_IPGRE;  	ip_tunnel_setup(dev, ipgre_net_id);  } @@ -476,7 +478,7 @@ static void __gre_tunnel_init(struct net_device *dev)  	dev->needed_headroom	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;  	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4; -	dev->features		|= NETIF_F_NETNS_LOCAL | GRE_FEATURES; +	dev->features		|= GRE_FEATURES;  	dev->hw_features	|= GRE_FEATURES;  	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { @@ -500,7 +502,6 @@ static int ipgre_tunnel_init(struct net_device *dev)  	memcpy(dev->dev_addr, &iph->saddr, 4);  	memcpy(dev->broadcast, &iph->daddr, 4); -	dev->type		= ARPHRD_IPGRE;  	dev->flags		= IFF_NOARP;  	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;  	dev->addr_len		= 4; @@ -648,6 +649,7 @@ static void ipgre_tap_setup(struct net_device *dev)  {  	ether_setup(dev);  	dev->netdev_ops		= &gre_tap_netdev_ops; +	dev->priv_flags 	|= IFF_LIVE_ADDR_CHANGE;  	ip_tunnel_setup(dev, gre_tap_net_id);  } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 054a3e97d82..3d4da2c16b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb)  	const struct iphdr *iph = ip_hdr(skb);  	struct rtable *rt; -	if (sysctl_ip_early_demux && !skb_dst(skb)) { +	if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {  		const struct net_protocol *ipprot;  		int protocol = iph->protocol; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec7264514a8..ad382499bac 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -167,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)  		soffset -= 4;  		if (soffset > 3) {  			memcpy(&faddr, &start[soffset-1], 4); -			for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) +			for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)  				memcpy(&dptr[doffset-1], &start[soffset-1], 4);  			/*  			 * RFC1812 requires to fix illegal source routes. @@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb)  			continue;  		}  		optlen = optptr[1]; -		if (optlen<2 || optlen>l) +		if (optlen < 2 || optlen > l)  		  return;  		if (!IPOPT_COPIED(*optptr))  			memset(optptr, IPOPT_NOOP, optlen); @@ -275,27 +275,31 @@ int ip_options_compile(struct net *net,  	for (l = opt->optlen; l > 0; ) {  		switch (*optptr) { -		      case IPOPT_END: -			for (optptr++, l--; l>0; optptr++, l--) { +		case IPOPT_END: +			for (optptr++, l--; l > 0; optptr++, l--) {  				if (*optptr != IPOPT_END) {  					*optptr = IPOPT_END;  					opt->is_changed = 1;  				}  			}  			goto eol; -		      case IPOPT_NOOP: +		case IPOPT_NOOP:  			l--;  			optptr++;  			continue;  		} +		if (unlikely(l < 2)) { +			pp_ptr = optptr; +			goto error; +		}  		optlen = optptr[1]; -		if (optlen<2 || optlen>l) { +		if (optlen < 2 || optlen > l) {  			pp_ptr = optptr;  			goto error;  		}  		switch (*optptr) { -		      case IPOPT_SSRR: -		      case IPOPT_LSRR: +		case IPOPT_SSRR: +		case IPOPT_LSRR:  			if (optlen < 3) {  				pp_ptr = optptr + 1;  				goto error; @@ -321,7 +325,7 @@ int ip_options_compile(struct net *net,  			opt->is_strictroute = (optptr[0] == IPOPT_SSRR);  			opt->srr = optptr - iph;  			break; -		      case IPOPT_RR: +		case IPOPT_RR:  			if (opt->rr) {  				pp_ptr = optptr;  				goto error; @@ -349,7 +353,7 @@ int ip_options_compile(struct net *net,  			}  			opt->rr = optptr - iph;  			break; -		      case IPOPT_TIMESTAMP: +		case IPOPT_TIMESTAMP:  			if (opt->ts) {  				pp_ptr = optptr;  				goto error; @@ -364,19 +368,19 @@ int ip_options_compile(struct net *net,  			}  			if (optptr[2] <= optlen) {  				unsigned char *timeptr = NULL; -				if (optptr[2]+3 > optptr[1]) { +				if (optptr[2]+3 > optlen) {  					pp_ptr = optptr + 2;  					goto error;  				}  				switch (optptr[3]&0xF) { -				      case IPOPT_TS_TSONLY: +				case IPOPT_TS_TSONLY:  					if (skb)  						timeptr = &optptr[optptr[2]-1];  					opt->ts_needtime = 1;  					optptr[2] += 4;  					break; -				      case IPOPT_TS_TSANDADDR: -					if (optptr[2]+7 > optptr[1]) { +				case IPOPT_TS_TSANDADDR: +					if (optptr[2]+7 > optlen) {  						pp_ptr = optptr + 2;  						goto error;  					} @@ -389,8 +393,8 @@ int ip_options_compile(struct net *net,  					opt->ts_needtime = 1;  					optptr[2] += 8;  					break; -				      case IPOPT_TS_PRESPEC: -					if (optptr[2]+7 > optptr[1]) { +				case IPOPT_TS_PRESPEC: +					if (optptr[2]+7 > optlen) {  						pp_ptr = optptr + 2;  						goto error;  					} @@ -405,7 +409,7 @@ int ip_options_compile(struct net *net,  					opt->ts_needtime = 1;  					optptr[2] += 8;  					break; -				      default: +				default:  					if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {  						pp_ptr = optptr + 3;  						goto error; @@ -433,7 +437,7 @@ int ip_options_compile(struct net *net,  			}  			opt->ts = optptr - iph;  			break; -		      case IPOPT_RA: +		case IPOPT_RA:  			if (optlen < 4) {  				pp_ptr = optptr + 1;  				goto error; @@ -441,7 +445,7 @@ int ip_options_compile(struct net *net,  			if (optptr[2] == 0 && optptr[3] == 0)  				opt->router_alert = optptr - iph;  			break; -		      case IPOPT_CIPSO: +		case IPOPT_CIPSO:  			if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {  				pp_ptr = optptr;  				goto error; @@ -452,9 +456,9 @@ int ip_options_compile(struct net *net,  				goto error;  			}  			break; -		      case IPOPT_SEC: -		      case IPOPT_SID: -		      default: +		case IPOPT_SEC: +		case IPOPT_SID: +		default:  			if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {  				pp_ptr = optptr;  				goto error; @@ -572,7 +576,7 @@ void ip_forward_options(struct sk_buff *skb)  		optptr = raw + opt->srr; -		for ( srrptr=optptr[2], srrspace = optptr[1]; +		for ( srrptr = optptr[2], srrspace = optptr[1];  		     srrptr <= srrspace;  		     srrptr += 4  		     ) { @@ -628,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)  	if (rt->rt_type != RTN_LOCAL)  		return -EINVAL; -	for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { +	for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {  		if (srrptr + 3 > srrspace) {  			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));  			return -EINVAL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a04d872c54f..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb)  		       skb_dst(skb)->dev, dst_output);  } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)  {  	int err;  	err = __ip_local_out(skb);  	if (likely(err == 1)) -		err = dst_output(skb); +		err = dst_output_sk(sk, skb);  	return err;  } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk);  static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)  { @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,  	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);  	iph->saddr    = saddr;  	iph->protocol = sk->sk_protocol; -	ip_select_ident(skb, &rt->dst, sk); +	ip_select_ident(skb, sk);  	if (opt && opt->opt.optlen) {  		iph->ihl += opt->opt.optlen>>2; @@ -211,6 +211,48 @@ static inline int ip_finish_output2(struct sk_buff *skb)  	return -EINVAL;  } +static int ip_finish_output_gso(struct sk_buff *skb) +{ +	netdev_features_t features; +	struct sk_buff *segs; +	int ret = 0; + +	/* common case: locally created skb or seglen is <= mtu */ +	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || +	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) +		return ip_finish_output2(skb); + +	/* Slowpath -  GSO segment length is exceeding the dst MTU. +	 * +	 * This can happen in two cases: +	 * 1) TCP GRO packet, DF bit not set +	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly +	 * from host network stack. +	 */ +	features = netif_skb_features(skb); +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); +	if (IS_ERR(segs)) { +		kfree_skb(skb); +		return -ENOMEM; +	} + +	consume_skb(skb); + +	do { +		struct sk_buff *nskb = segs->next; +		int err; + +		segs->next = NULL; +		err = ip_fragment(segs, ip_finish_output2); + +		if (err && ret == 0) +			ret = err; +		segs = nskb; +	} while (segs); + +	return ret; +} +  static int ip_finish_output(struct sk_buff *skb)  {  #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) @@ -220,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)  		return dst_output(skb);  	}  #endif -	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) +	if (skb_is_gso(skb)) +		return ip_finish_output_gso(skb); + +	if (skb->len > ip_skb_dst_mtu(skb))  		return ip_fragment(skb, ip_finish_output2); -	else -		return ip_finish_output2(skb); + +	return ip_finish_output2(skb);  } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb)  { -	struct sock *sk = skb->sk;  	struct rtable *rt = skb_rtable(skb);  	struct net_device *dev = rt->dst.dev; @@ -287,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb)  {  	struct net_device *dev = skb_dst(skb)->dev; @@ -315,9 +359,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)  	       sizeof(fl4->saddr) + sizeof(fl4->daddr));  } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)  { -	struct sock *sk = skb->sk;  	struct inet_sock *inet = inet_sk(sk);  	struct ip_options_rcu *inet_opt;  	struct flowi4 *fl4; @@ -371,7 +415,7 @@ packet_routed:  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); -	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) +	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)  		iph->frag_off = htons(IP_DF);  	else  		iph->frag_off = 0; @@ -386,9 +430,9 @@ packet_routed:  		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);  	} -	ip_select_ident_more(skb, &rt->dst, sk, -			     (skb_shinfo(skb)->gso_segs ?: 1) - 1); +	ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); +	/* TODO : should we use skb->sk here instead of sk ? */  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; @@ -422,9 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)  	to->tc_index = from->tc_index;  #endif  	nf_copy(to, from); -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) -	to->nf_trace = from->nf_trace; -#endif  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)  	to->ipvs_property = from->ipvs_property;  #endif @@ -458,12 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	iph = ip_hdr(skb); -	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || +	mtu = ip_skb_dst_mtu(skb); +	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||  		     (IPCB(skb)->frag_max_size && -		      IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { +		      IPCB(skb)->frag_max_size > mtu))) {  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(ip_skb_dst_mtu(skb))); +			  htonl(mtu));  		kfree_skb(skb);  		return -EMSGSIZE;  	} @@ -473,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	 */  	hlen = iph->ihl * 4; -	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */ +	mtu = mtu - hlen;	/* Size of data space */  #ifdef CONFIG_BRIDGE_NETFILTER  	if (skb->nf_bridge)  		mtu -= nf_bridge_mtu_reduction(skb); @@ -772,15 +814,20 @@ static inline int ip_ufo_append_data(struct sock *sk,  		/* initialize protocol header pointer */  		skb->transport_header = skb->network_header + fragheaderlen; -		skb->ip_summed = CHECKSUM_PARTIAL;  		skb->csum = 0; -		/* specify the length of each IP datagram fragment */ -		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; -		skb_shinfo(skb)->gso_type = SKB_GSO_UDP; +  		__skb_queue_tail(queue, skb); +	} else if (skb_is_gso(skb)) { +		goto append;  	} +	skb->ip_summed = CHECKSUM_PARTIAL; +	/* specify the length of each IP datagram fragment */ +	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; +	skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append:  	return skb_append_datato_frags(sk, skb, getfrag, from,  				       (length - transhdrlen));  } @@ -805,7 +852,7 @@ static int __ip_append_data(struct sock *sk,  	int copy;  	int err;  	int offset = 0; -	unsigned int maxfraglen, fragheaderlen; +	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;  	int csummode = CHECKSUM_NONE;  	struct rtable *rt = (struct rtable *)cork->dst; @@ -818,10 +865,11 @@ static int __ip_append_data(struct sock *sk,  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (cork->length + length > 0xFFFF - fragheaderlen) { +	if (cork->length + length > maxnonfragsize - fragheaderlen) {  		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, -			       mtu-exthdrlen); +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -1030,7 +1078,6 @@ error:  static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,  			 struct ipcm_cookie *ipc, struct rtable **rtp)  { -	struct inet_sock *inet = inet_sk(sk);  	struct ip_options_rcu *opt;  	struct rtable *rt; @@ -1056,10 +1103,13 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,  	 * We steal reference to this route, caller should not release it  	 */  	*rtp = NULL; -	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? -			 rt->dst.dev->mtu : dst_mtu(&rt->dst); +	cork->fragsize = ip_sk_use_pmtu(sk) ? +			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;  	cork->dst = &rt->dst;  	cork->length = 0; +	cork->ttl = ipc->ttl; +	cork->tos = ipc->tos; +	cork->priority = ipc->priority;  	cork->tx_flags = ipc->tx_flags;  	return 0; @@ -1114,7 +1164,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  	int mtu;  	int len;  	int err; -	unsigned int maxfraglen, fragheaderlen, fraggap; +	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;  	if (inet->hdrincl)  		return -EPERM; @@ -1138,9 +1188,11 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (cork->length + size > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); +	if (cork->length + size > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -1297,13 +1349,13 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	 * to fragment the frame generated here. No matter, what transforms  	 * how transforms change size of the packet, it will come out.  	 */ -	if (inet->pmtudisc < IP_PMTUDISC_DO) -		skb->local_df = 1; +	skb->ignore_df = ip_sk_ignore_df(sk);  	/* DF bit is set when we want to see DF on outgoing frames. -	 * If local_df is set too, we still allow to fragment this frame +	 * If ignore_df is set too, we still allow to fragment this frame  	 * locally. */ -	if (inet->pmtudisc >= IP_PMTUDISC_DO || +	if (inet->pmtudisc == IP_PMTUDISC_DO || +	    inet->pmtudisc == IP_PMTUDISC_PROBE ||  	    (skb->len <= dst_mtu(&rt->dst) &&  	     ip_dont_fragment(sk, &rt->dst)))  		df = htons(IP_DF); @@ -1311,7 +1363,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	if (cork->flags & IPCORK_OPT)  		opt = cork->opt; -	if (rt->rt_type == RTN_MULTICAST) +	if (cork->ttl != 0) +		ttl = cork->ttl; +	else if (rt->rt_type == RTN_MULTICAST)  		ttl = inet->mc_ttl;  	else  		ttl = ip_select_ttl(inet, &rt->dst); @@ -1319,19 +1373,19 @@ struct sk_buff *__ip_make_skb(struct sock *sk,  	iph = ip_hdr(skb);  	iph->version = 4;  	iph->ihl = 5; -	iph->tos = inet->tos; +	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;  	iph->frag_off = df;  	iph->ttl = ttl;  	iph->protocol = sk->sk_protocol;  	ip_copy_addrs(iph, fl4); -	ip_select_ident(skb, &rt->dst, sk); +	ip_select_ident(skb, sk);  	if (opt) {  		iph->ihl += opt->optlen>>2;  		ip_options_build(skb, opt, cork->addr, rt, 0);  	} -	skb->priority = sk->sk_priority; +	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;  	skb->mark = sk->sk_mark;  	/*  	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1481,6 +1535,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,  	ipc.addr = daddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	if (replyopts.opt.opt.optlen) {  		ipc.opt = &replyopts.opt; @@ -1489,7 +1545,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,  			daddr = replyopts.opt.opt.faddr;  	} -	flowi4_init_output(&fl4, arg->bound_dev_if, 0, +	flowi4_init_output(&fl4, arg->bound_dev_if, +			   IP4_REPLY_MARK(net, skb->mark),  			   RT_TOS(arg->tos),  			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,  			   ip_reply_arg_flowi_flags(arg), @@ -1534,7 +1591,7 @@ void __init ip_init(void)  	ip_rt_init();  	inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) -	igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) +	igmp_mc_init();  #endif  } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d9c4f113d70..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -56,7 +56,6 @@  /*   *	SOL_IP control messages.   */ -#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))  static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)  { @@ -187,14 +186,31 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)  }  EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, +		 bool allow_ipv6)  { -	int err; +	int err, val;  	struct cmsghdr *cmsg;  	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {  		if (!CMSG_OK(msg, cmsg))  			return -EINVAL; +#if defined(CONFIG_IPV6) +		if (allow_ipv6 && +		    cmsg->cmsg_level == SOL_IPV6 && +		    cmsg->cmsg_type == IPV6_PKTINFO) { +			struct in6_pktinfo *src_info; + +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) +				return -EINVAL; +			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); +			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) +				return -EINVAL; +			ipc->oif = src_info->ipi6_ifindex; +			ipc->addr = src_info->ipi6_addr.s6_addr32[3]; +			continue; +		} +#endif  		if (cmsg->cmsg_level != SOL_IP)  			continue;  		switch (cmsg->cmsg_type) { @@ -215,6 +231,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)  			ipc->addr = info->ipi_spec_dst.s_addr;  			break;  		} +		case IP_TTL: +			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) +				return -EINVAL; +			val = *(int *)CMSG_DATA(cmsg); +			if (val < 1 || val > 255) +				return -EINVAL; +			ipc->ttl = val; +			break; +		case IP_TOS: +			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) +				return -EINVAL; +			val = *(int *)CMSG_DATA(cmsg); +			if (val < 0 || val > 255) +				return -EINVAL; +			ipc->tos = val; +			ipc->priority = rt_tos2priority(ipc->tos); +			break; +  		default:  			return -EINVAL;  		} @@ -368,11 +402,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf  /*   *	Handle MSG_ERRQUEUE   */ -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)  {  	struct sock_exterr_skb *serr;  	struct sk_buff *skb, *skb2; -	struct sockaddr_in *sin; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct {  		struct sock_extended_err ee;  		struct sockaddr_in	 offender; @@ -398,13 +432,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)  	serr = SKB_EXT_ERR(skb); -	sin = (struct sockaddr_in *)msg->msg_name;  	if (sin) {  		sin->sin_family = AF_INET;  		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +  						   serr->addr_offset);  		sin->sin_port = serr->port;  		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -609,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  		inet->nodefrag = val ? 1 : 0;  		break;  	case IP_MTU_DISCOVER: -		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) +		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)  			goto e_inval;  		inet->pmtudisc = val;  		break; @@ -1032,13 +1066,15 @@ e_inval:   *   * To support IP_CMSG_PKTINFO option, we store rt_iif and specific   * destination in skb->cb[] before dst drop. - * This way, receiver doesnt make cache line misses to read rtable. + * This way, receiver doesn't make cache line misses to read rtable.   */ -void ipv4_pktinfo_prepare(struct sk_buff *skb) +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)  {  	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); +	bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || +		       ipv6_sk_rxinfo(sk); -	if (skb_rtable(skb)) { +	if (prepare && skb_rtable(skb)) {  		pktinfo->ipi_ifindex = inet_iif(skb);  		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);  	} else { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 63a6d6d6b87..6f9de61dce5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@  #include <linux/if_ether.h>  #include <linux/if_vlan.h>  #include <linux/rculist.h> +#include <linux/err.h>  #include <net/sock.h>  #include <net/ip.h> @@ -61,57 +62,59 @@  #include <net/ip6_route.h>  #endif -static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, -				   __be32 key, __be32 remote) +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)  {  	return hash_32((__force u32)key ^ (__force u32)remote,  			 IP_TNL_HASH_BITS);  } -/* Often modified stats are per cpu, other are shared (netdev->stats) */ -struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, -						struct rtnl_link_stats64 *tot) +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, +			     struct dst_entry *dst)  { -	int i; - -	for_each_possible_cpu(i) { -		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); -		u64 rx_packets, rx_bytes, tx_packets, tx_bytes; -		unsigned int start; +	struct dst_entry *old_dst; -		do { -			start = u64_stats_fetch_begin_bh(&tstats->syncp); -			rx_packets = tstats->rx_packets; -			tx_packets = tstats->tx_packets; -			rx_bytes = tstats->rx_bytes; -			tx_bytes = tstats->tx_bytes; -		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); - -		tot->rx_packets += rx_packets; -		tot->tx_packets += tx_packets; -		tot->rx_bytes   += rx_bytes; -		tot->tx_bytes   += tx_bytes; -	} +	dst_clone(dst); +	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); +	dst_release(old_dst); +} -	tot->multicast = dev->stats.multicast; +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ +	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); +} -	tot->rx_crc_errors = dev->stats.rx_crc_errors; -	tot->rx_fifo_errors = dev->stats.rx_fifo_errors; -	tot->rx_length_errors = dev->stats.rx_length_errors; -	tot->rx_frame_errors = dev->stats.rx_frame_errors; -	tot->rx_errors = dev->stats.rx_errors; +static void tunnel_dst_reset(struct ip_tunnel *t) +{ +	tunnel_dst_set(t, NULL); +} -	tot->tx_fifo_errors = dev->stats.tx_fifo_errors; -	tot->tx_carrier_errors = dev->stats.tx_carrier_errors; -	tot->tx_dropped = dev->stats.tx_dropped; -	tot->tx_aborted_errors = dev->stats.tx_aborted_errors; -	tot->tx_errors = dev->stats.tx_errors; +void ip_tunnel_dst_reset_all(struct ip_tunnel *t) +{ +	int i; -	tot->collisions  = dev->stats.collisions; +	for_each_possible_cpu(i) +		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} +EXPORT_SYMBOL(ip_tunnel_dst_reset_all); -	return tot; +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +{ +	struct dst_entry *dst; + +	rcu_read_lock(); +	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); +	if (dst && !atomic_inc_not_zero(&dst->__refcnt)) +		dst = NULL; +	if (dst) { +		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { +			tunnel_dst_reset(t); +			dst_release(dst); +			dst = NULL; +		} +	} +	rcu_read_unlock(); +	return (struct rtable *)dst;  } -EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);  static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,  				__be16 flags, __be32 key) @@ -146,7 +149,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,  	struct ip_tunnel *t, *cand = NULL;  	struct hlist_head *head; -	hash = ip_tunnel_hash(itn, key, remote); +	hash = ip_tunnel_hash(key, remote);  	head = &itn->tunnels[hash];  	hlist_for_each_entry_rcu(t, head, hash_node) { @@ -166,6 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,  	hlist_for_each_entry_rcu(t, head, hash_node) {  		if (remote != t->parms.iph.daddr || +		    t->parms.iph.saddr != 0 ||  		    !(t->dev->flags & IFF_UP))  			continue; @@ -178,14 +182,15 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,  			cand = t;  	} -	hash = ip_tunnel_hash(itn, key, 0); +	hash = ip_tunnel_hash(key, 0);  	head = &itn->tunnels[hash];  	hlist_for_each_entry_rcu(t, head, hash_node) { -		if ((local != t->parms.iph.saddr && -		     (local != t->parms.iph.daddr || -		      !ipv4_is_multicast(local))) || -		    !(t->dev->flags & IFF_UP)) +		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && +		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) +			continue; + +		if (!(t->dev->flags & IFF_UP))  			continue;  		if (!ip_tunnel_key_match(&t->parms, flags, key)) @@ -202,6 +207,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,  	hlist_for_each_entry_rcu(t, head, hash_node) {  		if (t->parms.i_key != key || +		    t->parms.iph.saddr != 0 || +		    t->parms.iph.daddr != 0 ||  		    !(t->dev->flags & IFF_UP))  			continue; @@ -228,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,  {  	unsigned int h;  	__be32 remote; +	__be32 i_key = parms->i_key;  	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))  		remote = parms->iph.daddr;  	else  		remote = 0; -	h = ip_tunnel_hash(itn, parms->i_key, remote); +	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) +		i_key = 0; + +	h = ip_tunnel_hash(i_key, remote);  	return &itn->tunnels[h];  } @@ -257,6 +268,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,  	__be32 remote = parms->iph.daddr;  	__be32 local = parms->iph.saddr;  	__be32 key = parms->i_key; +	__be16 flags = parms->i_flags;  	int link = parms->link;  	struct ip_tunnel *t = NULL;  	struct hlist_head *head = ip_bucket(itn, parms); @@ -264,9 +276,9 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,  	hlist_for_each_entry_rcu(t, head, hash_node) {  		if (local == t->parms.iph.saddr &&  		    remote == t->parms.iph.daddr && -		    key == t->parms.i_key &&  		    link == t->parms.link && -		    type == t->dev->type) +		    type == t->dev->type && +		    ip_tunnel_key_match(&t->parms, flags, key))  			break;  	}  	return t; @@ -318,11 +330,10 @@ failed:  	return ERR_PTR(err);  } -static inline struct rtable *ip_route_output_tunnel(struct net *net, -						    struct flowi4 *fl4, -						    int proto, -						    __be32 daddr, __be32 saddr, -						    __be32 key, __u8 tos, int oif) +static inline void init_tunnel_flow(struct flowi4 *fl4, +				    int proto, +				    __be32 daddr, __be32 saddr, +				    __be32 key, __u8 tos, int oif)  {  	memset(fl4, 0, sizeof(*fl4));  	fl4->flowi4_oif = oif; @@ -331,7 +342,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net,  	fl4->flowi4_tos = tos;  	fl4->flowi4_proto = proto;  	fl4->fl4_gre_key = key; -	return ip_route_output_key(net, fl4);  }  static int ip_tunnel_bind_dev(struct net_device *dev) @@ -350,14 +360,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev)  		struct flowi4 fl4;  		struct rtable *rt; -		rt = ip_route_output_tunnel(tunnel->net, &fl4, -					    tunnel->parms.iph.protocol, -					    iph->daddr, iph->saddr, -					    tunnel->parms.o_key, -					    RT_TOS(iph->tos), -					    tunnel->parms.link); +		init_tunnel_flow(&fl4, iph->protocol, iph->daddr, +				 iph->saddr, tunnel->parms.o_key, +				 RT_TOS(iph->tos), tunnel->parms.link); +		rt = ip_route_output_key(tunnel->net, &fl4); +  		if (!IS_ERR(rt)) {  			tdev = rt->dst.dev; +			tunnel_dst_set(tunnel, &rt->dst);  			ip_rt_put(rt);  		}  		if (dev->type != ARPHRD_ETHER) @@ -386,14 +396,13 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,  					  struct ip_tunnel_net *itn,  					  struct ip_tunnel_parm *parms)  { -	struct ip_tunnel *nt, *fbt; +	struct ip_tunnel *nt;  	struct net_device *dev;  	BUG_ON(!itn->fb_tunnel_dev); -	fbt = netdev_priv(itn->fb_tunnel_dev);  	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);  	if (IS_ERR(dev)) -		return NULL; +		return ERR_CAST(dev);  	dev->mtu = ip_tunnel_bind_dev(dev); @@ -405,15 +414,12 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,  int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,  		  const struct tnl_ptk_info *tpi, bool log_ecn_error)  { -	struct pcpu_tstats *tstats; +	struct pcpu_sw_netstats *tstats;  	const struct iphdr *iph = ip_hdr(skb);  	int err;  #ifdef CONFIG_NET_IPGRE_BROADCAST  	if (ipv4_is_multicast(iph->daddr)) { -		/* Looped back packet, drop it! */ -		if (rt_is_output_route(skb_rtable(skb))) -			goto drop;  		tunnel->dev->stats.multicast++;  		skb->pkt_type = PACKET_BROADCAST;  	} @@ -436,6 +442,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,  		tunnel->i_seqno = ntohl(tpi->seq) + 1;  	} +	skb_reset_network_header(skb); +  	err = IP_ECN_decapsulate(iph, skb);  	if (unlikely(err)) {  		if (log_ecn_error) @@ -454,6 +462,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,  	tstats->rx_bytes += skb->len;  	u64_stats_update_end(&tstats->syncp); +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); +  	if (tunnel->dev->type == ARPHRD_ETHER) {  		skb->protocol = eth_type_trans(skb, tunnel->dev);  		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); @@ -461,8 +471,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,  		skb->dev = tunnel->dev;  	} -	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); -  	gro_cells_receive(&tunnel->gro_cells, skb);  	return 0; @@ -532,8 +540,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  	unsigned int max_headroom;	/* The extra header space needed */  	__be32 dst;  	int err; +	bool connected;  	inner_iph = (const struct iphdr *)skb_inner_network_header(skb); +	connected = (tunnel->parms.iph.daddr != 0);  	dst = tnl_params->daddr;  	if (dst == 0) { @@ -581,27 +591,38 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  #endif  		else  			goto tx_error; + +		connected = false;  	}  	tos = tnl_params->tos;  	if (tos & 0x1) {  		tos &= ~0x1; -		if (skb->protocol == htons(ETH_P_IP)) +		if (skb->protocol == htons(ETH_P_IP)) {  			tos = inner_iph->tos; -		else if (skb->protocol == htons(ETH_P_IPV6)) +			connected = false; +		} else if (skb->protocol == htons(ETH_P_IPV6)) {  			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); +			connected = false; +		}  	} -	rt = ip_route_output_tunnel(tunnel->net, &fl4, -				    protocol, -				    dst, tnl_params->saddr, -				    tunnel->parms.o_key, -				    RT_TOS(tos), -				    tunnel->parms.link); -	if (IS_ERR(rt)) { -		dev->stats.tx_carrier_errors++; -		goto tx_error; +	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, +			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + +	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + +	if (!rt) { +		rt = ip_route_output_key(tunnel->net, &fl4); + +		if (IS_ERR(rt)) { +			dev->stats.tx_carrier_errors++; +			goto tx_error; +		} +		if (connected) +			tunnel_dst_set(tunnel, &rt->dst);  	} +  	if (rt->dst.dev == dev) {  		ip_rt_put(rt);  		dev->stats.collisions++; @@ -618,6 +639,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {  			tunnel->err_count--; +			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));  			dst_link_failure(skb);  		} else  			tunnel->err_count = 0; @@ -646,12 +668,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,  		dev->needed_headroom = max_headroom;  	if (skb_cow_head(skb, dev->needed_headroom)) { +		ip_rt_put(rt);  		dev->stats.tx_dropped++; -		dev_kfree_skb(skb); +		kfree_skb(skb);  		return;  	} -	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, +	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,  			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));  	iptunnel_xmit_stats(err, &dev->stats, dev->tstats); @@ -663,7 +686,7 @@ tx_error_icmp:  #endif  tx_error:  	dev->stats.tx_errors++; -	dev_kfree_skb(skb); +	kfree_skb(skb);  }  EXPORT_SYMBOL_GPL(ip_tunnel_xmit); @@ -696,25 +719,25 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,  		if (set_mtu)  			dev->mtu = mtu;  	} +	ip_tunnel_dst_reset_all(t);  	netdev_state_change(dev);  }  int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)  {  	int err = 0; -	struct ip_tunnel *t; -	struct net *net = dev_net(dev); -	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); +	struct ip_tunnel *t = netdev_priv(dev); +	struct net *net = t->net; +	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);  	BUG_ON(!itn->fb_tunnel_dev);  	switch (cmd) {  	case SIOCGETTUNNEL: -		t = NULL; -		if (dev == itn->fb_tunnel_dev) +		if (dev == itn->fb_tunnel_dev) {  			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); -		if (t == NULL) -			t = netdev_priv(dev); +			if (t == NULL) +				t = netdev_priv(dev); +		}  		memcpy(p, &t->parms, sizeof(*p));  		break; @@ -725,16 +748,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)  			goto done;  		if (p->iph.ttl)  			p->iph.frag_off |= htons(IP_DF); -		if (!(p->i_flags&TUNNEL_KEY)) -			p->i_key = 0; -		if (!(p->o_flags&TUNNEL_KEY)) -			p->o_key = 0; +		if (!(p->i_flags & VTI_ISVTI)) { +			if (!(p->i_flags & TUNNEL_KEY)) +				p->i_key = 0; +			if (!(p->o_flags & TUNNEL_KEY)) +				p->o_key = 0; +		}  		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); -		if (!t && (cmd == SIOCADDTUNNEL)) +		if (!t && (cmd == SIOCADDTUNNEL)) {  			t = ip_tunnel_create(net, itn, p); - +			err = PTR_ERR_OR_ZERO(t); +			break; +		}  		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {  			if (t != NULL) {  				if (t->dev != dev) { @@ -761,8 +788,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)  		if (t) {  			err = 0;  			ip_tunnel_update(itn, t, dev, p, true); -		} else -			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); +		} else { +			err = -ENOENT; +		}  		break;  	case SIOCDELTUNNEL: @@ -811,6 +839,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)  	struct ip_tunnel *tunnel = netdev_priv(dev);  	gro_cells_destroy(&tunnel->gro_cells); +	free_percpu(tunnel->dst_cache);  	free_percpu(dev->tstats);  	free_netdev(dev);  } @@ -855,11 +884,12 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,  	 */  	if (!IS_ERR(itn->fb_tunnel_dev)) {  		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; +		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);  		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));  	}  	rtnl_unlock(); -	return PTR_RET(itn->fb_tunnel_dev); +	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);  }  EXPORT_SYMBOL_GPL(ip_tunnel_init_net); @@ -979,12 +1009,19 @@ int ip_tunnel_init(struct net_device *dev)  	int err;  	dev->destructor	= ip_tunnel_dev_free; -	dev->tstats = alloc_percpu(struct pcpu_tstats); +	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);  	if (!dev->tstats)  		return -ENOMEM; +	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); +	if (!tunnel->dst_cache) { +		free_percpu(dev->tstats); +		return -ENOMEM; +	} +  	err = gro_cells_init(&tunnel->gro_cells, dev);  	if (err) { +		free_percpu(tunnel->dst_cache);  		free_percpu(dev->tstats);  		return err;  	} @@ -1009,6 +1046,8 @@ void ip_tunnel_uninit(struct net_device *dev)  	/* fb_tunnel_dev will be unregisted in net-exit call. */  	if (itn->fb_tunnel_dev != dev)  		ip_tunnel_del(netdev_priv(dev)); + +	ip_tunnel_dst_reset_all(tunnel);  }  EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c31e3ad98ef..f4c987bb7e9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@  #include <net/netns/generic.h>  #include <net/rtnetlink.h> -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,  		  __be32 src, __be32 dst, __u8 proto,  		  __u8 tos, __u8 ttl, __be16 df, bool xnet)  { @@ -56,7 +56,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,  	skb_scrub_packet(skb, xnet); -	skb->rxhash = 0; +	skb_clear_hash(skb);  	skb_dst_set(skb, &rt->dst);  	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -74,9 +74,9 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb,  	iph->daddr	=	dst;  	iph->saddr	=	src;  	iph->ttl	=	ttl; -	__ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); +	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); -	err = ip_local_out(skb); +	err = ip_local_out_sk(sk, skb);  	if (unlikely(net_xmit_eval(err)))  		pkt_len = 0;  	return pkt_len; @@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)  	nf_reset(skb);  	secpath_reset(skb); -	if (!skb->l4_rxhash) -		skb->rxhash = 0; +	skb_clear_hash_if_not_l4(skb);  	skb_dst_drop(skb);  	skb->vlan_tci = 0;  	skb_set_queue_mapping(skb, 0); @@ -116,3 +115,90 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)  	return 0;  }  EXPORT_SYMBOL_GPL(iptunnel_pull_header); + +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, +					 bool csum_help, +					 int gso_type_mask) +{ +	int err; + +	if (likely(!skb->encapsulation)) { +		skb_reset_inner_headers(skb); +		skb->encapsulation = 1; +	} + +	if (skb_is_gso(skb)) { +		err = skb_unclone(skb, GFP_ATOMIC); +		if (unlikely(err)) +			goto error; +		skb_shinfo(skb)->gso_type |= gso_type_mask; +		return skb; +	} + +	/* If packet is not gso and we are resolving any partial checksum, +	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL +	 * on the outer header without confusing devices that implement +	 * NETIF_F_IP_CSUM with encapsulation. +	 */ +	if (csum_help) +		skb->encapsulation = 0; + +	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { +		err = skb_checksum_help(skb); +		if (unlikely(err)) +			goto error; +	} else if (skb->ip_summed != CHECKSUM_PARTIAL) +		skb->ip_summed = CHECKSUM_NONE; + +	return skb; +error: +	kfree_skb(skb); +	return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, +						struct rtnl_link_stats64 *tot) +{ +	int i; + +	for_each_possible_cpu(i) { +		const struct pcpu_sw_netstats *tstats = +						   per_cpu_ptr(dev->tstats, i); +		u64 rx_packets, rx_bytes, tx_packets, tx_bytes; +		unsigned int start; + +		do { +			start = u64_stats_fetch_begin_irq(&tstats->syncp); +			rx_packets = tstats->rx_packets; +			tx_packets = tstats->tx_packets; +			rx_bytes = tstats->rx_bytes; +			tx_bytes = tstats->tx_bytes; +		} while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + +		tot->rx_packets += rx_packets; +		tot->tx_packets += tx_packets; +		tot->rx_bytes   += rx_bytes; +		tot->tx_bytes   += tx_bytes; +	} + +	tot->multicast = dev->stats.multicast; + +	tot->rx_crc_errors = dev->stats.rx_crc_errors; +	tot->rx_fifo_errors = dev->stats.rx_fifo_errors; +	tot->rx_length_errors = dev->stats.rx_length_errors; +	tot->rx_frame_errors = dev->stats.rx_frame_errors; +	tot->rx_errors = dev->stats.rx_errors; + +	tot->tx_fifo_errors = dev->stats.tx_fifo_errors; +	tot->tx_carrier_errors = dev->stats.tx_carrier_errors; +	tot->tx_dropped = dev->stats.tx_dropped; +	tot->tx_aborted_errors = dev->stats.tx_aborted_errors; +	tot->tx_errors = dev->stats.tx_errors; + +	tot->collisions  = dev->stats.collisions; + +	return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e805e7b3030..b8960f3527f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -34,6 +34,7 @@  #include <linux/init.h>  #include <linux/netfilter_ipv4.h>  #include <linux/if_ether.h> +#include <linux/icmpv6.h>  #include <net/sock.h>  #include <net/ip.h> @@ -49,145 +50,131 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;  static int vti_net_id __read_mostly;  static int vti_tunnel_init(struct net_device *dev); -static int vti_err(struct sk_buff *skb, u32 info) +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, +		     int encap_type)  { - -	/* All the routers (except for Linux) return only -	 * 8 bytes of packet payload. It means, that precise relaying of -	 * ICMP in the real Internet is absolutely infeasible. -	 */ +	struct ip_tunnel *tunnel; +	const struct iphdr *iph = ip_hdr(skb);  	struct net *net = dev_net(skb->dev);  	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); -	struct iphdr *iph = (struct iphdr *)skb->data; -	const int type = icmp_hdr(skb)->type; -	const int code = icmp_hdr(skb)->code; -	struct ip_tunnel *t; -	int err; - -	switch (type) { -	default: -	case ICMP_PARAMETERPROB: -		return 0; - -	case ICMP_DEST_UNREACH: -		switch (code) { -		case ICMP_SR_FAILED: -		case ICMP_PORT_UNREACH: -			/* Impossible event. */ -			return 0; -		default: -			/* All others are translated to HOST_UNREACH. */ -			break; -		} -		break; -	case ICMP_TIME_EXCEEDED: -		if (code != ICMP_EXC_TTL) -			return 0; -		break; -	} -	err = -ENOENT; +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +				  iph->saddr, iph->daddr, 0); +	if (tunnel != NULL) { +		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) +			goto drop; -	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, -			     iph->daddr, iph->saddr, 0); -	if (t == NULL) -		goto out; +		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; +		skb->mark = be32_to_cpu(tunnel->parms.i_key); -	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { -		ipv4_update_pmtu(skb, dev_net(skb->dev), info, -				 t->parms.link, 0, IPPROTO_IPIP, 0); -		err = 0; -		goto out; +		return xfrm_input(skb, nexthdr, spi, encap_type);  	} -	err = 0; -	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) -		goto out; - -	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) -		t->err_count++; -	else -		t->err_count = 1; -	t->err_time = jiffies; -out: -	return err; +	return -EINVAL; +drop: +	kfree_skb(skb); +	return 0;  } -/* We dont digest the packet therefore let the packet pass */  static int vti_rcv(struct sk_buff *skb)  { -	struct ip_tunnel *tunnel; -	const struct iphdr *iph = ip_hdr(skb); -	struct net *net = dev_net(skb->dev); -	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); +	XFRM_SPI_SKB_CB(skb)->family = AF_INET; +	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); -	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, -				  iph->saddr, iph->daddr, 0); -	if (tunnel != NULL) { -		struct pcpu_tstats *tstats; - -		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) -			return -1; +	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} -		tstats = this_cpu_ptr(tunnel->dev->tstats); -		u64_stats_update_begin(&tstats->syncp); -		tstats->rx_packets++; -		tstats->rx_bytes += skb->len; -		u64_stats_update_end(&tstats->syncp); +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ +	unsigned short family; +	struct net_device *dev; +	struct pcpu_sw_netstats *tstats; +	struct xfrm_state *x; +	struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; -		skb->mark = 0; -		secpath_reset(skb); -		skb->dev = tunnel->dev; +	if (!tunnel)  		return 1; + +	dev = tunnel->dev; + +	if (err) { +		dev->stats.rx_errors++; +		dev->stats.rx_dropped++; + +		return 0;  	} -	return -1; +	x = xfrm_input_state(skb); +	family = x->inner_mode->afinfo->family; + +	if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) +		return -EPERM; + +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); +	skb->dev = dev; + +	tstats = this_cpu_ptr(dev->tstats); + +	u64_stats_update_begin(&tstats->syncp); +	tstats->rx_packets++; +	tstats->rx_bytes += skb->len; +	u64_stats_update_end(&tstats->syncp); + +	return 0;  } -/* This function assumes it is being called from dev_queue_xmit() - * and that skb is filled properly by that function. - */ +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ +	xfrm_address_t *daddr = (xfrm_address_t *)&dst; +	xfrm_address_t *saddr = (xfrm_address_t *)&src; -static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +	/* if there is no transform then this tunnel is not functional. +	 * Or if the xfrm is not mode tunnel. +	 */ +	if (!x || x->props.mode != XFRM_MODE_TUNNEL || +	    x->props.family != AF_INET) +		return false; + +	if (!dst) +		return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + +	if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) +		return false; + +	return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, +			    struct flowi *fl)  {  	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct iphdr  *tiph = &tunnel->parms.iph; -	u8     tos; -	struct rtable *rt;		/* Route to the other host */ +	struct ip_tunnel_parm *parms = &tunnel->parms; +	struct dst_entry *dst = skb_dst(skb);  	struct net_device *tdev;	/* Device to other host */ -	struct iphdr  *old_iph = ip_hdr(skb); -	__be32 dst = tiph->daddr; -	struct flowi4 fl4;  	int err; -	if (skb->protocol != htons(ETH_P_IP)) -		goto tx_error; - -	tos = old_iph->tos; +	if (!dst) { +		dev->stats.tx_carrier_errors++; +		goto tx_error_icmp; +	} -	memset(&fl4, 0, sizeof(fl4)); -	flowi4_init_output(&fl4, tunnel->parms.link, -			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), -			   RT_SCOPE_UNIVERSE, -			   IPPROTO_IPIP, 0, -			   dst, tiph->saddr, 0, 0); -	rt = ip_route_output_key(dev_net(dev), &fl4); -	if (IS_ERR(rt)) { +	dst_hold(dst); +	dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); +	if (IS_ERR(dst)) {  		dev->stats.tx_carrier_errors++;  		goto tx_error_icmp;  	} -	/* if there is no transform then this tunnel is not functional. -	 * Or if the xfrm is not mode tunnel. -	 */ -	if (!rt->dst.xfrm || -	    rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) { + +	if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {  		dev->stats.tx_carrier_errors++; +		dst_release(dst);  		goto tx_error_icmp;  	} -	tdev = rt->dst.dev; + +	tdev = dst->dev;  	if (tdev == dev) { -		ip_rt_put(rt); +		dst_release(dst);  		dev->stats.collisions++;  		goto tx_error;  	} @@ -201,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)  			tunnel->err_count = 0;  	} -	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); -	nf_reset(skb); +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); +	skb_dst_set(skb, dst);  	skb->dev = skb_dst(skb)->dev;  	err = dst_output(skb); @@ -217,10 +202,102 @@ tx_error_icmp:  	dst_link_failure(skb);  tx_error:  	dev->stats.tx_errors++; -	dev_kfree_skb(skb); +	kfree_skb(skb);  	return NETDEV_TX_OK;  } +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct flowi fl; + +	memset(&fl, 0, sizeof(fl)); + +	skb->mark = be32_to_cpu(tunnel->parms.o_key); + +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		xfrm_decode_session(skb, &fl, AF_INET); +		memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +		break; +	case htons(ETH_P_IPV6): +		xfrm_decode_session(skb, &fl, AF_INET6); +		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); +		break; +	default: +		dev->stats.tx_errors++; +		dev_kfree_skb(skb); +		return NETDEV_TX_OK; +	} + +	return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ +	__be32 spi; +	__u32 mark; +	struct xfrm_state *x; +	struct ip_tunnel *tunnel; +	struct ip_esp_hdr *esph; +	struct ip_auth_hdr *ah ; +	struct ip_comp_hdr *ipch; +	struct net *net = dev_net(skb->dev); +	const struct iphdr *iph = (const struct iphdr *)skb->data; +	int protocol = iph->protocol; +	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +				  iph->daddr, iph->saddr, 0); +	if (!tunnel) +		return -1; + +	mark = be32_to_cpu(tunnel->parms.o_key); + +	switch (protocol) { +	case IPPROTO_ESP: +		esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); +		spi = esph->spi; +		break; +	case IPPROTO_AH: +		ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); +		spi = ah->spi; +		break; +	case IPPROTO_COMP: +		ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); +		spi = htonl(ntohs(ipch->cpi)); +		break; +	default: +		return 0; +	} + +	switch (icmp_hdr(skb)->type) { +	case ICMP_DEST_UNREACH: +		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) +			return 0; +	case ICMP_REDIRECT: +		break; +	default: +		return 0; +	} + +	x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, +			      spi, protocol, AF_INET); +	if (!x) +		return 0; + +	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) +		ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); +	else +		ipv4_redirect(skb, net, 0, 0, protocol, 0); +	xfrm_state_put(x); + +	return 0; +} +  static int  vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)  { @@ -236,12 +313,19 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)  			return -EINVAL;  	} +	if (!(p.i_flags & GRE_KEY)) +		p.i_key = 0; +	if (!(p.o_flags & GRE_KEY)) +		p.o_key = 0; + +	p.i_flags = VTI_ISVTI; +  	err = ip_tunnel_ioctl(dev, &p, cmd);  	if (err)  		return err;  	if (cmd != SIOCDELTUNNEL) { -		p.i_flags |= GRE_KEY | VTI_ISVTI; +		p.i_flags |= GRE_KEY;  		p.o_flags |= GRE_KEY;  	} @@ -262,6 +346,7 @@ static const struct net_device_ops vti_netdev_ops = {  static void vti_tunnel_setup(struct net_device *dev)  {  	dev->netdev_ops		= &vti_netdev_ops; +	dev->type		= ARPHRD_TUNNEL;  	ip_tunnel_setup(dev, vti_net_id);  } @@ -273,13 +358,11 @@ static int vti_tunnel_init(struct net_device *dev)  	memcpy(dev->dev_addr, &iph->saddr, 4);  	memcpy(dev->broadcast, &iph->daddr, 4); -	dev->type		= ARPHRD_TUNNEL;  	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr);  	dev->mtu		= ETH_DATA_LEN;  	dev->flags		= IFF_NOARP;  	dev->iflink		= 0;  	dev->addr_len		= 4; -	dev->features		|= NETIF_F_NETNS_LOCAL;  	dev->features		|= NETIF_F_LLTX;  	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE; @@ -296,10 +379,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)  	iph->ihl		= 5;  } -static struct xfrm_tunnel vti_handler __read_mostly = { +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {  	.handler	=	vti_rcv, -	.err_handler	=	vti_err, -	.priority	=	1, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { +	.handler	=	vti_rcv, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { +	.handler	=	vti_rcv, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100,  };  static int __net_init vti_init_net(struct net *net) @@ -343,6 +444,8 @@ static void vti_netlink_parms(struct nlattr *data[],  	if (!data)  		return; +	parms->i_flags = VTI_ISVTI; +  	if (data[IFLA_VTI_LINK])  		parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -438,10 +541,31 @@ static int __init vti_init(void)  	err = register_pernet_device(&vti_net_ops);  	if (err < 0)  		return err; -	err = xfrm4_mode_tunnel_input_register(&vti_handler); +	err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);  	if (err < 0) {  		unregister_pernet_device(&vti_net_ops);  		pr_info("vti init: can't register tunnel\n"); + +		return err; +	} + +	err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); +	if (err < 0) { +		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +		unregister_pernet_device(&vti_net_ops); +		pr_info("vti init: can't register tunnel\n"); + +		return err; +	} + +	err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); +	if (err < 0) { +		xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +		unregister_pernet_device(&vti_net_ops); +		pr_info("vti init: can't register tunnel\n"); + +		return err;  	}  	err = rtnl_link_register(&vti_link_ops); @@ -451,7 +575,9 @@ static int __init vti_init(void)  	return err;  rtnl_link_failed: -	xfrm4_mode_tunnel_input_deregister(&vti_handler); +	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);  	unregister_pernet_device(&vti_net_ops);  	return err;  } @@ -459,8 +585,13 @@ rtnl_link_failed:  static void __exit vti_fini(void)  {  	rtnl_link_unregister(&vti_link_ops); -	if (xfrm4_mode_tunnel_input_deregister(&vti_handler)) +	if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) +		pr_info("vti close: can't deregister tunnel\n"); +	if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))  		pr_info("vti close: can't deregister tunnel\n"); +	if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) +		pr_info("vti close: can't deregister tunnel\n"); +  	unregister_pernet_device(&vti_net_ops);  } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 826be4cb482..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,7 +23,7 @@  #include <net/protocol.h>  #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev);  	__be32 spi; @@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)  	switch (icmp_hdr(skb)->type) {  	case ICMP_DEST_UNREACH:  		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -			return; +			return 0;  	case ICMP_REDIRECT:  		break;  	default: -		return; +		return 0;  	}  	spi = htonl(ntohs(ipch->cpi));  	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,  			      spi, IPPROTO_COMP, AF_INET);  	if (!x) -		return; +		return 0;  	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)  		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);  	else  		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);  	xfrm_state_put(x); + +	return 0;  }  /* We always hold one tunnel user reference to indicate a tunnel */ @@ -147,6 +149,11 @@ out:  	return err;  } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +} +  static const struct xfrm_type ipcomp_type = {  	.description	= "IPCOMP4",  	.owner		= THIS_MODULE, @@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = {  	.output		= ipcomp_output  }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	ipcomp4_rcv_cb,  	.err_handler	=	ipcomp4_err, -	.no_policy	=	1, -	.netns_ok	=	1, +	.priority	=	0,  };  static int __init ipcomp4_init(void) @@ -170,7 +178,7 @@ static int __init ipcomp4_init(void)  		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { +	if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {  		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&ipcomp_type, AF_INET);  		return -EAGAIN; @@ -180,7 +188,7 @@ static int __init ipcomp4_init(void)  static void __exit ipcomp4_fini(void)  { -	if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) +	if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)  		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)  		pr_info("%s: can't remove xfrm type\n", __func__); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa52..b3e86ea7b71 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void)  		msleep(1); -		if time_before(jiffies, next_msg) +		if (time_before(jiffies, next_msg))  			continue;  		elapsed = jiffies_to_msecs(jiffies - start); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7f80fb4b82d..62eaa005e14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -149,13 +149,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)  	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {  		ipv4_update_pmtu(skb, dev_net(skb->dev), info, -				 t->dev->ifindex, 0, IPPROTO_IPIP, 0); +				 t->parms.link, 0, IPPROTO_IPIP, 0);  		err = 0;  		goto out;  	}  	if (type == ICMP_REDIRECT) { -		ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0, +		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,  			      IPPROTO_IPIP, 0);  		err = 0;  		goto out; @@ -220,17 +220,17 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)  	if (unlikely(skb->protocol != htons(ETH_P_IP)))  		goto tx_error; -	if (likely(!skb->encapsulation)) { -		skb_reset_inner_headers(skb); -		skb->encapsulation = 1; -	} +	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); +	if (IS_ERR(skb)) +		goto out;  	ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);  	return NETDEV_TX_OK;  tx_error: +	kfree_skb(skb); +out:  	dev->stats.tx_errors++; -	dev_kfree_skb(skb);  	return NETDEV_TX_OK;  } @@ -275,6 +275,7 @@ static const struct net_device_ops ipip_netdev_ops = {  #define IPIP_FEATURES (NETIF_F_SG |		\  		       NETIF_F_FRAGLIST |	\  		       NETIF_F_HIGHDMA |	\ +		       NETIF_F_GSO_SOFTWARE |	\  		       NETIF_F_HW_CSUM)  static void ipip_tunnel_setup(struct net_device *dev) @@ -485,4 +486,5 @@ static void __exit ipip_fini(void)  module_init(ipip_init);  module_exit(ipip_fini);  MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("ipip");  MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 62212c772a4..65bcaa78904 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)  static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,  			   struct mr_table **mrt)  { -	struct ipmr_result res; -	struct fib_lookup_arg arg = { .result = &res, };  	int err; +	struct ipmr_result res; +	struct fib_lookup_arg arg = { +		.result = &res, +		.flags = FIB_LOOKUP_NOREF, +	};  	err = fib_rules_lookup(net->ipv4.mr_rules_ops,  			       flowi4_to_flowi(flp4), 0, &arg); @@ -425,6 +428,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)  				goto failure;  			ipv4_devconf_setall(in_dev); +			neigh_parms_data_state_setall(in_dev->arp_parms);  			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;  			if (dev_open(dev)) @@ -451,7 +455,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)  	struct mr_table *mrt;  	struct flowi4 fl4 = {  		.flowi4_oif	= dev->ifindex, -		.flowi4_iif	= skb->skb_iif, +		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,  		.flowi4_mark	= skb->mark,  	};  	int err; @@ -480,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)  	dev->type		= ARPHRD_PIMREG;  	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;  	dev->flags		= IFF_NOARP; -	dev->netdev_ops		= ®_vif_netdev_ops, +	dev->netdev_ops		= ®_vif_netdev_ops;  	dev->destructor		= free_netdev;  	dev->features		|= NETIF_F_NETNS_LOCAL;  } @@ -517,6 +521,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)  	}  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;  	rcu_read_unlock(); @@ -1658,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)  	iph->protocol	=	IPPROTO_IPIP;  	iph->ihl	=	5;  	iph->tot_len	=	htons(skb->len); -	ip_select_ident(skb, skb_dst(skb), NULL); +	ip_select_ident(skb, NULL);  	ip_send_check(iph);  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -2250,13 +2255,14 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,  }  static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, -			    u32 portid, u32 seq, struct mfc_cache *c, int cmd) +			    u32 portid, u32 seq, struct mfc_cache *c, int cmd, +			    int flags)  {  	struct nlmsghdr *nlh;  	struct rtmsg *rtm;  	int err; -	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); +	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);  	if (nlh == NULL)  		return -EMSGSIZE; @@ -2324,7 +2330,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,  	if (skb == NULL)  		goto errout; -	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd); +	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);  	if (err < 0)  		goto errout; @@ -2363,7 +2369,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)  				if (ipmr_fill_mroute(mrt, skb,  						     NETLINK_CB(cb->skb).portid,  						     cb->nlh->nlmsg_seq, -						     mfc, RTM_NEWROUTE) < 0) +						     mfc, RTM_NEWROUTE, +						     NLM_F_MULTI) < 0)  					goto done;  next_entry:  				e++; @@ -2377,7 +2384,8 @@ next_entry:  			if (ipmr_fill_mroute(mrt, skb,  					     NETLINK_CB(cb->skb).portid,  					     cb->nlh->nlmsg_seq, -					     mfc, RTM_NEWROUTE) < 0) { +					     mfc, RTM_NEWROUTE, +					     NLM_F_MULTI) < 0) {  				spin_unlock_bh(&mfc_unres_lock);  				goto done;  			} diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c3e0adea9c2..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)  		skb_dst_set(skb, NULL);  		dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);  		if (IS_ERR(dst)) -			return PTR_ERR(dst);; +			return PTR_ERR(dst);  		skb_dst_set(skb, dst);  	}  #endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1657e39b291..a26ce035e3f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,6 +36,42 @@ config NF_CONNTRACK_PROC_COMPAT  	  If unsure, say Y. +config NF_TABLES_IPV4 +	depends on NF_TABLES +	tristate "IPv4 nf_tables support" +	help +	  This option enables the IPv4 support for nf_tables. + +config NFT_CHAIN_ROUTE_IPV4 +	depends on NF_TABLES_IPV4 +	tristate "IPv4 nf_tables route chain support" +	help +	  This option enables the "route" chain for IPv4 in nf_tables. This +	  chain type is used to force packet re-routing after mangling header +	  fields such as the source, destination, type of service and +	  the packet mark. + +config NFT_CHAIN_NAT_IPV4 +	depends on NF_TABLES_IPV4 +	depends on NF_NAT_IPV4 && NFT_NAT +	tristate "IPv4 nf_tables nat chain support" +	help +	  This option enables the "nat" chain for IPv4 in nf_tables. This +	  chain type is used to perform Network Address Translation (NAT) +	  packet transformations such as the source, destination address and +	  source and destination ports. + +config NFT_REJECT_IPV4 +	depends on NF_TABLES_IPV4 +	default NFT_REJECT +	tristate + +config NF_TABLES_ARP +	depends on NF_TABLES +	tristate "ARP nf_tables support" +	help +	  This option enables the ARP support for nf_tables. +  config IP_NF_IPTABLES  	tristate "IP tables support (required for filtering/masq/NAT)"  	default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 3622b248b6d..90b82405331 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,6 +27,12 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o  # NAT protocols (nf_nat)  obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o +obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o +  # generic IP tables   obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 85a4f21aac1..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -271,6 +271,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,  	local_bh_disable();  	addend = xt_write_recseq_begin();  	private = table->private; +	/* +	 * Ensure we load private-> members after we've fetched the base +	 * pointer. +	 */ +	smp_read_barrier_depends();  	table_base = private->entries[smp_processor_id()];  	e = get_entry(table_base, private->hook_entry[hook]); @@ -1039,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,  	xt_free_table_info(oldinfo);  	if (copy_to_user(counters_ptr, counters, -			 sizeof(struct xt_counters) * num_counters) != 0) -		ret = -EFAULT; +			 sizeof(struct xt_counters) * num_counters) != 0) { +		/* Silent error, can't fail, new table is already in place */ +		net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); +	}  	vfree(counters);  	xt_table_unlock(t);  	return ret; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index a865f6f9401..802ddecb30b 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {  /* The work comes in here from netfilter.c */  static unsigned int -arptable_filter_hook(unsigned int hook, struct sk_buff *skb, +arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  {  	const struct net *net = dev_net((in != NULL) ? in : out); -	return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); +	return arpt_do_table(skb, ops->hooknum, in, out, +			     net->ipv4.arptable_filter);  }  static struct nf_hook_ops *arpfilter_ops __read_mostly; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d23118d95ff..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -327,6 +327,11 @@ ipt_do_table(struct sk_buff *skb,  	addend = xt_write_recseq_begin();  	private = table->private;  	cpu        = smp_processor_id(); +	/* +	 * Ensure we load private-> members after we've fetched the base +	 * pointer. +	 */ +	smp_read_barrier_depends();  	table_base = private->entries[cpu];  	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];  	stackptr   = per_cpu_ptr(private->stackptr, cpu); @@ -1226,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,  	xt_free_table_info(oldinfo);  	if (copy_to_user(counters_ptr, counters, -			 sizeof(struct xt_counters) * num_counters) != 0) -		ret = -EFAULT; +			 sizeof(struct xt_counters) * num_counters) != 0) { +		/* Silent error, can't fail, new table is already in place */ +		net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); +	}  	vfree(counters);  	xt_table_unlock(t);  	return ret; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 0b732efd32e..2510c02c2d2 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -28,6 +28,7 @@  #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/net_namespace.h> +#include <net/netns/generic.h>  #include <net/checksum.h>  #include <net/ip.h> @@ -57,15 +58,21 @@ struct clusterip_config {  	struct rcu_head rcu;  }; -static LIST_HEAD(clusterip_configs); +#ifdef CONFIG_PROC_FS +static const struct file_operations clusterip_proc_fops; +#endif -/* clusterip_lock protects the clusterip_configs list */ -static DEFINE_SPINLOCK(clusterip_lock); +static int clusterip_net_id __read_mostly; + +struct clusterip_net { +	struct list_head configs; +	/* lock protects the configs list */ +	spinlock_t lock;  #ifdef CONFIG_PROC_FS -static const struct file_operations clusterip_proc_fops; -static struct proc_dir_entry *clusterip_procdir; +	struct proc_dir_entry *procdir;  #endif +};  static inline void  clusterip_config_get(struct clusterip_config *c) @@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)  static inline void  clusterip_config_entry_put(struct clusterip_config *c)  { +	struct net *net = dev_net(c->dev); +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +  	local_bh_disable(); -	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { +	if (atomic_dec_and_lock(&c->entries, &cn->lock)) {  		list_del_rcu(&c->list); -		spin_unlock(&clusterip_lock); +		spin_unlock(&cn->lock);  		local_bh_enable();  		dev_mc_del(c->dev, c->clustermac); @@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)  }  static struct clusterip_config * -__clusterip_config_find(__be32 clusterip) +__clusterip_config_find(struct net *net, __be32 clusterip)  {  	struct clusterip_config *c; +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); -	list_for_each_entry_rcu(c, &clusterip_configs, list) { +	list_for_each_entry_rcu(c, &cn->configs, list) {  		if (c->clusterip == clusterip)  			return c;  	} @@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)  }  static inline struct clusterip_config * -clusterip_config_find_get(__be32 clusterip, int entry) +clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)  {  	struct clusterip_config *c;  	rcu_read_lock_bh(); -	c = __clusterip_config_find(clusterip); +	c = __clusterip_config_find(net, clusterip);  	if (c) {  		if (unlikely(!atomic_inc_not_zero(&c->refcount)))  			c = NULL; @@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  			struct net_device *dev)  {  	struct clusterip_config *c; +	struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);  	c = kzalloc(sizeof(*c), GFP_ATOMIC);  	if (!c) @@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  		/* create proc dir entry */  		sprintf(buffer, "%pI4", &ip);  		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, -					  clusterip_procdir, +					  cn->procdir,  					  &clusterip_proc_fops, c);  		if (!c->pde) {  			kfree(c); @@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  	}  #endif -	spin_lock_bh(&clusterip_lock); -	list_add_rcu(&c->list, &clusterip_configs); -	spin_unlock_bh(&clusterip_lock); +	spin_lock_bh(&cn->lock); +	list_add_rcu(&c->list, &cn->configs); +	spin_unlock_bh(&cn->lock);  	return c;  } @@ -370,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  	/* FIXME: further sanity checks */ -	config = clusterip_config_find_get(e->ip.dst.s_addr, 1); +	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);  	if (!config) {  		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {  			pr_info("no config found for %pI4, need 'new'\n", @@ -384,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  				return -EINVAL;  			} -			dev = dev_get_by_name(&init_net, e->ip.iniface); +			dev = dev_get_by_name(par->net, e->ip.iniface);  			if (!dev) {  				pr_info("no such interface %s\n",  					e->ip.iniface); @@ -483,7 +495,7 @@ static void arp_print(struct arp_payload *payload)  #endif  static unsigned int -arp_mangle(unsigned int hook, +arp_mangle(const struct nf_hook_ops *ops,  	   struct sk_buff *skb,  	   const struct net_device *in,  	   const struct net_device *out, @@ -492,6 +504,7 @@ arp_mangle(unsigned int hook,  	struct arphdr *arp = arp_hdr(skb);  	struct arp_payload *payload;  	struct clusterip_config *c; +	struct net *net = dev_net(in ? in : out);  	/* we don't care about non-ethernet and non-ipv4 ARP */  	if (arp->ar_hrd != htons(ARPHRD_ETHER) || @@ -508,7 +521,7 @@ arp_mangle(unsigned int hook,  	/* if there is no clusterip configuration for the arp reply's  	 * source ip, we don't want to mangle it */ -	c = clusterip_config_find_get(payload->src_ip, 0); +	c = clusterip_config_find_get(net, payload->src_ip, 0);  	if (!c)  		return NF_ACCEPT; @@ -698,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {  #endif /* CONFIG_PROC_FS */ +static int clusterip_net_init(struct net *net) +{ +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); + +	INIT_LIST_HEAD(&cn->configs); + +	spin_lock_init(&cn->lock); + +#ifdef CONFIG_PROC_FS +	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); +	if (!cn->procdir) { +		pr_err("Unable to proc dir entry\n"); +		return -ENOMEM; +	} +#endif /* CONFIG_PROC_FS */ + +	return 0; +} + +static void clusterip_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	proc_remove(cn->procdir); +#endif +} + +static struct pernet_operations clusterip_net_ops = { +	.init = clusterip_net_init, +	.exit = clusterip_net_exit, +	.id   = &clusterip_net_id, +	.size = sizeof(struct clusterip_net), +}; +  static int __init clusterip_tg_init(void)  {  	int ret; -	ret = xt_register_target(&clusterip_tg_reg); +	ret = register_pernet_subsys(&clusterip_net_ops);  	if (ret < 0)  		return ret; +	ret = xt_register_target(&clusterip_tg_reg); +	if (ret < 0) +		goto cleanup_subsys; +  	ret = nf_register_hook(&cip_arp_ops);  	if (ret < 0)  		goto cleanup_target; -#ifdef CONFIG_PROC_FS -	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); -	if (!clusterip_procdir) { -		pr_err("Unable to proc dir entry\n"); -		ret = -ENOMEM; -		goto cleanup_hook; -	} -#endif /* CONFIG_PROC_FS */ -  	pr_info("ClusterIP Version %s loaded successfully\n",  		CLUSTERIP_VERSION); +  	return 0; -#ifdef CONFIG_PROC_FS -cleanup_hook: -	nf_unregister_hook(&cip_arp_ops); -#endif /* CONFIG_PROC_FS */  cleanup_target:  	xt_unregister_target(&clusterip_tg_reg); +cleanup_subsys: +	unregister_pernet_subsys(&clusterip_net_ops);  	return ret;  }  static void __exit clusterip_tg_exit(void)  {  	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); -#ifdef CONFIG_PROC_FS -	proc_remove(clusterip_procdir); -#endif +  	nf_unregister_hook(&cip_arp_ops);  	xt_unregister_target(&clusterip_tg_reg); +	unregister_pernet_subsys(&clusterip_net_ops);  	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */  	rcu_barrier_bh(); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b969131ad1c..5b6e0df4ccf 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@  #include <linux/udp.h>  #include <linux/icmp.h>  #include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/route.h> -#include <net/dst.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter_ipv4/ip_tables.h>  #include <linux/netfilter_ipv4/ipt_REJECT.h> @@ -28,128 +24,12 @@  #include <linux/netfilter_bridge.h>  #endif +#include <net/netfilter/ipv4/nf_reject.h> +  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");  MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ -	struct sk_buff *nskb; -	const struct iphdr *oiph; -	struct iphdr *niph; -	const struct tcphdr *oth; -	struct tcphdr _otcph, *tcph; - -	/* IP header checks: fragment. */ -	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) -		return; - -	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), -				 sizeof(_otcph), &_otcph); -	if (oth == NULL) -		return; - -	/* No RST for RST. */ -	if (oth->rst) -		return; - -	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) -		return; - -	/* Check checksum */ -	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) -		return; -	oiph = ip_hdr(oldskb); - -	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + -			 LL_MAX_HEADER, GFP_ATOMIC); -	if (!nskb) -		return; - -	skb_reserve(nskb, LL_MAX_HEADER); - -	skb_reset_network_header(nskb); -	niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); -	niph->version	= 4; -	niph->ihl	= sizeof(struct iphdr) / 4; -	niph->tos	= 0; -	niph->id	= 0; -	niph->frag_off	= htons(IP_DF); -	niph->protocol	= IPPROTO_TCP; -	niph->check	= 0; -	niph->saddr	= oiph->daddr; -	niph->daddr	= oiph->saddr; - -	skb_reset_transport_header(nskb); -	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); -	memset(tcph, 0, sizeof(*tcph)); -	tcph->source	= oth->dest; -	tcph->dest	= oth->source; -	tcph->doff	= sizeof(struct tcphdr) / 4; - -	if (oth->ack) -		tcph->seq = oth->ack_seq; -	else { -		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + -				      oldskb->len - ip_hdrlen(oldskb) - -				      (oth->doff << 2)); -		tcph->ack = 1; -	} - -	tcph->rst	= 1; -	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, -				    niph->daddr, 0); -	nskb->ip_summed = CHECKSUM_PARTIAL; -	nskb->csum_start = (unsigned char *)tcph - nskb->head; -	nskb->csum_offset = offsetof(struct tcphdr, check); - -	/* ip_route_me_harder expects skb->dst to be set */ -	skb_dst_set_noref(nskb, skb_dst(oldskb)); - -	nskb->protocol = htons(ETH_P_IP); -	if (ip_route_me_harder(nskb, RTN_UNSPEC)) -		goto free_nskb; - -	niph->ttl	= ip4_dst_hoplimit(skb_dst(nskb)); - -	/* "Never happens" */ -	if (nskb->len > dst_mtu(skb_dst(nskb))) -		goto free_nskb; - -	nf_ct_attach(nskb, oldskb); - -#ifdef CONFIG_BRIDGE_NETFILTER -	/* If we use ip_local_out for bridged traffic, the MAC source on -	 * the RST will be ours, instead of the destination's.  This confuses -	 * some routers/firewalls, and they drop the packet.  So we need to -	 * build the eth header using the original destination's MAC as the -	 * source, and send the RST packet directly. -	 */ -	if (oldskb->nf_bridge) { -		struct ethhdr *oeth = eth_hdr(oldskb); -		nskb->dev = oldskb->nf_bridge->physindev; -		niph->tot_len = htons(nskb->len); -		ip_send_check(niph); -		if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), -				    oeth->h_source, oeth->h_dest, nskb->len) < 0) -			goto free_nskb; -		dev_queue_xmit(nskb); -	} else -#endif -		ip_local_out(nskb); - -	return; - - free_nskb: -	kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ -	icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} -  static unsigned int  reject_tg(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -157,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)  	switch (reject->with) {  	case IPT_ICMP_NET_UNREACHABLE: -		send_unreach(skb, ICMP_NET_UNREACH); +		nf_send_unreach(skb, ICMP_NET_UNREACH);  		break;  	case IPT_ICMP_HOST_UNREACHABLE: -		send_unreach(skb, ICMP_HOST_UNREACH); +		nf_send_unreach(skb, ICMP_HOST_UNREACH);  		break;  	case IPT_ICMP_PROT_UNREACHABLE: -		send_unreach(skb, ICMP_PROT_UNREACH); +		nf_send_unreach(skb, ICMP_PROT_UNREACH);  		break;  	case IPT_ICMP_PORT_UNREACHABLE: -		send_unreach(skb, ICMP_PORT_UNREACH); +		nf_send_unreach(skb, ICMP_PORT_UNREACH);  		break;  	case IPT_ICMP_NET_PROHIBITED: -		send_unreach(skb, ICMP_NET_ANO); +		nf_send_unreach(skb, ICMP_NET_ANO);  		break;  	case IPT_ICMP_HOST_PROHIBITED: -		send_unreach(skb, ICMP_HOST_ANO); +		nf_send_unreach(skb, ICMP_HOST_ANO);  		break;  	case IPT_ICMP_ADMIN_PROHIBITED: -		send_unreach(skb, ICMP_PKT_FILTERED); +		nf_send_unreach(skb, ICMP_PKT_FILTERED);  		break;  	case IPT_TCP_RESET: -		send_reset(skb, par->hooknum); +		nf_send_reset(skb, par->hooknum);  	case IPT_ICMP_ECHOREPLY:  		/* Doesn't happen. */  		break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index b6346bf2fde..a313c3fbeb4 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -244,6 +244,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,  	this_cpu_inc(snet->stats->cookie_valid);  	opts->mss = mss; +	opts->options |= XT_SYNPROXY_OPT_MSS;  	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)  		synproxy_check_timestamp_cookie(opts); @@ -297,7 +298,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)  	return XT_CONTINUE;  } -static unsigned int ipv4_synproxy_hook(unsigned int hooknum, +static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,  				       struct sk_buff *skb,  				       const struct net_device *in,  				       const struct net_device *out, @@ -422,6 +423,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)  static struct xt_target synproxy_tg4_reg __read_mostly = {  	.name		= "SYNPROXY",  	.family		= NFPROTO_IPV4, +	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),  	.target		= synproxy_tg4,  	.targetsize	= sizeof(struct xt_synproxy_info),  	.checkentry	= synproxy_tg4_check, diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index cbc22158af4..9cb993cd224 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,6 +220,7 @@ static void ipt_ulog_packet(struct net *net,  	ub->qlen++;  	pm = nlmsg_data(nlh); +	memset(pm, 0, sizeof(*pm));  	/* We might not have a timestamp, get one */  	if (skb->tstamp.tv64 == 0) @@ -238,8 +239,6 @@ static void ipt_ulog_packet(struct net *net,  	}  	else if (loginfo->prefix[0] != '\0')  		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); -	else -		*(pm->prefix) = '\0';  	if (in && in->hard_header_len > 0 &&  	    skb->mac_header != skb->network_header && @@ -251,13 +250,9 @@ static void ipt_ulog_packet(struct net *net,  	if (in)  		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); -	else -		pm->indev_name[0] = '\0';  	if (out)  		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); -	else -		pm->outdev_name[0] = '\0';  	/* copy_len <= skb->len, so can't fail. */  	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c49dcd0284a..4bfaedf9b34 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -89,11 +89,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)  	if (ipv4_is_multicast(iph->daddr)) {  		if (ipv4_is_zeronet(iph->saddr))  			return ipv4_is_local_multicast(iph->daddr) ^ invert; -		flow.flowi4_iif = 0; -	} else { -		flow.flowi4_iif = LOOPBACK_IFINDEX;  	} - +	flow.flowi4_iif = LOOPBACK_IFINDEX;  	flow.daddr = iph->saddr;  	flow.saddr = rpfilter_get_saddr(iph->daddr);  	flow.flowi4_oif = 0; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 50af5b45c05..e08a74a243a 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,20 +33,21 @@ static const struct xt_table packet_filter = {  };  static unsigned int -iptable_filter_hook(unsigned int hook, struct sk_buff *skb, +iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		    const struct net_device *in, const struct net_device *out,  		    int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT && +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* root is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); +	return ipt_do_table(skb, ops->hooknum, in, out, +			    net->ipv4.iptable_filter);  }  static struct nf_hook_ops *filter_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 0d8cd82e0fa..6a5079c34bb 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)  /* The work comes in here from netfilter.c. */  static unsigned int -iptable_mangle_hook(unsigned int hook, +iptable_mangle_hook(const struct nf_hook_ops *ops,  		     struct sk_buff *skb,  		     const struct net_device *in,  		     const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	if (hook == NF_INET_LOCAL_OUT) +	if (ops->hooknum == NF_INET_LOCAL_OUT)  		return ipt_mangle_out(skb, out); -	if (hook == NF_INET_POST_ROUTING) -		return ipt_do_table(skb, hook, in, out, +	if (ops->hooknum == NF_INET_POST_ROUTING) +		return ipt_do_table(skb, ops->hooknum, in, out,  				    dev_net(out)->ipv4.iptable_mangle);  	/* PREROUTING/INPUT/FORWARD: */ -	return ipt_do_table(skb, hook, in, out, +	return ipt_do_table(skb, ops->hooknum, in, out,  			    dev_net(in)->ipv4.iptable_mangle);  } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 683bfaffed6..f1787c04a4d 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,  }  static unsigned int -nf_nat_ipv4_fn(unsigned int hooknum, +nf_nat_ipv4_fn(const struct nf_hook_ops *ops,  	       struct sk_buff *skb,  	       const struct net_device *in,  	       const struct net_device *out, @@ -71,7 +71,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,  	enum ip_conntrack_info ctinfo;  	struct nf_conn_nat *nat;  	/* maniptype == SRC for postrouting. */ -	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); +	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);  	/* We never see fragments: conntrack defrags on pre-routing  	 * and local-out, and nf_nat_out protects post-routing. @@ -91,24 +91,16 @@ nf_nat_ipv4_fn(unsigned int hooknum,  	if (nf_ct_is_untracked(ct))  		return NF_ACCEPT; -	nat = nfct_nat(ct); -	if (!nat) { -		/* NAT module was loaded late. */ -		if (nf_ct_is_confirmed(ct)) -			return NF_ACCEPT; -		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); -		if (nat == NULL) { -			pr_debug("failed to add NAT extension\n"); -			return NF_ACCEPT; -		} -	} +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT;  	switch (ctinfo) {  	case IP_CT_RELATED:  	case IP_CT_RELATED_REPLY:  		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {  			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, -							   hooknum)) +							   ops->hooknum))  				return NF_DROP;  			else  				return NF_ACCEPT; @@ -121,14 +113,14 @@ nf_nat_ipv4_fn(unsigned int hooknum,  		if (!nf_nat_initialized(ct, maniptype)) {  			unsigned int ret; -			ret = nf_nat_rule_find(skb, hooknum, in, out, ct); +			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);  			if (ret != NF_ACCEPT)  				return ret;  		} else {  			pr_debug("Already setup manip %s for ct %p\n",  				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",  				 ct); -			if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) +			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))  				goto oif_changed;  		}  		break; @@ -137,11 +129,11 @@ nf_nat_ipv4_fn(unsigned int hooknum,  		/* ESTABLISHED */  		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||  			     ctinfo == IP_CT_ESTABLISHED_REPLY); -		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) +		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))  			goto oif_changed;  	} -	return nf_nat_packet(ct, ctinfo, hooknum, skb); +	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);  oif_changed:  	nf_ct_kill_acct(ct, ctinfo, skb); @@ -149,7 +141,7 @@ oif_changed:  }  static unsigned int -nf_nat_ipv4_in(unsigned int hooknum, +nf_nat_ipv4_in(const struct nf_hook_ops *ops,  	       struct sk_buff *skb,  	       const struct net_device *in,  	       const struct net_device *out, @@ -158,7 +150,7 @@ nf_nat_ipv4_in(unsigned int hooknum,  	unsigned int ret;  	__be32 daddr = ip_hdr(skb)->daddr; -	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);  	if (ret != NF_DROP && ret != NF_STOLEN &&  	    daddr != ip_hdr(skb)->daddr)  		skb_dst_drop(skb); @@ -167,7 +159,7 @@ nf_nat_ipv4_in(unsigned int hooknum,  }  static unsigned int -nf_nat_ipv4_out(unsigned int hooknum, +nf_nat_ipv4_out(const struct nf_hook_ops *ops,  		struct sk_buff *skb,  		const struct net_device *in,  		const struct net_device *out, @@ -185,7 +177,7 @@ nf_nat_ipv4_out(unsigned int hooknum,  	    ip_hdrlen(skb) < sizeof(struct iphdr))  		return NF_ACCEPT; -	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);  #ifdef CONFIG_XFRM  	if (ret != NF_DROP && ret != NF_STOLEN &&  	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && @@ -207,7 +199,7 @@ nf_nat_ipv4_out(unsigned int hooknum,  }  static unsigned int -nf_nat_ipv4_local_fn(unsigned int hooknum, +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,  		     struct sk_buff *skb,  		     const struct net_device *in,  		     const struct net_device *out, @@ -223,7 +215,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,  	    ip_hdrlen(skb) < sizeof(struct iphdr))  		return NF_ACCEPT; -	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn); +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);  	if (ret != NF_DROP && ret != NF_STOLEN &&  	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {  		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1f82aea11df..b2f7e8f9831 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {  /* The work comes in here from netfilter.c. */  static unsigned int -iptable_raw_hook(unsigned int hook, struct sk_buff *skb, +iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		 const struct net_device *in, const struct net_device *out,  		 int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT &&  +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* root is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); +	return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);  }  static struct nf_hook_ops *rawtable_ops __read_mostly; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index f867a8d38bf..c86647ed207 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,21 +37,22 @@ static const struct xt_table security_table = {  };  static unsigned int -iptable_security_hook(unsigned int hook, struct sk_buff *skb, +iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT && +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* Somebody is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); +	return ipt_do_table(skb, ops->hooknum, in, out, +			    net->ipv4.iptable_security);  }  static struct nf_hook_ops *sectbl_ops __read_mostly; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 86f5b34a4ed..8127dc80286 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,  	return NF_ACCEPT;  } -static unsigned int ipv4_helper(unsigned int hooknum, +static unsigned int ipv4_helper(const struct nf_hook_ops *ops,  				struct sk_buff *skb,  				const struct net_device *in,  				const struct net_device *out, @@ -121,7 +121,7 @@ static unsigned int ipv4_helper(unsigned int hooknum,  			    ct, ctinfo);  } -static unsigned int ipv4_confirm(unsigned int hooknum, +static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,  				 struct sk_buff *skb,  				 const struct net_device *in,  				 const struct net_device *out, @@ -147,16 +147,16 @@ out:  	return nf_conntrack_confirm(skb);  } -static unsigned int ipv4_conntrack_in(unsigned int hooknum, +static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,  				      struct sk_buff *skb,  				      const struct net_device *in,  				      const struct net_device *out,  				      int (*okfn)(struct sk_buff *))  { -	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); +	return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);  } -static unsigned int ipv4_conntrack_local(unsigned int hooknum, +static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,  					 struct sk_buff *skb,  					 const struct net_device *in,  					 const struct net_device *out, @@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,  	if (skb->len < sizeof(struct iphdr) ||  	    ip_hdrlen(skb) < sizeof(struct iphdr))  		return NF_ACCEPT; -	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); +	return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);  }  /* Connection tracking may drop packets, but never alters them, so @@ -548,9 +548,3 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)  module_init(nf_conntrack_l3proto_ipv4_init);  module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ -	return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 742815518b0..b8f6381c7d0 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@  #endif  #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */  static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)  {  	int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)  	err = ip_defrag(skb, user);  	local_bh_enable(); -	if (!err) +	if (!err) {  		ip_send_check(ip_hdr(skb)); +		skb->ignore_df = 1; +	}  	return err;  } @@ -60,7 +61,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,  		return IP_DEFRAG_CONNTRACK_OUT + zone;  } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, +static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,  					  struct sk_buff *skb,  					  const struct net_device *in,  					  const struct net_device *out, @@ -83,7 +84,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,  #endif  	/* Gather fragments. */  	if (ip_is_fragment(ip_hdr(skb))) { -		enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); +		enum ip_defrag_users user = +			nf_ct_defrag_user(ops->hooknum, skb); +  		if (nf_ct_ipv4_gather_frags(skb, user))  			return NF_STOLEN;  	} diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9eea059dd62..574f7ebba0b 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  			ret = nf_ct_expect_related(rtcp_exp);  			if (ret == 0)  				break; -			else if (ret != -EBUSY) { +			else if (ret == -EBUSY) { +				nf_ct_unexpect_related(rtp_exp); +				continue; +			} else if (ret < 0) {  				nf_ct_unexpect_related(rtp_exp);  				nated_port = 0;  				break; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 5f011cc89cd..7c676671329 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -34,8 +34,7 @@   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   * GNU General Public License for more details.   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: James Morris <jmorris@intercode.com.au>   * @@ -462,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,  	}  	if (subid < 40) { -		optr [0] = 0; -		optr [1] = subid; +		optr[0] = 0; +		optr[1] = subid;  	} else if (subid < 80) { -		optr [0] = 1; -		optr [1] = subid - 40; +		optr[0] = 1; +		optr[1] = subid - 40;  	} else { -		optr [0] = 2; -		optr [1] = subid - 80; +		optr[0] = 2; +		optr[1] = subid - 80;  	}  	*len = 2; @@ -1199,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct,  		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);  	} else {  		/* DNAT replies */ -		map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); -		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); +		map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); +		map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);  	}  	if (map.from == map.to) diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c new file mode 100644 index 00000000000..19412a4063f --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netfilter_arp.h> +#include <net/netfilter/nf_tables.h> + +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, +		  struct sk_buff *skb, +		  const struct net_device *in, +		  const struct net_device *out, +		  int (*okfn)(struct sk_buff *)) +{ +	struct nft_pktinfo pkt; + +	nft_set_pktinfo(&pkt, ops, skb, in, out); + +	return nft_do_chain(&pkt, ops); +} + +static struct nft_af_info nft_af_arp __read_mostly = { +	.family		= NFPROTO_ARP, +	.nhooks		= NF_ARP_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 1, +	.hooks		= { +		[NF_ARP_IN]		= nft_do_chain_arp, +		[NF_ARP_OUT]		= nft_do_chain_arp, +		[NF_ARP_FORWARD]	= nft_do_chain_arp, +	}, +}; + +static int nf_tables_arp_init_net(struct net *net) +{ +	net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.arp== NULL) +		return -ENOMEM; + +	memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); + +	if (nft_register_afinfo(net, net->nft.arp) < 0) +		goto err; + +	return 0; +err: +	kfree(net->nft.arp); +	return -ENOMEM; +} + +static void nf_tables_arp_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.arp); +	kfree(net->nft.arp); +} + +static struct pernet_operations nf_tables_arp_net_ops = { +	.init   = nf_tables_arp_init_net, +	.exit   = nf_tables_arp_exit_net, +}; + +static const struct nf_chain_type filter_arp = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_ARP, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_ARP_IN) | +			  (1 << NF_ARP_OUT) | +			  (1 << NF_ARP_FORWARD), +}; + +static int __init nf_tables_arp_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_arp); +	ret = register_pernet_subsys(&nf_tables_arp_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_arp); + +	return ret; +} + +static void __exit nf_tables_arp_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_arp_net_ops); +	nft_unregister_chain_type(&filter_arp); +} + +module_init(nf_tables_arp_init); +module_exit(nf_tables_arp_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c new file mode 100644 index 00000000000..6820c8c4084 --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/netfilter/nf_tables_ipv4.h> + +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +				      struct sk_buff *skb, +				      const struct net_device *in, +				      const struct net_device *out, +				      int (*okfn)(struct sk_buff *)) +{ +	struct nft_pktinfo pkt; + +	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +	return nft_do_chain(&pkt, ops); +} + +static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +				    struct sk_buff *skb, +				    const struct net_device *in, +				    const struct net_device *out, +				    int (*okfn)(struct sk_buff *)) +{ +	if (unlikely(skb->len < sizeof(struct iphdr) || +		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { +		if (net_ratelimit()) +			pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " +				"packet\n"); +		return NF_ACCEPT; +	} + +	return nft_do_chain_ipv4(ops, skb, in, out, okfn); +} + +struct nft_af_info nft_af_ipv4 __read_mostly = { +	.family		= NFPROTO_IPV4, +	.nhooks		= NF_INET_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 1, +	.hooks		= { +		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4, +		[NF_INET_LOCAL_OUT]	= nft_ipv4_output, +		[NF_INET_FORWARD]	= nft_do_chain_ipv4, +		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4, +		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4, +	}, +}; +EXPORT_SYMBOL_GPL(nft_af_ipv4); + +static int nf_tables_ipv4_init_net(struct net *net) +{ +	net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.ipv4 == NULL) +		return -ENOMEM; + +	memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); + +	if (nft_register_afinfo(net, net->nft.ipv4) < 0) +		goto err; + +	return 0; +err: +	kfree(net->nft.ipv4); +	return -ENOMEM; +} + +static void nf_tables_ipv4_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.ipv4); +	kfree(net->nft.ipv4); +} + +static struct pernet_operations nf_tables_ipv4_net_ops = { +	.init	= nf_tables_ipv4_init_net, +	.exit	= nf_tables_ipv4_exit_net, +}; + +static const struct nf_chain_type filter_ipv4 = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_IN) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_FORWARD) | +			  (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_ipv4_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_ipv4); +	ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_ipv4); + +	return ret; +} + +static void __exit nf_tables_ipv4_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_ipv4_net_ops); +	nft_unregister_chain_type(&filter_ipv4); +} + +module_init(nf_tables_ipv4_init); +module_exit(nf_tables_ipv4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c new file mode 100644 index 00000000000..3964157d826 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +/* + * NAT chains + */ + +static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, +			      struct sk_buff *skb, +			      const struct net_device *in, +			      const struct net_device *out, +			      int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	struct nf_conn_nat *nat; +	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); +	struct nft_pktinfo pkt; +	unsigned int ret; + +	if (ct == NULL || nf_ct_is_untracked(ct)) +		return NF_ACCEPT; + +	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT; + +	switch (ctinfo) { +	case IP_CT_RELATED: +	case IP_CT_RELATED + IP_CT_IS_REPLY: +		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, +							   ops->hooknum)) +				return NF_DROP; +			else +				return NF_ACCEPT; +		} +		/* Fall through */ +	case IP_CT_NEW: +		if (nf_nat_initialized(ct, maniptype)) +			break; + +		nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +		ret = nft_do_chain(&pkt, ops); +		if (ret != NF_ACCEPT) +			return ret; +		if (!nf_nat_initialized(ct, maniptype)) { +			ret = nf_nat_alloc_null_binding(ct, ops->hooknum); +			if (ret != NF_ACCEPT) +				return ret; +		} +	default: +		break; +	} + +	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, +				      struct sk_buff *skb, +				      const struct net_device *in, +				      const struct net_device *out, +				      int (*okfn)(struct sk_buff *)) +{ +	__be32 daddr = ip_hdr(skb)->daddr; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    ip_hdr(skb)->daddr != daddr) { +		skb_dst_drop(skb); +	} +	return ret; +} + +static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, +				       struct sk_buff *skb, +				       const struct net_device *in, +				       const struct net_device *out, +				       int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo __maybe_unused; +	const struct nf_conn *ct __maybe_unused; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM +	if (ret != NF_DROP && ret != NF_STOLEN && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if (ct->tuplehash[dir].tuple.src.u3.ip != +		    ct->tuplehash[!dir].tuple.dst.u3.ip || +		    ct->tuplehash[dir].tuple.src.u.all != +		    ct->tuplehash[!dir].tuple.dst.u.all) +			return nf_xfrm_me_harder(skb, AF_INET) == 0 ? +								ret : NF_DROP; +	} +#endif +	return ret; +} + +static unsigned int nf_nat_output(const struct nf_hook_ops *ops, +				  struct sk_buff *skb, +				  const struct net_device *in, +				  const struct net_device *out, +				  int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo; +	const struct nf_conn *ct; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if (ct->tuplehash[dir].tuple.dst.u3.ip != +		    ct->tuplehash[!dir].tuple.src.u3.ip) { +			if (ip_route_me_harder(skb, RTN_UNSPEC)) +				ret = NF_DROP; +		} +#ifdef CONFIG_XFRM +		else if (ct->tuplehash[dir].tuple.dst.u.all != +			 ct->tuplehash[!dir].tuple.src.u.all) +			if (nf_xfrm_me_harder(skb, AF_INET)) +				ret = NF_DROP; +#endif +	} +	return ret; +} + +static const struct nf_chain_type nft_chain_nat_ipv4 = { +	.name		= "nat", +	.type		= NFT_CHAIN_T_NAT, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_LOCAL_IN), +	.hooks		= { +		[NF_INET_PRE_ROUTING]	= nf_nat_prerouting, +		[NF_INET_POST_ROUTING]	= nf_nat_postrouting, +		[NF_INET_LOCAL_OUT]	= nf_nat_output, +		[NF_INET_LOCAL_IN]	= nf_nat_fn, +	}, +}; + +static int __init nft_chain_nat_init(void) +{ +	int err; + +	err = nft_register_chain_type(&nft_chain_nat_ipv4); +	if (err < 0) +		return err; + +	return 0; +} + +static void __exit nft_chain_nat_exit(void) +{ +	nft_unregister_chain_type(&nft_chain_nat_ipv4); +} + +module_init(nft_chain_nat_init); +module_exit(nft_chain_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c new file mode 100644 index 00000000000..125b66766c0 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/route.h> +#include <net/ip.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +					struct sk_buff *skb, +					const struct net_device *in, +					const struct net_device *out, +					int (*okfn)(struct sk_buff *)) +{ +	unsigned int ret; +	struct nft_pktinfo pkt; +	u32 mark; +	__be32 saddr, daddr; +	u_int8_t tos; +	const struct iphdr *iph; + +	/* root is playing with raw sockets. */ +	if (skb->len < sizeof(struct iphdr) || +	    ip_hdrlen(skb) < sizeof(struct iphdr)) +		return NF_ACCEPT; + +	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +	mark = skb->mark; +	iph = ip_hdr(skb); +	saddr = iph->saddr; +	daddr = iph->daddr; +	tos = iph->tos; + +	ret = nft_do_chain(&pkt, ops); +	if (ret != NF_DROP && ret != NF_QUEUE) { +		iph = ip_hdr(skb); + +		if (iph->saddr != saddr || +		    iph->daddr != daddr || +		    skb->mark != mark || +		    iph->tos != tos) +			if (ip_route_me_harder(skb, RTN_UNSPEC)) +				ret = NF_DROP; +	} +	return ret; +} + +static const struct nf_chain_type nft_chain_route_ipv4 = { +	.name		= "route", +	.type		= NFT_CHAIN_T_ROUTE, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_OUT), +	.hooks		= { +		[NF_INET_LOCAL_OUT]	= nf_route_table_hook, +	}, +}; + +static int __init nft_chain_route_init(void) +{ +	return nft_register_chain_type(&nft_chain_route_ipv4); +} + +static void __exit nft_chain_route_exit(void) +{ +	nft_unregister_chain_type(&nft_chain_route_ipv4); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 00000000000..e79718a382f --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> + +void nft_reject_ipv4_eval(const struct nft_expr *expr, +			  struct nft_data data[NFT_REG_MAX + 1], +			  const struct nft_pktinfo *pkt) +{ +	struct nft_reject *priv = nft_expr_priv(expr); + +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		nf_send_unreach(pkt->skb, priv->icmp_code); +		break; +	case NFT_REJECT_TCP_RST: +		nf_send_reset(pkt->skb, pkt->ops->hooknum); +		break; +	} + +	data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { +	.type		= &nft_reject_ipv4_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)), +	.eval		= nft_reject_ipv4_eval, +	.init		= nft_reject_init, +	.dump		= nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { +	.family		= NFPROTO_IPV4, +	.name		= "reject", +	.ops		= &nft_reject_ipv4_ops, +	.policy		= nft_reject_policy, +	.maxattr	= NFTA_REJECT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ +	return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ +	nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d7d9882d4ca..044a0ddf6a7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -53,8 +53,12 @@  #include <net/transp_v6.h>  #endif +struct ping_table { +	struct hlist_nulls_head	hash[PING_HTABLE_SIZE]; +	rwlock_t		lock; +}; -struct ping_table ping_table; +static struct ping_table ping_table;  struct pingv6_ops pingv6_ops;  EXPORT_SYMBOL_GPL(pingv6_ops); @@ -202,15 +206,14 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)  #if IS_ENABLED(CONFIG_IPV6)  		} else if (skb->protocol == htons(ETH_P_IPV6) &&  			   sk->sk_family == AF_INET6) { -			struct ipv6_pinfo *np = inet6_sk(sk);  			pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,  				 (int) isk->inet_num, -				 &inet6_sk(sk)->rcv_saddr, +				 &sk->sk_v6_rcv_saddr,  				 sk->sk_bound_dev_if); -			if (!ipv6_addr_any(&np->rcv_saddr) && -			    !ipv6_addr_equal(&np->rcv_saddr, +			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && +			    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,  					     &ipv6_hdr(skb)->daddr))  				continue;  #endif @@ -233,15 +236,15 @@ exit:  static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,  					  kgid_t *high)  { -	kgid_t *data = net->ipv4.sysctl_ping_group_range; +	kgid_t *data = net->ipv4.ping_group_range.range;  	unsigned int seq;  	do { -		seq = read_seqbegin(&sysctl_local_ports.lock); +		seq = read_seqbegin(&net->ipv4.ping_group_range.lock);  		*low = data[0];  		*high = data[1]; -	} while (read_seqretry(&sysctl_local_ports.lock, seq)); +	} while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));  } @@ -249,26 +252,33 @@ int ping_init_sock(struct sock *sk)  {  	struct net *net = sock_net(sk);  	kgid_t group = current_egid(); -	struct group_info *group_info = get_current_groups(); -	int i, j, count = group_info->ngroups; +	struct group_info *group_info; +	int i, j, count;  	kgid_t low, high; +	int ret = 0;  	inet_get_ping_group_range_net(net, &low, &high);  	if (gid_lte(low, group) && gid_lte(group, high))  		return 0; +	group_info = get_current_groups(); +	count = group_info->ngroups;  	for (i = 0; i < group_info->nblocks; i++) {  		int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);  		for (j = 0; j < cp_count; j++) {  			kgid_t gid = group_info->blocks[i][j];  			if (gid_lte(low, gid) && gid_lte(gid, high)) -				return 0; +				goto out_release_group;  		}  		count -= cp_count;  	} -	return -EACCES; +	ret = -EACCES; + +out_release_group: +	put_group_info(group_info); +	return ret;  }  EXPORT_SYMBOL_GPL(ping_init_sock); @@ -317,6 +327,9 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,  		if (addr_len < sizeof(*addr))  			return -EINVAL; +		if (addr->sin6_family != AF_INET6) +			return -EINVAL; +  		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",  			 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -362,7 +375,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)  	} else if (saddr->sa_family == AF_INET6) {  		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;  		struct ipv6_pinfo *np = inet6_sk(sk); -		np->rcv_saddr = np->saddr = addr->sin6_addr; +		sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;  #endif  	}  } @@ -376,7 +389,7 @@ static void ping_clear_saddr(struct sock *sk, int dif)  #if IS_ENABLED(CONFIG_IPV6)  	} else if (sk->sk_family == AF_INET6) {  		struct ipv6_pinfo *np = inet6_sk(sk); -		memset(&np->rcv_saddr, 0, sizeof(np->rcv_saddr)); +		memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));  		memset(&np->saddr, 0, sizeof(np->saddr));  #endif  	} @@ -416,10 +429,12 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)  		 (int)sk->sk_bound_dev_if);  	err = 0; -	if ((sk->sk_family == AF_INET && isk->inet_rcv_saddr) || -	    (sk->sk_family == AF_INET6 && -	     !ipv6_addr_any(&inet6_sk(sk)->rcv_saddr))) +	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) +		sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#if IS_ENABLED(CONFIG_IPV6) +	if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))  		sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#endif  	if (snum)  		sk->sk_userlocks |= SOCK_BINDPORT_LOCK; @@ -429,7 +444,7 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)  #if IS_ENABLED(CONFIG_IPV6)  	if (sk->sk_family == AF_INET6) -		memset(&inet6_sk(sk)->daddr, 0, sizeof(inet6_sk(sk)->daddr)); +		memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));  #endif  	sk_dst_reset(sk); @@ -667,8 +682,8 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,  }  EXPORT_SYMBOL_GPL(ping_common_sendmsg); -int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, -		    size_t len) +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +			   size_t len)  {  	struct net *net = sock_net(sk);  	struct flowi4 fl4; @@ -695,7 +710,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	 */  	if (msg->msg_name) { -		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);  		if (msg->msg_namelen < sizeof(*usin))  			return -EINVAL;  		if (usin->sin_family != AF_INET) @@ -713,11 +728,13 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.opt = NULL;  	ipc.oif = sk->sk_bound_dev_if;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	sock_tx_timestamp(sk, &ipc.tx_flags);  	if (msg->msg_controllen) { -		err = ip_cmsg_send(sock_net(sk), msg, &ipc); +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);  		if (err)  			return err;  		if (ipc.opt) @@ -744,7 +761,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  			return -EINVAL;  		faddr = ipc.opt->opt.faddr;  	} -	tos = RT_TOS(inet->tos); +	tos = get_rttos(&ipc, inet);  	if (sock_flag(sk, SOCK_LOCALROUTE) ||  	    (msg->msg_flags & MSG_DONTROUTE) ||  	    (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -769,7 +786,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		err = PTR_ERR(rt);  		rt = NULL;  		if (err == -ENETUNREACH) -			IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); +			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);  		goto out;  	} @@ -827,8 +844,6 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  {  	struct inet_sock *isk = inet_sk(sk);  	int family = sk->sk_family; -	struct sockaddr_in *sin; -	struct sockaddr_in6 *sin6;  	struct sk_buff *skb;  	int copied, err; @@ -838,19 +853,13 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (flags & MSG_OOB)  		goto out; -	if (addr_len) { -		if (family == AF_INET) -			*addr_len = sizeof(*sin); -		else if (family == AF_INET6 && addr_len) -			*addr_len = sizeof(*sin6); -	} -  	if (flags & MSG_ERRQUEUE) {  		if (family == AF_INET) { -			return ip_recv_error(sk, msg, len); +			return ip_recv_error(sk, msg, len, addr_len);  #if IS_ENABLED(CONFIG_IPV6)  		} else if (family == AF_INET6) { -			return pingv6_ops.ipv6_recv_error(sk, msg, len); +			return pingv6_ops.ipv6_recv_error(sk, msg, len, +							  addr_len);  #endif  		}  	} @@ -874,11 +883,15 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	/* Copy the address and add cmsg data. */  	if (family == AF_INET) { -		sin = (struct sockaddr_in *) msg->msg_name; -		sin->sin_family = AF_INET; -		sin->sin_port = 0 /* skb->h.uh->source */; -		sin->sin_addr.s_addr = ip_hdr(skb)->saddr; -		memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +		DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); + +		if (sin) { +			sin->sin_family = AF_INET; +			sin->sin_port = 0 /* skb->h.uh->source */; +			sin->sin_addr.s_addr = ip_hdr(skb)->saddr; +			memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +			*addr_len = sizeof(*sin); +		}  		if (isk->cmsg_flags)  			ip_cmsg_recv(msg, skb); @@ -887,20 +900,28 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	} else if (family == AF_INET6) {  		struct ipv6_pinfo *np = inet6_sk(sk);  		struct ipv6hdr *ip6 = ipv6_hdr(skb); -		sin6 = (struct sockaddr_in6 *) msg->msg_name; -		sin6->sin6_family = AF_INET6; -		sin6->sin6_port = 0; -		sin6->sin6_addr = ip6->saddr; - -		sin6->sin6_flowinfo = 0; -		if (np->sndflow) -			sin6->sin6_flowinfo = ip6_flowinfo(ip6); - -		sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, -							  IP6CB(skb)->iif); +		DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + +		if (sin6) { +			sin6->sin6_family = AF_INET6; +			sin6->sin6_port = 0; +			sin6->sin6_addr = ip6->saddr; +			sin6->sin6_flowinfo = 0; +			if (np->sndflow) +				sin6->sin6_flowinfo = ip6_flowinfo(ip6); +			sin6->sin6_scope_id = +				ipv6_iface_scope_id(&sin6->sin6_addr, +						    IP6CB(skb)->iif); +			*addr_len = sizeof(*sin6); +		}  		if (inet6_sk(sk)->rxopt.all) -			pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); +			pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); +		if (skb->protocol == htons(ETH_P_IPV6) && +		    inet6_sk(sk)->rxopt.all) +			pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); +		else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) +			ip_cmsg_recv(msg, skb);  #endif  	} else {  		BUG(); @@ -1073,7 +1094,7 @@ void ping_seq_stop(struct seq_file *seq, void *v)  EXPORT_SYMBOL_GPL(ping_seq_stop);  static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, -		int bucket, int *len) +		int bucket)  {  	struct inet_sock *inet = inet_sk(sp);  	__be32 dest = inet->inet_daddr; @@ -1082,7 +1103,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,  	__u16 srcp = ntohs(inet->inet_sport);  	seq_printf(f, "%5d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",  		bucket, src, srcp, dest, destp, sp->sk_state,  		sk_wmem_alloc_get(sp),  		sk_rmem_alloc_get(sp), @@ -1090,23 +1111,22 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,  		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),  		0, sock_i_ino(sp),  		atomic_read(&sp->sk_refcnt), sp, -		atomic_read(&sp->sk_drops), len); +		atomic_read(&sp->sk_drops));  }  static int ping_v4_seq_show(struct seq_file *seq, void *v)  { +	seq_setwidth(seq, 127);  	if (v == SEQ_START_TOKEN) -		seq_printf(seq, "%-127s\n", -			   "  sl  local_address rem_address   st tx_queue " +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "  			   "rx_queue tr tm->when retrnsmt   uid  timeout "  			   "inode ref pointer drops");  	else {  		struct ping_iter_state *state = seq->private; -		int len; -		ping_v4_format_sock(v, seq, state->bucket, &len); -		seq_printf(seq, "%*s\n", 127 - len, ""); +		ping_v4_format_sock(v, seq, state->bucket);  	} +	seq_pad(seq, '\n');  	return 0;  } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4a0335854b8..ae0af9386f7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,12 +273,19 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),  	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),  	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), +	SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),  	SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),  	SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),  	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),  	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),  	SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),  	SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), +	SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), +	SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), +	SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),  	SNMP_MIB_SENTINEL  }; @@ -332,22 +339,22 @@ static void icmp_put(struct seq_file *seq)  	atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;  	seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); -	for (i=0; icmpmibmap[i].name != NULL; i++) +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " In%s", icmpmibmap[i].name);  	seq_printf(seq, " OutMsgs OutErrors"); -	for (i=0; icmpmibmap[i].name != NULL; i++) +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " Out%s", icmpmibmap[i].name);  	seq_printf(seq, "\nIcmp: %lu %lu %lu", -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); -	for (i=0; icmpmibmap[i].name != NULL; i++) +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " %lu",  			   atomic_long_read(ptr + icmpmibmap[i].index));  	seq_printf(seq, " %lu %lu", -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); -	for (i=0; icmpmibmap[i].name != NULL; i++) +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " %lu",  			   atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));  } @@ -372,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);  	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)  		seq_printf(seq, " %llu", -			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics, +			   snmp_fold_field64(net->mib.ip_statistics,  					     snmp4_ipstats_list[i].entry,  					     offsetof(struct ipstats_mib, syncp))); @@ -388,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  		/* MaxConn field is signed, RFC 2012 */  		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)  			seq_printf(seq, " %ld", -				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics, +				   snmp_fold_field(net->mib.tcp_statistics,  						   snmp4_tcp_list[i].entry));  		else  			seq_printf(seq, " %lu", -				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics, +				   snmp_fold_field(net->mib.tcp_statistics,  						   snmp4_tcp_list[i].entry));  	} @@ -403,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nUdp:");  	for (i = 0; snmp4_udp_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.udp_statistics, +			   snmp_fold_field(net->mib.udp_statistics,  					   snmp4_udp_list[i].entry));  	/* the UDP and UDP-Lite MIBs are the same */ @@ -414,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nUdpLite:");  	for (i = 0; snmp4_udp_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.udplite_statistics, +			   snmp_fold_field(net->mib.udplite_statistics,  					   snmp4_udp_list[i].entry));  	seq_putc(seq, '\n'); @@ -451,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nTcpExt:");  	for (i = 0; snmp4_net_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.net_statistics, +			   snmp_fold_field(net->mib.net_statistics,  					   snmp4_net_list[i].entry));  	seq_puts(seq, "\nIpExt:"); @@ -461,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nIpExt:");  	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)  		seq_printf(seq, " %llu", -			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics, +			   snmp_fold_field64(net->mib.ip_statistics,  					     snmp4_ipextstats_list[i].entry,  					     offsetof(struct ipstats_mib, syncp))); diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index ce848461acb..46d6a1c923a 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -31,10 +31,6 @@  const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;  const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; -/* - *	Add a protocol handler to the hash tables - */ -  int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)  {  	if (!prot->netns_ok) { @@ -55,10 +51,6 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol)  }  EXPORT_SYMBOL(inet_add_offload); -/* - *	Remove a protocol from the hash tables. - */ -  int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)  {  	int ret; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 193db03540a..2c65160565e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -299,7 +299,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)  {  	/* Charge it to the socket. */ -	ipv4_pktinfo_prepare(skb); +	ipv4_pktinfo_prepare(sk, skb);  	if (sock_queue_rcv_skb(sk, skb) < 0) {  		kfree_skb(skb);  		return NET_RX_DROP; @@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,  		iph->check   = 0;  		iph->tot_len = htons(length);  		if (!iph->id) -			ip_select_ident(skb, &rt->dst, NULL); +			ip_select_ident(skb, NULL);  		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);  	} @@ -493,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	 */  	if (msg->msg_namelen) { -		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);  		err = -EINVAL;  		if (msg->msg_namelen < sizeof(*usin))  			goto out; @@ -519,10 +519,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.addr = inet->inet_saddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	ipc.oif = sk->sk_bound_dev_if;  	if (msg->msg_controllen) { -		err = ip_cmsg_send(sock_net(sk), msg, &ipc); +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);  		if (err)  			goto out;  		if (ipc.opt) @@ -558,7 +560,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  			daddr = ipc.opt->opt.faddr;  		}  	} -	tos = RT_CONN_FLAGS(sk); +	tos = get_rtconn_flags(&ipc, sk);  	if (msg->msg_flags & MSG_DONTROUTE)  		tos |= RTO_ONLINK; @@ -573,7 +575,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,  			   RT_SCOPE_UNIVERSE,  			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, -			   inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | +			   inet_sk_flowi_flags(sk) |  			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),  			   daddr, saddr, 0, 0); @@ -688,17 +690,14 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	struct inet_sock *inet = inet_sk(sk);  	size_t copied = 0;  	int err = -EOPNOTSUPP; -	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct sk_buff *skb;  	if (flags & MSG_OOB)  		goto out; -	if (addr_len) -		*addr_len = sizeof(*sin); -  	if (flags & MSG_ERRQUEUE) { -		err = ip_recv_error(sk, msg, len); +		err = ip_recv_error(sk, msg, len, addr_len);  		goto out;  	} @@ -724,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;  		sin->sin_port = 0;  		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	if (inet->cmsg_flags)  		ip_cmsg_recv(msg, skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6011615e810..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -89,6 +89,7 @@  #include <linux/rcupdate.h>  #include <linux/times.h>  #include <linux/slab.h> +#include <linux/jhash.h>  #include <net/dst.h>  #include <net/net_namespace.h>  #include <net/protocol.h> @@ -112,9 +113,6 @@  #define RT_FL_TOS(oldflp4) \  	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU	0xFFFF -  #define RT_GC_TIMEOUT (300*HZ)  static int ip_rt_max_size; @@ -142,11 +140,6 @@ static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,  					struct sk_buff *skb);  static void		ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, -			    int how) -{ -} -  static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)  {  	WARN_ON(1); @@ -165,7 +158,6 @@ static struct dst_ops ipv4_dst_ops = {  	.mtu =			ipv4_mtu,  	.cow_metrics =		ipv4_cow_metrics,  	.destroy =		ipv4_dst_destroy, -	.ifdown =		ipv4_dst_ifdown,  	.negative_advice =	ipv4_negative_advice,  	.link_failure =		ipv4_link_failure,  	.update_pmtu =		ip_rt_update_pmtu, @@ -197,7 +189,7 @@ const __u8 ip_tos2prio[16] = {  EXPORT_SYMBOL(ip_tos2prio);  static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)  #ifdef CONFIG_PROC_FS  static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -295,7 +287,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)  	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "  		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",  		   dst_entries_get_slow(&ipv4_dst_ops), -		   st->in_hit, +		   0, /* st->in_hit */  		   st->in_slow_tot,  		   st->in_slow_mc,  		   st->in_no_route, @@ -303,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)  		   st->in_martian_dst,  		   st->in_martian_src, -		   st->out_hit, +		   0, /* st->out_hit */  		   st->out_slow_tot,  		   st->out_slow_mc, -		   st->gc_total, -		   st->gc_ignored, -		   st->gc_goal_miss, -		   st->gc_dst_overflow, -		   st->in_hlist_search, -		   st->out_hlist_search +		   0, /* st->gc_total */ +		   0, /* st->gc_ignored */ +		   0, /* st->gc_goal_miss */ +		   0, /* st->gc_dst_overflow */ +		   0, /* st->in_hlist_search */ +		   0  /* st->out_hlist_search */  		);  	return 0;  } @@ -465,39 +457,45 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,  	return neigh_create(&arp_tbl, pkey, dev);  } -/* - * Peer allocation may fail only in serious out-of-memory conditions.  However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { +	atomic_t	id; +	u32		stamp32; +}; + +static struct ip_ident_bucket *ip_idents __read_mostly; + +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time.   */ -static void ip_select_fb_ident(struct iphdr *iph) +u32 ip_idents_reserve(u32 hash, int segs)  { -	static DEFINE_SPINLOCK(ip_fb_id_lock); -	static u32 ip_fallback_id; -	u32 salt; +	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; +	u32 old = ACCESS_ONCE(bucket->stamp32); +	u32 now = (u32)jiffies; +	u32 delta = 0; -	spin_lock_bh(&ip_fb_id_lock); -	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); -	iph->id = htons(salt & 0xFFFF); -	ip_fallback_id = salt; -	spin_unlock_bh(&ip_fb_id_lock); +	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) +		delta = prandom_u32_max(now - old); + +	return atomic_add_return(segs + delta, &bucket->id) - segs;  } +EXPORT_SYMBOL(ip_idents_reserve); -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +void __ip_select_ident(struct iphdr *iph, int segs)  { -	struct net *net = dev_net(dst->dev); -	struct inet_peer *peer; +	static u32 ip_idents_hashrnd __read_mostly; +	u32 hash, id; -	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); -	if (peer) { -		iph->id = htons(inet_getid(peer, more)); -		inet_putpeer(peer); -		return; -	} +	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); -	ip_select_fb_ident(iph); +	hash = jhash_3words((__force u32)iph->daddr, +			    (__force u32)iph->saddr, +			    iph->protocol, +			    ip_idents_hashrnd); +	id = ip_idents_reserve(hash, segs); +	iph->id = htons(id);  }  EXPORT_SYMBOL(__ip_select_ident); @@ -700,7 +698,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,  out_unlock:  	spin_unlock_bh(&fnhe_lock); -	return;  }  static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, @@ -1003,6 +1000,9 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,  	struct flowi4 fl4;  	struct rtable *rt; +	if (!mark) +		mark = IP4_REPLY_MARK(net, skb->mark); +  	__build_flow_key(&fl4, NULL, iph, oif,  			 RT_TOS(iph->tos), protocol, mark, flow_flags);  	rt = __ip_route_output_key(net, &fl4); @@ -1020,6 +1020,10 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  	struct rtable *rt;  	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + +	if (!fl4.flowi4_mark) +		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); +  	rt = __ip_route_output_key(sock_net(sk), &fl4);  	if (!IS_ERR(rt)) {  		__ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1032,20 +1036,25 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  	const struct iphdr *iph = (const struct iphdr *) skb->data;  	struct flowi4 fl4;  	struct rtable *rt; -	struct dst_entry *dst; +	struct dst_entry *odst = NULL;  	bool new = false;  	bh_lock_sock(sk); -	rt = (struct rtable *) __sk_dst_get(sk); -	if (sock_owned_by_user(sk) || !rt) { +	if (!ip_sk_accept_pmtu(sk)) +		goto out; + +	odst = sk_dst_get(sk); + +	if (sock_owned_by_user(sk) || !odst) {  		__ipv4_sk_update_pmtu(skb, sk, mtu);  		goto out;  	}  	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); -	if (!__sk_dst_check(sk, 0)) { +	rt = (struct rtable *)odst; +	if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {  		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);  		if (IS_ERR(rt))  			goto out; @@ -1055,8 +1064,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); -	dst = dst_check(&rt->dst, 0); -	if (!dst) { +	if (!dst_check(&rt->dst, 0)) {  		if (new)  			dst_release(&rt->dst); @@ -1068,10 +1076,11 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  	}  	if (new) -		__sk_dst_set(sk, &rt->dst); +		sk_dst_set(sk, &rt->dst);  out:  	bh_unlock_sock(sk); +	dst_release(odst);  }  EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); @@ -1135,7 +1144,7 @@ static void ipv4_link_failure(struct sk_buff *skb)  		dst_set_expires(&rt->dst, 0);  } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)  {  	pr_debug("%s: %pI4 -> %pI4, %s\n",  		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1525,7 +1534,7 @@ static int __mkroute_input(struct sk_buff *skb,  	struct in_device *out_dev;  	unsigned int flags = 0;  	bool do_cache; -	u32 itag; +	u32 itag = 0;  	/* get a working reference to the output device */  	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); @@ -1596,6 +1605,7 @@ static int __mkroute_input(struct sk_buff *skb,  	rth->rt_gateway	= 0;  	rth->rt_uses_gateway = 0;  	INIT_LIST_HEAD(&rth->rt_uncached); +	RT_CACHE_STAT_INC(in_slow_tot);  	rth->dst.input = ip_forward;  	rth->dst.output = ip_output; @@ -1694,25 +1704,27 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	fl4.daddr = daddr;  	fl4.saddr = saddr;  	err = fib_lookup(net, &fl4, &res); -	if (err != 0) +	if (err != 0) { +		if (!IN_DEV_FORWARD(in_dev)) +			err = -EHOSTUNREACH;  		goto no_route; - -	RT_CACHE_STAT_INC(in_slow_tot); +	}  	if (res.type == RTN_BROADCAST)  		goto brd_input;  	if (res.type == RTN_LOCAL) {  		err = fib_validate_source(skb, saddr, daddr, tos, -					  LOOPBACK_IFINDEX, -					  dev, in_dev, &itag); +					  0, dev, in_dev, &itag);  		if (err < 0)  			goto martian_source_keep_err;  		goto local_input;  	} -	if (!IN_DEV_FORWARD(in_dev)) +	if (!IN_DEV_FORWARD(in_dev)) { +		err = -EHOSTUNREACH;  		goto no_route; +	}  	if (res.type != RTN_UNICAST)  		goto martian_destination; @@ -1767,13 +1779,18 @@ local_input:  	rth->rt_gateway	= 0;  	rth->rt_uses_gateway = 0;  	INIT_LIST_HEAD(&rth->rt_uncached); +	RT_CACHE_STAT_INC(in_slow_tot);  	if (res.type == RTN_UNREACHABLE) {  		rth->dst.input= ip_error;  		rth->dst.error= -err;  		rth->rt_flags 	&= ~RTCF_LOCAL;  	} -	if (do_cache) -		rt_cache_route(&FIB_RES_NH(res), rth); +	if (do_cache) { +		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { +			rth->dst.flags |= DST_NOCACHE; +			rt_add_uncached_list(rth); +		} +	}  	skb_dst_set(skb, &rth->dst);  	err = 0;  	goto out; @@ -2215,7 +2232,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or  		new->__use = 1;  		new->input = dst_discard; -		new->output = dst_discard; +		new->output = dst_discard_sk;  		new->dev = ort->dst.dev;  		if (new->dev) @@ -2354,7 +2371,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,  			}  		} else  #endif -			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) +			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))  				goto nla_put_failure;  	} @@ -2465,11 +2482,6 @@ errout_free:  	goto errout;  } -int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb) -{ -	return skb->len; -} -  void ip_rt_multicast_event(struct in_device *in_dev)  {  	rt_cache_flush(dev_net(in_dev->dev)); @@ -2707,6 +2719,12 @@ int __init ip_rt_init(void)  {  	int rc = 0; +	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); +	if (!ip_idents) +		panic("IP: failed to allocate ip_idents\n"); + +	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); +  #ifdef CONFIG_IP_ROUTE_CLASSID  	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));  	if (!ip_rt_acct) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 14a15c49129..c86624b36a6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -25,15 +25,7 @@  extern int sysctl_tcp_syncookies; -__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; -EXPORT_SYMBOL(syncookie_secret); - -static __init int init_syncookies(void) -{ -	get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); -	return 0; -} -__initcall(init_syncookies); +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];  #define COOKIEBITS 24	/* Upper bits store count */  #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -44,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],  static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,  		       u32 count, int c)  { -	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); +	__u32 *tmp; + +	net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); +	tmp  = __get_cpu_var(ipv4_cookie_scratch);  	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));  	tmp[0] = (__force u32)saddr;  	tmp[1] = (__force u32)daddr; @@ -89,8 +84,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)  static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, -				   __be16 dport, __u32 sseq, __u32 count, -				   __u32 data) +				   __be16 dport, __u32 sseq, __u32 data)  {  	/*  	 * Compute the secure sequence number. @@ -102,7 +96,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,  	 * As an extra hack, we add a small "data" value that encodes the  	 * MSS into the second hash value.  	 */ - +	u32 count = tcp_cookie_time();  	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +  		sseq + (count << COOKIEBITS) +  		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -114,22 +108,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,   * If the syncookie is bad, the data returned will be out of   * range.  This must be checked by the caller.   * - * The count value used to generate the cookie must be within - * "maxdiff" if the current (passed-in) "count".  The return value - * is (__u32)-1 if this test fails. + * The count value used to generate the cookie must be less than + * MAX_SYNCOOKIE_AGE minutes in the past. + * The return value (__u32)-1 if this test fails.   */  static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, -				  __be16 sport, __be16 dport, __u32 sseq, -				  __u32 count, __u32 maxdiff) +				  __be16 sport, __be16 dport, __u32 sseq)  { -	__u32 diff; +	u32 diff, count = tcp_cookie_time();  	/* Strip away the layers from the cookie */  	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;  	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ -	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); -	if (diff >= maxdiff) +	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); +	if (diff >= MAX_SYNCOOKIE_AGE)  		return (__u32)-1;  	return (cookie - @@ -138,22 +131,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,  }  /* - * MSS Values are taken from the 2009 paper - * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: - *  - values 1440 to 1460 accounted for 80% of observed mss values - *  - values outside the 536-1460 range are rare (<0.2%). + * MSS Values are chosen based on the 2011 paper + * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. + * Values .. + *  .. lower than 536 are rare (< 0.2%) + *  .. between 537 and 1299 account for less than < 1.5% of observed values + *  .. in the 1300-1349 range account for about 15 to 20% of observed mss values + *  .. exceeding 1460 are very rare (< 0.04%)   * - * Table must be sorted. + *  1460 is the single most frequently announced mss value (30 to 46% depending + *  on monitor location).  Table must be sorted.   */  static __u16 const msstab[] = { -	64, -	512,  	536, -	1024, -	1440, +	1300, +	1440,	/* 1440, 1452: PPPoE */  	1460, -	4312, -	8960,  };  /* @@ -173,7 +166,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,  	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,  				     th->source, th->dest, ntohl(th->seq), -				     jiffies / (HZ * 60), mssind); +				     mssind);  }  EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); @@ -189,13 +182,6 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)  }  /* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4 -/*   * Check if a ack sequence number is a valid syncookie.   * Return the decoded mss if it is, or 0 if not.   */ @@ -204,9 +190,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,  {  	__u32 seq = ntohl(th->seq) - 1;  	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, -					    th->source, th->dest, seq, -					    jiffies / (HZ * 60), -					    COUNTER_TRIES); +					    th->source, th->dest, seq);  	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;  } @@ -315,10 +299,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	treq->rcv_isn		= ntohl(th->seq) - 1;  	treq->snt_isn		= cookie;  	req->mss		= mss; -	ireq->loc_port		= th->dest; -	ireq->rmt_port		= th->source; -	ireq->loc_addr		= ip_hdr(skb)->daddr; -	ireq->rmt_addr		= ip_hdr(skb)->saddr; +	ireq->ir_num		= ntohs(th->dest); +	ireq->ir_rmt_port	= th->source; +	ireq->ir_loc_addr	= ip_hdr(skb)->daddr; +	ireq->ir_rmt_addr	= ip_hdr(skb)->saddr; +	ireq->ir_mark		= inet_request_mark(sk, skb);  	ireq->ecn_ok		= ecn_ok;  	ireq->snd_wscale	= tcp_opt.snd_wscale;  	ireq->sack_ok		= tcp_opt.sack_ok; @@ -355,11 +340,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	 * hasn't changed since we received the original syn, but I see  	 * no easy way to do this.  	 */ -	flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, +	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,  			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,  			   inet_sk_flowi_flags(sk), -			   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, -			   ireq->loc_addr, th->source, th->dest); +			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, th->source, th->dest);  	security_req_classify_flow(req, flowi4_to_flowi(&fl4));  	rt = ip_route_output_key(sock_net(sk), &fl4);  	if (IS_ERR(rt)) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 540279f4c53..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 };  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };  /* Update system visible IP port range */ -static void set_local_port_range(int range[2]) +static void set_local_port_range(struct net *net, int range[2])  { -	write_seqlock(&sysctl_local_ports.lock); -	sysctl_local_ports.range[0] = range[0]; -	sysctl_local_ports.range[1] = range[1]; -	write_sequnlock(&sysctl_local_ports.lock); +	write_seqlock(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] = range[0]; +	net->ipv4.ip_local_ports.range[1] = range[1]; +	write_sequnlock(&net->ipv4.ip_local_ports.lock);  }  /* Validate changes from /proc interface. */ @@ -56,6 +56,8 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,  				 void __user *buffer,  				 size_t *lenp, loff_t *ppos)  { +	struct net *net = +		container_of(table->data, struct net, ipv4.ip_local_ports.range);  	int ret;  	int range[2];  	struct ctl_table tmp = { @@ -66,14 +68,15 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,  		.extra2 = &ip_local_port_range_max,  	}; -	inet_get_local_port_range(range, range + 1); +	inet_get_local_port_range(net, &range[0], &range[1]); +  	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);  	if (write && ret == 0) {  		if (range[1] < range[0])  			ret = -EINVAL;  		else -			set_local_port_range(range); +			set_local_port_range(net, range);  	}  	return ret; @@ -83,23 +86,27 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,  static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)  {  	kgid_t *data = table->data; +	struct net *net = +		container_of(table->data, struct net, ipv4.ping_group_range.range);  	unsigned int seq;  	do { -		seq = read_seqbegin(&sysctl_local_ports.lock); +		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);  		*low = data[0];  		*high = data[1]; -	} while (read_seqretry(&sysctl_local_ports.lock, seq)); +	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));  }  /* Update system visible IP port range */  static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)  {  	kgid_t *data = table->data; -	write_seqlock(&sysctl_local_ports.lock); +	struct net *net = +		container_of(table->data, struct net, ipv4.ping_group_range.range); +	write_seqlock(&net->ipv4.ip_local_ports.lock);  	data[0] = low;  	data[1] = high; -	write_sequnlock(&sysctl_local_ports.lock); +	write_sequnlock(&net->ipv4.ip_local_ports.lock);  }  /* Validate changes from /proc interface. */ @@ -193,49 +200,6 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,  	return ret;  } -static int ipv4_tcp_mem(struct ctl_table *ctl, int write, -			   void __user *buffer, size_t *lenp, -			   loff_t *ppos) -{ -	int ret; -	unsigned long vec[3]; -	struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_MEMCG_KMEM -	struct mem_cgroup *memcg; -#endif - -	struct ctl_table tmp = { -		.data = &vec, -		.maxlen = sizeof(vec), -		.mode = ctl->mode, -	}; - -	if (!write) { -		ctl->data = &net->ipv4.sysctl_tcp_mem; -		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); -	} - -	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); -	if (ret) -		return ret; - -#ifdef CONFIG_MEMCG_KMEM -	rcu_read_lock(); -	memcg = mem_cgroup_from_task(current); - -	tcp_prot_mem(memcg, vec[0], 0); -	tcp_prot_mem(memcg, vec[1], 1); -	tcp_prot_mem(memcg, vec[2], 2); -	rcu_read_unlock(); -#endif - -	net->ipv4.sysctl_tcp_mem[0] = vec[0]; -	net->ipv4.sysctl_tcp_mem[1] = vec[1]; -	net->ipv4.sysctl_tcp_mem[2] = vec[2]; - -	return 0; -} -  static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,  				 void __user *buffer, size_t *lenp,  				 loff_t *ppos) @@ -267,6 +231,11 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,  			ret = -EINVAL;  			goto bad_key;  		} +		/* Generate a dummy secret but don't publish it. This +		 * is needed so we don't regenerate a new key on the +		 * first invocation of tcp_fastopen_cookie_gen +		 */ +		tcp_fastopen_init_key_once(false);  		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);  	} @@ -317,13 +286,6 @@ static struct ctl_table ipv4_table[] = {  		.extra2		= &ip_ttl_max,  	},  	{ -		.procname	= "ip_no_pmtu_disc", -		.data		= &ipv4_config.no_pmtu_disc, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "ip_nonlocal_bind",  		.data		= &sysctl_ip_nonlocal_bind,  		.maxlen		= sizeof(int), @@ -475,20 +437,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "ip_local_port_range", -		.data		= &sysctl_local_ports.range, -		.maxlen		= sizeof(sysctl_local_ports.range), -		.mode		= 0644, -		.proc_handler	= ipv4_local_port_range, -	}, -	{ -		.procname	= "ip_local_reserved_ports", -		.data		= NULL, /* initialized in sysctl_ipv4_init */ -		.maxlen		= 65536, -		.mode		= 0644, -		.proc_handler	= proc_do_large_bitmap, -	}, -	{  		.procname	= "igmp_max_memberships",  		.data		= &sysctl_igmp_max_memberships,  		.maxlen		= sizeof(int), @@ -552,6 +500,13 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ +		.procname	= "tcp_mem", +		.maxlen		= sizeof(sysctl_tcp_mem), +		.data		= &sysctl_tcp_mem, +		.mode		= 0644, +		.proc_handler	= proc_doulongvec_minmax, +	}, +	{  		.procname	= "tcp_wmem",  		.data		= &sysctl_tcp_wmem,  		.maxlen		= sizeof(sysctl_tcp_wmem), @@ -732,20 +687,13 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler   = proc_allowed_congestion_control,  	},  	{ -		.procname	= "tcp_max_ssthresh", -		.data		= &sysctl_tcp_max_ssthresh, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{  		.procname       = "tcp_thin_linear_timeouts",  		.data           = &sysctl_tcp_thin_linear_timeouts,  		.maxlen         = sizeof(int),  		.mode           = 0644,  		.proc_handler   = proc_dointvec  	}, -        { +	{  		.procname       = "tcp_thin_dupack",  		.data           = &sysctl_tcp_thin_dupack,  		.maxlen         = sizeof(int), @@ -771,6 +719,15 @@ static struct ctl_table ipv4_table[] = {  		.extra2		= &gso_max_segs,  	},  	{ +		.procname	= "tcp_autocorking", +		.data		= &sysctl_tcp_autocorking, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +	{  		.procname	= "udp_mem",  		.data		= &sysctl_udp_mem,  		.maxlen		= sizeof(sysctl_udp_mem), @@ -841,7 +798,7 @@ static struct ctl_table ipv4_net_table[] = {  	},  	{  		.procname	= "ping_group_range", -		.data		= &init_net.ipv4.sysctl_ping_group_range, +		.data		= &init_net.ipv4.ping_group_range.range,  		.maxlen		= sizeof(gid_t)*2,  		.mode		= 0644,  		.proc_handler	= ipv4_ping_group_range, @@ -854,10 +811,46 @@ static struct ctl_table ipv4_net_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "tcp_mem", -		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem), +		.procname	= "ip_local_port_range", +		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range), +		.data		= &init_net.ipv4.ip_local_ports.range, +		.mode		= 0644, +		.proc_handler	= ipv4_local_port_range, +	}, +	{ +		.procname	= "ip_local_reserved_ports", +		.data		= &init_net.ipv4.sysctl_local_reserved_ports, +		.maxlen		= 65536, +		.mode		= 0644, +		.proc_handler	= proc_do_large_bitmap, +	}, +	{ +		.procname	= "ip_no_pmtu_disc", +		.data		= &init_net.ipv4.sysctl_ip_no_pmtu_disc, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "ip_forward_use_pmtu", +		.data		= &init_net.ipv4.sysctl_ip_fwd_use_pmtu, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "fwmark_reflect", +		.data		= &init_net.ipv4.sysctl_fwmark_reflect, +		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= ipv4_tcp_mem, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "tcp_fwmark_accept", +		.data		= &init_net.ipv4.sysctl_tcp_fwmark_accept, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec,  	},  	{ }  }; @@ -868,47 +861,29 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)  	table = ipv4_net_table;  	if (!net_eq(net, &init_net)) { +		int i; +  		table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);  		if (table == NULL)  			goto err_alloc; -		table[0].data = -			&net->ipv4.sysctl_icmp_echo_ignore_all; -		table[1].data = -			&net->ipv4.sysctl_icmp_echo_ignore_broadcasts; -		table[2].data = -			&net->ipv4.sysctl_icmp_ignore_bogus_error_responses; -		table[3].data = -			&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; -		table[4].data = -			&net->ipv4.sysctl_icmp_ratelimit; -		table[5].data = -			&net->ipv4.sysctl_icmp_ratemask; -		table[6].data = -			&net->ipv4.sysctl_ping_group_range; -		table[7].data = -			&net->ipv4.sysctl_tcp_ecn; - -		/* Don't export sysctls to unprivileged users */ -		if (net->user_ns != &init_user_ns) -			table[0].procname = NULL; +		/* Update the variables to point into the current struct net */ +		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) +			table[i].data += (void *)net - (void *)&init_net;  	} -	/* -	 * Sane defaults - nobody may create ping sockets. -	 * Boot scripts should set this to distro-specific group. -	 */ -	net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); -	net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); - -	tcp_init_mem(net); -  	net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);  	if (net->ipv4.ipv4_hdr == NULL)  		goto err_reg; +	net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); +	if (!net->ipv4.sysctl_local_reserved_ports) +		goto err_ports; +  	return 0; +err_ports: +	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);  err_reg:  	if (!net_eq(net, &init_net))  		kfree(table); @@ -920,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)  {  	struct ctl_table *table; +	kfree(net->ipv4.sysctl_local_reserved_ports);  	table = net->ipv4.ipv4_hdr->ctl_table_arg;  	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);  	kfree(table); @@ -933,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {  static __init int sysctl_ipv4_init(void)  {  	struct ctl_table_header *hdr; -	struct ctl_table *i; - -	for (i = ipv4_table; i->procname; i++) { -		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { -			i->data = sysctl_local_reserved_ports; -			break; -		} -	} -	if (!i->procname) -		return -EINVAL;  	hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);  	if (hdr == NULL) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6e5617b9f9d..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,12 +285,16 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;  int sysctl_tcp_min_tso_segs __read_mostly = 2; +int sysctl_tcp_autocorking __read_mostly = 1; +  struct percpu_counter tcp_orphan_count;  EXPORT_SYMBOL_GPL(tcp_orphan_count); +long sysctl_tcp_mem[3] __read_mostly;  int sysctl_tcp_wmem[3] __read_mostly;  int sysctl_tcp_rmem[3] __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_mem);  EXPORT_SYMBOL(sysctl_tcp_rmem);  EXPORT_SYMBOL(sysctl_tcp_wmem); @@ -377,13 +381,13 @@ void tcp_init_sock(struct sock *sk)  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	skb_queue_head_init(&tp->out_of_order_queue); +	__skb_queue_head_init(&tp->out_of_order_queue);  	tcp_init_xmit_timers(sk);  	tcp_prequeue_init(tp);  	INIT_LIST_HEAD(&tp->tsq_node);  	icsk->icsk_rto = TCP_TIMEOUT_INIT; -	tp->mdev = TCP_TIMEOUT_INIT; +	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);  	/* So many TCP implementations out there (incorrectly) count the  	 * initial SYN frame in their delayed-ACK and congestion control @@ -617,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)  		tp->snd_up = tp->write_seq;  } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, -			    int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, +				int size_goal)  { -	if (tcp_send_head(sk)) { -		struct tcp_sock *tp = tcp_sk(sk); +	return skb->len < size_goal && +	       sysctl_tcp_autocorking && +	       skb != tcp_write_queue_head(sk) && +	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, +		     int nonagle, int size_goal) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; + +	if (!tcp_send_head(sk)) +		return; + +	skb = tcp_write_queue_tail(sk); +	if (!(flags & MSG_MORE) || forced_push(tp)) +		tcp_mark_push(tp, skb); -		if (!(flags & MSG_MORE) || forced_push(tp)) -			tcp_mark_push(tp, tcp_write_queue_tail(sk)); +	tcp_mark_urg(tp, flags); -		tcp_mark_urg(tp, flags); -		__tcp_push_pending_frames(sk, mss_now, -					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); +	if (tcp_should_autocork(sk, skb, size_goal)) { + +		/* avoid atomic op if TSQ_THROTTLED bit is already set */ +		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { +			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +		} +		/* It is possible TX completion already happened +		 * before we set TSQ_THROTTLED. +		 */ +		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) +			return;  	} + +	if (flags & MSG_MORE) +		nonagle = TCP_NAGLE_CORK; + +	__tcp_push_pending_frames(sk, mss_now, nonagle);  }  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -806,12 +849,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  		xmit_size_goal = min_t(u32, gso_size,  				       sk->sk_gso_max_size - 1 - hlen); -		/* TSQ : try to have at least two segments in flight -		 * (one in NIC TX ring, another in Qdisc) -		 */ -		xmit_size_goal = min_t(u32, xmit_size_goal, -				       sysctl_tcp_limit_output_bytes >> 1); -  		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);  		/* We try hard to avoid divides here */ @@ -938,7 +975,8 @@ new_segment:  wait_for_sndbuf:  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory: -		tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +		tcp_push(sk, flags & ~MSG_MORE, mss_now, +			 TCP_NAGLE_PUSH, size_goal);  		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  			goto do_error; @@ -948,7 +986,7 @@ wait_for_memory:  out:  	if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) -		tcp_push(sk, flags, mss_now, tp->nonagle); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);  	return copied;  do_error: @@ -1006,7 +1044,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)  	}  } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size) +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, +				int *copied, size_t size)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int err, flags; @@ -1021,11 +1060,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)  	if (unlikely(tp->fastopen_req == NULL))  		return -ENOBUFS;  	tp->fastopen_req->data = msg; +	tp->fastopen_req->size = size;  	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;  	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,  				    msg->msg_namelen, flags); -	*size = tp->fastopen_req->copied; +	*copied = tp->fastopen_req->copied;  	tcp_free_fastopen_req(tp);  	return err;  } @@ -1045,7 +1085,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	flags = msg->msg_flags;  	if (flags & MSG_FASTOPEN) { -		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn); +		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);  		if (err == -EINPROGRESS && copied_syn > 0)  			goto out;  		else if (err) @@ -1068,7 +1108,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (unlikely(tp->repair)) {  		if (tp->repair_queue == TCP_RECV_QUEUE) {  			copied = tcp_send_rcvq(sk, msg, size); -			goto out; +			goto out_nopush;  		}  		err = -EINVAL; @@ -1229,7 +1269,8 @@ wait_for_sndbuf:  			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory:  			if (copied) -				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +				tcp_push(sk, flags & ~MSG_MORE, mss_now, +					 TCP_NAGLE_PUSH, size_goal);  			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  				goto do_error; @@ -1240,7 +1281,8 @@ wait_for_memory:  out:  	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush:  	release_sock(sk);  	return copied + copied_syn; @@ -1429,7 +1471,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  	do {  		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,  					      last_issued, &done, -					      &used) == DMA_SUCCESS) { +					      &used) == DMA_COMPLETE) {  			/* Safe to free early-copied skbs now */  			__skb_queue_purge(&sk->sk_async_wait_queue);  			break; @@ -1437,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  			struct sk_buff *skb;  			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&  			       (dma_async_is_complete(skb->dma_cookie, done, -						      used) == DMA_SUCCESS)) { +						      used) == DMA_COMPLETE)) {  				__skb_dequeue(&sk->sk_async_wait_queue);  				kfree_skb(skb);  			} @@ -1627,11 +1669,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&  		    !sysctl_tcp_low_latency &&  		    net_dma_find_channel()) { -			preempt_enable_no_resched(); +			preempt_enable();  			tp->ucopy.pinned_list =  					dma_pin_iovec_pages(msg->msg_iov, len);  		} else { -			preempt_enable_no_resched(); +			preempt_enable();  		}  	}  #endif @@ -2190,7 +2232,7 @@ adjudge_to_death:  	/*	This is a (useful) BSD violating of the RFC. There is a  	 *	problem with TCP as specified in that the other end could  	 *	keep a socket open forever with no application left this end. -	 *	We use a 3 minute timeout (about the same as BSD) then kill +	 *	We use a 1 minute timeout (about the same as BSD) then kill  	 *	our end. If they send after that then tough - BUT: long enough  	 *	that we won't make the old 4*rto = almost no time - whoops  	 *	reset mistake. @@ -2300,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	sk->sk_shutdown = 0;  	sock_reset_flag(sk, SOCK_DONE); -	tp->srtt = 0; +	tp->srtt_us = 0;  	if ((tp->write_seq += tp->max_window + 2) == 0)  		tp->write_seq = 1;  	icsk->icsk_backoff = 0; @@ -2744,8 +2786,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)  	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;  	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; -	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; -	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; +	info->tcpi_rtt = tp->srtt_us >> 3; +	info->tcpi_rttvar = tp->mdev_us >> 2;  	info->tcpi_snd_ssthresh = tp->snd_ssthresh;  	info->tcpi_snd_cwnd = tp->snd_cwnd;  	info->tcpi_advmss = tp->advmss; @@ -2755,6 +2797,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)  	info->tcpi_rcv_space = tp->rcvq_space.space;  	info->tcpi_total_retrans = tp->total_retrans; + +	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? +					sk->sk_pacing_rate : ~0ULL; +	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? +					sk->sk_max_pacing_rate : ~0ULL;  }  EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2870,6 +2917,14 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  	case TCP_USER_TIMEOUT:  		val = jiffies_to_msecs(icsk->icsk_user_timeout);  		break; + +	case TCP_FASTOPEN: +		if (icsk->icsk_accept_queue.fastopenq != NULL) +			val = icsk->icsk_accept_queue.fastopenq->max_qlen; +		else +			val = 0; +		break; +  	case TCP_TIMESTAMP:  		val = tcp_time_stamp + tp->tsoffset;  		break; @@ -3097,13 +3152,13 @@ static int __init set_thash_entries(char *str)  }  __setup("thash_entries=", set_thash_entries); -void tcp_init_mem(struct net *net) +static void tcp_init_mem(void)  {  	unsigned long limit = nr_free_buffer_pages() / 8;  	limit = max(limit, 128UL); -	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; -	net->ipv4.sysctl_tcp_mem[1] = limit; -	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +	sysctl_tcp_mem[0] = limit / 4 * 3; +	sysctl_tcp_mem[1] = limit; +	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;  }  void __init tcp_init(void) @@ -3137,10 +3192,9 @@ void __init tcp_init(void)  					&tcp_hashinfo.ehash_mask,  					0,  					thash_entries ? 0 : 512 * 1024); -	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { +	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); -		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); -	} +  	if (inet_ehash_locks_alloc(&tcp_hashinfo))  		panic("TCP: failed to alloc ehash_locks");  	tcp_hashinfo.bhash = @@ -3166,7 +3220,7 @@ void __init tcp_init(void)  	sysctl_tcp_max_orphans = cnt / 2;  	sysctl_max_syn_backlog = max(128, cnt / 256); -	tcp_init_mem(&init_net); +	tcp_init_mem();  	/* Set per-socket limits to no more than 1/128 the pressure threshold */  	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);  	max_wshare = min(4UL*1024*1024, limit); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index f45e1c24244..d5de69bc04f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -140,16 +140,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  		ca->cnt = 1;  } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct bictcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		bictcp_update(ca, tp->snd_cwnd);  		tcp_cong_avoid_ai(tp, ca->cnt); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 019c2389a34..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -15,8 +15,6 @@  #include <linux/gfp.h>  #include <net/tcp.h> -int sysctl_tcp_max_ssthresh = 0; -  static DEFINE_SPINLOCK(tcp_cong_list_lock);  static LIST_HEAD(tcp_cong_list); @@ -278,56 +276,24 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)  	return err;  } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer - */ -bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) -{ -	const struct tcp_sock *tp = tcp_sk(sk); -	u32 left; - -	if (in_flight >= tp->snd_cwnd) -		return true; - -	left = tp->snd_cwnd - in_flight; -	if (sk_can_gso(sk) && -	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && -	    left * tp->mss_cache < sk->sk_gso_max_size && -	    left < sk->sk_gso_max_segs) -		return true; -	return left <= tcp_max_tso_deferred_mss(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - -/* - * Slow start is used when congestion window is less than slow start - * threshold. This version implements the basic RFC2581 version - * and optionally supports: - * 	RFC3742 Limited Slow Start  	  - growth limited to max_ssthresh - *	RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged +/* Slow start is used when congestion window is no greater than the slow start + * threshold. We base on RFC2581 and also handle stretch ACKs properly. + * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but + * something better;) a packet is only considered (s)acked in its entirety to + * defend the ACK attacks described in the RFC. Slow start processes a stretch + * ACK of degree N as if N acks of degree 1 are received back to back except + * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and + * returns the leftover acks to adjust cwnd in congestion avoidance mode.   */ -void tcp_slow_start(struct tcp_sock *tp) +int tcp_slow_start(struct tcp_sock *tp, u32 acked)  { -	int cnt; /* increase in packets */ -	unsigned int delta = 0; -	u32 snd_cwnd = tp->snd_cwnd; - -	if (unlikely(!snd_cwnd)) { -		pr_err_once("snd_cwnd is nul, please report this bug.\n"); -		snd_cwnd = 1U; -	} - -	if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) -		cnt = sysctl_tcp_max_ssthresh >> 1;	/* limited slow start */ -	else -		cnt = snd_cwnd;				/* exponential increase */ +	u32 cwnd = tp->snd_cwnd + acked; -	tp->snd_cwnd_cnt += cnt; -	while (tp->snd_cwnd_cnt >= snd_cwnd) { -		tp->snd_cwnd_cnt -= snd_cwnd; -		delta++; -	} -	tp->snd_cwnd = min(snd_cwnd + delta, tp->snd_cwnd_clamp); +	if (cwnd > tp->snd_ssthresh) +		cwnd = tp->snd_ssthresh + 1; +	acked -= cwnd - tp->snd_cwnd; +	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); +	return acked;  }  EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -351,16 +317,16 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);  /* This is Jacobson's slow start and congestion avoidance.   * SIGCOMM '88, p. 328.   */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* In "safe" area, increase. */  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	/* In dangerous area, increase slowly. */  	else  		tcp_cong_avoid_ai(tp, tp->snd_cwnd); @@ -375,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)  }  EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ -	const struct tcp_sock *tp = tcp_sk(sk); -	return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); -  struct tcp_congestion_ops tcp_reno = {  	.flags		= TCP_CONG_NON_RESTRICTED,  	.name		= "reno",  	.owner		= THIS_MODULE,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  };  /* Initial congestion control used (until SYN) @@ -401,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops  = {  	.owner		= THIS_MODULE,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  };  EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index b6ae92a51f5..a9bd8a4828a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -304,18 +304,18 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  		ca->cnt = 1;  } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct bictcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) {  		if (hystart && after(ack, ca->end_seq))  			bictcp_hystart_reset(sk); -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	} else {  		bictcp_update(ca, tp->snd_cwnd);  		tcp_cong_avoid_ai(tp, ca->cnt); @@ -408,7 +408,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)  		ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;  		ratio += cnt; -		ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); +		ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);  	}  	/* Some calls are for duplicates without timetamps */ @@ -475,10 +475,6 @@ static int __init cubictcp_register(void)  	/* divide by bic_scale and by constant Srtt (100ms) */  	do_div(cube_factor, bic_scale * 10); -	/* hystart needs ms clock resolution */ -	if (hystart && HZ < 1000) -		cubictcp.flags |= TCP_CONG_RTT_STAMP; -  	return tcp_register_congestion_control(&cubictcp);  } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ab7bd35bb31..9771563ab56 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -8,12 +8,26 @@  #include <net/inetpeer.h>  #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly; +int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;  struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;  static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); +void tcp_fastopen_init_key_once(bool publish) +{ +	static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + +	/* tcp_fastopen_reset_cipher publishes the new context +	 * atomically, so we allow this race happening here. +	 * +	 * All call sites of tcp_fastopen_cookie_gen also check +	 * for a valid cookie, so this is an acceptable risk. +	 */ +	if (net_get_random_once(key, sizeof(key)) && publish) +		tcp_fastopen_reset_cipher(key, sizeof(key)); +} +  static void tcp_fastopen_ctx_free(struct rcu_head *head)  {  	struct tcp_fastopen_context *ctx = @@ -58,34 +72,224 @@ error:		kfree(ctx);  	return err;  } -/* Computes the fastopen cookie for the IP path. - * The path is a 128 bits long (pad with zeros for IPv4). - * - * The caller must check foc->len to determine if a valid cookie - * has been generated successfully. -*/ -void tcp_fastopen_cookie_gen(__be32 src, __be32 dst, -			     struct tcp_fastopen_cookie *foc) +static bool __tcp_fastopen_cookie_gen(const void *path, +				      struct tcp_fastopen_cookie *foc)  { -	__be32 path[4] = { src, dst, 0, 0 };  	struct tcp_fastopen_context *ctx; +	bool ok = false; + +	tcp_fastopen_init_key_once(true);  	rcu_read_lock();  	ctx = rcu_dereference(tcp_fastopen_ctx);  	if (ctx) { -		crypto_cipher_encrypt_one(ctx->tfm, foc->val, (__u8 *)path); +		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);  		foc->len = TCP_FASTOPEN_COOKIE_SIZE; +		ok = true;  	}  	rcu_read_unlock(); +	return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, +				    struct sk_buff *syn, +				    struct tcp_fastopen_cookie *foc) +{ +	if (req->rsk_ops->family == AF_INET) { +		const struct iphdr *iph = ip_hdr(syn); + +		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; +		return __tcp_fastopen_cookie_gen(path, foc); +	} + +#if IS_ENABLED(CONFIG_IPV6) +	if (req->rsk_ops->family == AF_INET6) { +		const struct ipv6hdr *ip6h = ipv6_hdr(syn); +		struct tcp_fastopen_cookie tmp; + +		if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { +			struct in6_addr *buf = (struct in6_addr *) tmp.val; +			int i = 4; + +			for (i = 0; i < 4; i++) +				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; +			return __tcp_fastopen_cookie_gen(buf, foc); +		} +	} +#endif +	return false; +} + +static bool tcp_fastopen_create_child(struct sock *sk, +				      struct sk_buff *skb, +				      struct dst_entry *dst, +				      struct request_sock *req) +{ +	struct tcp_sock *tp; +	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; +	struct sock *child; + +	req->num_retrans = 0; +	req->num_timeout = 0; +	req->sk = NULL; + +	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); +	if (child == NULL) +		return false; + +	spin_lock(&queue->fastopenq->lock); +	queue->fastopenq->qlen++; +	spin_unlock(&queue->fastopenq->lock); + +	/* Initialize the child socket. Have to fix some values to take +	 * into account the child is a Fast Open socket and is created +	 * only out of the bits carried in the SYN packet. +	 */ +	tp = tcp_sk(child); + +	tp->fastopen_rsk = req; +	/* Do a hold on the listner sk so that if the listener is being +	 * closed, the child that has been accepted can live on and still +	 * access listen_lock. +	 */ +	sock_hold(sk); +	tcp_rsk(req)->listener = sk; + +	/* RFC1323: The window in SYN & SYN/ACK segments is never +	 * scaled. So correct it appropriately. +	 */ +	tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + +	/* Activate the retrans timer so that SYNACK can be retransmitted. +	 * The request socket is not added to the SYN table of the parent +	 * because it's been added to the accept queue directly. +	 */ +	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, +				  TCP_TIMEOUT_INIT, TCP_RTO_MAX); + +	/* Add the child socket directly into the accept queue */ +	inet_csk_reqsk_queue_add(sk, req, child); + +	/* Now finish processing the fastopen child socket. */ +	inet_csk(child)->icsk_af_ops->rebuild_header(child); +	tcp_init_congestion_control(child); +	tcp_mtup_init(child); +	tcp_init_metrics(child); +	tcp_init_buffer_space(child); + +	/* Queue the data carried in the SYN packet. We need to first +	 * bump skb's refcnt because the caller will attempt to free it. +	 * +	 * XXX (TFO) - we honor a zero-payload TFO request for now, +	 * (any reason not to?) but no need to queue the skb since +	 * there is no data. How about SYN+FIN? +	 */ +	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { +		skb = skb_get(skb); +		skb_dst_drop(skb); +		__skb_pull(skb, tcp_hdr(skb)->doff * 4); +		skb_set_owner_r(skb, child); +		__skb_queue_tail(&child->sk_receive_queue, skb); +		tp->syn_data_acked = 1; +	} +	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	sk->sk_data_ready(sk); +	bh_unlock_sock(child); +	sock_put(child); +	WARN_ON(req->sk == NULL); +	return true;  } +EXPORT_SYMBOL(tcp_fastopen_create_child); -static int __init tcp_fastopen_init(void) +static bool tcp_fastopen_queue_check(struct sock *sk)  { -	__u8 key[TCP_FASTOPEN_KEY_LENGTH]; +	struct fastopen_queue *fastopenq; -	get_random_bytes(key, sizeof(key)); -	tcp_fastopen_reset_cipher(key, sizeof(key)); -	return 0; +	/* Make sure the listener has enabled fastopen, and we don't +	 * exceed the max # of pending TFO requests allowed before trying +	 * to validating the cookie in order to avoid burning CPU cycles +	 * unnecessarily. +	 * +	 * XXX (TFO) - The implication of checking the max_qlen before +	 * processing a cookie request is that clients can't differentiate +	 * between qlen overflow causing Fast Open to be disabled +	 * temporarily vs a server not supporting Fast Open at all. +	 */ +	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; +	if (fastopenq == NULL || fastopenq->max_qlen == 0) +		return false; + +	if (fastopenq->qlen >= fastopenq->max_qlen) { +		struct request_sock *req1; +		spin_lock(&fastopenq->lock); +		req1 = fastopenq->rskq_rst_head; +		if ((req1 == NULL) || time_after(req1->expires, jiffies)) { +			spin_unlock(&fastopenq->lock); +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); +			return false; +		} +		fastopenq->rskq_rst_head = req1->dl_next; +		fastopenq->qlen--; +		spin_unlock(&fastopenq->lock); +		reqsk_free(req1); +	} +	return true;  } -late_initcall(tcp_fastopen_init); +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, +		      struct request_sock *req, +		      struct tcp_fastopen_cookie *foc, +		      struct dst_entry *dst) +{ +	struct tcp_fastopen_cookie valid_foc = { .len = -1 }; +	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + +	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && +	      (syn_data || foc->len >= 0) && +	      tcp_fastopen_queue_check(sk))) { +		foc->len = -1; +		return false; +	} + +	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) +		goto fastopen; + +	if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && +	    foc->len == TCP_FASTOPEN_COOKIE_SIZE && +	    foc->len == valid_foc.len && +	    !memcmp(foc->val, valid_foc.val, foc->len)) { +		/* Cookie is valid. Create a (full) child socket to accept +		 * the data in SYN before returning a SYN-ACK to ack the +		 * data. If we fail to create the socket, fall back and +		 * ack the ISN only but includes the same cookie. +		 * +		 * Note: Data-less SYN with valid cookie is allowed to send +		 * data in SYN_RECV state. +		 */ +fastopen: +		if (tcp_fastopen_create_child(sk, skb, dst, req)) { +			foc->len = -1; +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPFASTOPENPASSIVE); +			return true; +		} +	} + +	NET_INC_STATS_BH(sock_net(sk), foc->len ? +			 LINUX_MIB_TCPFASTOPENPASSIVEFAIL : +			 LINUX_MIB_TCPFASTOPENCOOKIEREQD); +	*foc = valid_foc; +	return false; +} +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 30f27f6b365..1c4908280d9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,16 +109,16 @@ static void hstcp_init(struct sock *sk)  	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);  } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct hstcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		/* Update AIMD parameters.  		 * @@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = {  	.init		= hstcp_init,  	.ssthresh	= hstcp_ssthresh,  	.cong_avoid	= hstcp_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.owner		= THIS_MODULE,  	.name		= "highspeed" diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index c1a8175361e..031361311a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,16 +227,16 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)  	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);  } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct htcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		/* In dangerous area, increase slowly.  		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 57bdd17dff4..d8f8f05a495 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -21,7 +21,7 @@ struct hybla {  	u32   rho2;	      /* Rho * Rho, integer part */  	u32   rho_3ls;	      /* Rho parameter, <<3 */  	u32   rho2_7ls;	      /* Rho^2, <<7	*/ -	u32   minrtt;	      /* Minimum smoothed round trip time value seen */ +	u32   minrtt_us;      /* Minimum smoothed round trip time value seen */  };  /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)  {  	struct hybla *ca = inet_csk_ca(sk); -	ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); +	ca->rho_3ls = max_t(u32, +			    tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), +			    8U);  	ca->rho = ca->rho_3ls >> 3;  	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;  	ca->rho2 = ca->rho2_7ls >> 7; @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)  	hybla_recalc_param(sk);  	/* set minimum rtt as this is the 1st ever seen */ -	ca->minrtt = tp->srtt; +	ca->minrtt_us = tp->srtt_us;  	tp->snd_cwnd = ca->rho;  } @@ -85,7 +87,7 @@ static inline u32 hybla_fraction(u32 odds)   *     o Give cwnd a new value based on the model proposed   *     o remember increments <1   */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct hybla *ca = inet_csk_ca(sk); @@ -93,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  	int is_slowstart = 0;  	/*  Recalculate rho only if this srtt is the lowest */ -	if (tp->srtt < ca->minrtt){ +	if (tp->srtt_us < ca->minrtt_us) {  		hybla_recalc_param(sk); -		ca->minrtt = tp->srtt; +		ca->minrtt_us = tp->srtt_us;  	} -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (!ca->hybla_en) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	} @@ -165,7 +167,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  static struct tcp_congestion_ops tcp_hybla __read_mostly = {  	.init		= hybla_init,  	.ssthresh	= tcp_reno_ssthresh, -	.min_cwnd	= tcp_reno_min_cwnd,  	.cong_avoid	= hybla_cong_avoid,  	.set_state	= hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 834857f3c87..5999b3972e6 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -23,7 +23,6 @@  #define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */  #define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */  #define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */ -#define U32_MAX		((u32)~0U)  #define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */  #define BETA_SHIFT	6 @@ -256,7 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)  /*   * Increase window in response to successful acknowledgment.   */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct illinois *ca = inet_csk_ca(sk); @@ -265,12 +264,12 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		update_params(sk);  	/* RFC2861 only increase cwnd if fully utilized */ -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* In slow start */  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		u32 delta; @@ -325,10 +324,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,  }  static struct tcp_congestion_ops tcp_illinois __read_mostly = { -	.flags		= TCP_CONG_RTT_STAMP,  	.init		= tcp_illinois_init,  	.ssthresh	= tcp_illinois_ssthresh, -	.min_cwnd	= tcp_reno_min_cwnd,  	.cong_avoid	= tcp_illinois_cong_avoid,  	.set_state	= tcp_illinois_state,  	.get_info	= tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 113dc5f17d4..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -267,11 +267,31 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr   * 1. Tuning sk->sk_sndbuf, when connection enters established state.   */ -static void tcp_fixup_sndbuf(struct sock *sk) +static void tcp_sndbuf_expand(struct sock *sk)  { -	int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); +	const struct tcp_sock *tp = tcp_sk(sk); +	int sndmem, per_mss; +	u32 nr_segs; + +	/* Worst case is non GSO/TSO : each frame consumes one skb +	 * and skb->head is kmalloced using power of two area of memory +	 */ +	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + +		  MAX_TCP_HEADER + +		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	per_mss = roundup_pow_of_two(per_mss) + +		  SKB_DATA_ALIGN(sizeof(struct sk_buff)); + +	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); +	nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + +	/* Fast Recovery (RFC 5681 3.2) : +	 * Cubic needs 1.7 factor, rounded to 2 to include +	 * extra cushion (application might react slowly to POLLOUT) +	 */ +	sndmem = 2 * nr_segs * per_mss; -	sndmem *= TCP_INIT_CWND;  	if (sk->sk_sndbuf < sndmem)  		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);  } @@ -355,6 +375,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)  	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *  		 tcp_default_init_rwnd(mss); +	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency +	 * Allow enough cushion so that sender is not limited by our window +	 */ +	if (sysctl_tcp_moderate_rcvbuf) +		rcvmem <<= 2; +  	if (sk->sk_rcvbuf < rcvmem)  		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);  } @@ -370,9 +396,11 @@ void tcp_init_buffer_space(struct sock *sk)  	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))  		tcp_fixup_rcvbuf(sk);  	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) -		tcp_fixup_sndbuf(sk); +		tcp_sndbuf_expand(sk);  	tp->rcvq_space.space = tp->rcv_wnd; +	tp->rcvq_space.time = tcp_time_stamp; +	tp->rcvq_space.seq = tp->copied_seq;  	maxwin = tcp_full_space(sk); @@ -512,48 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int time; -	int space; - -	if (tp->rcvq_space.time == 0) -		goto new_measure; +	int copied;  	time = tcp_time_stamp - tp->rcvq_space.time;  	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)  		return; -	space = 2 * (tp->copied_seq - tp->rcvq_space.seq); +	/* Number of bytes copied to user in last RTT */ +	copied = tp->copied_seq - tp->rcvq_space.seq; +	if (copied <= tp->rcvq_space.space) +		goto new_measure; -	space = max(tp->rcvq_space.space, space); +	/* A bit of theory : +	 * copied = bytes received in previous RTT, our base window +	 * To cope with packet losses, we need a 2x factor +	 * To cope with slow start, and sender growing its cwin by 100 % +	 * every RTT, we need a 4x factor, because the ACK we are sending +	 * now is for the next RTT, not the current one : +	 * <prev RTT . ><current RTT .. ><next RTT .... > +	 */ -	if (tp->rcvq_space.space != space) { -		int rcvmem; +	if (sysctl_tcp_moderate_rcvbuf && +	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { +		int rcvwin, rcvmem, rcvbuf; + +		/* minimal window to cope with packet losses, assuming +		 * steady state. Add some cushion because of small variations. +		 */ +		rcvwin = (copied << 1) + 16 * tp->advmss; + +		/* If rate increased by 25%, +		 *	assume slow start, rcvwin = 3 * copied +		 * If rate increased by 50%, +		 *	assume sender can use 2x growth, rcvwin = 4 * copied +		 */ +		if (copied >= +		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { +			if (copied >= +			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) +				rcvwin <<= 1; +			else +				rcvwin += (rcvwin >> 1); +		} -		tp->rcvq_space.space = space; +		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); +		while (tcp_win_from_space(rcvmem) < tp->advmss) +			rcvmem += 128; -		if (sysctl_tcp_moderate_rcvbuf && -		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { -			int new_clamp = space; +		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); +		if (rcvbuf > sk->sk_rcvbuf) { +			sk->sk_rcvbuf = rcvbuf; -			/* Receive space grows, normalize in order to -			 * take into account packet headers and sk_buff -			 * structure overhead. -			 */ -			space /= tp->advmss; -			if (!space) -				space = 1; -			rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); -			while (tcp_win_from_space(rcvmem) < tp->advmss) -				rcvmem += 128; -			space *= rcvmem; -			space = min(space, sysctl_tcp_rmem[2]); -			if (space > sk->sk_rcvbuf) { -				sk->sk_rcvbuf = space; - -				/* Make the window clamp follow along.  */ -				tp->window_clamp = new_clamp; -			} +			/* Make the window clamp follow along.  */ +			tp->window_clamp = rcvwin;  		}  	} +	tp->rcvq_space.space = copied;  new_measure:  	tp->rcvq_space.seq = tp->copied_seq; @@ -625,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)   * To save cycles in the RFC 1323 implementation it was better to break   * it up into three procedures. -- erics   */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)  {  	struct tcp_sock *tp = tcp_sk(sk); -	long m = mrtt; /* RTT */ +	long m = mrtt_us; /* RTT */ +	u32 srtt = tp->srtt_us;  	/*	The following amusing code comes from Jacobson's  	 *	article in SIGCOMM '88.  Note that rtt and mdev @@ -646,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  	 * does not matter how to _calculate_ it. Seems, it was trap  	 * that VJ failed to avoid. 8)  	 */ -	if (m == 0) -		m = 1; -	if (tp->srtt != 0) { -		m -= (tp->srtt >> 3);	/* m is now error in rtt est */ -		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */ +	if (srtt != 0) { +		m -= (srtt >> 3);	/* m is now error in rtt est */ +		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */  		if (m < 0) {  			m = -m;		/* m is now abs(error) */ -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  			/* This is similar to one of Eifel findings.  			 * Eifel blocks mdev updates when rtt decreases.  			 * This solution is a bit different: we use finer gain @@ -665,27 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  			if (m > 0)  				m >>= 3;  		} else { -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  		} -		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */ -		if (tp->mdev > tp->mdev_max) { -			tp->mdev_max = tp->mdev; -			if (tp->mdev_max > tp->rttvar) -				tp->rttvar = tp->mdev_max; +		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */ +		if (tp->mdev_us > tp->mdev_max_us) { +			tp->mdev_max_us = tp->mdev_us; +			if (tp->mdev_max_us > tp->rttvar_us) +				tp->rttvar_us = tp->mdev_max_us;  		}  		if (after(tp->snd_una, tp->rtt_seq)) { -			if (tp->mdev_max < tp->rttvar) -				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; +			if (tp->mdev_max_us < tp->rttvar_us) +				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;  			tp->rtt_seq = tp->snd_nxt; -			tp->mdev_max = tcp_rto_min(sk); +			tp->mdev_max_us = tcp_rto_min_us(sk);  		}  	} else {  		/* no previous measure. */ -		tp->srtt = m << 3;	/* take the measured time to be rtt */ -		tp->mdev = m << 1;	/* make sure rto = 3*rtt */ -		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); +		srtt = m << 3;		/* take the measured time to be rtt */ +		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */ +		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); +		tp->mdev_max_us = tp->rttvar_us;  		tp->rtt_seq = tp->snd_nxt;  	} +	tp->srtt_us = max(1U, srtt);  }  /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -700,26 +743,25 @@ static void tcp_update_pacing_rate(struct sock *sk)  	u64 rate;  	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ -	rate = (u64)tp->mss_cache * 2 * (HZ << 3); +	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);  	rate *= max(tp->snd_cwnd, tp->packets_out); -	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), -	 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) -	 * We probably need usec resolution in the future. -	 * Note: This also takes care of possible srtt=0 case, -	 * when tcp_rtt_estimator() was not yet called. -	 */ -	if (tp->srtt > 8 + 2) -		do_div(rate, tp->srtt); +	if (likely(tp->srtt_us)) +		do_div(rate, tp->srtt_us); -	sk->sk_pacing_rate = min_t(u64, rate, ~0U); +	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate +	 * without any lock. We want to make sure compiler wont store +	 * intermediate values in this location. +	 */ +	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, +						sk->sk_max_pacing_rate);  }  /* Calculate rto without backoff.  This is the second half of Van Jacobson's   * routine referred to above.   */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	/* Old crap is replaced with new one. 8) @@ -1064,7 +1106,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  	}  	/* D-SACK for already forgotten data... Do dumb counting. */ -	if (dup_sack && tp->undo_marker && tp->undo_retrans && +	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&  	    !after(end_seq_0, prior_snd_una) &&  	    after(end_seq_0, tp->undo_marker))  		tp->undo_retrans--; @@ -1073,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,  }  struct tcp_sacktag_state { -	int reord; -	int fack_count; -	int flag; -	s32 rtt; /* RTT measured by SACKing never-retransmitted data */ +	int	reord; +	int	fack_count; +	long	rtt_us; /* RTT measured by SACKing never-retransmitted data */ +	int	flag;  };  /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1120,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  			unsigned int new_len = (pkt_len / mss) * mss;  			if (!in_sack && new_len < pkt_len) {  				new_len += mss; -				if (new_len > skb->len) +				if (new_len >= skb->len)  					return 0;  			}  			pkt_len = new_len;  		} -		err = tcp_fragment(sk, skb, pkt_len, mss); +		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);  		if (err < 0)  			return err;  	} @@ -1137,14 +1179,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  static u8 tcp_sacktag_one(struct sock *sk,  			  struct tcp_sacktag_state *state, u8 sacked,  			  u32 start_seq, u32 end_seq, -			  int dup_sack, int pcount, u32 xmit_time) +			  int dup_sack, int pcount, +			  const struct skb_mstamp *xmit_time)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int fack_count = state->fack_count;  	/* Account D-SACK for retransmitted packet. */  	if (dup_sack && (sacked & TCPCB_RETRANS)) { -		if (tp->undo_marker && tp->undo_retrans && +		if (tp->undo_marker && tp->undo_retrans > 0 &&  		    after(end_seq, tp->undo_marker))  			tp->undo_retrans--;  		if (sacked & TCPCB_SACKED_ACKED) @@ -1178,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,  				if (!after(end_seq, tp->high_seq))  					state->flag |= FLAG_ORIG_SACK_ACKED;  				/* Pick the earliest sequence sacked for RTT */ -				if (state->rtt < 0) -					state->rtt = tcp_time_stamp - xmit_time; +				if (state->rtt_us < 0) { +					struct skb_mstamp now; + +					skb_mstamp_get(&now); +					state->rtt_us = skb_mstamp_us_delta(&now, +								xmit_time); +				}  			}  			if (sacked & TCPCB_LOST) { @@ -1238,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  	 */  	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,  			start_seq, end_seq, dup_sack, pcount, -			TCP_SKB_CB(skb)->when); +			&skb->skb_mstamp);  	if (skb == tp->lost_skb_hint)  		tp->lost_cnt_hint += pcount; @@ -1516,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  						TCP_SKB_CB(skb)->end_seq,  						dup_sack,  						tcp_skb_pcount(skb), -						TCP_SKB_CB(skb)->when); +						&skb->skb_mstamp);  			if (!before(TCP_SKB_CB(skb)->seq,  				    tcp_highest_sack_seq(tp))) @@ -1573,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl  static int  tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, -			u32 prior_snd_una, s32 *sack_rtt) +			u32 prior_snd_una, long *sack_rtt_us)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1591,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,  	state.flag = 0;  	state.reord = tp->packets_out; -	state.rtt = -1; +	state.rtt_us = -1L;  	if (!tp->sacked_out) {  		if (WARN_ON(tp->fackets_out)) @@ -1775,7 +1823,7 @@ out:  	WARN_ON((int)tp->retrans_out < 0);  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);  #endif -	*sack_rtt = state.rtt; +	*sack_rtt_us = state.rtt_us;  	return state.flag;  } @@ -1845,7 +1893,7 @@ static void tcp_clear_retrans_partial(struct tcp_sock *tp)  	tp->lost_out = 0;  	tp->undo_marker = 0; -	tp->undo_retrans = 0; +	tp->undo_retrans = -1;  }  void tcp_clear_retrans(struct tcp_sock *tp) @@ -1896,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how)  		if (skb == tcp_send_head(sk))  			break; -		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) +		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)  			tp->undo_marker = 0; +  		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;  		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -1985,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)  	 * available, or RTO is scheduled to fire first.  	 */  	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || -	    (flag & FLAG_ECE) || !tp->srtt) +	    (flag & FLAG_ECE) || !tp->srtt_us)  		return false; -	delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); +	delay = max(usecs_to_jiffies(tp->srtt_us >> 5), +		    msecs_to_jiffies(2)); +  	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))  		return false; @@ -2190,7 +2241,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  				break;  			mss = skb_shinfo(skb)->gso_size; -			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); +			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, +					   mss, GFP_ATOMIC);  			if (err < 0)  				break;  			cnt = packets; @@ -2613,7 +2665,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)  	tp->prior_ssthresh = 0;  	tp->undo_marker = tp->snd_una; -	tp->undo_retrans = tp->retrans_out; +	tp->undo_retrans = tp->retrans_out ? : -1;  	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {  		if (!ece_ack) @@ -2633,13 +2685,12 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)  	bool recovered = !before(tp->snd_una, tp->high_seq);  	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ -		if (flag & FLAG_ORIG_SACK_ACKED) { -			/* Step 3.b. A timeout is spurious if not all data are -			 * lost, i.e., never-retransmitted data are (s)acked. -			 */ -			tcp_try_undo_loss(sk, true); +		/* Step 3.b. A timeout is spurious if not all data are +		 * lost, i.e., never-retransmitted data are (s)acked. +		 */ +		if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))  			return; -		} +  		if (after(tp->snd_nxt, tp->high_seq) &&  		    (flag & FLAG_DATA_SACKED || is_dupack)) {  			tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2835,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,  }  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, -				      s32 seq_rtt, s32 sack_rtt) +				      long seq_rtt_us, long sack_rtt_us)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -2845,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 * is acked (RFC6298).  	 */  	if (flag & FLAG_RETRANS_DATA_ACKED) -		seq_rtt = -1; +		seq_rtt_us = -1L; -	if (seq_rtt < 0) -		seq_rtt = sack_rtt; +	if (seq_rtt_us < 0) +		seq_rtt_us = sack_rtt_us;  	/* RTTM Rule: A TSecr value received in a segment is used to  	 * update the averaged RTT measurement only if the segment @@ -2856,13 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  	 * left edge of the send window.  	 * See draft-ietf-tcplw-high-performance-00, section 3.3.  	 */ -	if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) -		seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; +	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && +	    flag & FLAG_ACKED) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); -	if (seq_rtt < 0) +	if (seq_rtt_us < 0)  		return false; -	tcp_rtt_estimator(sk, seq_rtt); +	tcp_rtt_estimator(sk, seq_rtt_us);  	tcp_set_rto(sk);  	/* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2871,20 +2923,26 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,  }  /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ -static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)  {  	struct tcp_sock *tp = tcp_sk(sk); -	s32 seq_rtt = -1; +	long seq_rtt_us = -1L; + +	if (synack_stamp && !tp->total_retrans) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); -	if (tp->lsndtime && !tp->total_retrans) -		seq_rtt = tcp_time_stamp - tp->lsndtime; -	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); +	/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets +	 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() +	 */ +	if (!tp->srtt_us) +		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);  } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + +	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);  	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;  } @@ -2967,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)   * arrived at the other end.   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, -			       u32 prior_snd_una, s32 sack_rtt) +			       u32 prior_snd_una, long sack_rtt_us)  { -	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); +	struct skb_mstamp first_ackt, last_ackt, now; +	struct tcp_sock *tp = tcp_sk(sk); +	u32 prior_sacked = tp->sacked_out; +	u32 reord = tp->packets_out; +	bool fully_acked = true; +	long ca_seq_rtt_us = -1L; +	long seq_rtt_us = -1L;  	struct sk_buff *skb; -	u32 now = tcp_time_stamp; -	int fully_acked = true; -	int flag = 0;  	u32 pkts_acked = 0; -	u32 reord = tp->packets_out; -	u32 prior_sacked = tp->sacked_out; -	s32 seq_rtt = -1; -	s32 ca_seq_rtt = -1; -	ktime_t last_ackt = net_invalid_timestamp(); +	bool rtt_update; +	int flag = 0; + +	first_ackt.v64 = 0;  	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -		u32 acked_pcount;  		u8 sacked = scb->sacked; +		u32 acked_pcount;  		/* Determine how many packets and what bytes were acked, tso and else */  		if (after(scb->end_seq, tp->snd_una)) { @@ -3007,11 +3067,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  				tp->retrans_out -= acked_pcount;  			flag |= FLAG_RETRANS_DATA_ACKED;  		} else { -			ca_seq_rtt = now - scb->when; -			last_ackt = skb->tstamp; -			if (seq_rtt < 0) { -				seq_rtt = ca_seq_rtt; -			} +			last_ackt = skb->skb_mstamp; +			WARN_ON_ONCE(last_ackt.v64 == 0); +			if (!first_ackt.v64) +				first_ackt = last_ackt; +  			if (!(sacked & TCPCB_SACKED_ACKED))  				reord = min(pkts_acked, reord);  			if (!after(scb->end_seq, tp->high_seq)) @@ -3057,14 +3117,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))  		flag |= FLAG_SACK_RENEGING; -	if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || -	    (flag & FLAG_ACKED)) -		tcp_rearm_rto(sk); +	skb_mstamp_get(&now); +	if (first_ackt.v64) { +		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); +		ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); +	} + +	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);  	if (flag & FLAG_ACKED) {  		const struct tcp_congestion_ops *ca_ops  			= inet_csk(sk)->icsk_ca_ops; +		tcp_rearm_rto(sk);  		if (unlikely(icsk->icsk_mtup.probe_size &&  			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {  			tcp_mtup_probe_success(sk); @@ -3086,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		tp->fackets_out -= min(pkts_acked, tp->fackets_out); -		if (ca_ops->pkts_acked) { -			s32 rtt_us = -1; - -			/* Is the ACK triggering packet unambiguous? */ -			if (!(flag & FLAG_RETRANS_DATA_ACKED)) { -				/* High resolution needed and available? */ -				if (ca_ops->flags & TCP_CONG_RTT_STAMP && -				    !ktime_equal(last_ackt, -						 net_invalid_timestamp())) -					rtt_us = ktime_us_delta(ktime_get_real(), -								last_ackt); -				else if (ca_seq_rtt >= 0) -					rtt_us = jiffies_to_usecs(ca_seq_rtt); -			} +		if (ca_ops->pkts_acked) +			ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); -			ca_ops->pkts_acked(sk, pkts_acked, rtt_us); -		} +	} else if (skb && rtt_update && sack_rtt_us >= 0 && +		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { +		/* Do not re-arm RTO if the sack RTT is measured from data sent +		 * after when the head was last (re)transmitted. Otherwise the +		 * timeout may continue to extend in loss recovery. +		 */ +		tcp_rearm_rto(sk);  	}  #if FASTRETRANS_DEBUG > 0 @@ -3291,7 +3349,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  			tcp_init_cwnd_reduction(sk, true);  			tcp_set_ca_state(sk, TCP_CA_CWR);  			tcp_end_cwnd_reduction(sk); -			tcp_set_ca_state(sk, TCP_CA_Open); +			tcp_try_keep_open(sk);  			NET_INC_STATS_BH(sock_net(sk),  					 LINUX_MIB_TCPLOSSPROBERECOVERY);  		} @@ -3307,12 +3365,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	u32 ack_seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq;  	bool is_dupack = false; -	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;  	u32 prior_fackets;  	int prior_packets = tp->packets_out;  	const int prior_unsacked = tp->packets_out - tp->sacked_out;  	int acked = 0; /* Number of packets newly acked */ -	s32 sack_rtt = -1; +	long sack_rtt_us = -1L;  	/* If the ack is older than previous acks  	 * then we can probably ignore it. @@ -3340,7 +3397,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		flag |= FLAG_SND_UNA_ADVANCED;  	prior_fackets = tp->fackets_out; -	prior_in_flight = tcp_packets_in_flight(tp);  	/* ts_recent update must be made after we are sure that the packet  	 * is in window. @@ -3370,7 +3426,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		if (TCP_SKB_CB(skb)->sacked)  			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, -							&sack_rtt); +							&sack_rtt_us);  		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))  			flag |= FLAG_ECE; @@ -3389,12 +3445,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	/* See if we can take anything off of the retransmit queue. */  	acked = tp->packets_out; -	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); +	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, +				    sack_rtt_us);  	acked -= tp->packets_out;  	/* Advance cwnd if state allows */  	if (tcp_may_raise_cwnd(sk, flag)) -		tcp_cong_avoid(sk, ack, prior_in_flight); +		tcp_cong_avoid(sk, ack, acked);  	if (tcp_ack_is_dubious(sk, flag)) {  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3412,8 +3469,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (icsk->icsk_pending == ICSK_TIME_RETRANS)  		tcp_schedule_loss_probe(sk); -	if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) -		tcp_update_pacing_rate(sk); +	tcp_update_pacing_rate(sk);  	return 1;  no_queue: @@ -3442,7 +3498,7 @@ old_ack:  	 */  	if (TCP_SKB_CB(skb)->sacked) {  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, -						&sack_rtt); +						&sack_rtt_us);  		tcp_fastretrans_alert(sk, acked, prior_unsacked,  				      is_dupack, flag);  	} @@ -3626,7 +3682,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)  		int opcode = *ptr++;  		int opsize; -		switch(opcode) { +		switch (opcode) {  		case TCPOPT_EOL:  			return NULL;  		case TCPOPT_NOP: @@ -3986,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)  			WARN_ON(before(tp->rcv_nxt, sp->end_seq));  			/* Zap this SACK, by moving forward any other SACKS. */ -			for (i=this_sack+1; i < num_sacks; i++) +			for (i = this_sack+1; i < num_sacks; i++)  				tp->selective_acks[i-1] = tp->selective_acks[i];  			num_sacks--;  			continue; @@ -4356,7 +4412,7 @@ queue_and_out:  		if (eaten > 0)  			kfree_skb_partial(skb, fragstolen);  		if (!sock_flag(sk, SOCK_DEAD)) -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  		return;  	} @@ -4646,28 +4702,6 @@ static int tcp_prune_queue(struct sock *sk)  	return -1;  } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); - -	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && -	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { -		/* Limited by application or receiver window. */ -		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); -		u32 win_used = max(tp->snd_cwnd_used, init_win); -		if (win_used < tp->snd_cwnd) { -			tp->snd_ssthresh = tcp_current_ssthresh(sk); -			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; -		} -		tp->snd_cwnd_used = 0; -	} -	tp->snd_cwnd_stamp = tcp_time_stamp; -} -  static bool tcp_should_expand_sndbuf(const struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -4704,15 +4738,7 @@ static void tcp_new_space(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	if (tcp_should_expand_sndbuf(sk)) { -		int sndmem = SKB_TRUESIZE(max_t(u32, -						tp->rx_opt.mss_clamp, -						tp->mss_cache) + -					  MAX_TCP_HEADER); -		int demanded = max_t(unsigned int, tp->snd_cwnd, -				     tp->reordering + 1); -		sndmem *= 2 * demanded; -		if (sndmem > sk->sk_sndbuf) -			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); +		tcp_sndbuf_expand(sk);  		tp->snd_cwnd_stamp = tcp_time_stamp;  	} @@ -4865,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t  				BUG();  			tp->urg_data = TCP_URG_VALID | tmp;  			if (!sock_flag(sk, SOCK_DEAD)) -				sk->sk_data_ready(sk, 0); +				sk->sk_data_ready(sk);  		}  	}  } @@ -4951,11 +4977,11 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,  		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||  		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {  			tp->ucopy.wakeup = 1; -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  		}  	} else if (chunk > 0) {  		tp->ucopy.wakeup = 1; -		sk->sk_data_ready(sk, 0); +		sk->sk_data_ready(sk);  	}  out:  	return copied_early; @@ -5226,7 +5252,7 @@ no_ack:  #endif  			if (eaten)  				kfree_skb_partial(skb, fragstolen); -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  			return;  		}  	} @@ -5346,9 +5372,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,  				break;  		}  		tcp_rearm_rto(sk); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);  		return true;  	}  	tp->syn_data_acked = tp->syn_data; +	if (tp->syn_data_acked) +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);  	return false;  } @@ -5587,6 +5616,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  	struct request_sock *req;  	int queued = 0;  	bool acceptable; +	u32 synack_stamp;  	tp->rx_opt.saw_tstamp = 0; @@ -5669,16 +5699,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		 * so release it.  		 */  		if (req) { +			synack_stamp = tcp_rsk(req)->snt_synack;  			tp->total_retrans = req->num_retrans;  			reqsk_fastopen_remove(sk, req, false);  		} else { +			synack_stamp = tp->lsndtime;  			/* Make sure socket is routed, for correct metrics. */  			icsk->icsk_af_ops->rebuild_header(sk);  			tcp_init_congestion_control(sk);  			tcp_mtup_init(sk); -			tcp_init_buffer_space(sk);  			tp->copied_seq = tp->rcv_nxt; +			tcp_init_buffer_space(sk);  		}  		smp_mb();  		tcp_set_state(sk, TCP_ESTABLISHED); @@ -5694,7 +5726,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		tp->snd_una = TCP_SKB_CB(skb)->ack_seq;  		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;  		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); -		tcp_synack_rtt_meas(sk, req); +		tcp_synack_rtt_meas(sk, synack_stamp);  		if (tp->rx_opt.tstamp_ok)  			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -5712,6 +5744,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		} else  			tcp_init_metrics(sk); +		tcp_update_pacing_rate(sk); +  		/* Prevent spurious tcp_cwnd_restart() on first data packet */  		tp->lsndtime = tcp_time_stamp; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b14266bb91e..77cccda1ad0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -173,11 +173,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,  			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,  			      IPPROTO_TCP, -			      orig_sport, orig_dport, sk, true); +			      orig_sport, orig_dport, sk);  	if (IS_ERR(rt)) {  		err = PTR_ERR(rt);  		if (err == -ENETUNREACH) -			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); +			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);  		return err;  	} @@ -288,6 +288,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)  	mtu = dst_mtu(dst);  	if (inet->pmtudisc != IP_PMTUDISC_DONT && +	    ip_sk_accept_pmtu(sk) &&  	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {  		tcp_sync_mss(sk, mtu); @@ -335,8 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	const int code = icmp_hdr(icmp_skb)->code;  	struct sock *sk;  	struct sk_buff *skb; -	struct request_sock *req; -	__u32 seq; +	struct request_sock *fastopen; +	__u32 seq, snd_una;  	__u32 remaining;  	int err;  	struct net *net = dev_net(icmp_skb->dev); @@ -377,12 +378,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	icsk = inet_csk(sk);  	tp = tcp_sk(sk); -	req = tp->fastopen_rsk;  	seq = ntohl(th->seq); +	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ +	fastopen = tp->fastopen_rsk; +	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;  	if (sk->sk_state != TCP_LISTEN && -	    !between(seq, tp->snd_una, tp->snd_nxt) && -	    (req == NULL || seq != tcp_rsk(req)->snt_isn)) { -		/* For a Fast Open socket, allow seq to be snt_isn. */ +	    !between(seq, snd_una, tp->snd_nxt)) {  		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);  		goto out;  	} @@ -425,16 +426,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)  			break;  		if (seq != tp->snd_una  || !icsk->icsk_retransmits || -		    !icsk->icsk_backoff) +		    !icsk->icsk_backoff || fastopen)  			break; -		/* XXX (TFO) - revisit the following logic for TFO */ -  		if (sock_owned_by_user(sk))  			break;  		icsk->icsk_backoff--; -		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : +		inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :  			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;  		tcp_bound_rto(sk); @@ -461,14 +460,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		goto out;  	} -	/* XXX (TFO) - if it's a TFO socket and has been accepted, rather -	 * than following the TCP_SYN_RECV case and closing the socket, -	 * we ignore the ICMP error and keep trying like a fully established -	 * socket. Is this the right thing to do? -	 */ -	if (req && req->sk == NULL) -		goto out; -  	switch (sk->sk_state) {  		struct request_sock *req, **prev;  	case TCP_LISTEN: @@ -501,10 +492,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		goto out;  	case TCP_SYN_SENT: -	case TCP_SYN_RECV:  /* Cannot happen. -			       It can f.e. if SYNs crossed, -			       or Fast Open. -			     */ +	case TCP_SYN_RECV: +		/* Only in fast or simultaneous open. If a fast open socket is +		 * is already accepted it is treated as a connected one below. +		 */ +		if (fastopen && fastopen->sk == NULL) +			break; +  		if (!sock_owned_by_user(sk)) {  			sk->sk_err = err; @@ -821,25 +815,26 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,   */  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,  			      struct request_sock *req, -			      u16 queue_mapping) +			      u16 queue_mapping, +			      struct tcp_fastopen_cookie *foc)  {  	const struct inet_request_sock *ireq = inet_rsk(req);  	struct flowi4 fl4;  	int err = -1; -	struct sk_buff * skb; +	struct sk_buff *skb;  	/* First, grab a route. */  	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)  		return -1; -	skb = tcp_make_synack(sk, dst, req, NULL); +	skb = tcp_make_synack(sk, dst, req, foc);  	if (skb) { -		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); +		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);  		skb_set_queue_mapping(skb, queue_mapping); -		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, -					    ireq->rmt_addr, +		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, +					    ireq->ir_rmt_addr,  					    ireq->opt);  		err = net_xmit_eval(err);  		if (!tcp_rsk(req)->snt_synack && !err) @@ -851,10 +846,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,  static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)  { -	int res = tcp_v4_send_synack(sk, NULL, req, 0); +	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); -	if (!res) +	if (!res) {  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); +	}  	return res;  } @@ -877,8 +874,6 @@ bool tcp_syn_flood_action(struct sock *sk,  	bool want_cookie = false;  	struct listen_sock *lopt; - -  #ifdef CONFIG_SYN_COOKIES  	if (sysctl_tcp_syncookies) {  		msg = "Sending cookies"; @@ -972,7 +967,7 @@ static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,  {  	union tcp_md5_addr *addr; -	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr; +	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;  	return tcp_md5_do_lookup(sk, addr, AF_INET);  } @@ -1149,8 +1144,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,  		saddr = inet_sk(sk)->inet_saddr;  		daddr = inet_sk(sk)->inet_daddr;  	} else if (req) { -		saddr = inet_rsk(req)->loc_addr; -		daddr = inet_rsk(req)->rmt_addr; +		saddr = inet_rsk(req)->ir_loc_addr; +		daddr = inet_rsk(req)->ir_rmt_addr;  	} else {  		const struct iphdr *iph = ip_hdr(skb);  		saddr = iph->saddr; @@ -1259,187 +1254,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {  };  #endif -static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, -			       struct request_sock *req, -			       struct tcp_fastopen_cookie *foc, -			       struct tcp_fastopen_cookie *valid_foc) -{ -	bool skip_cookie = false; -	struct fastopen_queue *fastopenq; - -	if (likely(!fastopen_cookie_present(foc))) { -		/* See include/net/tcp.h for the meaning of these knobs */ -		if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || -		    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && -		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) -			skip_cookie = true; /* no cookie to validate */ -		else -			return false; -	} -	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; -	/* A FO option is present; bump the counter. */ -	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); - -	/* Make sure the listener has enabled fastopen, and we don't -	 * exceed the max # of pending TFO requests allowed before trying -	 * to validating the cookie in order to avoid burning CPU cycles -	 * unnecessarily. -	 * -	 * XXX (TFO) - The implication of checking the max_qlen before -	 * processing a cookie request is that clients can't differentiate -	 * between qlen overflow causing Fast Open to be disabled -	 * temporarily vs a server not supporting Fast Open at all. -	 */ -	if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || -	    fastopenq == NULL || fastopenq->max_qlen == 0) -		return false; - -	if (fastopenq->qlen >= fastopenq->max_qlen) { -		struct request_sock *req1; -		spin_lock(&fastopenq->lock); -		req1 = fastopenq->rskq_rst_head; -		if ((req1 == NULL) || time_after(req1->expires, jiffies)) { -			spin_unlock(&fastopenq->lock); -			NET_INC_STATS_BH(sock_net(sk), -			    LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); -			/* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ -			foc->len = -1; -			return false; -		} -		fastopenq->rskq_rst_head = req1->dl_next; -		fastopenq->qlen--; -		spin_unlock(&fastopenq->lock); -		reqsk_free(req1); -	} -	if (skip_cookie) { -		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -		return true; -	} - -	if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { -		if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { -			tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, -						ip_hdr(skb)->daddr, valid_foc); -			if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || -			    memcmp(&foc->val[0], &valid_foc->val[0], -			    TCP_FASTOPEN_COOKIE_SIZE) != 0) -				return false; -			valid_foc->len = -1; -		} -		/* Acknowledge the data received from the peer. */ -		tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -		return true; -	} else if (foc->len == 0) { /* Client requesting a cookie */ -		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, -					ip_hdr(skb)->daddr, valid_foc); -		NET_INC_STATS_BH(sock_net(sk), -		    LINUX_MIB_TCPFASTOPENCOOKIEREQD); -	} else { -		/* Client sent a cookie with wrong size. Treat it -		 * the same as invalid and return a valid one. -		 */ -		tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, -					ip_hdr(skb)->daddr, valid_foc); -	} -	return false; -} - -static int tcp_v4_conn_req_fastopen(struct sock *sk, -				    struct sk_buff *skb, -				    struct sk_buff *skb_synack, -				    struct request_sock *req) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; -	const struct inet_request_sock *ireq = inet_rsk(req); -	struct sock *child; -	int err; - -	req->num_retrans = 0; -	req->num_timeout = 0; -	req->sk = NULL; - -	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); -	if (child == NULL) { -		NET_INC_STATS_BH(sock_net(sk), -				 LINUX_MIB_TCPFASTOPENPASSIVEFAIL); -		kfree_skb(skb_synack); -		return -1; -	} -	err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, -				    ireq->rmt_addr, ireq->opt); -	err = net_xmit_eval(err); -	if (!err) -		tcp_rsk(req)->snt_synack = tcp_time_stamp; -	/* XXX (TFO) - is it ok to ignore error and continue? */ - -	spin_lock(&queue->fastopenq->lock); -	queue->fastopenq->qlen++; -	spin_unlock(&queue->fastopenq->lock); - -	/* Initialize the child socket. Have to fix some values to take -	 * into account the child is a Fast Open socket and is created -	 * only out of the bits carried in the SYN packet. -	 */ -	tp = tcp_sk(child); - -	tp->fastopen_rsk = req; -	/* Do a hold on the listner sk so that if the listener is being -	 * closed, the child that has been accepted can live on and still -	 * access listen_lock. -	 */ -	sock_hold(sk); -	tcp_rsk(req)->listener = sk; - -	/* RFC1323: The window in SYN & SYN/ACK segments is never -	 * scaled. So correct it appropriately. -	 */ -	tp->snd_wnd = ntohs(tcp_hdr(skb)->window); - -	/* Activate the retrans timer so that SYNACK can be retransmitted. -	 * The request socket is not added to the SYN table of the parent -	 * because it's been added to the accept queue directly. -	 */ -	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, -	    TCP_TIMEOUT_INIT, TCP_RTO_MAX); - -	/* Add the child socket directly into the accept queue */ -	inet_csk_reqsk_queue_add(sk, req, child); - -	/* Now finish processing the fastopen child socket. */ -	inet_csk(child)->icsk_af_ops->rebuild_header(child); -	tcp_init_congestion_control(child); -	tcp_mtup_init(child); -	tcp_init_buffer_space(child); -	tcp_init_metrics(child); - -	/* Queue the data carried in the SYN packet. We need to first -	 * bump skb's refcnt because the caller will attempt to free it. -	 * -	 * XXX (TFO) - we honor a zero-payload TFO request for now. -	 * (Any reason not to?) -	 */ -	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { -		/* Don't queue the skb if there is no payload in SYN. -		 * XXX (TFO) - How about SYN+FIN? -		 */ -		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -	} else { -		skb = skb_get(skb); -		skb_dst_drop(skb); -		__skb_pull(skb, tcp_hdr(skb)->doff * 4); -		skb_set_owner_r(skb, child); -		__skb_queue_tail(&child->sk_receive_queue, skb); -		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; -		tp->syn_data_acked = 1; -	} -	sk->sk_data_ready(sk, 0); -	bh_unlock_sock(child); -	sock_put(child); -	WARN_ON(req->sk == NULL); -	return 0; -} -  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  {  	struct tcp_options_received tmp_opt; @@ -1450,12 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	__be32 saddr = ip_hdr(skb)->saddr;  	__be32 daddr = ip_hdr(skb)->daddr;  	__u32 isn = TCP_SKB_CB(skb)->when; -	bool want_cookie = false; +	bool want_cookie = false, fastopen;  	struct flowi4 fl4;  	struct tcp_fastopen_cookie foc = { .len = -1 }; -	struct tcp_fastopen_cookie valid_foc = { .len = -1 }; -	struct sk_buff *skb_synack; -	int do_fastopen; +	int err;  	/* Never answer to SYNs send to broadcast or multicast */  	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1502,10 +1314,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	tcp_openreq_init(req, &tmp_opt, skb);  	ireq = inet_rsk(req); -	ireq->loc_addr = daddr; -	ireq->rmt_addr = saddr; +	ireq->ir_loc_addr = daddr; +	ireq->ir_rmt_addr = saddr;  	ireq->no_srccheck = inet_sk(sk)->transparent;  	ireq->opt = tcp_v4_save_options(skb); +	ireq->ir_mark = inet_request_mark(sk, skb);  	if (security_inet_conn_request(sk, skb, req))  		goto drop_and_free; @@ -1554,52 +1367,24 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  		isn = tcp_v4_init_sequence(skb);  	} -	tcp_rsk(req)->snt_isn = isn; - -	if (dst == NULL) { -		dst = inet_csk_route_req(sk, &fl4, req); -		if (dst == NULL) -			goto drop_and_free; -	} -	do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); - -	/* We don't call tcp_v4_send_synack() directly because we need -	 * to make sure a child socket can be created successfully before -	 * sending back synack! -	 * -	 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() -	 * (or better yet, call tcp_send_synack() in the child context -	 * directly, but will have to fix bunch of other code first) -	 * after syn_recv_sock() except one will need to first fix the -	 * latter to remove its dependency on the current implementation -	 * of tcp_v4_send_synack()->tcp_select_initial_window(). -	 */ -	skb_synack = tcp_make_synack(sk, dst, req, -	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); - -	if (skb_synack) { -		__tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); -		skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); -	} else +	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)  		goto drop_and_free; -	if (likely(!do_fastopen)) { -		int err; -		err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, -		     ireq->rmt_addr, ireq->opt); -		err = net_xmit_eval(err); +	tcp_rsk(req)->snt_isn = isn; +	tcp_rsk(req)->snt_synack = tcp_time_stamp; +	tcp_openreq_init_rwin(req, sk, dst); +	fastopen = !want_cookie && +		   tcp_try_fastopen(sk, skb, req, &foc, dst); +	err = tcp_v4_send_synack(sk, dst, req, +				 skb_get_queue_mapping(skb), &foc); +	if (!fastopen) {  		if (err || want_cookie)  			goto drop_and_free;  		tcp_rsk(req)->snt_synack = tcp_time_stamp;  		tcp_rsk(req)->listener = NULL; -		/* Add the request_sock to the SYN table */  		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -		if (fastopen_cookie_present(&foc) && foc.len != 0) -			NET_INC_STATS_BH(sock_net(sk), -			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL); -	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req)) -		goto drop_and_free; +	}  	return 0; @@ -1644,9 +1429,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,  	newtp		      = tcp_sk(newsk);  	newinet		      = inet_sk(newsk);  	ireq		      = inet_rsk(req); -	newinet->inet_daddr   = ireq->rmt_addr; -	newinet->inet_rcv_saddr = ireq->loc_addr; -	newinet->inet_saddr	      = ireq->loc_addr; +	newinet->inet_daddr   = ireq->ir_rmt_addr; +	newinet->inet_rcv_saddr = ireq->ir_loc_addr; +	newinet->inet_saddr	      = ireq->ir_loc_addr;  	inet_opt	      = ireq->opt;  	rcu_assign_pointer(newinet->inet_opt, inet_opt);  	ireq->opt	      = NULL; @@ -1667,7 +1452,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,  	}  	sk_setup_caps(newsk, dst); -	tcp_mtup_init(newsk);  	tcp_sync_mss(newsk, dst_mtu(dst));  	newtp->advmss = dst_metric_advmss(dst);  	if (tcp_sk(sk)->rx_opt.user_mss && @@ -1744,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)  	return sk;  } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ -	const struct iphdr *iph = ip_hdr(skb); - -	if (skb->ip_summed == CHECKSUM_COMPLETE) { -		if (!tcp_v4_check(skb->len, iph->saddr, -				  iph->daddr, skb->csum)) { -			skb->ip_summed = CHECKSUM_UNNECESSARY; -			return 0; -		} -	} - -	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -				       skb->len, IPPROTO_TCP, 0); - -	if (skb->len <= 76) { -		return __skb_checksum_complete(skb); -	} -	return 0; -} - -  /* The socket must have it's spinlock held when we get   * here.   * @@ -1960,7 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb)  	 * Packet length and doff are validated by header prediction,  	 * provided case of th->doff==0 is eliminated.  	 * So, we defer the checks. */ -	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) + +	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))  		goto csum_error;  	th = tcp_hdr(skb); @@ -2194,18 +1957,6 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);  #ifdef CONFIG_PROC_FS  /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) -{ -	return hlist_nulls_empty(head) ? NULL : -		list_entry(head->first, struct inet_timewait_sock, tw_node); -} - -static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) -{ -	return !is_a_nulls(tw->tw_node.next) ? -		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; -} -  /*   * Get next listener socket follow cur.  If cur is NULL, get first socket   * starting from bucket given in st->bucket; when st->bucket is zero the @@ -2309,10 +2060,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)  	return rc;  } -static inline bool empty_bucket(struct tcp_iter_state *st) +static inline bool empty_bucket(const struct tcp_iter_state *st)  { -	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && -		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);  }  /* @@ -2329,7 +2079,6 @@ static void *established_get_first(struct seq_file *seq)  	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {  		struct sock *sk;  		struct hlist_nulls_node *node; -		struct inet_timewait_sock *tw;  		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);  		/* Lockless fast path for the common case of empty buckets */ @@ -2345,18 +2094,7 @@ static void *established_get_first(struct seq_file *seq)  			rc = sk;  			goto out;  		} -		st->state = TCP_SEQ_STATE_TIME_WAIT; -		inet_twsk_for_each(tw, node, -				   &tcp_hashinfo.ehash[st->bucket].twchain) { -			if (tw->tw_family != st->family || -			    !net_eq(twsk_net(tw), net)) { -				continue; -			} -			rc = tw; -			goto out; -		}  		spin_unlock_bh(lock); -		st->state = TCP_SEQ_STATE_ESTABLISHED;  	}  out:  	return rc; @@ -2365,7 +2103,6 @@ out:  static void *established_get_next(struct seq_file *seq, void *cur)  {  	struct sock *sk = cur; -	struct inet_timewait_sock *tw;  	struct hlist_nulls_node *node;  	struct tcp_iter_state *st = seq->private;  	struct net *net = seq_file_net(seq); @@ -2373,45 +2110,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)  	++st->num;  	++st->offset; -	if (st->state == TCP_SEQ_STATE_TIME_WAIT) { -		tw = cur; -		tw = tw_next(tw); -get_tw: -		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { -			tw = tw_next(tw); -		} -		if (tw) { -			cur = tw; -			goto out; -		} -		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); -		st->state = TCP_SEQ_STATE_ESTABLISHED; - -		/* Look for next non empty bucket */ -		st->offset = 0; -		while (++st->bucket <= tcp_hashinfo.ehash_mask && -				empty_bucket(st)) -			; -		if (st->bucket > tcp_hashinfo.ehash_mask) -			return NULL; - -		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); -		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); -	} else -		sk = sk_nulls_next(sk); +	sk = sk_nulls_next(sk);  	sk_nulls_for_each_from(sk, node) {  		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) -			goto found; +			return sk;  	} -	st->state = TCP_SEQ_STATE_TIME_WAIT; -	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); -	goto get_tw; -found: -	cur = sk; -out: -	return cur; +	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); +	++st->bucket; +	return established_get_first(seq);  }  static void *established_get_idx(struct seq_file *seq, loff_t pos) @@ -2464,10 +2172,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)  		if (rc)  			break;  		st->bucket = 0; +		st->state = TCP_SEQ_STATE_ESTABLISHED;  		/* Fallthrough */  	case TCP_SEQ_STATE_ESTABLISHED: -	case TCP_SEQ_STATE_TIME_WAIT: -		st->state = TCP_SEQ_STATE_ESTABLISHED;  		if (st->bucket > tcp_hashinfo.ehash_mask)  			break;  		rc = established_get_first(seq); @@ -2524,7 +2231,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)  		}  		break;  	case TCP_SEQ_STATE_ESTABLISHED: -	case TCP_SEQ_STATE_TIME_WAIT:  		rc = established_get_next(seq, v);  		break;  	} @@ -2548,7 +2254,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)  		if (v != SEQ_START_TOKEN)  			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);  		break; -	case TCP_SEQ_STATE_TIME_WAIT:  	case TCP_SEQ_STATE_ESTABLISHED:  		if (v)  			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2598,18 +2303,18 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)  EXPORT_SYMBOL(tcp_proc_unregister);  static void get_openreq4(const struct sock *sk, const struct request_sock *req, -			 struct seq_file *f, int i, kuid_t uid, int *len) +			 struct seq_file *f, int i, kuid_t uid)  {  	const struct inet_request_sock *ireq = inet_rsk(req);  	long delta = req->expires - jiffies;  	seq_printf(f, "%4d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK%n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",  		i, -		ireq->loc_addr, +		ireq->ir_loc_addr,  		ntohs(inet_sk(sk)->inet_sport), -		ireq->rmt_addr, -		ntohs(ireq->rmt_port), +		ireq->ir_rmt_addr, +		ntohs(ireq->ir_rmt_port),  		TCP_SYN_RECV,  		0, 0, /* could print option size, but that is af dependent. */  		1,    /* timers active (only the expire timer) */ @@ -2619,11 +2324,10 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,  		0,  /* non standard timer */  		0, /* open_requests have no inode */  		atomic_read(&sk->sk_refcnt), -		req, -		len); +		req);  } -static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)  {  	int timer_active;  	unsigned long timer_expires; @@ -2662,7 +2366,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);  	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " -			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d%n", +			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",  		i, src, srcp, dest, destp, sk->sk_state,  		tp->write_seq - tp->snd_una,  		rx_queue, @@ -2679,16 +2383,15 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  		tp->snd_cwnd,  		sk->sk_state == TCP_LISTEN ?  		    (fastopenq ? fastopenq->max_qlen : 0) : -		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), -		len); +		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));  }  static void get_timewait4_sock(const struct inet_timewait_sock *tw, -			       struct seq_file *f, int i, int *len) +			       struct seq_file *f, int i)  {  	__be32 dest, src;  	__u16 destp, srcp; -	long delta = tw->tw_ttd - jiffies; +	s32 delta = tw->tw_ttd - inet_tw_time_stamp();  	dest  = tw->tw_daddr;  	src   = tw->tw_rcv_saddr; @@ -2696,10 +2399,10 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,  	srcp  = ntohs(tw->tw_sport);  	seq_printf(f, "%4d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", +		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",  		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,  		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, -		atomic_read(&tw->tw_refcnt), tw, len); +		atomic_read(&tw->tw_refcnt), tw);  }  #define TMPSZ 150 @@ -2707,11 +2410,11 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,  static int tcp4_seq_show(struct seq_file *seq, void *v)  {  	struct tcp_iter_state *st; -	int len; +	struct sock *sk = v; +	seq_setwidth(seq, TMPSZ - 1);  	if (v == SEQ_START_TOKEN) { -		seq_printf(seq, "%-*s\n", TMPSZ - 1, -			   "  sl  local_address rem_address   st tx_queue " +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "  			   "rx_queue tr tm->when retrnsmt   uid  timeout "  			   "inode");  		goto out; @@ -2721,17 +2424,17 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)  	switch (st->state) {  	case TCP_SEQ_STATE_LISTENING:  	case TCP_SEQ_STATE_ESTABLISHED: -		get_tcp4_sock(v, seq, st->num, &len); +		if (sk->sk_state == TCP_TIME_WAIT) +			get_timewait4_sock(v, seq, st->num); +		else +			get_tcp4_sock(v, seq, st->num);  		break;  	case TCP_SEQ_STATE_OPENREQ: -		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); -		break; -	case TCP_SEQ_STATE_TIME_WAIT: -		get_timewait4_sock(v, seq, st->num, &len); +		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);  		break;  	} -	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");  out: +	seq_pad(seq, '\n');  	return 0;  } @@ -2806,6 +2509,7 @@ struct proto tcp_prot = {  	.orphan_count		= &tcp_orphan_count,  	.memory_allocated	= &tcp_memory_allocated,  	.memory_pressure	= &tcp_memory_pressure, +	.sysctl_mem		= sysctl_tcp_mem,  	.sysctl_wmem		= sysctl_tcp_wmem,  	.sysctl_rmem		= sysctl_tcp_rmem,  	.max_header		= MAX_TCP_HEADER, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 72f7218b03f..1e70fa8fa79 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk)   * Will only call newReno CA when away from inference.   * From TCP-LP's paper, this will be handled in additive increasement.   */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct lp *lp = inet_csk_ca(sk);  	if (!(lp->flag & LP_WITHIN_INF)) -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  }  /** @@ -314,11 +314,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)  }  static struct tcp_congestion_ops tcp_lp __read_mostly = { -	.flags = TCP_CONG_RTT_STAMP,  	.init = tcp_lp_init,  	.ssthresh = tcp_reno_ssthresh,  	.cong_avoid = tcp_lp_cong_avoid, -	.min_cwnd = tcp_reno_min_cwnd,  	.pkts_acked = tcp_lp_pkts_acked,  	.owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 559d4ae6ebf..f7a2ec3ac58 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,18 +6,6 @@  #include <linux/memcontrol.h>  #include <linux/module.h> -static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) -{ -	return container_of(cg_proto, struct tcp_memcontrol, cg_proto); -} - -static void memcg_tcp_enter_memory_pressure(struct sock *sk) -{ -	if (sk->sk_cgrp->memory_pressure) -		*sk->sk_cgrp->memory_pressure = 1; -} -EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure); -  int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)  {  	/* @@ -27,34 +15,24 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)  	 */  	struct res_counter *res_parent = NULL;  	struct cg_proto *cg_proto, *parent_cg; -	struct tcp_memcontrol *tcp;  	struct mem_cgroup *parent = parent_mem_cgroup(memcg); -	struct net *net = current->nsproxy->net_ns;  	cg_proto = tcp_prot.proto_cgroup(memcg);  	if (!cg_proto)  		return 0; -	tcp = tcp_from_cgproto(cg_proto); - -	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; -	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; -	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; -	tcp->tcp_memory_pressure = 0; +	cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; +	cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; +	cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; +	cg_proto->memory_pressure = 0; +	cg_proto->memcg = memcg;  	parent_cg = tcp_prot.proto_cgroup(parent);  	if (parent_cg) -		res_parent = parent_cg->memory_allocated; +		res_parent = &parent_cg->memory_allocated; -	res_counter_init(&tcp->tcp_memory_allocated, res_parent); -	percpu_counter_init(&tcp->tcp_sockets_allocated, 0); - -	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure; -	cg_proto->memory_pressure = &tcp->tcp_memory_pressure; -	cg_proto->sysctl_mem = tcp->tcp_prot_mem; -	cg_proto->memory_allocated = &tcp->tcp_memory_allocated; -	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; -	cg_proto->memcg = memcg; +	res_counter_init(&cg_proto->memory_allocated, res_parent); +	percpu_counter_init(&cg_proto->sockets_allocated, 0);  	return 0;  } @@ -63,23 +41,18 @@ EXPORT_SYMBOL(tcp_init_cgroup);  void tcp_destroy_cgroup(struct mem_cgroup *memcg)  {  	struct cg_proto *cg_proto; -	struct tcp_memcontrol *tcp;  	cg_proto = tcp_prot.proto_cgroup(memcg);  	if (!cg_proto)  		return; -	tcp = tcp_from_cgproto(cg_proto); -	percpu_counter_destroy(&tcp->tcp_sockets_allocated); +	percpu_counter_destroy(&cg_proto->sockets_allocated);  }  EXPORT_SYMBOL(tcp_destroy_cgroup);  static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)  { -	struct net *net = current->nsproxy->net_ns; -	struct tcp_memcontrol *tcp;  	struct cg_proto *cg_proto; -	u64 old_lim;  	int i;  	int ret; @@ -90,16 +63,13 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)  	if (val > RES_COUNTER_MAX)  		val = RES_COUNTER_MAX; -	tcp = tcp_from_cgproto(cg_proto); - -	old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); -	ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); +	ret = res_counter_set_limit(&cg_proto->memory_allocated, val);  	if (ret)  		return ret;  	for (i = 0; i < 3; i++) -		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, -					     net->ipv4.sysctl_tcp_mem[i]); +		cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, +						sysctl_tcp_mem[i]);  	if (val == RES_COUNTER_MAX)  		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); @@ -132,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)  	return 0;  } -static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, -			    const char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	unsigned long long val;  	int ret = 0; -	switch (cft->private) { +	buf = strstrip(buf); + +	switch (of_cft(of)->private) {  	case RES_LIMIT:  		/* see memcontrol.c */ -		ret = res_counter_memparse_write_strategy(buffer, &val); +		ret = res_counter_memparse_write_strategy(buf, &val);  		if (ret)  			break;  		ret = tcp_update_limit(memcg, val); @@ -151,33 +123,29 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  		ret = -EINVAL;  		break;  	} -	return ret; +	return ret ?: nbytes;  }  static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)  { -	struct tcp_memcontrol *tcp;  	struct cg_proto *cg_proto;  	cg_proto = tcp_prot.proto_cgroup(memcg);  	if (!cg_proto)  		return default_val; -	tcp = tcp_from_cgproto(cg_proto); -	return res_counter_read_u64(&tcp->tcp_memory_allocated, type); +	return res_counter_read_u64(&cg_proto->memory_allocated, type);  }  static u64 tcp_read_usage(struct mem_cgroup *memcg)  { -	struct tcp_memcontrol *tcp;  	struct cg_proto *cg_proto;  	cg_proto = tcp_prot.proto_cgroup(memcg);  	if (!cg_proto)  		return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; -	tcp = tcp_from_cgproto(cg_proto); -	return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); +	return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);  }  static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -202,61 +170,33 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)  	return val;  } -static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off)  {  	struct mem_cgroup *memcg; -	struct tcp_memcontrol *tcp;  	struct cg_proto *cg_proto; -	memcg = mem_cgroup_from_css(css); +	memcg = mem_cgroup_from_css(of_css(of));  	cg_proto = tcp_prot.proto_cgroup(memcg);  	if (!cg_proto) -		return 0; -	tcp = tcp_from_cgproto(cg_proto); +		return nbytes; -	switch (event) { +	switch (of_cft(of)->private) {  	case RES_MAX_USAGE: -		res_counter_reset_max(&tcp->tcp_memory_allocated); +		res_counter_reset_max(&cg_proto->memory_allocated);  		break;  	case RES_FAILCNT: -		res_counter_reset_failcnt(&tcp->tcp_memory_allocated); +		res_counter_reset_failcnt(&cg_proto->memory_allocated);  		break;  	} -	return 0; -} - -unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) -{ -	struct tcp_memcontrol *tcp; -	struct cg_proto *cg_proto; - -	cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); -	if (!cg_proto) -		return 0; - -	tcp = tcp_from_cgproto(cg_proto); -	return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); -} - -void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) -{ -	struct tcp_memcontrol *tcp; -	struct cg_proto *cg_proto; - -	cg_proto = tcp_prot.proto_cgroup(memcg); -	if (!cg_proto) -		return; - -	tcp = tcp_from_cgproto(cg_proto); - -	tcp->tcp_prot_mem[idx] = val; +	return nbytes;  }  static struct cftype tcp_files[] = {  	{  		.name = "kmem.tcp.limit_in_bytes", -		.write_string = tcp_cgroup_write, +		.write = tcp_cgroup_write,  		.read_u64 = tcp_cgroup_read,  		.private = RES_LIMIT,  	}, @@ -268,13 +208,13 @@ static struct cftype tcp_files[] = {  	{  		.name = "kmem.tcp.failcnt",  		.private = RES_FAILCNT, -		.trigger = tcp_cgroup_reset, +		.write = tcp_cgroup_reset,  		.read_u64 = tcp_cgroup_read,  	},  	{  		.name = "kmem.tcp.max_usage_in_bytes",  		.private = RES_MAX_USAGE, -		.trigger = tcp_cgroup_reset, +		.write = tcp_cgroup_reset,  		.read_u64 = tcp_cgroup_read,  	},  	{ }	/* terminate */ @@ -282,7 +222,7 @@ static struct cftype tcp_files[] = {  static int __init tcp_memcontrol_init(void)  { -	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); +	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));  	return 0;  }  __initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 52f3c6b971d..4fe04180598 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -22,6 +22,10 @@  int sysctl_tcp_nometrics_save __read_mostly; +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, +						   const struct inetpeer_addr *daddr, +						   struct net *net, unsigned int hash); +  struct tcp_fastopen_metrics {  	u16	mss;  	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */ @@ -29,14 +33,20 @@ struct tcp_fastopen_metrics {  	struct	tcp_fastopen_cookie	cookie;  }; +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) +  struct tcp_metrics_block {  	struct tcp_metrics_block __rcu	*tcpm_next; -	struct inetpeer_addr		tcpm_addr; +	struct inetpeer_addr		tcpm_saddr; +	struct inetpeer_addr		tcpm_daddr;  	unsigned long			tcpm_stamp;  	u32				tcpm_ts;  	u32				tcpm_ts_stamp;  	u32				tcpm_lock; -	u32				tcpm_vals[TCP_METRIC_MAX + 1]; +	u32				tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];  	struct tcp_fastopen_metrics	tcpm_fastopen;  	struct rcu_head			rcu_head; @@ -54,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,  	return tm->tcpm_vals[idx];  } -static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, -				  enum tcp_metric_index idx) -{ -	return msecs_to_jiffies(tm->tcpm_vals[idx]); -} -  static void tcp_metric_set(struct tcp_metrics_block *tm,  			   enum tcp_metric_index idx,  			   u32 val) @@ -67,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,  	tm->tcpm_vals[idx] = val;  } -static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, -				 enum tcp_metric_index idx, -				 u32 val) -{ -	tm->tcpm_vals[idx] = jiffies_to_msecs(val); -} -  static bool addr_same(const struct inetpeer_addr *a,  		      const struct inetpeer_addr *b)  { @@ -96,9 +93,11 @@ struct tcpm_hash_bucket {  static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, +static void tcpm_suck_dst(struct tcp_metrics_block *tm, +			  const struct dst_entry *dst,  			  bool fastopen_clear)  { +	u32 msval;  	u32 val;  	tm->tcpm_stamp = jiffies; @@ -116,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,  		val |= 1 << TCP_METRIC_REORDERING;  	tm->tcpm_lock = val; -	tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); -	tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); +	msval = dst_metric_raw(dst, RTAX_RTT); +	tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + +	msval = dst_metric_raw(dst, RTAX_RTTVAR); +	tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;  	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);  	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);  	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); @@ -130,16 +132,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,  	}  } +#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ +	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) +		tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH	5 +#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL +  static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, -					  struct inetpeer_addr *addr, -					  unsigned int hash, -					  bool reclaim) +					  struct inetpeer_addr *saddr, +					  struct inetpeer_addr *daddr, +					  unsigned int hash)  {  	struct tcp_metrics_block *tm;  	struct net *net; +	bool reclaim = false;  	spin_lock_bh(&tcp_metrics_lock);  	net = dev_net(dst->dev); + +	/* While waiting for the spin-lock the cache might have been populated +	 * with this entry and so we have to check again. +	 */ +	tm = __tcp_get_metrics(saddr, daddr, net, hash); +	if (tm == TCP_METRICS_RECLAIM_PTR) { +		reclaim = true; +		tm = NULL; +	} +	if (tm) { +		tcpm_check_stamp(tm, dst); +		goto out_unlock; +	} +  	if (unlikely(reclaim)) {  		struct tcp_metrics_block *oldest; @@ -155,7 +183,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,  		if (!tm)  			goto out_unlock;  	} -	tm->tcpm_addr = *addr; +	tm->tcpm_saddr = *saddr; +	tm->tcpm_daddr = *daddr;  	tcpm_suck_dst(tm, dst, true); @@ -169,17 +198,6 @@ out_unlock:  	return tm;  } -#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ) - -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) -{ -	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) -		tcpm_suck_dst(tm, dst, false); -} - -#define TCP_METRICS_RECLAIM_DEPTH	5 -#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL -  static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)  {  	if (tm) @@ -189,7 +207,8 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in  	return NULL;  } -static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, +						   const struct inetpeer_addr *daddr,  						   struct net *net, unsigned int hash)  {  	struct tcp_metrics_block *tm; @@ -197,7 +216,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *a  	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;  	     tm = rcu_dereference(tm->tcpm_next)) { -		if (addr_same(&tm->tcpm_addr, addr)) +		if (addr_same(&tm->tcpm_saddr, saddr) && +		    addr_same(&tm->tcpm_daddr, daddr))  			break;  		depth++;  	} @@ -208,20 +228,25 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,  						       struct dst_entry *dst)  {  	struct tcp_metrics_block *tm; -	struct inetpeer_addr addr; +	struct inetpeer_addr saddr, daddr;  	unsigned int hash;  	struct net *net; -	addr.family = req->rsk_ops->family; -	switch (addr.family) { +	saddr.family = req->rsk_ops->family; +	daddr.family = req->rsk_ops->family; +	switch (daddr.family) {  	case AF_INET: -		addr.addr.a4 = inet_rsk(req)->rmt_addr; -		hash = (__force unsigned int) addr.addr.a4; +		saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; +		daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; +		hash = (__force unsigned int) daddr.addr.a4;  		break; +#if IS_ENABLED(CONFIG_IPV6)  	case AF_INET6: -		*(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; -		hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); +		*(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; +		*(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; +		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);  		break; +#endif  	default:  		return NULL;  	} @@ -231,7 +256,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,  	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;  	     tm = rcu_dereference(tm->tcpm_next)) { -		if (addr_same(&tm->tcpm_addr, &addr)) +		if (addr_same(&tm->tcpm_saddr, &saddr) && +		    addr_same(&tm->tcpm_daddr, &daddr))  			break;  	}  	tcpm_check_stamp(tm, dst); @@ -240,33 +266,45 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,  static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)  { -	struct inet6_timewait_sock *tw6;  	struct tcp_metrics_block *tm; -	struct inetpeer_addr addr; +	struct inetpeer_addr saddr, daddr;  	unsigned int hash;  	struct net *net; -	addr.family = tw->tw_family; -	switch (addr.family) { -	case AF_INET: -		addr.addr.a4 = tw->tw_daddr; -		hash = (__force unsigned int) addr.addr.a4; -		break; -	case AF_INET6: -		tw6 = inet6_twsk((struct sock *)tw); -		*(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; -		hash = ipv6_addr_hash(&tw6->tw_v6_daddr); -		break; -	default: -		return NULL; +	if (tw->tw_family == AF_INET) { +		saddr.family = AF_INET; +		saddr.addr.a4 = tw->tw_rcv_saddr; +		daddr.family = AF_INET; +		daddr.addr.a4 = tw->tw_daddr; +		hash = (__force unsigned int) daddr.addr.a4; +	} +#if IS_ENABLED(CONFIG_IPV6) +	else if (tw->tw_family == AF_INET6) { +		if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { +			saddr.family = AF_INET; +			saddr.addr.a4 = tw->tw_rcv_saddr; +			daddr.family = AF_INET; +			daddr.addr.a4 = tw->tw_daddr; +			hash = (__force unsigned int) daddr.addr.a4; +		} else { +			saddr.family = AF_INET6; +			*(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; +			daddr.family = AF_INET6; +			*(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; +			hash = ipv6_addr_hash(&tw->tw_v6_daddr); +		}  	} +#endif +	else +		return NULL;  	net = twsk_net(tw);  	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);  	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;  	     tm = rcu_dereference(tm->tcpm_next)) { -		if (addr_same(&tm->tcpm_addr, &addr)) +		if (addr_same(&tm->tcpm_saddr, &saddr) && +		    addr_same(&tm->tcpm_daddr, &daddr))  			break;  	}  	return tm; @@ -277,36 +315,45 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,  						 bool create)  {  	struct tcp_metrics_block *tm; -	struct inetpeer_addr addr; +	struct inetpeer_addr saddr, daddr;  	unsigned int hash;  	struct net *net; -	bool reclaim; -	addr.family = sk->sk_family; -	switch (addr.family) { -	case AF_INET: -		addr.addr.a4 = inet_sk(sk)->inet_daddr; -		hash = (__force unsigned int) addr.addr.a4; -		break; -	case AF_INET6: -		*(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; -		hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); -		break; -	default: -		return NULL; +	if (sk->sk_family == AF_INET) { +		saddr.family = AF_INET; +		saddr.addr.a4 = inet_sk(sk)->inet_saddr; +		daddr.family = AF_INET; +		daddr.addr.a4 = inet_sk(sk)->inet_daddr; +		hash = (__force unsigned int) daddr.addr.a4; +	} +#if IS_ENABLED(CONFIG_IPV6) +	else if (sk->sk_family == AF_INET6) { +		if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { +			saddr.family = AF_INET; +			saddr.addr.a4 = inet_sk(sk)->inet_saddr; +			daddr.family = AF_INET; +			daddr.addr.a4 = inet_sk(sk)->inet_daddr; +			hash = (__force unsigned int) daddr.addr.a4; +		} else { +			saddr.family = AF_INET6; +			*(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; +			daddr.family = AF_INET6; +			*(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; +			hash = ipv6_addr_hash(&sk->sk_v6_daddr); +		}  	} +#endif +	else +		return NULL;  	net = dev_net(dst->dev);  	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); -	tm = __tcp_get_metrics(&addr, net, hash); -	reclaim = false; -	if (tm == TCP_METRICS_RECLAIM_PTR) { -		reclaim = true; +	tm = __tcp_get_metrics(&saddr, &daddr, net, hash); +	if (tm == TCP_METRICS_RECLAIM_PTR)  		tm = NULL; -	}  	if (!tm && create) -		tm = tcpm_new(dst, &addr, hash, reclaim); +		tm = tcpm_new(dst, &saddr, &daddr, hash);  	else  		tcpm_check_stamp(tm, dst); @@ -334,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)  		dst_confirm(dst);  	rcu_read_lock(); -	if (icsk->icsk_backoff || !tp->srtt) { +	if (icsk->icsk_backoff || !tp->srtt_us) {  		/* This session failed to estimate rtt. Why?  		 * Probably, no packets returned in time.  Reset our  		 * results. @@ -349,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)  	if (!tm)  		goto out_unlock; -	rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); -	m = rtt - tp->srtt; +	rtt = tcp_metric_get(tm, TCP_METRIC_RTT); +	m = rtt - tp->srtt_us;  	/* If newly calculated rtt larger than stored one, store new  	 * one. Otherwise, use EWMA. Remember, rtt overestimation is @@ -358,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)  	 */  	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {  		if (m <= 0) -			rtt = tp->srtt; +			rtt = tp->srtt_us;  		else  			rtt -= (m >> 3); -		tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); +		tcp_metric_set(tm, TCP_METRIC_RTT, rtt);  	}  	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { @@ -372,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)  		/* Scale deviation to rttvar fixed point */  		m >>= 1; -		if (m < tp->mdev) -			m = tp->mdev; +		if (m < tp->mdev_us) +			m = tp->mdev_us; -		var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); +		var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);  		if (m >= var)  			var = m;  		else  			var -= (var - m) >> 2; -		tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); +		tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);  	}  	if (tcp_in_initial_slowstart(tp)) { @@ -478,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)  		tp->reordering = val;  	} -	crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); +	crtt = tcp_metric_get(tm, TCP_METRIC_RTT);  	rcu_read_unlock();  reset:  	/* The initial RTT measurement from the SYN/SYN-ACK is not ideal @@ -501,18 +548,20 @@ reset:  	 * to low value, and then abruptly stops to do it and starts to delay  	 * ACKs, wait for troubles.  	 */ -	if (crtt > tp->srtt) { +	if (crtt > tp->srtt_us) {  		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ -		crtt >>= 3; +		crtt /= 8 * USEC_PER_MSEC;  		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); -	} else if (tp->srtt == 0) { +	} else if (tp->srtt_us == 0) {  		/* RFC6298: 5.7 We've failed to get a valid RTT sample from  		 * 3WHS. This is most likely due to retransmission,  		 * including spurious one. Reset the RTO back to 3secs  		 * from the more aggressive 1sec to avoid more spurious  		 * retransmission.  		 */ -		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; +		tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); +		tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; +  		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;  	}  	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been @@ -659,16 +708,20 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,  void tcp_fastopen_cache_set(struct sock *sk, u16 mss,  			    struct tcp_fastopen_cookie *cookie, bool syn_lost)  { +	struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_metrics_block *tm; +	if (!dst) +		return;  	rcu_read_lock(); -	tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); +	tm = tcp_get_metrics(sk, dst, true);  	if (tm) {  		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;  		write_seqlock_bh(&fastopen_seqlock); -		tfom->mss = mss; -		if (cookie->len > 0) +		if (mss) +			tfom->mss = mss; +		if (cookie && cookie->len > 0)  			tfom->cookie = *cookie;  		if (syn_lost) {  			++tfom->syn_loss; @@ -716,15 +769,21 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,  	struct nlattr *nest;  	int i; -	switch (tm->tcpm_addr.family) { +	switch (tm->tcpm_daddr.family) {  	case AF_INET:  		if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, -				tm->tcpm_addr.addr.a4) < 0) +				tm->tcpm_daddr.addr.a4) < 0) +			goto nla_put_failure; +		if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, +				tm->tcpm_saddr.addr.a4) < 0)  			goto nla_put_failure;  		break;  	case AF_INET6:  		if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, -			    tm->tcpm_addr.addr.a6) < 0) +			    tm->tcpm_daddr.addr.a6) < 0) +			goto nla_put_failure; +		if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, +			    tm->tcpm_saddr.addr.a6) < 0)  			goto nla_put_failure;  		break;  	default: @@ -749,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,  		nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);  		if (!nest)  			goto nla_put_failure; -		for (i = 0; i < TCP_METRIC_MAX + 1; i++) { -			if (!tm->tcpm_vals[i]) +		for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { +			u32 val = tm->tcpm_vals[i]; + +			if (!val)  				continue; -			if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) +			if (i == TCP_METRIC_RTT) { +				if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, +						val) < 0) +					goto nla_put_failure; +				n++; +				val = max(val / 1000, 1U); +			} +			if (i == TCP_METRIC_RTTVAR) { +				if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, +						val) < 0) +					goto nla_put_failure; +				n++; +				val = max(val / 1000, 1U); +			} +			if (nla_put_u32(msg, i + 1, val) < 0)  				goto nla_put_failure;  			n++;  		} @@ -847,44 +922,66 @@ done:  	return skb->len;  } -static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, -			 unsigned int *hash, int optional) +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, +			   unsigned int *hash, int optional, int v4, int v6)  {  	struct nlattr *a; -	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; +	a = info->attrs[v4];  	if (a) {  		addr->family = AF_INET;  		addr->addr.a4 = nla_get_be32(a); -		*hash = (__force unsigned int) addr->addr.a4; +		if (hash) +			*hash = (__force unsigned int) addr->addr.a4;  		return 0;  	} -	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; +	a = info->attrs[v6];  	if (a) {  		if (nla_len(a) != sizeof(struct in6_addr))  			return -EINVAL;  		addr->family = AF_INET6;  		memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); -		*hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); +		if (hash) +			*hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);  		return 0;  	}  	return optional ? 1 : -EAFNOSUPPORT;  } +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, +			 unsigned int *hash, int optional) +{ +	return __parse_nl_addr(info, addr, hash, optional, +			       TCP_METRICS_ATTR_ADDR_IPV4, +			       TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ +	return __parse_nl_addr(info, addr, NULL, 0, +			       TCP_METRICS_ATTR_SADDR_IPV4, +			       TCP_METRICS_ATTR_SADDR_IPV6); +} +  static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)  {  	struct tcp_metrics_block *tm; -	struct inetpeer_addr addr; +	struct inetpeer_addr saddr, daddr;  	unsigned int hash;  	struct sk_buff *msg;  	struct net *net = genl_info_net(info);  	void *reply;  	int ret; +	bool src = true; -	ret = parse_nl_addr(info, &addr, &hash, 0); +	ret = parse_nl_addr(info, &daddr, &hash, 0);  	if (ret < 0)  		return ret; +	ret = parse_nl_saddr(info, &saddr); +	if (ret < 0) +		src = false; +  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);  	if (!msg)  		return -ENOMEM; @@ -899,7 +996,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)  	rcu_read_lock();  	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;  	     tm = rcu_dereference(tm->tcpm_next)) { -		if (addr_same(&tm->tcpm_addr, &addr)) { +		if (addr_same(&tm->tcpm_daddr, &daddr) && +		    (!src || addr_same(&tm->tcpm_saddr, &saddr))) {  			ret = tcp_metrics_fill_info(msg, tm);  			break;  		} @@ -954,36 +1052,42 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)  	struct tcpm_hash_bucket *hb;  	struct tcp_metrics_block *tm;  	struct tcp_metrics_block __rcu **pp; -	struct inetpeer_addr addr; +	struct inetpeer_addr saddr, daddr;  	unsigned int hash;  	struct net *net = genl_info_net(info);  	int ret; +	bool src = true, found = false; -	ret = parse_nl_addr(info, &addr, &hash, 1); +	ret = parse_nl_addr(info, &daddr, &hash, 1);  	if (ret < 0)  		return ret;  	if (ret > 0)  		return tcp_metrics_flush_all(net); +	ret = parse_nl_saddr(info, &saddr); +	if (ret < 0) +		src = false;  	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);  	hb = net->ipv4.tcp_metrics_hash + hash;  	pp = &hb->chain;  	spin_lock_bh(&tcp_metrics_lock); -	for (tm = deref_locked_genl(*pp); tm; -	     pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { -		if (addr_same(&tm->tcpm_addr, &addr)) { +	for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { +		if (addr_same(&tm->tcpm_daddr, &daddr) && +		    (!src || addr_same(&tm->tcpm_saddr, &saddr))) {  			*pp = tm->tcpm_next; -			break; +			kfree_rcu(tm, rcu_head); +			found = true; +		} else { +			pp = &tm->tcpm_next;  		}  	}  	spin_unlock_bh(&tcp_metrics_lock); -	if (!tm) +	if (!found)  		return -ESRCH; -	kfree_rcu(tm, rcu_head);  	return 0;  } -static struct genl_ops tcp_metrics_nl_ops[] = { +static const struct genl_ops tcp_metrics_nl_ops[] = {  	{  		.cmd = TCP_METRICS_CMD_GET,  		.doit = tcp_metrics_nl_cmd_get, @@ -1055,10 +1159,7 @@ static void __net_exit tcp_net_metrics_exit(struct net *net)  			tm = next;  		}  	} -	if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) -		vfree(net->ipv4.tcp_metrics_hash); -	else -		kfree(net->ipv4.tcp_metrics_hash); +	kvfree(net->ipv4.tcp_metrics_hash);  }  static __net_initdata struct pernet_operations tcp_net_metrics_ops = { @@ -1074,8 +1175,7 @@ void __init tcp_metrics_init(void)  	if (ret < 0)  		goto cleanup;  	ret = genl_register_family_with_ops(&tcp_metrics_nl_family, -					    tcp_metrics_nl_ops, -					    ARRAY_SIZE(tcp_metrics_nl_ops)); +					    tcp_metrics_nl_ops);  	if (ret < 0)  		goto cleanup_subsys;  	return; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 58a3e69aef6..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -293,13 +293,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  #if IS_ENABLED(CONFIG_IPV6)  		if (tw->tw_family == PF_INET6) {  			struct ipv6_pinfo *np = inet6_sk(sk); -			struct inet6_timewait_sock *tw6; -			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); -			tw6 = inet6_twsk((struct sock *)tw); -			tw6->tw_v6_daddr = np->daddr; -			tw6->tw_v6_rcv_saddr = np->rcv_saddr; +			tw->tw_v6_daddr = sk->sk_v6_daddr; +			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;  			tw->tw_tclass = np->tclass; +			tw->tw_flowlabel = np->flow_label >> 12;  			tw->tw_ipv6only = np->ipv6only;  		}  #endif @@ -364,6 +362,37 @@ void tcp_twsk_destructor(struct sock *sk)  }  EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, +			   struct sock *sk, struct dst_entry *dst) +{ +	struct inet_request_sock *ireq = inet_rsk(req); +	struct tcp_sock *tp = tcp_sk(sk); +	__u8 rcv_wscale; +	int mss = dst_metric_advmss(dst); + +	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) +		mss = tp->rx_opt.user_mss; + +	/* Set this up on the first call only */ +	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + +	/* limit the window selection if the user enforce a smaller rx buffer */ +	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && +	    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) +		req->window_clamp = tcp_full_space(sk); + +	/* tcp_full_space because it is guaranteed to be the first packet */ +	tcp_select_initial_window(tcp_full_space(sk), +		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), +		&req->rcv_wnd, +		&req->window_clamp, +		ireq->wscale_ok, +		&rcv_wscale, +		dst_metric(dst, RTAX_INITRWND)); +	ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); +  static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,  					 struct request_sock *req)  { @@ -400,8 +429,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		tcp_init_wl(newtp, treq->rcv_isn); -		newtp->srtt = 0; -		newtp->mdev = TCP_TIMEOUT_INIT; +		newtp->srtt_us = 0; +		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);  		newicsk->icsk_rto = TCP_TIMEOUT_INIT;  		newtp->packets_out = 0; @@ -428,7 +457,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		tcp_set_ca_state(newsk, TCP_CA_Open);  		tcp_init_xmit_timers(newsk); -		skb_queue_head_init(&newtp->out_of_order_queue); +		__skb_queue_head_init(&newtp->out_of_order_queue);  		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;  		newtp->rx_opt.saw_tstamp = 0; @@ -747,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,  					    skb->len);  		/* Wakeup parent, send SIGIO */  		if (state == TCP_SYN_RECV && child->sk_state != state) -			parent->sk_data_ready(parent, 0); +			parent->sk_data_ready(parent);  	} else {  		/* Alas, it is possible again, because we do lookup  		 * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 3a7525e6c08..55046ecd083 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,10 +14,11 @@  #include <net/tcp.h>  #include <net/protocol.h> -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, +struct sk_buff *tcp_gso_segment(struct sk_buff *skb,  				netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL); +	unsigned int sum_truesize = 0;  	struct tcphdr *th;  	unsigned int thlen;  	unsigned int seq; @@ -56,8 +57,12 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,  			       SKB_GSO_TCP_ECN |  			       SKB_GSO_TCPV6 |  			       SKB_GSO_GRE | +			       SKB_GSO_GRE_CSUM | +			       SKB_GSO_IPIP | +			       SKB_GSO_SIT |  			       SKB_GSO_MPLS |  			       SKB_GSO_UDP_TUNNEL | +			       SKB_GSO_UDP_TUNNEL_CSUM |  			       0) ||  			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))  			goto out; @@ -94,21 +99,13 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,  		th->check = newcheck;  		if (skb->ip_summed != CHECKSUM_PARTIAL) -			th->check = -			     csum_fold(csum_partial(skb_transport_header(skb), -						    thlen, skb->csum)); +			th->check = gso_make_checksum(skb, ~th->check);  		seq += mss;  		if (copy_destructor) {  			skb->destructor = gso_skb->destructor;  			skb->sk = gso_skb->sk; -			/* {tcp|sock}_wfree() use exact truesize accounting : -			 * sum(skb->truesize) MUST be exactly be gso_skb->truesize -			 * So we account mss bytes of 'true size' for each segment. -			 * The last segment will contain the remaining. -			 */ -			skb->truesize = mss; -			gso_skb->truesize -= mss; +			sum_truesize += skb->truesize;  		}  		skb = skb->next;  		th = tcp_hdr(skb); @@ -125,7 +122,9 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,  	if (copy_destructor) {  		swap(gso_skb->sk, skb->sk);  		swap(gso_skb->destructor, skb->destructor); -		swap(gso_skb->truesize, skb->truesize); +		sum_truesize += skb->truesize; +		atomic_add(sum_truesize - gso_skb->truesize, +			   &skb->sk->sk_wmem_alloc);  	}  	delta = htonl(oldlen + (skb_tail_pointer(skb) - @@ -134,12 +133,10 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,  	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +  				(__force u32)delta));  	if (skb->ip_summed != CHECKSUM_PARTIAL) -		th->check = csum_fold(csum_partial(skb_transport_header(skb), -						   thlen, skb->csum)); +		th->check = gso_make_checksum(skb, ~th->check);  out:  	return segs;  } -EXPORT_SYMBOL(tcp_tso_segment);  struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)  { @@ -198,7 +195,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)  	goto out_check_final;  found: -	flush = NAPI_GRO_CB(p)->flush; +	/* Include the IP ID check below from the inner most IP hdr */ +	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;  	flush |= (__force int)(flags & TCP_FLAG_CWR);  	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &  		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -231,17 +229,16 @@ out_check_final:  		pp = head;  out: -	NAPI_GRO_CB(skb)->flush |= flush; +	NAPI_GRO_CB(skb)->flush |= (flush != 0);  	return pp;  } -EXPORT_SYMBOL(tcp_gro_receive);  int tcp_gro_complete(struct sk_buff *skb)  {  	struct tcphdr *th = tcp_hdr(skb); -	skb->csum_start = skb_transport_header(skb) - skb->head; +	skb->csum_start = (unsigned char *)th - skb->head;  	skb->csum_offset = offsetof(struct tcphdr, check);  	skb->ip_summed = CHECKSUM_PARTIAL; @@ -273,46 +270,46 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb)  static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)  { +	/* Use the IP hdr immediately proceeding for this transport */  	const struct iphdr *iph = skb_gro_network_header(skb);  	__wsum wsum; -	__sum16 sum; + +	/* Don't bother verifying checksum if we're going to flush anyway. */ +	if (NAPI_GRO_CB(skb)->flush) +		goto skip_csum; + +	wsum = NAPI_GRO_CB(skb)->csum;  	switch (skb->ip_summed) { +	case CHECKSUM_NONE: +		wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), +				    0); + +		/* fall through */ +  	case CHECKSUM_COMPLETE:  		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, -				  skb->csum)) { +				  wsum)) {  			skb->ip_summed = CHECKSUM_UNNECESSARY;  			break;  		} -flush: +  		NAPI_GRO_CB(skb)->flush = 1;  		return NULL; - -	case CHECKSUM_NONE: -		wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -					  skb_gro_len(skb), IPPROTO_TCP, 0); -		sum = csum_fold(skb_checksum(skb, -					     skb_gro_offset(skb), -					     skb_gro_len(skb), -					     wsum)); -		if (sum) -			goto flush; - -		skb->ip_summed = CHECKSUM_UNNECESSARY; -		break;  	} +skip_csum:  	return tcp_gro_receive(head, skb);  } -static int tcp4_gro_complete(struct sk_buff *skb) +static int tcp4_gro_complete(struct sk_buff *skb, int thoff)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct tcphdr *th = tcp_hdr(skb); -	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), -				  iph->saddr, iph->daddr, 0); -	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; +	th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, +				  iph->daddr, 0); +	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;  	return tcp_gro_complete(skb);  } @@ -320,7 +317,7 @@ static int tcp4_gro_complete(struct sk_buff *skb)  static const struct net_offload tcpv4_offload = {  	.callbacks = {  		.gso_send_check	=	tcp_v4_gso_send_check, -		.gso_segment	=	tcp_tso_segment, +		.gso_segment	=	tcp_gso_segment,  		.gro_receive	=	tcp4_gro_receive,  		.gro_complete	=	tcp4_gro_complete,  	}, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c6f01f2cdb3..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)  	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {  		tcp_rearm_rto(sk);  	} + +	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, +		      tcp_skb_pcount(skb));  }  /* SND.NXT, if window was not shrunk. @@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);  static u16 tcp_select_window(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	u32 old_win = tp->rcv_wnd;  	u32 cur_win = tcp_receive_window(tp);  	u32 new_win = __tcp_select_window(sk); @@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)  		 *  		 * Relax Will Robinson.  		 */ +		if (new_win == 0) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPWANTZEROWINDOWADV);  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);  	}  	tp->rcv_wnd = new_win; @@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk)  	new_win >>= tp->rx_opt.rcv_wscale;  	/* If we advertise zero window, disable fast path. */ -	if (new_win == 0) +	if (new_win == 0) {  		tp->pred_flags = 0; +		if (old_win) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPTOZEROWINDOWADV); +	} else if (old_win == 0) { +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); +	}  	return new_win;  } @@ -363,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,   */  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  { +	struct skb_shared_info *shinfo = skb_shinfo(skb); +  	skb->ip_summed = CHECKSUM_PARTIAL;  	skb->csum = 0;  	TCP_SKB_CB(skb)->tcp_flags = flags;  	TCP_SKB_CB(skb)->sacked = 0; -	skb_shinfo(skb)->gso_segs = 1; -	skb_shinfo(skb)->gso_size = 0; -	skb_shinfo(skb)->gso_type = 0; +	shinfo->gso_segs = 1; +	shinfo->gso_size = 0; +	shinfo->gso_type = 0;  	TCP_SKB_CB(skb)->seq = seq;  	if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -406,7 +421,7 @@ struct tcp_out_options {   * Beware: Something in the Internet is very sensitive to the ordering of   * TCP options, we learned this through the hard way, so be careful here.   * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with   * the ordering which we have been using if we want to keep working with   * those broken things (not that it currently hurts anybody as there isn't   * particular reason why the ordering would need to be changed). @@ -612,7 +627,7 @@ static unsigned int tcp_synack_options(struct sock *sk,  		if (unlikely(!ireq->tstamp_ok))  			remaining -= TCPOLEN_SACKPERM_ALIGNED;  	} -	if (foc != NULL) { +	if (foc != NULL && foc->len >= 0) {  		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;  		need = (need + 3) & ~3U;  /* Align to 32 bits */  		if (remaining >= need) { @@ -679,7 +694,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb   *   * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb   * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc   *   * Since transmit from skb destructor is forbidden, we use a tasklet   * to process all sockets that eventually need to send more skbs. @@ -696,12 +711,13 @@ static void tcp_tsq_handler(struct sock *sk)  	if ((1 << sk->sk_state) &  	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |  	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) -		tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); +		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, +			       0, GFP_ATOMIC);  }  /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs.   * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might   * interrupt us (non NAPI drivers)   */  static void tcp_tasklet_func(unsigned long data) @@ -764,6 +780,17 @@ void tcp_release_cb(struct sock *sk)  	if (flags & (1UL << TCP_TSQ_DEFERRED))  		tcp_tsq_handler(sk); +	/* Here begins the tricky part : +	 * We are called from release_sock() with : +	 * 1) BH disabled +	 * 2) sk_lock.slock spinlock held +	 * 3) socket owned by us (sk->sk_lock.owned == 1) +	 * +	 * But following code is meant to be called from BH handlers, +	 * so we should keep BH disabled, but early release socket ownership +	 */ +	sock_release_ownership(sk); +  	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {  		tcp_write_timer_handler(sk);  		__sock_put(sk); @@ -795,7 +822,7 @@ void __init tcp_tasklet_init(void)  /*   * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already   * hold qdisc lock.   */  void tcp_wfree(struct sk_buff *skb) @@ -850,19 +877,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	BUG_ON(!skb || !tcp_skb_pcount(skb)); -	/* If congestion control is doing timestamping, we must -	 * take such a timestamp before we potentially clone/copy. -	 */ -	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) -		__net_timestamp(skb); - -	if (likely(clone_it)) { -		const struct sk_buff *fclone = skb + 1; - -		if (unlikely(skb->fclone == SKB_FCLONE_ORIG && -			     fclone->fclone == SKB_FCLONE_CLONE)) -			NET_INC_STATS_BH(sock_net(sk), -					 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); +	if (clone_it) { +		skb_mstamp_get(&skb->skb_mstamp);  		if (unlikely(skb_cloned(skb)))  			skb = pskb_copy(skb, gfp_mask); @@ -870,6 +886,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  			skb = skb_clone(skb, gfp_mask);  		if (unlikely(!skb))  			return -ENOBUFS; +		/* Our usage of tstamp should remain private */ +		skb->tstamp.tv64 = 0;  	}  	inet = inet_sk(sk); @@ -956,7 +974,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,  			      tcp_skb_pcount(skb)); -	err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); +	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);  	if (likely(err <= 0))  		return err; @@ -986,18 +1004,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)  static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,  				 unsigned int mss_now)  { -	if (skb->len <= mss_now || !sk_can_gso(sk) || -	    skb->ip_summed == CHECKSUM_NONE) { +	struct skb_shared_info *shinfo = skb_shinfo(skb); + +	/* Make sure we own this skb before messing gso_size/gso_segs */ +	WARN_ON_ONCE(skb_cloned(skb)); + +	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {  		/* Avoid the costly divide in the normal  		 * non-TSO case.  		 */ -		skb_shinfo(skb)->gso_segs = 1; -		skb_shinfo(skb)->gso_size = 0; -		skb_shinfo(skb)->gso_type = 0; +		shinfo->gso_segs = 1; +		shinfo->gso_size = 0; +		shinfo->gso_type = 0;  	} else { -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); -		skb_shinfo(skb)->gso_size = mss_now; -		skb_shinfo(skb)->gso_type = sk->sk_gso_type; +		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); +		shinfo->gso_size = mss_now; +		shinfo->gso_type = sk->sk_gso_type;  	}  } @@ -1052,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de   * Remember, these are still headerless SKBs at this point.   */  int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, -		 unsigned int mss_now) +		 unsigned int mss_now, gfp_t gfp)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *buff; @@ -1067,13 +1089,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	if (nsize < 0)  		nsize = 0; -	if (skb_cloned(skb) && -	    skb_is_nonlinear(skb) && -	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, gfp))  		return -ENOMEM;  	/* Get a new skb... force flag on. */ -	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); +	buff = sk_stream_alloc_skb(sk, nsize, gfp);  	if (buff == NULL)  		return -ENOMEM; /* We'll just try again later. */ @@ -1146,6 +1166,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,   */  static void __pskb_trim_head(struct sk_buff *skb, int len)  { +	struct skb_shared_info *shinfo;  	int i, k, eat;  	eat = min_t(int, len, skb_headlen(skb)); @@ -1157,23 +1178,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)  	}  	eat = len;  	k = 0; -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); +	shinfo = skb_shinfo(skb); +	for (i = 0; i < shinfo->nr_frags; i++) { +		int size = skb_frag_size(&shinfo->frags[i]);  		if (size <= eat) {  			skb_frag_unref(skb, i);  			eat -= size;  		} else { -			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; +			shinfo->frags[k] = shinfo->frags[i];  			if (eat) { -				skb_shinfo(skb)->frags[k].page_offset += eat; -				skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); +				shinfo->frags[k].page_offset += eat; +				skb_frag_size_sub(&shinfo->frags[k], eat);  				eat = 0;  			}  			k++;  		}  	} -	skb_shinfo(skb)->nr_frags = k; +	shinfo->nr_frags = k;  	skb_reset_tail_pointer(skb);  	skb->data_len -= len; @@ -1358,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)  	return mss_now;  } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (tp->packets_out >= tp->snd_cwnd) { +	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && +	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { +		/* Limited by application or receiver window. */ +		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); +		u32 win_used = max(tp->snd_cwnd_used, init_win); +		if (win_used < tp->snd_cwnd) { +			tp->snd_ssthresh = tcp_current_ssthresh(sk); +			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; +		} +		tp->snd_cwnd_used = 0; +	} +	tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	/* Track the maximum number of outstanding packets in each +	 * window, and remember whether we were cwnd-limited then. +	 */ +	if (!before(tp->snd_una, tp->max_packets_seq) || +	    tp->packets_out > tp->max_packets_out) { +		tp->max_packets_out = tp->packets_out; +		tp->max_packets_seq = tp->snd_nxt; +		tp->is_cwnd_limited = is_cwnd_limited; +	} + +	if (tcp_is_cwnd_limited(sk)) {  		/* Network is feed fully. */  		tp->snd_cwnd_used = 0;  		tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1378,23 +1431,51 @@ static void tcp_cwnd_validate(struct sock *sk)  	}  } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ +	return after(tp->snd_sml, tp->snd_una) && +		!after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + *  skb_pcount = skb->len / mss_now + */ +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, +				const struct sk_buff *skb) +{ +	if (skb->len < tcp_skb_pcount(skb) * mss_now) +		tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + *    With Minshall's modification: all sent small packets are ACKed.   */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, -					unsigned int mss_now, unsigned int max_segs) +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, +			    int nonagle) +{ +	return partial && +		((nonagle & TCP_NAGLE_CORK) || +		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, +					const struct sk_buff *skb, +					unsigned int mss_now, +					unsigned int max_segs, +					int nonagle)  {  	const struct tcp_sock *tp = tcp_sk(sk); -	u32 needed, window, max_len; +	u32 partial, needed, window, max_len;  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;  	max_len = mss_now * max_segs; @@ -1407,7 +1488,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b  	if (max_len <= needed)  		return max_len; -	return needed - needed % mss_now; +	partial = needed % mss_now; +	/* If last segment is not a full MSS, check if Nagle rules allow us +	 * to include this last segment in this skb. +	 * Otherwise, we'll split the skb at last MSS boundary +	 */ +	if (tcp_nagle_check(partial != 0, tp, nonagle)) +		return needed - partial; + +	return needed;  }  /* Can at least one segment of SKB be sent right now, according to the @@ -1447,28 +1536,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,  	return tso_segs;  } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ -	return after(tp->snd_sml, tp->snd_una) && -		!after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - *    With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, -				  const struct sk_buff *skb, -				  unsigned int mss_now, int nonagle) -{ -	return skb->len < mss_now && -		((nonagle & TCP_NAGLE_CORK) || -		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -}  /* Return true if the Nagle test allows this packet to be   * sent now. @@ -1489,7 +1556,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf  	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))  		return true; -	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) +	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))  		return true;  	return false; @@ -1558,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	/* All of a TSO frame must be composed of paged data.  */  	if (skb->len != skb->data_len) -		return tcp_fragment(sk, skb, len, mss_now); +		return tcp_fragment(sk, skb, len, mss_now, gfp);  	buff = sk_stream_alloc_skb(sk, 0, gfp);  	if (unlikely(buff == NULL)) @@ -1601,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,   *   * This algorithm is from John Heffner.   */ -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, +				 bool *is_cwnd_limited)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1665,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)  	if (!tp->tso_deferred)  		tp->tso_deferred = 1 | (jiffies << 1); +	if (cong_win < send_win && cong_win < skb->len) +		*is_cwnd_limited = true; +  	return true;  send_now: @@ -1825,6 +1896,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  	unsigned int tso_segs, sent_pkts;  	int cwnd_quota;  	int result; +	bool is_cwnd_limited = false;  	sent_pkts = 0; @@ -1849,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		cwnd_quota = tcp_cwnd_test(tp, skb);  		if (!cwnd_quota) { +			is_cwnd_limited = true;  			if (push_one == 2)  				/* Force out a loss probe pkt. */  				cwnd_quota = 1; @@ -1865,7 +1938,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  						      nonagle : TCP_NAGLE_PUSH))))  				break;  		} else { -			if (!push_one && tcp_tso_should_defer(sk, skb)) +			if (!push_one && +			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited))  				break;  		} @@ -1875,12 +1949,22 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		 *  - better RTT estimation and ACK scheduling  		 *  - faster recovery  		 *  - high rates +		 * Alas, some drivers / subsystems require a fair amount +		 * of queued bytes to ensure line rate. +		 * One example is wifi aggregation (802.11 AMPDU)  		 */ -		limit = max(skb->truesize, sk->sk_pacing_rate >> 10); +		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, +			      sk->sk_pacing_rate >> 10);  		if (atomic_read(&sk->sk_wmem_alloc) > limit) {  			set_bit(TSQ_THROTTLED, &tp->tsq_flags); -			break; +			/* It is possible TX completion already happened +			 * before we set TSQ_THROTTLED, so we must +			 * test again the condition. +			 */ +			smp_mb__after_atomic(); +			if (atomic_read(&sk->sk_wmem_alloc) > limit) +				break;  		}  		limit = mss_now; @@ -1888,7 +1972,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  			limit = tcp_mss_split_point(sk, skb, mss_now,  						    min_t(unsigned int,  							  cwnd_quota, -							  sk->sk_gso_max_segs)); +							  sk->sk_gso_max_segs), +						    nonagle);  		if (skb->len > limit &&  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1919,7 +2004,7 @@ repair:  		/* Send one loss probe per tail loss episode. */  		if (push_one != 2)  			tcp_schedule_loss_probe(sk); -		tcp_cwnd_validate(sk); +		tcp_cwnd_validate(sk, is_cwnd_limited);  		return false;  	}  	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); @@ -1930,7 +2015,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	u32 timeout, tlp_time_stamp, rto_time_stamp; -	u32 rtt = tp->srtt >> 3; +	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);  	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))  		return false; @@ -1952,7 +2037,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)  	/* Schedule a loss probe in 2*RTT for SACK capable connections  	 * in Open state, that are either limited by cwnd or application.  	 */ -	if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || +	if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||  	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)  		return false; @@ -1983,6 +2068,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)  	return true;  } +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, +				    const struct sk_buff *skb) +{ +	const struct sk_buff *fclone = skb + 1; + +	if (unlikely(skb->fclone == SKB_FCLONE_ORIG && +		     fclone->fclone == SKB_FCLONE_CLONE)) { +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); +		return true; +	} +	return false; +} +  /* When probe timeout (PTO) fires, send a new segment if one exists, else   * retransmit the last segment.   */ @@ -2008,12 +2112,16 @@ void tcp_send_loss_probe(struct sock *sk)  	if (WARN_ON(!skb))  		goto rearm_timer; +	if (skb_still_in_host_queue(sk, skb)) +		goto rearm_timer; +  	pcount = tcp_skb_pcount(skb);  	if (WARN_ON(!pcount))  		goto rearm_timer;  	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { -		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss))) +		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, +					  GFP_ATOMIC)))  			goto rearm_timer;  		skb = tcp_write_queue_tail(sk);  	} @@ -2021,9 +2129,7 @@ void tcp_send_loss_probe(struct sock *sk)  	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))  		goto rearm_timer; -	/* Probe with zero data doesn't trigger fast recovery. */ -	if (skb->len > 0) -		err = __tcp_retransmit_skb(sk, skb); +	err = __tcp_retransmit_skb(sk, skb);  	/* Record snd_nxt for loss detection. */  	if (likely(!err)) @@ -2037,7 +2143,6 @@ rearm_timer:  	if (likely(!err))  		NET_INC_STATS_BH(sock_net(sk),  				 LINUX_MIB_TCPLOSSPROBES); -	return;  }  /* Push out any pending frames which were held back due to @@ -2135,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk)  	 */  	int mss = icsk->icsk_ack.rcv_mss;  	int free_space = tcp_space(sk); -	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); +	int allowed_space = tcp_full_space(sk); +	int full_space = min_t(int, tp->window_clamp, allowed_space);  	int window;  	if (mss > full_space) @@ -2148,7 +2254,19 @@ u32 __tcp_select_window(struct sock *sk)  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,  					       4U * tp->advmss); -		if (free_space < mss) +		/* free_space might become our new window, make sure we don't +		 * increase it due to wscale. +		 */ +		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + +		/* if free space is less than mss estimate, or is below 1/16th +		 * of the maximum allowed, try to move to zero-window, else +		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and +		 * new incoming data is dropped due to memory limits. +		 * With large window, mss test triggers way too late in order +		 * to announce zero window in time before rmem limit kicks in. +		 */ +		if (free_space < (allowed_space >> 4) || free_space < mss)  			return 0;  	} @@ -2303,6 +2421,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk);  	unsigned int cur_mss; +	int err;  	/* Inconslusive MTU probe */  	if (icsk->icsk_mtup.probe_size) { @@ -2316,6 +2435,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))  		return -EAGAIN; +	if (skb_still_in_host_queue(sk, skb)) +		return -EBUSY; +  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {  		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))  			BUG(); @@ -2338,12 +2460,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		return -EAGAIN;  	if (skb->len > cur_mss) { -		if (tcp_fragment(sk, skb, cur_mss, cur_mss)) +		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))  			return -ENOMEM; /* We'll try again later. */  	} else {  		int oldpcount = tcp_skb_pcount(skb);  		if (unlikely(oldpcount > 1)) { +			if (skb_unclone(skb, GFP_ATOMIC)) +				return -ENOMEM;  			tcp_init_tso_segs(sk, skb, cur_mss);  			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));  		} @@ -2351,21 +2475,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	tcp_retrans_try_collapse(sk, skb, cur_mss); -	/* Some Solaris stacks overoptimize and ignore the FIN on a -	 * retransmit when old data is attached.  So strip it off -	 * since it is cheap to do so and saves bytes on the network. -	 */ -	if (skb->len > 0 && -	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && -	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { -		if (!pskb_trim(skb, 0)) { -			/* Reuse, even though it does some unnecessary work */ -			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, -					     TCP_SKB_CB(skb)->tcp_flags); -			skb->ip_summed = CHECKSUM_NONE; -		} -	} -  	/* Make a copy, if the first transmission SKB clone we made  	 * is still in somebody's hands, else make a clone.  	 */ @@ -2379,11 +2488,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		     skb_headroom(skb) >= 0xFFFF)) {  		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,  						   GFP_ATOMIC); -		return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -			      -ENOBUFS; +		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : +			     -ENOBUFS;  	} else { -		return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +	} + +	if (likely(!err)) { +		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; +		/* Update global TCP statistics. */ +		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); +		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) +			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); +		tp->total_retrans++;  	} +	return err;  }  int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) @@ -2392,11 +2511,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	int err = __tcp_retransmit_skb(sk, skb);  	if (err == 0) { -		/* Update global TCP statistics. */ -		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - -		tp->total_retrans++; -  #if FASTRETRANS_DEBUG > 0  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {  			net_dbg_ratelimited("retrans_out leaked\n"); @@ -2411,15 +2525,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		if (!tp->retrans_stamp)  			tp->retrans_stamp = TCP_SKB_CB(skb)->when; -		tp->undo_retrans += tcp_skb_pcount(skb); -  		/* snd_nxt is stored to detect loss of retransmitted segment,  		 * see tcp_input.c tcp_sacktag_write_queue().  		 */  		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; -	} else { +	} else if (err != -EBUSY) {  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);  	} + +	if (tp->undo_retrans < 0) +		tp->undo_retrans = 0; +	tp->undo_retrans += tcp_skb_pcount(skb);  	return err;  } @@ -2680,7 +2796,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	int tcp_header_size;  	int mss; -	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); +	skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);  	if (unlikely(!skb)) {  		dst_release(dst);  		return NULL; @@ -2695,27 +2811,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)  		mss = tp->rx_opt.user_mss; -	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ -		__u8 rcv_wscale; -		/* Set this up on the first call only */ -		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - -		/* limit the window selection if the user enforce a smaller rx buffer */ -		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && -		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) -			req->window_clamp = tcp_full_space(sk); - -		/* tcp_full_space because it is guaranteed to be the first packet */ -		tcp_select_initial_window(tcp_full_space(sk), -			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), -			&req->rcv_wnd, -			&req->window_clamp, -			ireq->wscale_ok, -			&rcv_wscale, -			dst_metric(dst, RTAX_INITRWND)); -		ireq->rcv_wscale = rcv_wscale; -	} -  	memset(&opts, 0, sizeof(opts));  #ifdef CONFIG_SYN_COOKIES  	if (unlikely(req->cookie_ts)) @@ -2734,8 +2829,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	th->syn = 1;  	th->ack = 1;  	TCP_ECN_make_synack(req, th); -	th->source = ireq->loc_port; -	th->dest = ireq->rmt_port; +	th->source = htons(ireq->ir_num); +	th->dest = ireq->ir_rmt_port;  	/* Setting of flags are superfluous here for callers (and ECE is  	 * not even correctly set)  	 */ @@ -2750,7 +2845,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	th->window = htons(min(req->rcv_wnd, 65535U));  	tcp_options_write((__be32 *)(th + 1), tp, &opts);  	th->doff = (tcp_header_size >> 2); -	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); +	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);  #ifdef CONFIG_TCP_MD5SIG  	/* Okay, we have all we need - do the md5 hash if needed */ @@ -2765,7 +2860,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  EXPORT_SYMBOL(tcp_make_synack);  /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk)  {  	const struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_sock *tp = tcp_sk(sk); @@ -2887,7 +2982,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)  	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -  		MAX_TCP_OPTION_SPACE; -	syn_data = skb_copy_expand(syn, skb_headroom(syn), space, +	space = min_t(size_t, space, fo->size); + +	/* limit to order-0 allocations */ +	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + +	syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,  				   sk->sk_allocation);  	if (syn_data == NULL)  		goto fallback; @@ -2917,9 +3017,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)  	tcp_connect_queue_skb(sk, data);  	fo->copied = data->len; +	/* syn_data is about to be sent, we need to take current time stamps +	 * for the packets that are in write queue : SYN packet and DATA +	 */ +	skb_mstamp_get(&syn->skb_mstamp); +	data->skb_mstamp = syn->skb_mstamp; +  	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {  		tp->syn_data = (fo->copied > 0); -		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);  		goto done;  	}  	syn_data = NULL; @@ -3007,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk)  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements  		 * directly.  		 */ -		if (tp->srtt) { -			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); +		if (tp->srtt_us) { +			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), +					TCP_DELACK_MIN);  			if (rtt < max_ato)  				max_ato = rtt; @@ -3106,7 +3213,6 @@ void tcp_send_window_probe(struct sock *sk)  {  	if (sk->sk_state == TCP_ESTABLISHED) {  		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; -		tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;  		tcp_xmit_probe_skb(sk, 0);  	}  } @@ -3137,7 +3243,7 @@ int tcp_write_wakeup(struct sock *sk)  		    skb->len > mss) {  			seg_size = min(seg_size, mss);  			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; -			if (tcp_fragment(sk, skb, seg_size, mss)) +			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))  				return -1;  		} else if (!tcp_skb_pcount(skb))  			tcp_set_skb_tso_segs(sk, skb, mss); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 611beab38a0..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper");  MODULE_LICENSE("GPL");  MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly;  MODULE_PARM_DESC(port, "Port to match (0=all)");  module_param(port, int, 0); @@ -46,7 +46,7 @@ static unsigned int bufsize __read_mostly = 4096;  MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");  module_param(bufsize, uint, 0); -static unsigned int fwmark __read_mostly = 0; +static unsigned int fwmark __read_mostly;  MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");  module_param(fwmark, uint, 0); @@ -101,22 +101,6 @@ static inline int tcp_probe_avail(void)  		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\  	} while (0)						\ -#if IS_ENABLED(CONFIG_IPV6) -#define tcp_probe_copy_fl_to_si6(inet, si6, mem)		\ -	do {							\ -		struct ipv6_pinfo *pi6 = inet->pinet6;		\ -		si6.sin6_family = AF_INET6;			\ -		si6.sin6_port = inet->inet_##mem##port;		\ -		si6.sin6_addr = pi6->mem##addr;			\ -		si6.sin6_flowinfo = 0; /* No need here. */	\ -		si6.sin6_scope_id = 0;	/* No need here. */	\ -	} while (0) -#else -#define tcp_probe_copy_fl_to_si6(fl, si6, mem)			\ -	do {							\ -		memset(&si6, 0, sizeof(si6));			\ -	} while (0) -#endif  /*   * Hook inserted to be called before each receive packet. @@ -147,8 +131,17 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,  				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);  				break;  			case AF_INET6: -				tcp_probe_copy_fl_to_si6(inet, p->src.v6, s); -				tcp_probe_copy_fl_to_si6(inet, p->dst.v6, d); +				memset(&p->src.v6, 0, sizeof(p->src.v6)); +				memset(&p->dst.v6, 0, sizeof(p->dst.v6)); +#if IS_ENABLED(CONFIG_IPV6) +				p->src.v6.sin6_family = AF_INET6; +				p->src.v6.sin6_port = inet->inet_sport; +				p->src.v6.sin6_addr = inet6_sk(sk)->saddr; + +				p->dst.v6.sin6_family = AF_INET6; +				p->dst.v6.sin6_port = inet->inet_dport; +				p->dst.v6.sin6_addr = sk->sk_v6_daddr; +#endif  				break;  			default:  				BUG(); @@ -161,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,  			p->snd_wnd = tp->snd_wnd;  			p->rcv_wnd = tp->rcv_wnd;  			p->ssthresh = tcp_current_ssthresh(sk); -			p->srtt = tp->srtt >> 3; +			p->srtt = tp->srtt_us >> 3;  			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);  		} diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 8ce55b8aaec..8250949b885 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,15 +15,15 @@  #define TCP_SCALABLE_AI_CNT	50U  #define TCP_SCALABLE_MD_SCALE	3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else  		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));  } @@ -38,7 +38,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)  static struct tcp_congestion_ops tcp_scalable __read_mostly = {  	.ssthresh	= tcp_scalable_ssthresh,  	.cong_avoid	= tcp_scalable_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.owner		= THIS_MODULE,  	.name		= "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4b85e6f636c..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,12 +156,19 @@ static bool retransmits_timed_out(struct sock *sk,  static int tcp_write_timeout(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk);  	int retry_until;  	bool do_reset, syn_set = false;  	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { -		if (icsk->icsk_retransmits) +		if (icsk->icsk_retransmits) {  			dst_negative_advice(sk); +			if (tp->syn_fastopen || tp->syn_data) +				tcp_fastopen_cache_set(sk, 0, NULL, true); +			if (tp->syn_data) +				NET_INC_STATS_BH(sock_net(sk), +						 LINUX_MIB_TCPFASTOPENACTIVEFAIL); +		}  		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;  		syn_set = true;  	} else { @@ -374,9 +381,8 @@ void tcp_retransmit_timer(struct sock *sk)  		}  #if IS_ENABLED(CONFIG_IPV6)  		else if (sk->sk_family == AF_INET6) { -			struct ipv6_pinfo *np = inet6_sk(sk);  			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), -				       &np->daddr, +				       &sk->sk_v6_daddr,  				       ntohs(inet->inet_dport), inet->inet_num,  				       tp->snd_una, tp->snd_nxt);  		} diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 80fa2bfd7ed..9a5e05f27f4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,13 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)  	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);  } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct vegas *vegas = inet_csk_ca(sk);  	if (!vegas->doing_vegas_now) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	} @@ -194,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  			/* We don't have enough RTT samples to do the Vegas  			 * calculation, so we'll behave like Reno.  			 */ -			tcp_reno_cong_avoid(sk, ack, in_flight); +			tcp_reno_cong_avoid(sk, ack, acked);  		} else {  			u32 rtt, diff;  			u64 target_cwnd; @@ -243,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {  				/* Slow start.  */ -				tcp_slow_start(tp); +				tcp_slow_start(tp, acked);  			} else {  				/* Congestion avoidance. */ @@ -283,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  	}  	/* Use normal slow start */  	else if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  } @@ -305,11 +305,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)  EXPORT_SYMBOL_GPL(tcp_vegas_get_info);  static struct tcp_congestion_ops tcp_vegas __read_mostly = { -	.flags		= TCP_CONG_RTT_STAMP,  	.init		= tcp_vegas_init,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_vegas_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.pkts_acked	= tcp_vegas_pkts_acked,  	.set_state	= tcp_vegas_state,  	.cwnd_event	= tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 6c0eea2f824..0531b99d863 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -15,10 +15,10 @@ struct vegas {  	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */  }; -extern void tcp_vegas_init(struct sock *sk); -extern void tcp_vegas_state(struct sock *sk, u8 ca_state); -extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); -extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +void tcp_vegas_init(struct sock *sk); +void tcp_vegas_state(struct sock *sk, u8 ca_state); +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);  #endif	/* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index ac43cd747bc..27b9825753d 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,18 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)  		tcp_veno_init(sk);  } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct veno *veno = inet_csk_ca(sk);  	if (!veno->doing_veno_now) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	}  	/* limited by applications */ -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* We do the Veno calculations only if we got enough rtt samples */ @@ -133,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		/* We don't have enough rtt samples to do the Veno  		 * calculation, so we'll behave like Reno.  		 */ -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  	} else {  		u64 target_cwnd;  		u32 rtt; @@ -152,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		if (tp->snd_cwnd <= tp->snd_ssthresh) {  			/* Slow start.  */ -			tcp_slow_start(tp); +			tcp_slow_start(tp, acked);  		} else {  			/* Congestion avoidance. */  			if (veno->diff < beta) { @@ -202,7 +202,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)  }  static struct tcp_congestion_ops tcp_veno __read_mostly = { -	.flags		= TCP_CONG_RTT_STAMP,  	.init		= tcp_veno_init,  	.ssthresh	= tcp_veno_ssthresh,  	.cong_avoid	= tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 76a1e23259e..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {  	.init		= tcp_westwood_init,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_westwood_bw_rttmin,  	.cwnd_event	= tcp_westwood_event,  	.get_info	= tcp_westwood_info,  	.pkts_acked	= tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 05c3b6f0e8e..599b79b8eac 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@   *   YeAH TCP   *   * For further details look at: - *    http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + *   https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf   *   */  #include <linux/mm.h> @@ -15,13 +15,13 @@  #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA       80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA        1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA        3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON      1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY          8 //lin maximum delta from base -#define TCP_YEAH_RHO         16 //lin minimum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA        50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA       80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA        1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA        3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON      1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY          8 /* maximum delta from base */ +#define TCP_YEAH_RHO         16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA        50 /* minimum number of state switches to reset reno_count */  #define TCP_SCALABLE_AI_CNT	 100U @@ -69,16 +69,16 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)  	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);  } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct yeah *yeah = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else if (!yeah->doing_reno_now) {  		/* Scalable */ @@ -213,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {  	if (yeah->doing_reno_now < TCP_YEAH_RHO) {  		reduction = yeah->lastQ; -		reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); +		reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); -		reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); +		reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);  	} else  		reduction = max(tp->snd_cwnd>>1, 2U); @@ -226,11 +226,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {  }  static struct tcp_congestion_ops tcp_yeah __read_mostly = { -	.flags		= TCP_CONG_RTT_STAMP,  	.init		= tcp_yeah_init,  	.ssthresh	= tcp_yeah_ssthresh,  	.cong_avoid	= tcp_yeah_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.set_state	= tcp_vegas_state,  	.cwnd_event	= tcp_vegas_cwnd_event,  	.get_info	= tcp_vegas_get_info, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0ca44df51ee..7d5a8661df7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -103,6 +103,7 @@  #include <linux/seq_file.h>  #include <net/net_namespace.h>  #include <net/icmp.h> +#include <net/inet_hashtables.h>  #include <net/route.h>  #include <net/checksum.h>  #include <net/xfrm.h> @@ -219,10 +220,10 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,  		unsigned short first, last;  		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1; -		rand = net_random(); +		rand = prandom_u32();  		first = (((u64)rand * remaining) >> 32) + low;  		/*  		 * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -245,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,  			do {  				if (low <= snum && snum <= high &&  				    !test_bit(snum >> udptable->log, bitmap) && -				    !inet_is_reserved_local_port(snum)) +				    !inet_is_local_reserved_port(net, snum))  					goto found;  				snum += rand;  			} while (snum != first); @@ -406,6 +407,18 @@ static inline int compute_score2(struct sock *sk, struct net *net,  	return score;  } +static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, +				 const __u16 lport, const __be32 faddr, +				 const __be16 fport) +{ +	static u32 udp_ehash_secret __read_mostly; + +	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); + +	return __inet_ehashfn(laddr, lport, faddr, fport, +			      udp_ehash_secret + net_hash_mix(net)); +} +  /* called with read_rcu_lock() */  static struct sock *udp4_lib_lookup2(struct net *net, @@ -429,8 +442,8 @@ begin:  			badness = score;  			reuseport = sk->sk_reuseport;  			if (reuseport) { -				hash = inet_ehashfn(net, daddr, hnum, -						    saddr, sport); +				hash = udp_ehashfn(net, daddr, hnum, +						   saddr, sport);  				matches = 1;  			}  		} else if (score == badness && reuseport) { @@ -510,8 +523,8 @@ begin:  			badness = score;  			reuseport = sk->sk_reuseport;  			if (reuseport) { -				hash = inet_ehashfn(net, daddr, hnum, -						    saddr, sport); +				hash = udp_ehashfn(net, daddr, hnum, +						   saddr, sport);  				matches = 1;  			}  		} else if (score == badness && reuseport) { @@ -547,15 +560,11 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,  						 __be16 sport, __be16 dport,  						 struct udp_table *udptable)  { -	struct sock *sk;  	const struct iphdr *iph = ip_hdr(skb); -	if (unlikely(sk = skb_steal_sock(skb))) -		return sk; -	else -		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, -					 iph->daddr, dport, inet_iif(skb), -					 udptable); +	return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, +				 iph->daddr, dport, inet_iif(skb), +				 udptable);  }  struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -565,6 +574,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,  }  EXPORT_SYMBOL_GPL(udp4_lib_lookup); +static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, +				       __be16 loc_port, __be32 loc_addr, +				       __be16 rmt_port, __be32 rmt_addr, +				       int dif, unsigned short hnum) +{ +	struct inet_sock *inet = inet_sk(sk); + +	if (!net_eq(sock_net(sk), net) || +	    udp_sk(sk)->udp_port_hash != hnum || +	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) || +	    (inet->inet_dport != rmt_port && inet->inet_dport) || +	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || +	    ipv6_only_sock(sk) || +	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) +		return false; +	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) +		return false; +	return true; +} +  static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,  					     __be16 loc_port, __be32 loc_addr,  					     __be16 rmt_port, __be32 rmt_addr, @@ -575,20 +604,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,  	unsigned short hnum = ntohs(loc_port);  	sk_nulls_for_each_from(s, node) { -		struct inet_sock *inet = inet_sk(s); - -		if (!net_eq(sock_net(s), net) || -		    udp_sk(s)->udp_port_hash != hnum || -		    (inet->inet_daddr && inet->inet_daddr != rmt_addr) || -		    (inet->inet_dport != rmt_port && inet->inet_dport) || -		    (inet->inet_rcv_saddr && -		     inet->inet_rcv_saddr != loc_addr) || -		    ipv6_only_sock(s) || -		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) -			continue; -		if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) -			continue; -		goto found; +		if (__udp_is_mcast_sock(net, s, +					loc_port, loc_addr, +					rmt_port, rmt_addr, +					dif, hnum)) +			goto found;  	}  	s = NULL;  found: @@ -707,13 +727,12 @@ EXPORT_SYMBOL(udp_flush_pending_frames);  void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)  {  	struct udphdr *uh = udp_hdr(skb); -	struct sk_buff *frags = skb_shinfo(skb)->frag_list;  	int offset = skb_transport_offset(skb);  	int len = skb->len - offset;  	int hlen = len;  	__wsum csum = 0; -	if (!frags) { +	if (!skb_has_frag_list(skb)) {  		/*  		 * Only one fragment on the socket.  		 */ @@ -722,15 +741,17 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)  		uh->check = ~csum_tcpudp_magic(src, dst, len,  					       IPPROTO_UDP, 0);  	} else { +		struct sk_buff *frags; +  		/*  		 * HW-checksum won't work as there are two or more  		 * fragments on the socket so that all csums of sk_buffs  		 * should be together  		 */ -		do { +		skb_walk_frags(skb, frags) {  			csum = csum_add(csum, frags->csum);  			hlen -= frags->len; -		} while ((frags = frags->next)); +		}  		csum = skb_checksum(skb, offset, hlen, csum);  		skb->ip_summed = CHECKSUM_NONE; @@ -742,6 +763,43 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)  }  EXPORT_SYMBOL_GPL(udp4_hwcsum); +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp_set_csum(bool nocheck, struct sk_buff *skb, +		  __be32 saddr, __be32 daddr, int len) +{ +	struct udphdr *uh = udp_hdr(skb); + +	if (nocheck) +		uh->check = 0; +	else if (skb_is_gso(skb)) +		uh->check = ~udp_v4_check(len, saddr, daddr, 0); +	else if (skb_dst(skb) && skb_dst(skb)->dev && +		 (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + +		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + +		skb->ip_summed = CHECKSUM_PARTIAL; +		skb->csum_start = skb_transport_header(skb) - skb->head; +		skb->csum_offset = offsetof(struct udphdr, check); +		uh->check = ~udp_v4_check(len, saddr, daddr, 0); +	} else { +		__wsum csum; + +		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + +		uh->check = 0; +		csum = skb_checksum(skb, 0, len, 0); +		uh->check = udp_v4_check(len, saddr, daddr, csum); +		if (uh->check == 0) +			uh->check = CSUM_MANGLED_0; + +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	} +} +EXPORT_SYMBOL(udp_set_csum); +  static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)  {  	struct sock *sk = skb->sk; @@ -765,7 +823,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)  	if (is_udplite)  				 /*     UDP-Lite      */  		csum = udplite_csum(skb); -	else if (sk->sk_no_check == UDP_CSUM_NOXMIT) {   /* UDP csum disabled */ +	else if (sk->sk_no_check_tx) {   /* UDP csum disabled */  		skb->ip_summed = CHECKSUM_NONE;  		goto send; @@ -855,6 +913,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -880,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	 *	Get and verify the address.  	 */  	if (msg->msg_name) { -		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);  		if (msg->msg_namelen < sizeof(*usin))  			return -EINVAL;  		if (usin->sin_family != AF_INET) { @@ -909,7 +969,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	sock_tx_timestamp(sk, &ipc.tx_flags);  	if (msg->msg_controllen) { -		err = ip_cmsg_send(sock_net(sk), msg, &ipc); +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, +				   sk->sk_family == AF_INET6);  		if (err)  			return err;  		if (ipc.opt) @@ -938,7 +999,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		faddr = ipc.opt->opt.faddr;  		connected = 0;  	} -	tos = RT_TOS(inet->tos); +	tos = get_rttos(&ipc, inet);  	if (sock_flag(sk, SOCK_LOCALROUTE) ||  	    (msg->msg_flags & MSG_DONTROUTE) ||  	    (ipc.opt && ipc.opt->opt.is_strictroute)) { @@ -964,7 +1025,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		fl4 = &fl4_stack;  		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,  				   RT_SCOPE_UNIVERSE, sk->sk_protocol, -				   inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, +				   inet_sk_flowi_flags(sk),  				   faddr, saddr, dport, inet->inet_sport);  		security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); @@ -973,7 +1034,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  			err = PTR_ERR(rt);  			rt = NULL;  			if (err == -ENETUNREACH) -				IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); +				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);  			goto out;  		} @@ -1072,6 +1133,9 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,  	struct udp_sock *up = udp_sk(sk);  	int ret; +	if (flags & MSG_SENDPAGE_NOTLAST) +		flags |= MSG_MORE; +  	if (!up->pending) {  		struct msghdr msg = {	.msg_flags = flags|MSG_MORE }; @@ -1201,7 +1265,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		size_t len, int noblock, int flags, int *addr_len)  {  	struct inet_sock *inet = inet_sk(sk); -	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct sk_buff *skb;  	unsigned int ulen, copied;  	int peeked, off = 0; @@ -1209,14 +1273,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	int is_udplite = IS_UDPLITE(sk);  	bool slow; -	/* -	 *	Check any passed addresses -	 */ -	if (addr_len) -		*addr_len = sizeof(*sin); -  	if (flags & MSG_ERRQUEUE) -		return ip_recv_error(sk, msg, len); +		return ip_recv_error(sk, msg, len, addr_len);  try_again:  	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), @@ -1276,6 +1334,7 @@ try_again:  		sin->sin_port = udp_hdr(skb)->source;  		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;  		memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	if (inet->cmsg_flags)  		ip_cmsg_recv(msg, skb); @@ -1403,8 +1462,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  {  	int rc; -	if (inet_sk(sk)->inet_daddr) +	if (inet_sk(sk)->inet_daddr) {  		sock_rps_save_rxhash(sk, skb); +		sk_mark_napi_id(sk, skb); +	}  	rc = sock_queue_rcv_skb(sk, skb);  	if (rc < 0) { @@ -1472,6 +1533,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {  			int ret; +			/* Verify checksum before giving to encap */ +			if (udp_lib_checksum_complete(skb)) +				goto csum_error; +  			ret = encap_rcv(sk, skb);  			if (ret <= 0) {  				UDP_INC_STATS_BH(sock_net(sk), @@ -1523,12 +1588,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		goto csum_error; -	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) +	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { +		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, +				 is_udplite);  		goto drop; +	}  	rc = 0; -	ipv4_pktinfo_prepare(skb); +	ipv4_pktinfo_prepare(sk, skb);  	bh_lock_sock(sk);  	if (!sock_owned_by_user(sk))  		rc = __udp_queue_rcv_skb(sk, skb); @@ -1577,6 +1645,18 @@ static void flush_stack(struct sock **stack, unsigned int count,  		kfree_skb(skb1);  } +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ +	struct dst_entry *old; + +	dst_hold(dst); +	old = xchg(&sk->sk_rx_dst, dst); +	dst_release(old); +} +  /*   *	Multicasts and broadcasts go to each listener.   * @@ -1637,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,  static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,  				 int proto)  { -	const struct iphdr *iph;  	int err;  	UDP_SKB_CB(skb)->partial_cov = 0; @@ -1649,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,  			return err;  	} -	iph = ip_hdr(skb); -	if (uh->check == 0) { -		skb->ip_summed = CHECKSUM_UNNECESSARY; -	} else if (skb->ip_summed == CHECKSUM_COMPLETE) { -		if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, -				      proto, skb->csum)) -			skb->ip_summed = CHECKSUM_UNNECESSARY; -	} -	if (!skb_csum_unnecessary(skb)) -		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -					       skb->len, proto, 0); -	/* Probably, we should checksum udp header (it should be in cache -	 * in any case) and data in tiny packets (< rx copybreak). -	 */ - -	return 0; +	return skb_checksum_init_zero_check(skb, proto, uh->check, +					    inet_compute_pseudo);  }  /* @@ -1705,16 +1770,33 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,  	if (udp4_csum_init(skb, uh, proto))  		goto csum_error; -	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) -		return __udp4_lib_mcast_deliver(net, skb, uh, -				saddr, daddr, udptable); +	sk = skb_steal_sock(skb); +	if (sk) { +		struct dst_entry *dst = skb_dst(skb); +		int ret; -	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); +		if (unlikely(sk->sk_rx_dst != dst)) +			udp_sk_rx_dst_set(sk, dst); + +		ret = udp_queue_rcv_skb(sk, skb); +		sock_put(sk); +		/* a return value > 0 means to resubmit the input, but +		 * it wants the return to be -protocol, or 0 +		 */ +		if (ret > 0) +			return -ret; +		return 0; +	} else { +		if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) +			return __udp4_lib_mcast_deliver(net, skb, uh, +					saddr, daddr, udptable); + +		sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); +	}  	if (sk != NULL) {  		int ret; -		sk_mark_napi_id(sk, skb);  		ret = udp_queue_rcv_skb(sk, skb);  		sock_put(sk); @@ -1768,6 +1850,142 @@ drop:  	return 0;  } +/* We can only early demux multicast if there is a single matching socket. + * If more than one socket found returns NULL + */ +static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, +						  __be16 loc_port, __be32 loc_addr, +						  __be16 rmt_port, __be32 rmt_addr, +						  int dif) +{ +	struct sock *sk, *result; +	struct hlist_nulls_node *node; +	unsigned short hnum = ntohs(loc_port); +	unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); +	struct udp_hslot *hslot = &udp_table.hash[slot]; + +	/* Do not bother scanning a too big list */ +	if (hslot->count > 10) +		return NULL; + +	rcu_read_lock(); +begin: +	count = 0; +	result = NULL; +	sk_nulls_for_each_rcu(sk, node, &hslot->head) { +		if (__udp_is_mcast_sock(net, sk, +					loc_port, loc_addr, +					rmt_port, rmt_addr, +					dif, hnum)) { +			result = sk; +			++count; +		} +	} +	/* +	 * if the nulls value we got at the end of this lookup is +	 * not the expected one, we must restart lookup. +	 * We probably met an item that was moved to another chain. +	 */ +	if (get_nulls_value(node) != slot) +		goto begin; + +	if (result) { +		if (count != 1 || +		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) +			result = NULL; +		else if (unlikely(!__udp_is_mcast_sock(net, result, +						       loc_port, loc_addr, +						       rmt_port, rmt_addr, +						       dif, hnum))) { +			sock_put(result); +			result = NULL; +		} +	} +	rcu_read_unlock(); +	return result; +} + +/* For unicast we should only early demux connected sockets or we can + * break forwarding setups.  The chains here can be long so only check + * if the first socket is an exact match and if not move on. + */ +static struct sock *__udp4_lib_demux_lookup(struct net *net, +					    __be16 loc_port, __be32 loc_addr, +					    __be16 rmt_port, __be32 rmt_addr, +					    int dif) +{ +	struct sock *sk, *result; +	struct hlist_nulls_node *node; +	unsigned short hnum = ntohs(loc_port); +	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); +	unsigned int slot2 = hash2 & udp_table.mask; +	struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; +	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); +	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + +	rcu_read_lock(); +	result = NULL; +	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { +		if (INET_MATCH(sk, net, acookie, +			       rmt_addr, loc_addr, ports, dif)) +			result = sk; +		/* Only check first socket in chain */ +		break; +	} + +	if (result) { +		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) +			result = NULL; +		else if (unlikely(!INET_MATCH(sk, net, acookie, +					      rmt_addr, loc_addr, +					      ports, dif))) { +			sock_put(result); +			result = NULL; +		} +	} +	rcu_read_unlock(); +	return result; +} + +void udp_v4_early_demux(struct sk_buff *skb) +{ +	struct net *net = dev_net(skb->dev); +	const struct iphdr *iph; +	const struct udphdr *uh; +	struct sock *sk; +	struct dst_entry *dst; +	int dif = skb->dev->ifindex; + +	/* validate the packet */ +	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) +		return; + +	iph = ip_hdr(skb); +	uh = udp_hdr(skb); + +	if (skb->pkt_type == PACKET_BROADCAST || +	    skb->pkt_type == PACKET_MULTICAST) +		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, +						   uh->source, iph->saddr, dif); +	else if (skb->pkt_type == PACKET_HOST) +		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, +					     uh->source, iph->saddr, dif); +	else +		return; + +	if (!sk) +		return; + +	skb->sk = sk; +	skb->destructor = sock_edemux; +	dst = sk->sk_rx_dst; + +	if (dst) +		dst = dst_check(dst, 0); +	if (dst) +		skb_dst_set_noref(skb, dst); +} +  int udp_rcv(struct sk_buff *skb)  {  	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -1795,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  		       int (*push_pending_frames)(struct sock *))  {  	struct udp_sock *up = udp_sk(sk); -	int val; +	int val, valbool;  	int err = 0;  	int is_udplite = IS_UDPLITE(sk); @@ -1805,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  	if (get_user(val, (int __user *)optval))  		return -EFAULT; +	valbool = val ? 1 : 0; +  	switch (optname) {  	case UDP_CORK:  		if (val != 0) { @@ -1834,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  		}  		break; +	case UDP_NO_CHECK6_TX: +		up->no_check6_tx = valbool; +		break; + +	case UDP_NO_CHECK6_RX: +		up->no_check6_rx = valbool; +		break; +  	/*  	 * 	UDP-Lite's partial checksum coverage (RFC 3828).  	 */ @@ -1916,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,  		val = up->encap_type;  		break; +	case UDP_NO_CHECK6_TX: +		val = up->no_check6_tx; +		break; + +	case UDP_NO_CHECK6_RX: +		val = up->no_check6_rx; +		break; +  	/* The following two cannot be changed on UDP sockets, the return is  	 * always 0 (which corresponds to the full checksum coverage of UDP). */  	case UDPLITE_SEND_CSCOV: @@ -2150,7 +2386,7 @@ EXPORT_SYMBOL(udp_proc_unregister);  /* ------------------------------------------------------------------------ */  static void udp4_format_sock(struct sock *sp, struct seq_file *f, -		int bucket, int *len) +		int bucket)  {  	struct inet_sock *inet = inet_sk(sp);  	__be32 dest = inet->inet_daddr; @@ -2159,7 +2395,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,  	__u16 srcp	  = ntohs(inet->inet_sport);  	seq_printf(f, "%5d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d%n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",  		bucket, src, srcp, dest, destp, sp->sk_state,  		sk_wmem_alloc_get(sp),  		sk_rmem_alloc_get(sp), @@ -2167,23 +2403,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,  		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),  		0, sock_i_ino(sp),  		atomic_read(&sp->sk_refcnt), sp, -		atomic_read(&sp->sk_drops), len); +		atomic_read(&sp->sk_drops));  }  int udp4_seq_show(struct seq_file *seq, void *v)  { +	seq_setwidth(seq, 127);  	if (v == SEQ_START_TOKEN) -		seq_printf(seq, "%-127s\n", -			   "  sl  local_address rem_address   st tx_queue " +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "  			   "rx_queue tr tm->when retrnsmt   uid  timeout "  			   "inode ref pointer drops");  	else {  		struct udp_iter_state *state = seq->private; -		int len; -		udp4_format_sock(v, seq, state->bucket, &len); -		seq_printf(seq, "%*s\n", 127 - len, ""); +		udp4_format_sock(v, seq, state->bucket);  	} +	seq_pad(seq, '\n');  	return 0;  } @@ -2296,11 +2531,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,  				       netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL); +	u16 mac_offset = skb->mac_header;  	int mac_len = skb->mac_len;  	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);  	__be16 protocol = skb->protocol;  	netdev_features_t enc_features; -	int outer_hlen; +	int udp_offset, outer_hlen; +	unsigned int oldlen; +	bool need_csum; + +	oldlen = (u16)~skb->len;  	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))  		goto out; @@ -2312,17 +2552,25 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,  	skb->mac_len = skb_inner_network_offset(skb);  	skb->protocol = htons(ETH_P_TEB); +	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); +	if (need_csum) +		skb->encap_hdr_csum = 1; +  	/* segment inner packet. */  	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);  	segs = skb_mac_gso_segment(skb, enc_features); -	if (!segs || IS_ERR(segs)) +	if (!segs || IS_ERR(segs)) { +		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, +				     mac_len);  		goto out; +	}  	outer_hlen = skb_tnl_header_len(skb); +	udp_offset = outer_hlen - tnl_hlen;  	skb = segs;  	do {  		struct udphdr *uh; -		int udp_offset = outer_hlen - tnl_hlen; +		int len;  		skb_reset_inner_headers(skb);  		skb->encapsulation = 1; @@ -2333,31 +2581,20 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,  		skb_reset_mac_header(skb);  		skb_set_network_header(skb, mac_len);  		skb_set_transport_header(skb, udp_offset); +		len = skb->len - udp_offset;  		uh = udp_hdr(skb); -		uh->len = htons(skb->len - udp_offset); - -		/* csum segment if tunnel sets skb with csum. */ -		if (protocol == htons(ETH_P_IP) && unlikely(uh->check)) { -			struct iphdr *iph = ip_hdr(skb); +		uh->len = htons(len); -			uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, -						       skb->len - udp_offset, -						       IPPROTO_UDP, 0); -			uh->check = csum_fold(skb_checksum(skb, udp_offset, -							   skb->len - udp_offset, 0)); -			if (uh->check == 0) -				uh->check = CSUM_MANGLED_0; +		if (need_csum) { +			__be32 delta = htonl(oldlen + len); -		} else if (protocol == htons(ETH_P_IPV6)) { -			struct ipv6hdr *ipv6h = ipv6_hdr(skb); -			u32 len = skb->len - udp_offset; +			uh->check = ~csum_fold((__force __wsum) +					       ((__force u32)uh->check + +						(__force u32)delta)); +			uh->check = gso_make_checksum(skb, ~uh->check); -			uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, -						     len, IPPROTO_UDP, 0); -			uh->check = csum_fold(skb_checksum(skb, udp_offset, len, 0));  			if (uh->check == 0)  				uh->check = CSUM_MANGLED_0; -			skb->ip_summed = CHECKSUM_NONE;  		}  		skb->protocol = protocol; diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 5a681e298b9..f3c27899f62 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,30 +5,30 @@  #include <net/protocol.h>  #include <net/inet_common.h> -extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); -extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); -extern int	udp_v4_get_port(struct sock *sk, unsigned short snum); +int udp_v4_get_port(struct sock *sk, unsigned short snum); -extern int	udp_setsockopt(struct sock *sk, int level, int optname, -			       char __user *optval, unsigned int optlen); -extern int	udp_getsockopt(struct sock *sk, int level, int optname, -			       char __user *optval, int __user *optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, +		   char __user *optval, unsigned int optlen); +int udp_getsockopt(struct sock *sk, int level, int optname, +		   char __user *optval, int __user *optlen);  #ifdef CONFIG_COMPAT -extern int	compat_udp_setsockopt(struct sock *sk, int level, int optname, -				      char __user *optval, unsigned int optlen); -extern int	compat_udp_getsockopt(struct sock *sk, int level, int optname, -				      char __user *optval, int __user *optlen); +int compat_udp_setsockopt(struct sock *sk, int level, int optname, +			  char __user *optval, unsigned int optlen); +int compat_udp_getsockopt(struct sock *sk, int level, int optname, +			  char __user *optval, int __user *optlen);  #endif -extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, -			    size_t len, int noblock, int flags, int *addr_len); -extern int	udp_sendpage(struct sock *sk, struct page *page, int offset, -			     size_t size, int flags); -extern int	udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -extern void	udp_destroy_sock(struct sock *sk); +int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +		size_t len, int noblock, int flags, int *addr_len); +int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, +		 int flags); +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udp_destroy_sock(struct sock *sk);  #ifdef CONFIG_PROC_FS -extern int	udp4_seq_show(struct seq_file *seq, void *v); +int udp4_seq_show(struct seq_file *seq, void *v);  #endif  #endif	/* _UDP4_IMPL_H */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f35eccaa855..546d2d439dd 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,6 +14,17 @@  #include <net/udp.h>  #include <net/protocol.h> +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; + +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + +struct udp_offload_priv { +	struct udp_offload	*offload; +	struct rcu_head		rcu; +	struct udp_offload_priv __rcu *next; +}; +  static int udp4_ufo_send_check(struct sk_buff *skb)  {  	if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -41,6 +52,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,  {  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	unsigned int mss; +	int offset; +	__wsum csum; + +	if (skb->encapsulation && +	    (skb_shinfo(skb)->gso_type & +	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { +		segs = skb_udp_tunnel_segment(skb, features); +		goto out; +	}  	mss = skb_shinfo(skb)->gso_size;  	if (unlikely(skb->len <= mss)) @@ -52,7 +72,10 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,  		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |  				      SKB_GSO_UDP_TUNNEL | -				      SKB_GSO_GRE | SKB_GSO_MPLS) || +				      SKB_GSO_UDP_TUNNEL_CSUM | +				      SKB_GSO_IPIP | +				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM | +				      SKB_GSO_MPLS) ||  			     !(type & (SKB_GSO_UDP))))  			goto out; @@ -62,35 +85,162 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,  		goto out;  	} +	/* Do software UFO. Complete and fill in the UDP checksum as +	 * HW cannot do checksum of UDP packets sent as multiple +	 * IP fragments. +	 */ +	offset = skb_checksum_start_offset(skb); +	csum = skb_checksum(skb, offset, skb->len - offset, 0); +	offset += skb->csum_offset; +	*(__sum16 *)(skb->data + offset) = csum_fold(csum); +	skb->ip_summed = CHECKSUM_NONE; +  	/* Fragment the skb. IP headers of the fragments are updated in  	 * inet_gso_segment()  	 */ -	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) -		segs = skb_udp_tunnel_segment(skb, features); -	else { -		int offset; -		__wsum csum; - -		/* Do software UFO. Complete and fill in the UDP checksum as -		 * HW cannot do checksum of UDP packets sent as multiple -		 * IP fragments. -		 */ -		offset = skb_checksum_start_offset(skb); -		csum = skb_checksum(skb, offset, skb->len - offset, 0); -		offset += skb->csum_offset; -		*(__sum16 *)(skb->data + offset) = csum_fold(csum); -		skb->ip_summed = CHECKSUM_NONE; - -		segs = skb_segment(skb, features); -	} +	segs = skb_segment(skb, features);  out:  	return segs;  } +int udp_add_offload(struct udp_offload *uo) +{ +	struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); + +	if (!new_offload) +		return -ENOMEM; + +	new_offload->offload = uo; + +	spin_lock(&udp_offload_lock); +	new_offload->next = udp_offload_base; +	rcu_assign_pointer(udp_offload_base, new_offload); +	spin_unlock(&udp_offload_lock); + +	return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ +	struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); +	kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ +	struct udp_offload_priv __rcu **head = &udp_offload_base; +	struct udp_offload_priv *uo_priv; + +	spin_lock(&udp_offload_lock); + +	uo_priv = udp_deref_protected(*head); +	for (; uo_priv != NULL; +	     uo_priv = udp_deref_protected(*head)) { +		if (uo_priv->offload == uo) { +			rcu_assign_pointer(*head, +					   udp_deref_protected(uo_priv->next)); +			goto unlock; +		} +		head = &uo_priv->next; +	} +	pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); +unlock: +	spin_unlock(&udp_offload_lock); +	if (uo_priv != NULL) +		call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ +	struct udp_offload_priv *uo_priv; +	struct sk_buff *p, **pp = NULL; +	struct udphdr *uh, *uh2; +	unsigned int hlen, off; +	int flush = 1; + +	if (NAPI_GRO_CB(skb)->udp_mark || +	    (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) +		goto out; + +	/* mark that this skb passed once through the udp gro layer */ +	NAPI_GRO_CB(skb)->udp_mark = 1; + +	off  = skb_gro_offset(skb); +	hlen = off + sizeof(*uh); +	uh   = skb_gro_header_fast(skb, off); +	if (skb_gro_header_hard(skb, hlen)) { +		uh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!uh)) +			goto out; +	} + +	rcu_read_lock(); +	uo_priv = rcu_dereference(udp_offload_base); +	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { +		if (uo_priv->offload->port == uh->dest && +		    uo_priv->offload->callbacks.gro_receive) +			goto unflush; +	} +	goto out_unlock; + +unflush: +	flush = 0; + +	for (p = *head; p; p = p->next) { +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		uh2 = (struct udphdr   *)(p->data + off); +		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +	} + +	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ +	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); +	pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: +	rcu_read_unlock(); +out: +	NAPI_GRO_CB(skb)->flush |= flush; +	return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ +	struct udp_offload_priv *uo_priv; +	__be16 newlen = htons(skb->len - nhoff); +	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); +	int err = -ENOSYS; + +	uh->len = newlen; + +	rcu_read_lock(); + +	uo_priv = rcu_dereference(udp_offload_base); +	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { +		if (uo_priv->offload->port == uh->dest && +		    uo_priv->offload->callbacks.gro_complete) +			break; +	} + +	if (uo_priv != NULL) +		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + +	rcu_read_unlock(); +	return err; +} +  static const struct net_offload udpv4_offload = {  	.callbacks = {  		.gso_send_check = udp4_ufo_send_check,  		.gso_segment = udp4_ufo_fragment, +		.gro_receive  =	udp_gro_receive, +		.gro_complete =	udp_gro_complete,  	},  }; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2c46acd4cc3..3b3efbda48e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -70,7 +70,6 @@ static struct inet_protosw udplite4_protosw = {  	.protocol	=  IPPROTO_UDPLITE,  	.prot		=  &udplite_prot,  	.ops		=  &inet_dgram_ops, -	.no_check	=  0,		/* must checksum (RFC 3828) */  	.flags		=  INET_PROTOSW_PERMANENT,  }; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 1f12c8b4586..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop:  	return NET_RX_DROP;  } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, -		    int encap_type) -{ -	XFRM_SPI_SKB_CB(skb)->family = AF_INET; -	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); -	return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); -  int xfrm4_transport_finish(struct sk_buff *skb, int async)  {  	struct iphdr *iph = ip_hdr(skb); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f91511..71acd0014f2 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)  		hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);  	skb_set_network_header(skb, -x->props.header_len - -			            hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); +				    hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));  	if (x->sel.family != AF_INET6)  		skb->network_header += IPV4_BEET_PHMAXLEN;  	skb->mac_header = skb->network_header + diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index b5663c37f08..91771a7c802 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -15,65 +15,6 @@  #include <net/ip.h>  #include <net/xfrm.h> -/* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; -static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); - -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) -{ -	struct xfrm_tunnel __rcu **pprev; -	struct xfrm_tunnel *t; -	int ret = -EEXIST; -	int priority = handler->priority; - -	mutex_lock(&xfrm4_mode_tunnel_input_mutex); - -	for (pprev = &rcv_notify_handlers; -	     (t = rcu_dereference_protected(*pprev, -	     lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; -	     pprev = &t->next) { -		if (t->priority > priority) -			break; -		if (t->priority == priority) -			goto err; - -	} - -	handler->next = *pprev; -	rcu_assign_pointer(*pprev, handler); - -	ret = 0; - -err: -	mutex_unlock(&xfrm4_mode_tunnel_input_mutex); -	return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); - -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) -{ -	struct xfrm_tunnel __rcu **pprev; -	struct xfrm_tunnel *t; -	int ret = -ENOENT; - -	mutex_lock(&xfrm4_mode_tunnel_input_mutex); -	for (pprev = &rcv_notify_handlers; -	     (t = rcu_dereference_protected(*pprev, -	     lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL; -	     pprev = &t->next) { -		if (t == handler) { -			*pprev = handler->next; -			ret = 0; -			break; -		} -	} -	mutex_unlock(&xfrm4_mode_tunnel_input_mutex); -	synchronize_net(); - -	return ret; -} -EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister); -  static inline void ipip_ecn_decapsulate(struct sk_buff *skb)  {  	struct iphdr *inner_iph = ipip_hdr(skb); @@ -117,24 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)  	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?  		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); -	ip_select_ident(skb, dst->child, NULL);  	top_iph->ttl = ip4_dst_hoplimit(dst->child);  	top_iph->saddr = x->props.saddr.a4;  	top_iph->daddr = x->id.daddr.a4; +	ip_select_ident(skb, NULL);  	return 0;  } -#define for_each_input_rcu(head, handler)	\ -	for (handler = rcu_dereference(head);	\ -	     handler != NULL;			\ -	     handler = rcu_dereference(handler->next)) -  static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)  { -	struct xfrm_tunnel *handler;  	int err = -EINVAL;  	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)  	if (!pskb_may_pull(skb, sizeof(struct iphdr)))  		goto out; -	for_each_input_rcu(rcv_notify_handlers, handler) -		handler->handler(skb); -  	err = skb_unclone(skb, GFP_ATOMIC);  	if (err)  		goto out; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731f..d5f6bd9a210 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -25,7 +25,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)  	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)  		goto out; -	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) +	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)  		goto out;  	mtu = dst_mtu(skb_dst(skb)); @@ -62,10 +62,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)  	if (err)  		return err; -	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - -	skb->protocol = htons(ETH_P_IP); +	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;  	return x->outer_mode->output2(x, skb);  } @@ -73,27 +70,34 @@ EXPORT_SYMBOL(xfrm4_prepare_output);  int xfrm4_output_finish(struct sk_buff *skb)  { -#ifdef CONFIG_NETFILTER -	if (!skb_dst(skb)->xfrm) { -		IPCB(skb)->flags |= IPSKB_REROUTED; -		return dst_output(skb); -	} +	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +	skb->protocol = htons(ETH_P_IP); +#ifdef CONFIG_NETFILTER  	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;  #endif -	skb->protocol = htons(ETH_P_IP);  	return xfrm_output(skb);  } -int xfrm4_output(struct sk_buff *skb) +static int __xfrm4_output(struct sk_buff *skb)  { -	struct dst_entry *dst = skb_dst(skb); -	struct xfrm_state *x = dst->xfrm; +	struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER +	if (!x) { +		IPCB(skb)->flags |= IPSKB_REROUTED; +		return dst_output(skb); +	} +#endif +	return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm4_output(struct sock *sk, struct sk_buff *skb) +{  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, -			    NULL, dst->dev, -			    x->outer_mode->afinfo->output_finish, +			    NULL, skb_dst(skb)->dev, __xfrm4_output,  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9a459be24af..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -104,9 +104,14 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  	const struct iphdr *iph = ip_hdr(skb);  	u8 *xprth = skb_network_header(skb) + iph->ihl * 4;  	struct flowi4 *fl4 = &fl->u.ip4; +	int oif = 0; + +	if (skb_dst(skb)) +		oif = skb_dst(skb)->dev->ifindex;  	memset(fl4, 0, sizeof(struct flowi4));  	fl4->flowi4_mark = skb->mark; +	fl4->flowi4_oif = reverse ? skb->skb_iif : oif;  	if (!ip_is_fragment(iph)) {  		switch (iph->protocol) { @@ -235,7 +240,7 @@ static struct dst_ops xfrm4_dst_ops = {  	.destroy =		xfrm4_dst_destroy,  	.ifdown =		xfrm4_dst_ifdown,  	.local_out =		__ip_local_out, -	.gc_thresh =		1024, +	.gc_thresh =		32768,  };  static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { @@ -320,6 +325,7 @@ void __init xfrm4_init(void)  	xfrm4_state_init();  	xfrm4_policy_init(); +	xfrm4_protocol_init();  #ifdef CONFIG_SYSCTL  	register_pernet_subsys(&xfrm4_net_ops);  #endif diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ +	switch (protocol) { +	case IPPROTO_ESP: +		return &esp4_handlers; +	case IPPROTO_AH: +		return &ah4_handlers; +	case IPPROTO_COMP: +		return &ipcomp4_handlers; +	} + +	return NULL; +} + +#define for_each_protocol_rcu(head, handler)		\ +	for (handler = rcu_dereference(head);		\ +	     handler != NULL;				\ +	     handler = rcu_dereference(handler->next))	\ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ +	int ret; +	struct xfrm4_protocol *handler; +	struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + +	if (!head) +		return 0; + +	for_each_protocol_rcu(*head, handler) +		if ((ret = handler->cb_handler(skb, err)) <= 0) +			return ret; + +	return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, +		    int encap_type) +{ +	int ret; +	struct xfrm4_protocol *handler; +	struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; +	XFRM_SPI_SKB_CB(skb)->family = AF_INET; +	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + +	if (!head) +		goto out; + +	for_each_protocol_rcu(*head, handler) +		if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) +			return ret; + +out: +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(esp4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(esp4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(ah4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret;; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(ah4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(ipcomp4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(ipcomp4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static const struct net_protocol esp4_protocol = { +	.handler	=	xfrm4_esp_rcv, +	.err_handler	=	xfrm4_esp_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static const struct net_protocol ah4_protocol = { +	.handler	=	xfrm4_ah_rcv, +	.err_handler	=	xfrm4_ah_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static const struct net_protocol ipcomp4_protocol = { +	.handler	=	xfrm4_ipcomp_rcv, +	.err_handler	=	xfrm4_ipcomp_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { +	.family		=	AF_INET, +	.owner		=	THIS_MODULE, +	.callback	=	xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ +	switch (protocol) { +	case IPPROTO_ESP: +		return &esp4_protocol; +	case IPPROTO_AH: +		return &ah4_protocol; +	case IPPROTO_COMP: +		return &ipcomp4_protocol; +	} + +	return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, +			    unsigned char protocol) +{ +	struct xfrm4_protocol __rcu **pprev; +	struct xfrm4_protocol *t; +	bool add_netproto = false; +	int ret = -EEXIST; +	int priority = handler->priority; + +	if (!proto_handlers(protocol) || !netproto(protocol)) +		return -EINVAL; + +	mutex_lock(&xfrm4_protocol_mutex); + +	if (!rcu_dereference_protected(*proto_handlers(protocol), +				       lockdep_is_held(&xfrm4_protocol_mutex))) +		add_netproto = true; + +	for (pprev = proto_handlers(protocol); +	     (t = rcu_dereference_protected(*pprev, +			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; +	     pprev = &t->next) { +		if (t->priority < priority) +			break; +		if (t->priority == priority) +			goto err; +	} + +	handler->next = *pprev; +	rcu_assign_pointer(*pprev, handler); + +	ret = 0; + +err: +	mutex_unlock(&xfrm4_protocol_mutex); + +	if (add_netproto) { +		if (inet_add_protocol(netproto(protocol), protocol)) { +			pr_err("%s: can't add protocol\n", __func__); +			ret = -EAGAIN; +		} +	} + +	return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, +			      unsigned char protocol) +{ +	struct xfrm4_protocol __rcu **pprev; +	struct xfrm4_protocol *t; +	int ret = -ENOENT; + +	if (!proto_handlers(protocol) || !netproto(protocol)) +		return -EINVAL; + +	mutex_lock(&xfrm4_protocol_mutex); + +	for (pprev = proto_handlers(protocol); +	     (t = rcu_dereference_protected(*pprev, +			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; +	     pprev = &t->next) { +		if (t == handler) { +			*pprev = handler->next; +			ret = 0; +			break; +		} +	} + +	if (!rcu_dereference_protected(*proto_handlers(protocol), +				       lockdep_is_held(&xfrm4_protocol_mutex))) { +		if (inet_del_protocol(netproto(protocol), protocol) < 0) { +			pr_err("%s: can't remove protocol\n", __func__); +			ret = -EAGAIN; +		} +	} + +	mutex_unlock(&xfrm4_protocol_mutex); + +	synchronize_net(); + +	return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ +	xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 0b2a0641526..542074c00c7 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@  static int xfrm4_init_flags(struct xfrm_state *x)  { -	if (ipv4_config.no_pmtu_disc) +	if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)  		x->props.flags |= XFRM_STATE_NOPMTUDISC;  	return 0;  }  | 
