diff options
Diffstat (limited to 'net/ipv4/af_inet.c')
| -rw-r--r-- | net/ipv4/af_inet.c | 295 | 
1 files changed, 159 insertions, 136 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cfeb85cff4f..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@  static struct list_head inetsw[SOCK_MAX];  static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); -  /* New destruction routine */  void inet_sock_destruct(struct sock *sk) @@ -245,29 +242,6 @@ out:  }  EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -u32 ipv6_hash_secret __read_mostly; -EXPORT_SYMBOL(ipv6_hash_secret); - -/* - * inet_ehash_secret must be set exactly once, and to a non nul value - * ipv6_hash_secret must be set exactly once. - */ -void build_ehash_secret(void) -{ -	u32 rnd; - -	do { -		get_random_bytes(&rnd, sizeof(rnd)); -	} while (rnd == 0); - -	if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) -		get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); -} -EXPORT_SYMBOL(build_ehash_secret); -  /*   *	Create an inet socket.   */ @@ -280,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,  	struct inet_sock *inet;  	struct proto *answer_prot;  	unsigned char answer_flags; -	char answer_no_check;  	int try_loading_module = 0;  	int err; -	if (unlikely(!inet_ehash_secret)) -		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) -			build_ehash_secret(); -  	sock->state = SS_UNCONNECTED;  	/* Look for the requested type/protocol pair. */ @@ -342,7 +311,6 @@ lookup_protocol:  	sock->ops = answer->ops;  	answer_prot = answer->prot; -	answer_no_check = answer->no_check;  	answer_flags = answer->flags;  	rcu_read_unlock(); @@ -354,7 +322,6 @@ lookup_protocol:  		goto out;  	err = 0; -	sk->sk_no_check = answer_no_check;  	if (INET_PROTOSW_REUSE & answer_flags)  		sk->sk_reuse = SK_CAN_REUSE; @@ -369,7 +336,7 @@ lookup_protocol:  			inet->hdrincl = 1;  	} -	if (ipv4_config.no_pmtu_disc) +	if (net->ipv4.sysctl_ip_no_pmtu_disc)  		inet->pmtudisc = IP_PMTUDISC_DONT;  	else  		inet->pmtudisc = IP_PMTUDISC_WANT; @@ -1032,7 +999,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_TCP,  		.prot =       &tcp_prot,  		.ops =        &inet_stream_ops, -		.no_check =   0,  		.flags =      INET_PROTOSW_PERMANENT |  			      INET_PROTOSW_ICSK,  	}, @@ -1042,7 +1008,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_UDP,  		.prot =       &udp_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_PERMANENT,         }, @@ -1051,7 +1016,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_ICMP,  		.prot =       &ping_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_REUSE,         }, @@ -1060,7 +1024,6 @@ static struct inet_protosw inetsw_array[] =  	       .protocol =   IPPROTO_IP,	/* wild card */  	       .prot =       &raw_prot,  	       .ops =        &inet_sockraw_ops, -	       .no_check =   UDP_CSUM_DEFAULT,  	       .flags =      INET_PROTOSW_REUSE,         }  }; @@ -1160,7 +1123,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)  	fl4 = &inet->cork.fl.u.ip4;  	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),  			      sk->sk_bound_dev_if, sk->sk_protocol, -			      inet->inet_sport, inet->inet_dport, sk, false); +			      inet->inet_sport, inet->inet_dport, sk);  	if (IS_ERR(rt))  		return PTR_ERR(rt); @@ -1254,36 +1217,36 @@ static int inet_gso_send_check(struct sk_buff *skb)  	if (ihl < sizeof(*iph))  		goto out; +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	proto = iph->protocol;  	err = -EPROTONOSUPPORT; -	rcu_read_lock();  	ops = rcu_dereference(inet_offloads[proto]);  	if (likely(ops && ops->callbacks.gso_send_check))  		err = ops->callbacks.gso_send_check(skb); -	rcu_read_unlock();  out:  	return err;  }  static struct sk_buff *inet_gso_segment(struct sk_buff *skb, -	netdev_features_t features) +					netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL);  	const struct net_offload *ops; +	unsigned int offset = 0; +	bool udpfrag, encap;  	struct iphdr *iph;  	int proto; +	int nhoff;  	int ihl;  	int id; -	unsigned int offset = 0; -	bool tunnel;  	if (unlikely(skb_shinfo(skb)->gso_type &  		     ~(SKB_GSO_TCPV4 | @@ -1291,12 +1254,18 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,  		       SKB_GSO_DODGY |  		       SKB_GSO_TCP_ECN |  		       SKB_GSO_GRE | +		       SKB_GSO_GRE_CSUM | +		       SKB_GSO_IPIP | +		       SKB_GSO_SIT |  		       SKB_GSO_TCPV6 |  		       SKB_GSO_UDP_TUNNEL | +		       SKB_GSO_UDP_TUNNEL_CSUM |  		       SKB_GSO_MPLS |  		       0)))  		goto out; +	skb_reset_network_header(skb); +	nhoff = skb_network_header(skb) - skb_mac_header(skb);  	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))  		goto out; @@ -1305,42 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,  	if (ihl < sizeof(*iph))  		goto out; +	id = ntohs(iph->id); +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; +	__skb_pull(skb, ihl); -	tunnel = !!skb->encapsulation; +	encap = SKB_GSO_CB(skb)->encap_level > 0; +	if (encap) +		features = skb->dev->hw_enc_features & netif_skb_features(skb); +	SKB_GSO_CB(skb)->encap_level += ihl; -	__skb_pull(skb, ihl);  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	id = ntohs(iph->id); -	proto = iph->protocol; +  	segs = ERR_PTR(-EPROTONOSUPPORT); -	rcu_read_lock(); +	if (skb->encapsulation && +	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) +		udpfrag = proto == IPPROTO_UDP && encap; +	else +		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; +  	ops = rcu_dereference(inet_offloads[proto]);  	if (likely(ops && ops->callbacks.gso_segment))  		segs = ops->callbacks.gso_segment(skb, features); -	rcu_read_unlock();  	if (IS_ERR_OR_NULL(segs))  		goto out;  	skb = segs;  	do { -		iph = ip_hdr(skb); -		if (!tunnel && proto == IPPROTO_UDP) { +		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); +		if (udpfrag) {  			iph->id = htons(id);  			iph->frag_off = htons(offset >> 3);  			if (skb->next != NULL)  				iph->frag_off |= htons(IP_MF); -			offset += (skb->len - skb->mac_len - iph->ihl * 4); -		} else  { +			offset += skb->len - nhoff - ihl; +		} else {  			iph->id = htons(id++);  		} -		iph->tot_len = htons(skb->len - skb->mac_len); -		iph->check = 0; -		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); +		iph->tot_len = htons(skb->len - nhoff); +		ip_send_check(iph); +		if (encap) +			skb_reset_inner_headers(skb); +		skb->network_header = (u8 *)iph - skb->head;  	} while ((skb = skb->next));  out: @@ -1392,8 +1372,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		if (!NAPI_GRO_CB(p)->same_flow)  			continue; -		iph2 = ip_hdr(p); - +		iph2 = (struct iphdr *)(p->data + off); +		/* The above works because, with the exception of the top +		 * (inner most) layer, we only aggregate pkts with the same +		 * hdr length so all the hdrs we'll need to verify will start +		 * at the same offset. +		 */  		if ((iph->protocol ^ iph2->protocol) |  		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |  		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1405,13 +1389,24 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		NAPI_GRO_CB(p)->flush |=  			(iph->ttl ^ iph2->ttl) |  			(iph->tos ^ iph2->tos) | -			(__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | -			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); +			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); +		/* Save the IP ID check to be included later when we get to +		 * the transport layer so only the inner most IP ID is checked. +		 * This is because some GSO/TSO implementations do not +		 * correctly increment the IP ID for the outer hdrs. +		 */ +		NAPI_GRO_CB(p)->flush_id = +			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);  		NAPI_GRO_CB(p)->flush |= flush;  	}  	NAPI_GRO_CB(skb)->flush |= flush; +	skb_set_network_header(skb, off); +	/* The above will be needed by the transport layer if there is one +	 * immediately following this IP hdr. +	 */ +  	skb_gro_pull(skb, sizeof(*iph));  	skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1426,14 +1421,17 @@ out:  	return pp;  } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff)  { -	__be16 newlen = htons(skb->len - skb_network_offset(skb)); -	struct iphdr *iph = ip_hdr(skb); +	__be16 newlen = htons(skb->len - nhoff); +	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);  	const struct net_offload *ops;  	int proto = iph->protocol;  	int err = -ENOSYS; +	if (skb->encapsulation) +		skb_set_inner_network_header(skb, nhoff); +  	csum_replace2(&iph->check, iph->tot_len, newlen);  	iph->tot_len = newlen; @@ -1442,7 +1440,11 @@ static int inet_gro_complete(struct sk_buff *skb)  	if (WARN_ON(!ops || !ops->callbacks.gro_complete))  		goto out_unlock; -	err = ops->callbacks.gro_complete(skb); +	/* Only need to add sizeof(*iph) to get to the next hdr below +	 * because any hdr with option will have been flushed in +	 * inet_gro_receive(). +	 */ +	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));  out_unlock:  	rcu_read_unlock(); @@ -1472,22 +1474,20 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,  }  EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt)  {  	unsigned long res = 0; -	int i, j; +	int i; -	for_each_possible_cpu(i) { -		for (j = 0; j < SNMP_ARRAY_SZ; j++) -			res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt); -	} +	for_each_possible_cpu(i) +		res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field);  #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)  {  	u64 res = 0;  	int cpu; @@ -1498,12 +1498,12 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)  		u64 v;  		unsigned int start; -		bhptr = per_cpu_ptr(mib[0], cpu); +		bhptr = per_cpu_ptr(mib, cpu);  		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);  		do { -			start = u64_stats_fetch_begin_bh(syncp); +			start = u64_stats_fetch_begin_irq(syncp);  			v = *(((u64 *) bhptr) + offt); -		} while (u64_stats_fetch_retry_bh(syncp, start)); +		} while (u64_stats_fetch_retry_irq(syncp, start));  		res += v;  	} @@ -1512,24 +1512,6 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)  EXPORT_SYMBOL_GPL(snmp_fold_field64);  #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ -	BUG_ON(ptr == NULL); -	ptr[0] = __alloc_percpu(mibsize, align); -	if (!ptr[0]) -		return -ENOMEM; -#if SNMP_ARRAY_SZ == 2 -	ptr[1] = __alloc_percpu(mibsize, align); -	if (!ptr[1]) { -		free_percpu(ptr[0]); -		ptr[0] = NULL; -		return -ENOMEM; -	} -#endif -	return 0; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); -  #ifdef CONFIG_IP_MULTICAST  static const struct net_protocol igmp_protocol = {  	.handler =	igmp_rcv, @@ -1543,9 +1525,11 @@ static const struct net_protocol tcp_protocol = {  	.err_handler	=	tcp_v4_err,  	.no_policy	=	1,  	.netns_ok	=	1, +	.icmp_strict_tag_validation = 1,  };  static const struct net_protocol udp_protocol = { +	.early_demux =	udp_v4_early_demux,  	.handler =	udp_rcv,  	.err_handler =	udp_err,  	.no_policy =	1, @@ -1561,29 +1545,32 @@ static const struct net_protocol icmp_protocol = {  static __net_init int ipv4_mib_init_net(struct net *net)  { -	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, -			  sizeof(struct tcp_mib), -			  __alignof__(struct tcp_mib)) < 0) +	int i; + +	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); +	if (!net->mib.tcp_statistics)  		goto err_tcp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, -			  sizeof(struct ipstats_mib), -			  __alignof__(struct ipstats_mib)) < 0) +	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); +	if (!net->mib.ip_statistics)  		goto err_ip_mib; -	if (snmp_mib_init((void __percpu **)net->mib.net_statistics, -			  sizeof(struct linux_mib), -			  __alignof__(struct linux_mib)) < 0) + +	for_each_possible_cpu(i) { +		struct ipstats_mib *af_inet_stats; +		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); +		u64_stats_init(&af_inet_stats->syncp); +	} + +	net->mib.net_statistics = alloc_percpu(struct linux_mib); +	if (!net->mib.net_statistics)  		goto err_net_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udp_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udp_statistics)  		goto err_udp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udplite_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udplite_statistics)  		goto err_udplite_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, -			  sizeof(struct icmp_mib), -			  __alignof__(struct icmp_mib)) < 0) +	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); +	if (!net->mib.icmp_statistics)  		goto err_icmp_mib;  	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),  					      GFP_KERNEL); @@ -1594,17 +1581,17 @@ static __net_init int ipv4_mib_init_net(struct net *net)  	return 0;  err_icmpmsg_mib: -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); +	free_percpu(net->mib.icmp_statistics);  err_icmp_mib: -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); +	free_percpu(net->mib.udplite_statistics);  err_udplite_mib: -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); +	free_percpu(net->mib.udp_statistics);  err_udp_mib: -	snmp_mib_free((void __percpu **)net->mib.net_statistics); +	free_percpu(net->mib.net_statistics);  err_net_mib: -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); +	free_percpu(net->mib.ip_statistics);  err_ip_mib: -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.tcp_statistics);  err_tcp_mib:  	return -ENOMEM;  } @@ -1612,12 +1599,12 @@ err_tcp_mib:  static __net_exit void ipv4_mib_exit_net(struct net *net)  {  	kfree(net->mib.icmpmsg_statistics); -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); -	snmp_mib_free((void __percpu **)net->mib.net_statistics); -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.icmp_statistics); +	free_percpu(net->mib.udplite_statistics); +	free_percpu(net->mib.udp_statistics); +	free_percpu(net->mib.net_statistics); +	free_percpu(net->mib.ip_statistics); +	free_percpu(net->mib.tcp_statistics);  }  static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1630,6 +1617,39 @@ static int __init init_ipv4_mibs(void)  	return register_pernet_subsys(&ipv4_mib_ops);  } +static __net_init int inet_init_net(struct net *net) +{ +	/* +	 * Set defaults for local port range +	 */ +	seqlock_init(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] =  32768; +	net->ipv4.ip_local_ports.range[1] =  61000; + +	seqlock_init(&net->ipv4.ping_group_range.lock); +	/* +	 * Sane defaults - nobody may create ping sockets. +	 * Boot scripts should set this to distro-specific group. +	 */ +	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); +	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); +	return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { +	.init = inet_init_net, +	.exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ +	return register_pernet_subsys(&af_inet_ops); +} +  static int ipv4_proc_init(void);  /* @@ -1646,6 +1666,13 @@ static struct packet_offload ip_packet_offload __read_mostly = {  	},  }; +static const struct net_offload ipip_offload = { +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment	= inet_gso_segment, +	}, +}; +  static int __init ipv4_offload_init(void)  {  	/* @@ -1657,6 +1684,7 @@ static int __init ipv4_offload_init(void)  		pr_crit("%s: Cannot add TCP protocol offload\n", __func__);  	dev_add_offload(&ip_packet_offload); +	inet_add_offload(&ipip_offload, IPPROTO_IPIP);  	return 0;  } @@ -1675,13 +1703,9 @@ static int __init inet_init(void)  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); -	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); -	if (!sysctl_local_reserved_ports) -		goto out; -  	rc = proto_register(&tcp_prot, 1);  	if (rc) -		goto out_free_reserved_ports; +		goto out;  	rc = proto_register(&udp_prot, 1);  	if (rc) @@ -1705,8 +1729,6 @@ static int __init inet_init(void)  	ip_static_sysctl_init();  #endif -	tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; -  	/*  	 *	Add all the base protocols.  	 */ @@ -1768,6 +1790,9 @@ static int __init inet_init(void)  	if (ip_mr_init())  		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);  #endif + +	if (init_inet_pernet_ops()) +		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);  	/*  	 *	Initialise per-cpu ipv4 mibs  	 */ @@ -1790,8 +1815,6 @@ out_unregister_udp_proto:  	proto_unregister(&udp_prot);  out_unregister_tcp_proto:  	proto_unregister(&tcp_prot); -out_free_reserved_ports: -	kfree(sysctl_local_reserved_ports);  	goto out;  }  | 
