diff options
Diffstat (limited to 'net/ipv4/af_inet.c')
| -rw-r--r-- | net/ipv4/af_inet.c | 653 | 
1 files changed, 377 insertions, 276 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2b61107df6..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -65,6 +65,8 @@   *		2 of the License, or (at your option) any later version.   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/err.h>  #include <linux/errno.h>  #include <linux/types.h> @@ -89,7 +91,6 @@  #include <linux/slab.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/inet.h>  #include <linux/igmp.h> @@ -105,14 +106,15 @@  #include <net/tcp.h>  #include <net/udp.h>  #include <net/udplite.h> +#include <net/ping.h>  #include <linux/skbuff.h>  #include <net/sock.h>  #include <net/raw.h>  #include <net/icmp.h> -#include <net/ipip.h>  #include <net/inet_common.h>  #include <net/xfrm.h>  #include <net/net_namespace.h> +#include <net/secure_seq.h>  #ifdef CONFIG_IP_MROUTE  #include <linux/mroute.h>  #endif @@ -124,9 +126,6 @@  static struct list_head inetsw[SOCK_MAX];  static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); -  /* New destruction routine */  void inet_sock_destruct(struct sock *sk) @@ -153,8 +152,9 @@ void inet_sock_destruct(struct sock *sk)  	WARN_ON(sk->sk_wmem_queued);  	WARN_ON(sk->sk_forward_alloc); -	kfree(inet->opt); +	kfree(rcu_dereference_protected(inet->inet_opt, 1));  	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); +	dst_release(sk->sk_rx_dst);  	sk_refcnt_debug_dec(sk);  }  EXPORT_SYMBOL(inet_sock_destruct); @@ -209,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog)  	 * we can only allow the backlog to be adjusted.  	 */  	if (old_state != TCP_LISTEN) { +		/* Check special setups for testing purpose to enable TFO w/o +		 * requiring TCP_FASTOPEN sockopt. +		 * Note that only TCP sockets (SOCK_STREAM) will reach here. +		 * Also fastopenq may already been allocated because this +		 * socket was in TCP_LISTEN state previously but was +		 * shutdown() (rather than close()). +		 */ +		if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && +		    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { +			if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) +				err = fastopen_init_queue(sk, backlog); +			else if ((sysctl_tcp_fastopen & +				  TFO_SERVER_WO_SOCKOPT2) != 0) +				err = fastopen_init_queue(sk, +				    ((uint)sysctl_tcp_fastopen) >> 16); +			else +				err = 0; +			if (err) +				goto out; +		}  		err = inet_csk_listen_start(sk, backlog);  		if (err)  			goto out; @@ -222,41 +242,6 @@ out:  }  EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -/* - * inet_ehash_secret must be set exactly once - */ -void build_ehash_secret(void) -{ -	u32 rnd; - -	do { -		get_random_bytes(&rnd, sizeof(rnd)); -	} while (rnd == 0); - -	cmpxchg(&inet_ehash_secret, 0, rnd); -} -EXPORT_SYMBOL(build_ehash_secret); - -static inline int inet_netns_ok(struct net *net, int protocol) -{ -	int hash; -	const struct net_protocol *ipprot; - -	if (net_eq(net, &init_net)) -		return 1; - -	hash = protocol & (MAX_INET_PROTOS - 1); -	ipprot = rcu_dereference(inet_protos[hash]); - -	if (ipprot == NULL) -		/* raw IP is OK */ -		return 1; -	return ipprot->netns_ok; -} -  /*   *	Create an inet socket.   */ @@ -269,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,  	struct inet_sock *inet;  	struct proto *answer_prot;  	unsigned char answer_flags; -	char answer_no_check;  	int try_loading_module = 0;  	int err; -	if (unlikely(!inet_ehash_secret)) -		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) -			build_ehash_secret(); -  	sock->state = SS_UNCONNECTED;  	/* Look for the requested type/protocol pair. */ @@ -325,16 +305,12 @@ lookup_protocol:  	}  	err = -EPERM; -	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) -		goto out_rcu_unlock; - -	err = -EAFNOSUPPORT; -	if (!inet_netns_ok(net, protocol)) +	if (sock->type == SOCK_RAW && !kern && +	    !ns_capable(net->user_ns, CAP_NET_RAW))  		goto out_rcu_unlock;  	sock->ops = answer->ops;  	answer_prot = answer->prot; -	answer_no_check = answer->no_check;  	answer_flags = answer->flags;  	rcu_read_unlock(); @@ -346,9 +322,8 @@ lookup_protocol:  		goto out;  	err = 0; -	sk->sk_no_check = answer_no_check;  	if (INET_PROTOSW_REUSE & answer_flags) -		sk->sk_reuse = 1; +		sk->sk_reuse = SK_CAN_REUSE;  	inet = inet_sk(sk);  	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; @@ -361,7 +336,7 @@ lookup_protocol:  			inet->hdrincl = 1;  	} -	if (ipv4_config.no_pmtu_disc) +	if (net->ipv4.sysctl_ip_no_pmtu_disc)  		inet->pmtudisc = IP_PMTUDISC_DONT;  	else  		inet->pmtudisc = IP_PMTUDISC_WANT; @@ -380,6 +355,7 @@ lookup_protocol:  	inet->mc_all	= 1;  	inet->mc_index	= 0;  	inet->mc_list	= NULL; +	inet->rcv_tos	= 0;  	sk_refcnt_debug_inc(sk); @@ -451,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;  	struct sock *sk = sock->sk;  	struct inet_sock *inet = inet_sk(sk); +	struct net *net = sock_net(sk);  	unsigned short snum;  	int chk_addr_ret;  	int err; @@ -464,7 +441,17 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	if (addr_len < sizeof(struct sockaddr_in))  		goto out; -	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); +	if (addr->sin_family != AF_INET) { +		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) +		 * only if s_addr is INADDR_ANY. +		 */ +		err = -EAFNOSUPPORT; +		if (addr->sin_family != AF_UNSPEC || +		    addr->sin_addr.s_addr != htonl(INADDR_ANY)) +			goto out; +	} + +	chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);  	/* Not specified by any standard per-se, however it breaks too  	 * many applications when removed.  It is unfortunate since @@ -484,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	snum = ntohs(addr->sin_port);  	err = -EACCES; -	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) +	if (snum && snum < PROT_SOCK && +	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))  		goto out;  	/*      We keep a pair of addresses. rcv_saddr is the one @@ -528,7 +516,7 @@ out:  }  EXPORT_SYMBOL(inet_bind); -int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,  		       int addr_len, int flags)  {  	struct sock *sk = sock->sk; @@ -540,15 +528,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,  	if (!inet_sk(sk)->inet_num && inet_autobind(sk))  		return -EAGAIN; -	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +	return sk->sk_prot->connect(sk, uaddr, addr_len);  }  EXPORT_SYMBOL(inet_dgram_connect); -static long inet_wait_for_connect(struct sock *sk, long timeo) +static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)  {  	DEFINE_WAIT(wait);  	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); +	sk->sk_write_pending += writebias;  	/* Basic assumption: if someone sets sk->sk_err, he _must_  	 * change state of the socket from TCP_SYN_*. @@ -564,6 +553,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);  	}  	finish_wait(sk_sleep(sk), &wait); +	sk->sk_write_pending -= writebias;  	return timeo;  } @@ -571,8 +561,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)   *	Connect to a remote host. There is regrettably still a little   *	TCP 'magic' in here.   */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, -			int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +			  int addr_len, int flags)  {  	struct sock *sk = sock->sk;  	int err; @@ -581,8 +571,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	if (addr_len < sizeof(uaddr->sa_family))  		return -EINVAL; -	lock_sock(sk); -  	if (uaddr->sa_family == AF_UNSPEC) {  		err = sk->sk_prot->disconnect(sk, flags);  		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -622,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);  	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { +		int writebias = (sk->sk_protocol == IPPROTO_TCP) && +				tcp_sk(sk)->fastopen_req && +				tcp_sk(sk)->fastopen_req->data ? 1 : 0; +  		/* Error code is set above */ -		if (!timeo || !inet_wait_for_connect(sk, timeo)) +		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))  			goto out;  		err = sock_intr_errno(timeo); @@ -645,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	sock->state = SS_CONNECTED;  	err = 0;  out: -	release_sock(sk);  	return err;  sock_error: @@ -655,6 +646,18 @@ sock_error:  		sock->state = SS_DISCONNECTING;  	goto out;  } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +			int addr_len, int flags) +{ +	int err; + +	lock_sock(sock->sk); +	err = __inet_stream_connect(sock, uaddr, addr_len, flags); +	release_sock(sock->sk); +	return err; +}  EXPORT_SYMBOL(inet_stream_connect);  /* @@ -672,8 +675,10 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)  	lock_sock(sk2); +	sock_rps_record_flow(sk2);  	WARN_ON(!((1 << sk2->sk_state) & -		  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); +		  (TCPF_ESTABLISHED | TCPF_SYN_RECV | +		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));  	sock_graft(sk2, newsock); @@ -880,6 +885,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  }  EXPORT_SYMBOL(inet_ioctl); +#ifdef CONFIG_COMPAT +static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ +	struct sock *sk = sock->sk; +	int err = -ENOIOCTLCMD; + +	if (sk->sk_prot->compat_ioctl) +		err = sk->sk_prot->compat_ioctl(sk, cmd, arg); + +	return err; +} +#endif +  const struct proto_ops inet_stream_ops = {  	.family		   = PF_INET,  	.owner		   = THIS_MODULE, @@ -903,6 +921,7 @@ const struct proto_ops inet_stream_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  };  EXPORT_SYMBOL(inet_stream_ops); @@ -929,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  };  EXPORT_SYMBOL(inet_dgram_ops); @@ -959,6 +979,7 @@ static const struct proto_ops inet_sockraw_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  }; @@ -978,7 +999,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_TCP,  		.prot =       &tcp_prot,  		.ops =        &inet_stream_ops, -		.no_check =   0,  		.flags =      INET_PROTOSW_PERMANENT |  			      INET_PROTOSW_ICSK,  	}, @@ -988,17 +1008,22 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_UDP,  		.prot =       &udp_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_PERMANENT,         }, +       { +		.type =       SOCK_DGRAM, +		.protocol =   IPPROTO_ICMP, +		.prot =       &ping_prot, +		.ops =        &inet_dgram_ops, +		.flags =      INET_PROTOSW_REUSE, +       },         {  	       .type =       SOCK_RAW,  	       .protocol =   IPPROTO_IP,	/* wild card */  	       .prot =       &raw_prot,  	       .ops =        &inet_sockraw_ops, -	       .no_check =   UDP_CSUM_DEFAULT,  	       .flags =      INET_PROTOSW_REUSE,         }  }; @@ -1048,13 +1073,11 @@ out:  	return;  out_permanent: -	printk(KERN_ERR "Attempt to override permanent protocol %d.\n", -	       protocol); +	pr_err("Attempt to override permanent protocol %d\n", protocol);  	goto out;  out_illegal: -	printk(KERN_ERR -	       "Ignoring attempt to register invalid socket type %d.\n", +	pr_err("Ignoring attempt to register invalid socket type %d\n",  	       p->type);  	goto out;  } @@ -1063,8 +1086,7 @@ EXPORT_SYMBOL(inet_register_protosw);  void inet_unregister_protosw(struct inet_protosw *p)  {  	if (INET_PROTOSW_PERMANENT & p->flags) { -		printk(KERN_ERR -		       "Attempt to unregister permanent protocol %d.\n", +		pr_err("Attempt to unregister permanent protocol %d\n",  		       p->protocol);  	} else {  		spin_lock_bh(&inetsw_lock); @@ -1085,34 +1107,36 @@ int sysctl_ip_dynaddr __read_mostly;  static int inet_sk_reselect_saddr(struct sock *sk)  {  	struct inet_sock *inet = inet_sk(sk); -	int err; -	struct rtable *rt;  	__be32 old_saddr = inet->inet_saddr; -	__be32 new_saddr;  	__be32 daddr = inet->inet_daddr; +	struct flowi4 *fl4; +	struct rtable *rt; +	__be32 new_saddr; +	struct ip_options_rcu *inet_opt; -	if (inet->opt && inet->opt->srr) -		daddr = inet->opt->faddr; +	inet_opt = rcu_dereference_protected(inet->inet_opt, +					     sock_owned_by_user(sk)); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr;  	/* Query new route. */ -	err = ip_route_connect(&rt, daddr, 0, -			       RT_CONN_FLAGS(sk), -			       sk->sk_bound_dev_if, -			       sk->sk_protocol, -			       inet->inet_sport, inet->inet_dport, sk, 0); -	if (err) -		return err; +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), +			      sk->sk_bound_dev_if, sk->sk_protocol, +			      inet->inet_sport, inet->inet_dport, sk); +	if (IS_ERR(rt)) +		return PTR_ERR(rt);  	sk_setup_caps(sk, &rt->dst); -	new_saddr = rt->rt_src; +	new_saddr = fl4->saddr;  	if (new_saddr == old_saddr)  		return 0;  	if (sysctl_ip_dynaddr > 1) { -		printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n", -		       __func__, &old_saddr, &new_saddr); +		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", +			__func__, &old_saddr, &new_saddr);  	}  	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; @@ -1134,6 +1158,8 @@ int inet_sk_rebuild_header(struct sock *sk)  	struct inet_sock *inet = inet_sk(sk);  	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);  	__be32 daddr; +	struct ip_options_rcu *inet_opt; +	struct flowi4 *fl4;  	int err;  	/* Route is OK, nothing to do. */ @@ -1141,28 +1167,23 @@ int inet_sk_rebuild_header(struct sock *sk)  		return 0;  	/* Reroute. */ +	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt);  	daddr = inet->inet_daddr; -	if (inet->opt && inet->opt->srr) -		daddr = inet->opt->faddr; -{ -	struct flowi fl = { -		.oif = sk->sk_bound_dev_if, -		.mark = sk->sk_mark, -		.fl4_dst = daddr, -		.fl4_src = inet->inet_saddr, -		.fl4_tos = RT_CONN_FLAGS(sk), -		.proto = sk->sk_protocol, -		.flags = inet_sk_flowi_flags(sk), -		.fl_ip_sport = inet->inet_sport, -		.fl_ip_dport = inet->inet_dport, -	}; - -	security_sk_classify_flow(sk, &fl); -	err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); -} -	if (!err) +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	rcu_read_unlock(); +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, +				   inet->inet_dport, inet->inet_sport, +				   sk->sk_protocol, RT_CONN_FLAGS(sk), +				   sk->sk_bound_dev_if); +	if (!IS_ERR(rt)) { +		err = 0;  		sk_setup_caps(sk, &rt->dst); -	else { +	} else { +		err = PTR_ERR(rt); +  		/* Routing failed... */  		sk->sk_route_caps = 0;  		/* @@ -1182,8 +1203,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);  static int inet_gso_send_check(struct sk_buff *skb)  { -	struct iphdr *iph; -	const struct net_protocol *ops; +	const struct net_offload *ops; +	const struct iphdr *iph;  	int proto;  	int ihl;  	int err = -EINVAL; @@ -1196,46 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb)  	if (ihl < sizeof(*iph))  		goto out; +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	proto = iph->protocol & (MAX_INET_PROTOS - 1);  	err = -EPROTONOSUPPORT; -	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (likely(ops && ops->gso_send_check)) -		err = ops->gso_send_check(skb); -	rcu_read_unlock(); +	ops = rcu_dereference(inet_offloads[proto]); +	if (likely(ops && ops->callbacks.gso_send_check)) +		err = ops->callbacks.gso_send_check(skb);  out:  	return err;  } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, +					netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL); +	const struct net_offload *ops; +	unsigned int offset = 0; +	bool udpfrag, encap;  	struct iphdr *iph; -	const struct net_protocol *ops;  	int proto; +	int nhoff;  	int ihl;  	int id; -	unsigned int offset = 0; - -	if (!(features & NETIF_F_V4_CSUM)) -		features &= ~NETIF_F_SG;  	if (unlikely(skb_shinfo(skb)->gso_type &  		     ~(SKB_GSO_TCPV4 |  		       SKB_GSO_UDP |  		       SKB_GSO_DODGY |  		       SKB_GSO_TCP_ECN | +		       SKB_GSO_GRE | +		       SKB_GSO_GRE_CSUM | +		       SKB_GSO_IPIP | +		       SKB_GSO_SIT | +		       SKB_GSO_TCPV6 | +		       SKB_GSO_UDP_TUNNEL | +		       SKB_GSO_UDP_TUNNEL_CSUM | +		       SKB_GSO_MPLS |  		       0)))  		goto out; +	skb_reset_network_header(skb); +	nhoff = skb_network_header(skb) - skb_mac_header(skb);  	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))  		goto out; @@ -1244,39 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)  	if (ihl < sizeof(*iph))  		goto out; +	id = ntohs(iph->id); +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); + +	encap = SKB_GSO_CB(skb)->encap_level > 0; +	if (encap) +		features = skb->dev->hw_enc_features & netif_skb_features(skb); +	SKB_GSO_CB(skb)->encap_level += ihl; +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	id = ntohs(iph->id); -	proto = iph->protocol & (MAX_INET_PROTOS - 1); +  	segs = ERR_PTR(-EPROTONOSUPPORT); -	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (likely(ops && ops->gso_segment)) -		segs = ops->gso_segment(skb, features); -	rcu_read_unlock(); +	if (skb->encapsulation && +	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) +		udpfrag = proto == IPPROTO_UDP && encap; +	else +		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; -	if (!segs || IS_ERR(segs)) +	ops = rcu_dereference(inet_offloads[proto]); +	if (likely(ops && ops->callbacks.gso_segment)) +		segs = ops->callbacks.gso_segment(skb, features); + +	if (IS_ERR_OR_NULL(segs))  		goto out;  	skb = segs;  	do { -		iph = ip_hdr(skb); -		if (proto == IPPROTO_UDP) { +		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); +		if (udpfrag) {  			iph->id = htons(id);  			iph->frag_off = htons(offset >> 3);  			if (skb->next != NULL)  				iph->frag_off |= htons(IP_MF); -			offset += (skb->len - skb->mac_len - iph->ihl * 4); -		} else +			offset += skb->len - nhoff - ihl; +		} else {  			iph->id = htons(id++); -		iph->tot_len = htons(skb->len - skb->mac_len); -		iph->check = 0; -		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); +		} +		iph->tot_len = htons(skb->len - nhoff); +		ip_send_check(iph); +		if (encap) +			skb_reset_inner_headers(skb); +		skb->network_header = (u8 *)iph - skb->head;  	} while ((skb = skb->next));  out: @@ -1286,10 +1330,10 @@ out:  static struct sk_buff **inet_gro_receive(struct sk_buff **head,  					 struct sk_buff *skb)  { -	const struct net_protocol *ops; +	const struct net_offload *ops;  	struct sk_buff **pp = NULL;  	struct sk_buff *p; -	struct iphdr *iph; +	const struct iphdr *iph;  	unsigned int hlen;  	unsigned int off;  	unsigned int id; @@ -1305,21 +1349,21 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  			goto out;  	} -	proto = iph->protocol & (MAX_INET_PROTOS - 1); +	proto = iph->protocol;  	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (!ops || !ops->gro_receive) +	ops = rcu_dereference(inet_offloads[proto]); +	if (!ops || !ops->callbacks.gro_receive)  		goto out_unlock;  	if (*(u8 *)iph != 0x45)  		goto out_unlock; -	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) +	if (unlikely(ip_fast_csum((u8 *)iph, 5)))  		goto out_unlock;  	id = ntohl(*(__be32 *)&iph->id); -	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); +	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));  	id >>= 16;  	for (p = *head; p; p = p->next) { @@ -1328,10 +1372,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		if (!NAPI_GRO_CB(p)->same_flow)  			continue; -		iph2 = ip_hdr(p); - +		iph2 = (struct iphdr *)(p->data + off); +		/* The above works because, with the exception of the top +		 * (inner most) layer, we only aggregate pkts with the same +		 * hdr length so all the hdrs we'll need to verify will start +		 * at the same offset. +		 */  		if ((iph->protocol ^ iph2->protocol) | -		    (iph->tos ^ iph2->tos) |  		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |  		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {  			NAPI_GRO_CB(p)->same_flow = 0; @@ -1341,16 +1388,29 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		/* All fields must match except length and checksum. */  		NAPI_GRO_CB(p)->flush |=  			(iph->ttl ^ iph2->ttl) | -			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); +			(iph->tos ^ iph2->tos) | +			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); +		/* Save the IP ID check to be included later when we get to +		 * the transport layer so only the inner most IP ID is checked. +		 * This is because some GSO/TSO implementations do not +		 * correctly increment the IP ID for the outer hdrs. +		 */ +		NAPI_GRO_CB(p)->flush_id = +			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);  		NAPI_GRO_CB(p)->flush |= flush;  	}  	NAPI_GRO_CB(skb)->flush |= flush; +	skb_set_network_header(skb, off); +	/* The above will be needed by the transport layer if there is one +	 * immediately following this IP hdr. +	 */ +  	skb_gro_pull(skb, sizeof(*iph));  	skb_set_transport_header(skb, skb_gro_offset(skb)); -	pp = ops->gro_receive(head, skb); +	pp = ops->callbacks.gro_receive(head, skb);  out_unlock:  	rcu_read_unlock(); @@ -1361,23 +1421,30 @@ out:  	return pp;  } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff)  { -	const struct net_protocol *ops; -	struct iphdr *iph = ip_hdr(skb); -	int proto = iph->protocol & (MAX_INET_PROTOS - 1); +	__be16 newlen = htons(skb->len - nhoff); +	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); +	const struct net_offload *ops; +	int proto = iph->protocol;  	int err = -ENOSYS; -	__be16 newlen = htons(skb->len - skb_network_offset(skb)); + +	if (skb->encapsulation) +		skb_set_inner_network_header(skb, nhoff);  	csum_replace2(&iph->check, iph->tot_len, newlen);  	iph->tot_len = newlen;  	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (WARN_ON(!ops || !ops->gro_complete)) +	ops = rcu_dereference(inet_offloads[proto]); +	if (WARN_ON(!ops || !ops->callbacks.gro_complete))  		goto out_unlock; -	err = ops->gro_complete(skb); +	/* Only need to add sizeof(*iph) to get to the next hdr below +	 * because any hdr with option will have been flushed in +	 * inet_gro_receive(). +	 */ +	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));  out_unlock:  	rcu_read_unlock(); @@ -1407,82 +1474,44 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,  }  EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt)  {  	unsigned long res = 0;  	int i; -	for_each_possible_cpu(i) { -		res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); -		res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); -	} +	for_each_possible_cpu(i) +		res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field);  #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)  {  	u64 res = 0;  	int cpu;  	for_each_possible_cpu(cpu) { -		void *bhptr, *userptr; +		void *bhptr;  		struct u64_stats_sync *syncp; -		u64 v_bh, v_user; +		u64 v;  		unsigned int start; -		/* first mib used by softirq context, we must use _bh() accessors */ -		bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); +		bhptr = per_cpu_ptr(mib, cpu);  		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);  		do { -			start = u64_stats_fetch_begin_bh(syncp); -			v_bh = *(((u64 *) bhptr) + offt); -		} while (u64_stats_fetch_retry_bh(syncp, start)); +			start = u64_stats_fetch_begin_irq(syncp); +			v = *(((u64 *) bhptr) + offt); +		} while (u64_stats_fetch_retry_irq(syncp, start)); -		/* second mib used in USER context */ -		userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); -		syncp = (struct u64_stats_sync *)(userptr + syncp_offset); -		do { -			start = u64_stats_fetch_begin(syncp); -			v_user = *(((u64 *) userptr) + offt); -		} while (u64_stats_fetch_retry(syncp, start)); - -		res += v_bh + v_user; +		res += v;  	}  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field64);  #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ -	BUG_ON(ptr == NULL); -	ptr[0] = __alloc_percpu(mibsize, align); -	if (!ptr[0]) -		goto err0; -	ptr[1] = __alloc_percpu(mibsize, align); -	if (!ptr[1]) -		goto err1; -	return 0; -err1: -	free_percpu(ptr[0]); -	ptr[0] = NULL; -err0: -	return -ENOMEM; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - -void snmp_mib_free(void __percpu *ptr[2]) -{ -	BUG_ON(ptr == NULL); -	free_percpu(ptr[0]); -	free_percpu(ptr[1]); -	ptr[0] = ptr[1] = NULL; -} -EXPORT_SYMBOL_GPL(snmp_mib_free); -  #ifdef CONFIG_IP_MULTICAST  static const struct net_protocol igmp_protocol = {  	.handler =	igmp_rcv, @@ -1491,90 +1520,91 @@ static const struct net_protocol igmp_protocol = {  #endif  static const struct net_protocol tcp_protocol = { -	.handler =	tcp_v4_rcv, -	.err_handler =	tcp_v4_err, -	.gso_send_check = tcp_v4_gso_send_check, -	.gso_segment =	tcp_tso_segment, -	.gro_receive =	tcp4_gro_receive, -	.gro_complete =	tcp4_gro_complete, -	.no_policy =	1, -	.netns_ok =	1, +	.early_demux	=	tcp_v4_early_demux, +	.handler	=	tcp_v4_rcv, +	.err_handler	=	tcp_v4_err, +	.no_policy	=	1, +	.netns_ok	=	1, +	.icmp_strict_tag_validation = 1,  };  static const struct net_protocol udp_protocol = { +	.early_demux =	udp_v4_early_demux,  	.handler =	udp_rcv,  	.err_handler =	udp_err, -	.gso_send_check = udp4_ufo_send_check, -	.gso_segment = udp4_ufo_fragment,  	.no_policy =	1,  	.netns_ok =	1,  };  static const struct net_protocol icmp_protocol = {  	.handler =	icmp_rcv, +	.err_handler =	icmp_err,  	.no_policy =	1,  	.netns_ok =	1,  };  static __net_init int ipv4_mib_init_net(struct net *net)  { -	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, -			  sizeof(struct tcp_mib), -			  __alignof__(struct tcp_mib)) < 0) +	int i; + +	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); +	if (!net->mib.tcp_statistics)  		goto err_tcp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, -			  sizeof(struct ipstats_mib), -			  __alignof__(struct ipstats_mib)) < 0) +	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); +	if (!net->mib.ip_statistics)  		goto err_ip_mib; -	if (snmp_mib_init((void __percpu **)net->mib.net_statistics, -			  sizeof(struct linux_mib), -			  __alignof__(struct linux_mib)) < 0) + +	for_each_possible_cpu(i) { +		struct ipstats_mib *af_inet_stats; +		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); +		u64_stats_init(&af_inet_stats->syncp); +	} + +	net->mib.net_statistics = alloc_percpu(struct linux_mib); +	if (!net->mib.net_statistics)  		goto err_net_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udp_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udp_statistics)  		goto err_udp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udplite_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udplite_statistics)  		goto err_udplite_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, -			  sizeof(struct icmp_mib), -			  __alignof__(struct icmp_mib)) < 0) +	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); +	if (!net->mib.icmp_statistics)  		goto err_icmp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, -			  sizeof(struct icmpmsg_mib), -			  __alignof__(struct icmpmsg_mib)) < 0) +	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), +					      GFP_KERNEL); +	if (!net->mib.icmpmsg_statistics)  		goto err_icmpmsg_mib;  	tcp_mib_init(net);  	return 0;  err_icmpmsg_mib: -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); +	free_percpu(net->mib.icmp_statistics);  err_icmp_mib: -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); +	free_percpu(net->mib.udplite_statistics);  err_udplite_mib: -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); +	free_percpu(net->mib.udp_statistics);  err_udp_mib: -	snmp_mib_free((void __percpu **)net->mib.net_statistics); +	free_percpu(net->mib.net_statistics);  err_net_mib: -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); +	free_percpu(net->mib.ip_statistics);  err_ip_mib: -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.tcp_statistics);  err_tcp_mib:  	return -ENOMEM;  }  static __net_exit void ipv4_mib_exit_net(struct net *net)  { -	snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); -	snmp_mib_free((void __percpu **)net->mib.net_statistics); -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	kfree(net->mib.icmpmsg_statistics); +	free_percpu(net->mib.icmp_statistics); +	free_percpu(net->mib.udplite_statistics); +	free_percpu(net->mib.udp_statistics); +	free_percpu(net->mib.net_statistics); +	free_percpu(net->mib.ip_statistics); +	free_percpu(net->mib.tcp_statistics);  }  static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1587,37 +1617,95 @@ static int __init init_ipv4_mibs(void)  	return register_pernet_subsys(&ipv4_mib_ops);  } +static __net_init int inet_init_net(struct net *net) +{ +	/* +	 * Set defaults for local port range +	 */ +	seqlock_init(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] =  32768; +	net->ipv4.ip_local_ports.range[1] =  61000; + +	seqlock_init(&net->ipv4.ping_group_range.lock); +	/* +	 * Sane defaults - nobody may create ping sockets. +	 * Boot scripts should set this to distro-specific group. +	 */ +	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); +	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); +	return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { +	.init = inet_init_net, +	.exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ +	return register_pernet_subsys(&af_inet_ops); +} +  static int ipv4_proc_init(void);  /*   *	IP protocol layer initialiser   */ +static struct packet_offload ip_packet_offload __read_mostly = { +	.type = cpu_to_be16(ETH_P_IP), +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment = inet_gso_segment, +		.gro_receive = inet_gro_receive, +		.gro_complete = inet_gro_complete, +	}, +}; + +static const struct net_offload ipip_offload = { +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment	= inet_gso_segment, +	}, +}; + +static int __init ipv4_offload_init(void) +{ +	/* +	 * Add offloads +	 */ +	if (udpv4_offload_init() < 0) +		pr_crit("%s: Cannot add UDP protocol offload\n", __func__); +	if (tcpv4_offload_init() < 0) +		pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + +	dev_add_offload(&ip_packet_offload); +	inet_add_offload(&ipip_offload, IPPROTO_IPIP); +	return 0; +} + +fs_initcall(ipv4_offload_init); +  static struct packet_type ip_packet_type __read_mostly = {  	.type = cpu_to_be16(ETH_P_IP),  	.func = ip_rcv, -	.gso_send_check = inet_gso_send_check, -	.gso_segment = inet_gso_segment, -	.gro_receive = inet_gro_receive, -	.gro_complete = inet_gro_complete,  };  static int __init inet_init(void)  { -	struct sk_buff *dummy_skb;  	struct inet_protosw *q;  	struct list_head *r;  	int rc = -EINVAL; -	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); - -	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); -	if (!sysctl_local_reserved_ports) -		goto out; +	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));  	rc = proto_register(&tcp_prot, 1);  	if (rc) -		goto out_free_reserved_ports; +		goto out;  	rc = proto_register(&udp_prot, 1);  	if (rc) @@ -1627,6 +1715,10 @@ static int __init inet_init(void)  	if (rc)  		goto out_unregister_udp_proto; +	rc = proto_register(&ping_prot, 1); +	if (rc) +		goto out_unregister_raw_proto; +  	/*  	 *	Tell SOCKET that we are alive...  	 */ @@ -1642,14 +1734,14 @@ static int __init inet_init(void)  	 */  	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); +		pr_crit("%s: Cannot add ICMP protocol\n", __func__);  	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); +		pr_crit("%s: Cannot add UDP protocol\n", __func__);  	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +		pr_crit("%s: Cannot add TCP protocol\n", __func__);  #ifdef CONFIG_IP_MULTICAST  	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +		pr_crit("%s: Cannot add IGMP protocol\n", __func__);  #endif  	/* Register the socket-side information for inet_create. */ @@ -1682,6 +1774,8 @@ static int __init inet_init(void)  	/* Add UDP-Lite (RFC 3828) */  	udplite4_register(); +	ping_init(); +  	/*  	 *	Set the ICMP layer up  	 */ @@ -1694,14 +1788,17 @@ static int __init inet_init(void)  	 */  #if defined(CONFIG_IP_MROUTE)  	if (ip_mr_init()) -		printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); +		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);  #endif + +	if (init_inet_pernet_ops()) +		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);  	/*  	 *	Initialise per-cpu ipv4 mibs  	 */  	if (init_ipv4_mibs()) -		printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); +		pr_crit("%s: Cannot init ipv4 mibs\n", __func__);  	ipv4_proc_init(); @@ -1712,12 +1809,12 @@ static int __init inet_init(void)  	rc = 0;  out:  	return rc; +out_unregister_raw_proto: +	proto_unregister(&raw_prot);  out_unregister_udp_proto:  	proto_unregister(&udp_prot);  out_unregister_tcp_proto:  	proto_unregister(&tcp_prot); -out_free_reserved_ports: -	kfree(sysctl_local_reserved_ports);  	goto out;  } @@ -1736,11 +1833,15 @@ static int __init ipv4_proc_init(void)  		goto out_tcp;  	if (udp4_proc_init())  		goto out_udp; +	if (ping_proc_init()) +		goto out_ping;  	if (ip_misc_proc_init())  		goto out_misc;  out:  	return rc;  out_misc: +	ping_proc_exit(); +out_ping:  	udp4_proc_exit();  out_udp:  	tcp4_proc_exit();  | 
