diff options
Diffstat (limited to 'net/ipv4')
143 files changed, 23359 insertions, 19860 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9e95d7fb6d5..05c57f0fcab 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -9,10 +9,7 @@ config IP_MULTICAST  	  intend to participate in the MBONE, a high bandwidth network on top  	  of the Internet which carries audio and video broadcasts. More  	  information about the MBONE is on the WWW at -	  <http://www.savetz.com/mbone/>. Information about the multicast -	  capabilities of the various network cards is contained in -	  <file:Documentation/networking/multicast.txt>. For most people, it's -	  safe to say N. +	  <http://www.savetz.com/mbone/>. For most people, it's safe to say N.  config IP_ADVANCED_ROUTER  	bool "IP: advanced router" @@ -55,45 +52,9 @@ config IP_ADVANCED_ROUTER  	  If unsure, say N here. -choice -	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" -	depends on IP_ADVANCED_ROUTER -	default ASK_IP_FIB_HASH - -config ASK_IP_FIB_HASH -	bool "FIB_HASH" -	---help--- -	  Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE -	bool "FIB_TRIE" -	---help--- -	  Use new experimental LC-trie as FIB lookup algorithm. -	  This improves lookup performance if you have a large -	  number of routes. - -	  LC-trie is a longest matching prefix lookup algorithm which -	  performs better than FIB_HASH for large routing tables. -	  But, it consumes more memory and is more complex. - -	  LC-trie is described in: - -	  IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson -	  IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, -	  June 1999 - -	  An experimental study of compression methods for dynamic tries -	  Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. -	  <http://www.csc.kth.se/~snilsson/software/dyntrie2/> - -endchoice - -config IP_FIB_HASH -	def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER -  config IP_FIB_TRIE_STATS  	bool "FIB TRIE statistics" -	depends on IP_FIB_TRIE +	depends on IP_ADVANCED_ROUTER  	---help---  	  Keep track of statistics on structure of FIB TRIE table.  	  Useful for testing and measuring TRIE performance. @@ -140,6 +101,9 @@ config IP_ROUTE_VERBOSE  	  handled by the klogd daemon which is responsible for kernel messages  	  ("man klogd"). +config IP_ROUTE_CLASSID +	bool +  config IP_PNP  	bool "IP: kernel level autoconfiguration"  	help @@ -196,11 +160,10 @@ config IP_PNP_RARP  	  operating on your network. Read  	  <file:Documentation/filesystems/nfs/nfsroot.txt> for details. -# not yet ready.. -#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP  config NET_IPIP  	tristate "IP: tunneling"  	select INET_TUNNEL +	select NET_IP_TUNNEL  	---help---  	  Tunneling means encapsulating data of one protocol type within  	  another protocol and sending it over a channel that understands the @@ -221,9 +184,14 @@ config NET_IPGRE_DEMUX  	 This is helper module to demultiplex GRE packets on GRE version field criteria.  	 Required by ip_gre and pptp modules. +config NET_IP_TUNNEL +	tristate +	default n +  config NET_IPGRE  	tristate "IP: GRE tunnels over IP"  	depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX +	select NET_IP_TUNNEL  	help  	  Tunneling means encapsulating data of one protocol type within  	  another protocol and sending it over a channel that understands the @@ -252,10 +220,8 @@ config IP_MROUTE  	  packets that have several destination addresses. It is needed on the  	  MBONE, a high bandwidth network on top of the Internet which carries  	  audio and video broadcasts. In order to do that, you would most -	  likely run the program mrouted. Information about the multicast -	  capabilities of the various network cards is contained in -	  <file:Documentation/networking/multicast.txt>. If you haven't heard -	  about it, you don't need it. +	  likely run the program mrouted. If you haven't heard about it, you +	  don't need it.  config IP_MROUTE_MULTIPLE_TABLES  	bool "IP: multicast policy routing" @@ -293,22 +259,6 @@ config IP_PIMSM_V2  	  gated-5). This routing protocol is not used widely, so say N unless  	  you want to play with it. -config ARPD -	bool "IP: ARP daemon support" -	---help--- -	  The kernel maintains an internal cache which maps IP addresses to -	  hardware addresses on the local network, so that Ethernet/Token Ring/ -	  etc. frames are sent to the proper address on the physical networking -	  layer. Normally, kernel uses the ARP protocol to resolve these -	  mappings. - -	  Saying Y here adds support to have an user space daemon to do this -	  resolution instead. This is useful for implementing an alternate -	  address resolution protocol (e.g. NHRP on mGRE tunnels) and also for -	  testing purposes. - -	  If unsure, say N. -  config SYN_COOKIES  	bool "IP: TCP syncookie support"  	---help--- @@ -345,9 +295,21 @@ config SYN_COOKIES  	  If unsure, say N. +config NET_IPVTI +	tristate "Virtual (secure) IP: tunneling" +	select INET_TUNNEL +	select NET_IP_TUNNEL +	depends on INET_XFRM_MODE_TUNNEL +	---help--- +	  Tunneling means encapsulating data of one protocol type within +	  another protocol and sending it over a channel that understands the +	  encapsulating protocol. This can be used with xfrm mode tunnel to give +	  the notion of a secure tunnel for IPSEC and then use routing protocol +	  on top. +  config INET_AH  	tristate "IP: AH transformation" -	select XFRM +	select XFRM_ALGO  	select CRYPTO  	select CRYPTO_HMAC  	select CRYPTO_MD5 @@ -359,7 +321,7 @@ config INET_AH  config INET_ESP  	tristate "IP: ESP transformation" -	select XFRM +	select XFRM_ALGO  	select CRYPTO  	select CRYPTO_AUTHENC  	select CRYPTO_HMAC @@ -432,7 +394,9 @@ config INET_DIAG  	---help---  	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by  	  native Linux tools such as ss. ss is included in iproute2, currently -	  downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. +	  downloadable at: +	   +	    http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2  	  If unsure, say Y. @@ -440,6 +404,14 @@ config INET_TCP_DIAG  	depends on INET_DIAG  	def_tristate INET_DIAG +config INET_UDP_DIAG +	tristate "UDP: socket monitoring interface" +	depends on INET_DIAG && (IPV6 || IPV6=n) +	default n +	---help--- +	  Support for UDP socket monitoring interface used by the ss tool. +	  If unsure, say Y. +  menuconfig TCP_CONG_ADVANCED  	bool "TCP: advanced congestion control"  	---help--- @@ -502,7 +474,6 @@ config TCP_CONG_HTCP  config TCP_CONG_HSTCP  	tristate "High Speed TCP" -	depends on EXPERIMENTAL  	default n  	---help---  	Sally Floyd's High Speed TCP (RFC 3649) congestion control. @@ -513,7 +484,6 @@ config TCP_CONG_HSTCP  config TCP_CONG_HYBLA  	tristate "TCP-Hybla congestion control algorithm" -	depends on EXPERIMENTAL  	default n  	---help---  	TCP-Hybla is a sender-side only change that eliminates penalization of @@ -523,7 +493,6 @@ config TCP_CONG_HYBLA  config TCP_CONG_VEGAS  	tristate "TCP Vegas" -	depends on EXPERIMENTAL  	default n  	---help---  	TCP Vegas is a sender-side only change to TCP that anticipates @@ -534,7 +503,6 @@ config TCP_CONG_VEGAS  config TCP_CONG_SCALABLE  	tristate "Scalable TCP" -	depends on EXPERIMENTAL  	default n  	---help---  	Scalable TCP is a sender-side only change to TCP which uses a @@ -544,7 +512,6 @@ config TCP_CONG_SCALABLE  config TCP_CONG_LP  	tristate "TCP Low Priority" -	depends on EXPERIMENTAL  	default n  	---help---  	TCP Low Priority (TCP-LP), a distributed algorithm whose goal is @@ -554,7 +521,6 @@ config TCP_CONG_LP  config TCP_CONG_VENO  	tristate "TCP Veno" -	depends on EXPERIMENTAL  	default n  	---help---  	TCP Veno is a sender-side only enhancement of TCP to obtain better @@ -566,7 +532,6 @@ config TCP_CONG_VENO  config TCP_CONG_YEAH  	tristate "YeAH TCP" -	depends on EXPERIMENTAL  	select TCP_CONG_VEGAS  	default n  	---help--- @@ -581,7 +546,6 @@ config TCP_CONG_YEAH  config TCP_CONG_ILLINOIS  	tristate "TCP Illinois" -	depends on EXPERIMENTAL  	default n  	---help---  	TCP-Illinois is a sender-side modification of TCP Reno for @@ -645,8 +609,7 @@ config DEFAULT_TCP_CONG  	default "cubic"  config TCP_MD5SIG -	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)" -	depends on EXPERIMENTAL +	bool "TCP: MD5 Signature Option support (RFC2385)"  	select CRYPTO  	select CRYPTO_MD5  	---help--- @@ -655,4 +618,3 @@ config TCP_MD5SIG  	  on the Internet.  	  If unsure, say N. - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4978d22f9a7..f032688d20d 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,21 +7,22 @@ obj-y     := route.o inetpeer.o protocol.o \  	     ip_output.o ip_sockglue.o inet_hashtables.o \  	     inet_timewait_sock.o inet_connection_sock.o \  	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ -	     tcp_minisocks.o tcp_cong.o \ -	     datagram.o raw.o udp.o udplite.o \ -	     arp.o icmp.o devinet.o af_inet.o  igmp.o \ -	     fib_frontend.o fib_semantics.o \ -	     inet_fragment.o +	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ +	     tcp_offload.o datagram.o raw.o udp.o udplite.o \ +	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ +	     fib_frontend.o fib_semantics.o fib_trie.o \ +	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o +obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o -obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o  obj-$(CONFIG_PROC_FS) += proc.o  obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o  obj-$(CONFIG_IP_MROUTE) += ipmr.o  obj-$(CONFIG_NET_IPIP) += ipip.o +gre-y := gre_demux.o  obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o  obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_NET_IPVTI) += ip_vti.o  obj-$(CONFIG_SYN_COOKIES) += syncookies.o  obj-$(CONFIG_INET_AH) += ah4.o  obj-$(CONFIG_INET_ESP) += esp4.o @@ -36,6 +37,7 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o  obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/  obj-$(CONFIG_INET_DIAG) += inet_diag.o   obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o  obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o  obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o  obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o @@ -49,7 +51,8 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o  obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o  obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o  obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o  obj-$(CONFIG_NETLABEL) += cipso_ipv4.o  obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ -		      xfrm4_output.o +		      xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2b61107df6..d156b3c5f36 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -65,6 +65,8 @@   *		2 of the License, or (at your option) any later version.   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/err.h>  #include <linux/errno.h>  #include <linux/types.h> @@ -89,7 +91,6 @@  #include <linux/slab.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/inet.h>  #include <linux/igmp.h> @@ -105,14 +106,15 @@  #include <net/tcp.h>  #include <net/udp.h>  #include <net/udplite.h> +#include <net/ping.h>  #include <linux/skbuff.h>  #include <net/sock.h>  #include <net/raw.h>  #include <net/icmp.h> -#include <net/ipip.h>  #include <net/inet_common.h>  #include <net/xfrm.h>  #include <net/net_namespace.h> +#include <net/secure_seq.h>  #ifdef CONFIG_IP_MROUTE  #include <linux/mroute.h>  #endif @@ -124,9 +126,6 @@  static struct list_head inetsw[SOCK_MAX];  static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); -  /* New destruction routine */  void inet_sock_destruct(struct sock *sk) @@ -153,8 +152,9 @@ void inet_sock_destruct(struct sock *sk)  	WARN_ON(sk->sk_wmem_queued);  	WARN_ON(sk->sk_forward_alloc); -	kfree(inet->opt); +	kfree(rcu_dereference_protected(inet->inet_opt, 1));  	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); +	dst_release(sk->sk_rx_dst);  	sk_refcnt_debug_dec(sk);  }  EXPORT_SYMBOL(inet_sock_destruct); @@ -209,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog)  	 * we can only allow the backlog to be adjusted.  	 */  	if (old_state != TCP_LISTEN) { +		/* Check special setups for testing purpose to enable TFO w/o +		 * requiring TCP_FASTOPEN sockopt. +		 * Note that only TCP sockets (SOCK_STREAM) will reach here. +		 * Also fastopenq may already been allocated because this +		 * socket was in TCP_LISTEN state previously but was +		 * shutdown() (rather than close()). +		 */ +		if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && +		    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { +			if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) +				err = fastopen_init_queue(sk, backlog); +			else if ((sysctl_tcp_fastopen & +				  TFO_SERVER_WO_SOCKOPT2) != 0) +				err = fastopen_init_queue(sk, +				    ((uint)sysctl_tcp_fastopen) >> 16); +			else +				err = 0; +			if (err) +				goto out; +		}  		err = inet_csk_listen_start(sk, backlog);  		if (err)  			goto out; @@ -222,41 +242,6 @@ out:  }  EXPORT_SYMBOL(inet_listen); -u32 inet_ehash_secret __read_mostly; -EXPORT_SYMBOL(inet_ehash_secret); - -/* - * inet_ehash_secret must be set exactly once - */ -void build_ehash_secret(void) -{ -	u32 rnd; - -	do { -		get_random_bytes(&rnd, sizeof(rnd)); -	} while (rnd == 0); - -	cmpxchg(&inet_ehash_secret, 0, rnd); -} -EXPORT_SYMBOL(build_ehash_secret); - -static inline int inet_netns_ok(struct net *net, int protocol) -{ -	int hash; -	const struct net_protocol *ipprot; - -	if (net_eq(net, &init_net)) -		return 1; - -	hash = protocol & (MAX_INET_PROTOS - 1); -	ipprot = rcu_dereference(inet_protos[hash]); - -	if (ipprot == NULL) -		/* raw IP is OK */ -		return 1; -	return ipprot->netns_ok; -} -  /*   *	Create an inet socket.   */ @@ -269,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,  	struct inet_sock *inet;  	struct proto *answer_prot;  	unsigned char answer_flags; -	char answer_no_check;  	int try_loading_module = 0;  	int err; -	if (unlikely(!inet_ehash_secret)) -		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) -			build_ehash_secret(); -  	sock->state = SS_UNCONNECTED;  	/* Look for the requested type/protocol pair. */ @@ -325,16 +305,12 @@ lookup_protocol:  	}  	err = -EPERM; -	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) -		goto out_rcu_unlock; - -	err = -EAFNOSUPPORT; -	if (!inet_netns_ok(net, protocol)) +	if (sock->type == SOCK_RAW && !kern && +	    !ns_capable(net->user_ns, CAP_NET_RAW))  		goto out_rcu_unlock;  	sock->ops = answer->ops;  	answer_prot = answer->prot; -	answer_no_check = answer->no_check;  	answer_flags = answer->flags;  	rcu_read_unlock(); @@ -346,9 +322,8 @@ lookup_protocol:  		goto out;  	err = 0; -	sk->sk_no_check = answer_no_check;  	if (INET_PROTOSW_REUSE & answer_flags) -		sk->sk_reuse = 1; +		sk->sk_reuse = SK_CAN_REUSE;  	inet = inet_sk(sk);  	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; @@ -361,7 +336,7 @@ lookup_protocol:  			inet->hdrincl = 1;  	} -	if (ipv4_config.no_pmtu_disc) +	if (net->ipv4.sysctl_ip_no_pmtu_disc)  		inet->pmtudisc = IP_PMTUDISC_DONT;  	else  		inet->pmtudisc = IP_PMTUDISC_WANT; @@ -380,6 +355,7 @@ lookup_protocol:  	inet->mc_all	= 1;  	inet->mc_index	= 0;  	inet->mc_list	= NULL; +	inet->rcv_tos	= 0;  	sk_refcnt_debug_inc(sk); @@ -451,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;  	struct sock *sk = sock->sk;  	struct inet_sock *inet = inet_sk(sk); +	struct net *net = sock_net(sk);  	unsigned short snum;  	int chk_addr_ret;  	int err; @@ -464,7 +441,17 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	if (addr_len < sizeof(struct sockaddr_in))  		goto out; -	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); +	if (addr->sin_family != AF_INET) { +		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) +		 * only if s_addr is INADDR_ANY. +		 */ +		err = -EAFNOSUPPORT; +		if (addr->sin_family != AF_UNSPEC || +		    addr->sin_addr.s_addr != htonl(INADDR_ANY)) +			goto out; +	} + +	chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);  	/* Not specified by any standard per-se, however it breaks too  	 * many applications when removed.  It is unfortunate since @@ -484,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)  	snum = ntohs(addr->sin_port);  	err = -EACCES; -	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) +	if (snum && snum < PROT_SOCK && +	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))  		goto out;  	/*      We keep a pair of addresses. rcv_saddr is the one @@ -528,7 +516,7 @@ out:  }  EXPORT_SYMBOL(inet_bind); -int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,  		       int addr_len, int flags)  {  	struct sock *sk = sock->sk; @@ -540,15 +528,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,  	if (!inet_sk(sk)->inet_num && inet_autobind(sk))  		return -EAGAIN; -	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +	return sk->sk_prot->connect(sk, uaddr, addr_len);  }  EXPORT_SYMBOL(inet_dgram_connect); -static long inet_wait_for_connect(struct sock *sk, long timeo) +static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)  {  	DEFINE_WAIT(wait);  	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); +	sk->sk_write_pending += writebias;  	/* Basic assumption: if someone sets sk->sk_err, he _must_  	 * change state of the socket from TCP_SYN_*. @@ -564,6 +553,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);  	}  	finish_wait(sk_sleep(sk), &wait); +	sk->sk_write_pending -= writebias;  	return timeo;  } @@ -571,8 +561,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)   *	Connect to a remote host. There is regrettably still a little   *	TCP 'magic' in here.   */ -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, -			int addr_len, int flags) +int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +			  int addr_len, int flags)  {  	struct sock *sk = sock->sk;  	int err; @@ -581,8 +571,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	if (addr_len < sizeof(uaddr->sa_family))  		return -EINVAL; -	lock_sock(sk); -  	if (uaddr->sa_family == AF_UNSPEC) {  		err = sk->sk_prot->disconnect(sk, flags);  		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; @@ -622,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);  	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { +		int writebias = (sk->sk_protocol == IPPROTO_TCP) && +				tcp_sk(sk)->fastopen_req && +				tcp_sk(sk)->fastopen_req->data ? 1 : 0; +  		/* Error code is set above */ -		if (!timeo || !inet_wait_for_connect(sk, timeo)) +		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))  			goto out;  		err = sock_intr_errno(timeo); @@ -645,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,  	sock->state = SS_CONNECTED;  	err = 0;  out: -	release_sock(sk);  	return err;  sock_error: @@ -655,6 +646,18 @@ sock_error:  		sock->state = SS_DISCONNECTING;  	goto out;  } +EXPORT_SYMBOL(__inet_stream_connect); + +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +			int addr_len, int flags) +{ +	int err; + +	lock_sock(sock->sk); +	err = __inet_stream_connect(sock, uaddr, addr_len, flags); +	release_sock(sock->sk); +	return err; +}  EXPORT_SYMBOL(inet_stream_connect);  /* @@ -672,8 +675,10 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)  	lock_sock(sk2); +	sock_rps_record_flow(sk2);  	WARN_ON(!((1 << sk2->sk_state) & -		  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); +		  (TCPF_ESTABLISHED | TCPF_SYN_RECV | +		  TCPF_CLOSE_WAIT | TCPF_CLOSE)));  	sock_graft(sk2, newsock); @@ -880,6 +885,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  }  EXPORT_SYMBOL(inet_ioctl); +#ifdef CONFIG_COMPAT +static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ +	struct sock *sk = sock->sk; +	int err = -ENOIOCTLCMD; + +	if (sk->sk_prot->compat_ioctl) +		err = sk->sk_prot->compat_ioctl(sk, cmd, arg); + +	return err; +} +#endif +  const struct proto_ops inet_stream_ops = {  	.family		   = PF_INET,  	.owner		   = THIS_MODULE, @@ -903,6 +921,7 @@ const struct proto_ops inet_stream_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  };  EXPORT_SYMBOL(inet_stream_ops); @@ -929,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  };  EXPORT_SYMBOL(inet_dgram_ops); @@ -959,6 +979,7 @@ static const struct proto_ops inet_sockraw_ops = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_sock_common_setsockopt,  	.compat_getsockopt = compat_sock_common_getsockopt, +	.compat_ioctl	   = inet_compat_ioctl,  #endif  }; @@ -978,7 +999,6 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_TCP,  		.prot =       &tcp_prot,  		.ops =        &inet_stream_ops, -		.no_check =   0,  		.flags =      INET_PROTOSW_PERMANENT |  			      INET_PROTOSW_ICSK,  	}, @@ -988,17 +1008,22 @@ static struct inet_protosw inetsw_array[] =  		.protocol =   IPPROTO_UDP,  		.prot =       &udp_prot,  		.ops =        &inet_dgram_ops, -		.no_check =   UDP_CSUM_DEFAULT,  		.flags =      INET_PROTOSW_PERMANENT,         }, +       { +		.type =       SOCK_DGRAM, +		.protocol =   IPPROTO_ICMP, +		.prot =       &ping_prot, +		.ops =        &inet_dgram_ops, +		.flags =      INET_PROTOSW_REUSE, +       },         {  	       .type =       SOCK_RAW,  	       .protocol =   IPPROTO_IP,	/* wild card */  	       .prot =       &raw_prot,  	       .ops =        &inet_sockraw_ops, -	       .no_check =   UDP_CSUM_DEFAULT,  	       .flags =      INET_PROTOSW_REUSE,         }  }; @@ -1048,13 +1073,11 @@ out:  	return;  out_permanent: -	printk(KERN_ERR "Attempt to override permanent protocol %d.\n", -	       protocol); +	pr_err("Attempt to override permanent protocol %d\n", protocol);  	goto out;  out_illegal: -	printk(KERN_ERR -	       "Ignoring attempt to register invalid socket type %d.\n", +	pr_err("Ignoring attempt to register invalid socket type %d\n",  	       p->type);  	goto out;  } @@ -1063,8 +1086,7 @@ EXPORT_SYMBOL(inet_register_protosw);  void inet_unregister_protosw(struct inet_protosw *p)  {  	if (INET_PROTOSW_PERMANENT & p->flags) { -		printk(KERN_ERR -		       "Attempt to unregister permanent protocol %d.\n", +		pr_err("Attempt to unregister permanent protocol %d\n",  		       p->protocol);  	} else {  		spin_lock_bh(&inetsw_lock); @@ -1085,34 +1107,36 @@ int sysctl_ip_dynaddr __read_mostly;  static int inet_sk_reselect_saddr(struct sock *sk)  {  	struct inet_sock *inet = inet_sk(sk); -	int err; -	struct rtable *rt;  	__be32 old_saddr = inet->inet_saddr; -	__be32 new_saddr;  	__be32 daddr = inet->inet_daddr; +	struct flowi4 *fl4; +	struct rtable *rt; +	__be32 new_saddr; +	struct ip_options_rcu *inet_opt; -	if (inet->opt && inet->opt->srr) -		daddr = inet->opt->faddr; +	inet_opt = rcu_dereference_protected(inet->inet_opt, +					     sock_owned_by_user(sk)); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr;  	/* Query new route. */ -	err = ip_route_connect(&rt, daddr, 0, -			       RT_CONN_FLAGS(sk), -			       sk->sk_bound_dev_if, -			       sk->sk_protocol, -			       inet->inet_sport, inet->inet_dport, sk, 0); -	if (err) -		return err; +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), +			      sk->sk_bound_dev_if, sk->sk_protocol, +			      inet->inet_sport, inet->inet_dport, sk); +	if (IS_ERR(rt)) +		return PTR_ERR(rt);  	sk_setup_caps(sk, &rt->dst); -	new_saddr = rt->rt_src; +	new_saddr = fl4->saddr;  	if (new_saddr == old_saddr)  		return 0;  	if (sysctl_ip_dynaddr > 1) { -		printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n", -		       __func__, &old_saddr, &new_saddr); +		pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", +			__func__, &old_saddr, &new_saddr);  	}  	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; @@ -1134,6 +1158,8 @@ int inet_sk_rebuild_header(struct sock *sk)  	struct inet_sock *inet = inet_sk(sk);  	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);  	__be32 daddr; +	struct ip_options_rcu *inet_opt; +	struct flowi4 *fl4;  	int err;  	/* Route is OK, nothing to do. */ @@ -1141,28 +1167,23 @@ int inet_sk_rebuild_header(struct sock *sk)  		return 0;  	/* Reroute. */ +	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt);  	daddr = inet->inet_daddr; -	if (inet->opt && inet->opt->srr) -		daddr = inet->opt->faddr; -{ -	struct flowi fl = { -		.oif = sk->sk_bound_dev_if, -		.mark = sk->sk_mark, -		.fl4_dst = daddr, -		.fl4_src = inet->inet_saddr, -		.fl4_tos = RT_CONN_FLAGS(sk), -		.proto = sk->sk_protocol, -		.flags = inet_sk_flowi_flags(sk), -		.fl_ip_sport = inet->inet_sport, -		.fl_ip_dport = inet->inet_dport, -	}; - -	security_sk_classify_flow(sk, &fl); -	err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); -} -	if (!err) +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	rcu_read_unlock(); +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, +				   inet->inet_dport, inet->inet_sport, +				   sk->sk_protocol, RT_CONN_FLAGS(sk), +				   sk->sk_bound_dev_if); +	if (!IS_ERR(rt)) { +		err = 0;  		sk_setup_caps(sk, &rt->dst); -	else { +	} else { +		err = PTR_ERR(rt); +  		/* Routing failed... */  		sk->sk_route_caps = 0;  		/* @@ -1182,8 +1203,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);  static int inet_gso_send_check(struct sk_buff *skb)  { -	struct iphdr *iph; -	const struct net_protocol *ops; +	const struct net_offload *ops; +	const struct iphdr *iph;  	int proto;  	int ihl;  	int err = -EINVAL; @@ -1196,46 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb)  	if (ihl < sizeof(*iph))  		goto out; +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	proto = iph->protocol & (MAX_INET_PROTOS - 1);  	err = -EPROTONOSUPPORT; -	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (likely(ops && ops->gso_send_check)) -		err = ops->gso_send_check(skb); -	rcu_read_unlock(); +	ops = rcu_dereference(inet_offloads[proto]); +	if (likely(ops && ops->callbacks.gso_send_check)) +		err = ops->callbacks.gso_send_check(skb);  out:  	return err;  } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, +					netdev_features_t features)  {  	struct sk_buff *segs = ERR_PTR(-EINVAL); +	const struct net_offload *ops; +	unsigned int offset = 0; +	bool udpfrag, encap;  	struct iphdr *iph; -	const struct net_protocol *ops;  	int proto; +	int nhoff;  	int ihl;  	int id; -	unsigned int offset = 0; - -	if (!(features & NETIF_F_V4_CSUM)) -		features &= ~NETIF_F_SG;  	if (unlikely(skb_shinfo(skb)->gso_type &  		     ~(SKB_GSO_TCPV4 |  		       SKB_GSO_UDP |  		       SKB_GSO_DODGY |  		       SKB_GSO_TCP_ECN | +		       SKB_GSO_GRE | +		       SKB_GSO_GRE_CSUM | +		       SKB_GSO_IPIP | +		       SKB_GSO_SIT | +		       SKB_GSO_TCPV6 | +		       SKB_GSO_UDP_TUNNEL | +		       SKB_GSO_UDP_TUNNEL_CSUM | +		       SKB_GSO_MPLS |  		       0)))  		goto out; +	skb_reset_network_header(skb); +	nhoff = skb_network_header(skb) - skb_mac_header(skb);  	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))  		goto out; @@ -1244,39 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)  	if (ihl < sizeof(*iph))  		goto out; +	id = ntohs(iph->id); +	proto = iph->protocol; + +	/* Warning: after this point, iph might be no longer valid */  	if (unlikely(!pskb_may_pull(skb, ihl)))  		goto out; -  	__skb_pull(skb, ihl); + +	encap = SKB_GSO_CB(skb)->encap_level > 0; +	if (encap) +		features = skb->dev->hw_enc_features & netif_skb_features(skb); +	SKB_GSO_CB(skb)->encap_level += ihl; +  	skb_reset_transport_header(skb); -	iph = ip_hdr(skb); -	id = ntohs(iph->id); -	proto = iph->protocol & (MAX_INET_PROTOS - 1); +  	segs = ERR_PTR(-EPROTONOSUPPORT); -	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (likely(ops && ops->gso_segment)) -		segs = ops->gso_segment(skb, features); -	rcu_read_unlock(); +	if (skb->encapsulation && +	    skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) +		udpfrag = proto == IPPROTO_UDP && encap; +	else +		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; -	if (!segs || IS_ERR(segs)) +	ops = rcu_dereference(inet_offloads[proto]); +	if (likely(ops && ops->callbacks.gso_segment)) +		segs = ops->callbacks.gso_segment(skb, features); + +	if (IS_ERR_OR_NULL(segs))  		goto out;  	skb = segs;  	do { -		iph = ip_hdr(skb); -		if (proto == IPPROTO_UDP) { +		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); +		if (udpfrag) {  			iph->id = htons(id);  			iph->frag_off = htons(offset >> 3);  			if (skb->next != NULL)  				iph->frag_off |= htons(IP_MF); -			offset += (skb->len - skb->mac_len - iph->ihl * 4); -		} else +			offset += skb->len - nhoff - ihl; +		} else {  			iph->id = htons(id++); -		iph->tot_len = htons(skb->len - skb->mac_len); -		iph->check = 0; -		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); +		} +		iph->tot_len = htons(skb->len - nhoff); +		ip_send_check(iph); +		if (encap) +			skb_reset_inner_headers(skb); +		skb->network_header = (u8 *)iph - skb->head;  	} while ((skb = skb->next));  out: @@ -1286,10 +1330,10 @@ out:  static struct sk_buff **inet_gro_receive(struct sk_buff **head,  					 struct sk_buff *skb)  { -	const struct net_protocol *ops; +	const struct net_offload *ops;  	struct sk_buff **pp = NULL;  	struct sk_buff *p; -	struct iphdr *iph; +	const struct iphdr *iph;  	unsigned int hlen;  	unsigned int off;  	unsigned int id; @@ -1305,21 +1349,21 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  			goto out;  	} -	proto = iph->protocol & (MAX_INET_PROTOS - 1); +	proto = iph->protocol;  	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (!ops || !ops->gro_receive) +	ops = rcu_dereference(inet_offloads[proto]); +	if (!ops || !ops->callbacks.gro_receive)  		goto out_unlock;  	if (*(u8 *)iph != 0x45)  		goto out_unlock; -	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) +	if (unlikely(ip_fast_csum((u8 *)iph, 5)))  		goto out_unlock;  	id = ntohl(*(__be32 *)&iph->id); -	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); +	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));  	id >>= 16;  	for (p = *head; p; p = p->next) { @@ -1328,10 +1372,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		if (!NAPI_GRO_CB(p)->same_flow)  			continue; -		iph2 = ip_hdr(p); - +		iph2 = (struct iphdr *)(p->data + off); +		/* The above works because, with the exception of the top +		 * (inner most) layer, we only aggregate pkts with the same +		 * hdr length so all the hdrs we'll need to verify will start +		 * at the same offset. +		 */  		if ((iph->protocol ^ iph2->protocol) | -		    (iph->tos ^ iph2->tos) |  		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |  		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {  			NAPI_GRO_CB(p)->same_flow = 0; @@ -1341,16 +1388,29 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,  		/* All fields must match except length and checksum. */  		NAPI_GRO_CB(p)->flush |=  			(iph->ttl ^ iph2->ttl) | -			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); +			(iph->tos ^ iph2->tos) | +			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); +		/* Save the IP ID check to be included later when we get to +		 * the transport layer so only the inner most IP ID is checked. +		 * This is because some GSO/TSO implementations do not +		 * correctly increment the IP ID for the outer hdrs. +		 */ +		NAPI_GRO_CB(p)->flush_id = +			    ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);  		NAPI_GRO_CB(p)->flush |= flush;  	}  	NAPI_GRO_CB(skb)->flush |= flush; +	skb_set_network_header(skb, off); +	/* The above will be needed by the transport layer if there is one +	 * immediately following this IP hdr. +	 */ +  	skb_gro_pull(skb, sizeof(*iph));  	skb_set_transport_header(skb, skb_gro_offset(skb)); -	pp = ops->gro_receive(head, skb); +	pp = ops->callbacks.gro_receive(head, skb);  out_unlock:  	rcu_read_unlock(); @@ -1361,23 +1421,30 @@ out:  	return pp;  } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff)  { -	const struct net_protocol *ops; -	struct iphdr *iph = ip_hdr(skb); -	int proto = iph->protocol & (MAX_INET_PROTOS - 1); +	__be16 newlen = htons(skb->len - nhoff); +	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); +	const struct net_offload *ops; +	int proto = iph->protocol;  	int err = -ENOSYS; -	__be16 newlen = htons(skb->len - skb_network_offset(skb)); + +	if (skb->encapsulation) +		skb_set_inner_network_header(skb, nhoff);  	csum_replace2(&iph->check, iph->tot_len, newlen);  	iph->tot_len = newlen;  	rcu_read_lock(); -	ops = rcu_dereference(inet_protos[proto]); -	if (WARN_ON(!ops || !ops->gro_complete)) +	ops = rcu_dereference(inet_offloads[proto]); +	if (WARN_ON(!ops || !ops->callbacks.gro_complete))  		goto out_unlock; -	err = ops->gro_complete(skb); +	/* Only need to add sizeof(*iph) to get to the next hdr below +	 * because any hdr with option will have been flushed in +	 * inet_gro_receive(). +	 */ +	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));  out_unlock:  	rcu_read_unlock(); @@ -1407,82 +1474,44 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,  }  EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void __percpu *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib, int offt)  {  	unsigned long res = 0;  	int i; -	for_each_possible_cpu(i) { -		res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); -		res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); -	} +	for_each_possible_cpu(i) +		res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field);  #if BITS_PER_LONG==32 -u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)  {  	u64 res = 0;  	int cpu;  	for_each_possible_cpu(cpu) { -		void *bhptr, *userptr; +		void *bhptr;  		struct u64_stats_sync *syncp; -		u64 v_bh, v_user; +		u64 v;  		unsigned int start; -		/* first mib used by softirq context, we must use _bh() accessors */ -		bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); +		bhptr = per_cpu_ptr(mib, cpu);  		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);  		do { -			start = u64_stats_fetch_begin_bh(syncp); -			v_bh = *(((u64 *) bhptr) + offt); -		} while (u64_stats_fetch_retry_bh(syncp, start)); +			start = u64_stats_fetch_begin_irq(syncp); +			v = *(((u64 *) bhptr) + offt); +		} while (u64_stats_fetch_retry_irq(syncp, start)); -		/* second mib used in USER context */ -		userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); -		syncp = (struct u64_stats_sync *)(userptr + syncp_offset); -		do { -			start = u64_stats_fetch_begin(syncp); -			v_user = *(((u64 *) userptr) + offt); -		} while (u64_stats_fetch_retry(syncp, start)); - -		res += v_bh + v_user; +		res += v;  	}  	return res;  }  EXPORT_SYMBOL_GPL(snmp_fold_field64);  #endif -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) -{ -	BUG_ON(ptr == NULL); -	ptr[0] = __alloc_percpu(mibsize, align); -	if (!ptr[0]) -		goto err0; -	ptr[1] = __alloc_percpu(mibsize, align); -	if (!ptr[1]) -		goto err1; -	return 0; -err1: -	free_percpu(ptr[0]); -	ptr[0] = NULL; -err0: -	return -ENOMEM; -} -EXPORT_SYMBOL_GPL(snmp_mib_init); - -void snmp_mib_free(void __percpu *ptr[2]) -{ -	BUG_ON(ptr == NULL); -	free_percpu(ptr[0]); -	free_percpu(ptr[1]); -	ptr[0] = ptr[1] = NULL; -} -EXPORT_SYMBOL_GPL(snmp_mib_free); -  #ifdef CONFIG_IP_MULTICAST  static const struct net_protocol igmp_protocol = {  	.handler =	igmp_rcv, @@ -1491,90 +1520,91 @@ static const struct net_protocol igmp_protocol = {  #endif  static const struct net_protocol tcp_protocol = { -	.handler =	tcp_v4_rcv, -	.err_handler =	tcp_v4_err, -	.gso_send_check = tcp_v4_gso_send_check, -	.gso_segment =	tcp_tso_segment, -	.gro_receive =	tcp4_gro_receive, -	.gro_complete =	tcp4_gro_complete, -	.no_policy =	1, -	.netns_ok =	1, +	.early_demux	=	tcp_v4_early_demux, +	.handler	=	tcp_v4_rcv, +	.err_handler	=	tcp_v4_err, +	.no_policy	=	1, +	.netns_ok	=	1, +	.icmp_strict_tag_validation = 1,  };  static const struct net_protocol udp_protocol = { +	.early_demux =	udp_v4_early_demux,  	.handler =	udp_rcv,  	.err_handler =	udp_err, -	.gso_send_check = udp4_ufo_send_check, -	.gso_segment = udp4_ufo_fragment,  	.no_policy =	1,  	.netns_ok =	1,  };  static const struct net_protocol icmp_protocol = {  	.handler =	icmp_rcv, +	.err_handler =	icmp_err,  	.no_policy =	1,  	.netns_ok =	1,  };  static __net_init int ipv4_mib_init_net(struct net *net)  { -	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, -			  sizeof(struct tcp_mib), -			  __alignof__(struct tcp_mib)) < 0) +	int i; + +	net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); +	if (!net->mib.tcp_statistics)  		goto err_tcp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, -			  sizeof(struct ipstats_mib), -			  __alignof__(struct ipstats_mib)) < 0) +	net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); +	if (!net->mib.ip_statistics)  		goto err_ip_mib; -	if (snmp_mib_init((void __percpu **)net->mib.net_statistics, -			  sizeof(struct linux_mib), -			  __alignof__(struct linux_mib)) < 0) + +	for_each_possible_cpu(i) { +		struct ipstats_mib *af_inet_stats; +		af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); +		u64_stats_init(&af_inet_stats->syncp); +	} + +	net->mib.net_statistics = alloc_percpu(struct linux_mib); +	if (!net->mib.net_statistics)  		goto err_net_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udp_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udp_statistics)  		goto err_udp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, -			  sizeof(struct udp_mib), -			  __alignof__(struct udp_mib)) < 0) +	net->mib.udplite_statistics = alloc_percpu(struct udp_mib); +	if (!net->mib.udplite_statistics)  		goto err_udplite_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, -			  sizeof(struct icmp_mib), -			  __alignof__(struct icmp_mib)) < 0) +	net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); +	if (!net->mib.icmp_statistics)  		goto err_icmp_mib; -	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, -			  sizeof(struct icmpmsg_mib), -			  __alignof__(struct icmpmsg_mib)) < 0) +	net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), +					      GFP_KERNEL); +	if (!net->mib.icmpmsg_statistics)  		goto err_icmpmsg_mib;  	tcp_mib_init(net);  	return 0;  err_icmpmsg_mib: -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); +	free_percpu(net->mib.icmp_statistics);  err_icmp_mib: -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); +	free_percpu(net->mib.udplite_statistics);  err_udplite_mib: -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); +	free_percpu(net->mib.udp_statistics);  err_udp_mib: -	snmp_mib_free((void __percpu **)net->mib.net_statistics); +	free_percpu(net->mib.net_statistics);  err_net_mib: -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); +	free_percpu(net->mib.ip_statistics);  err_ip_mib: -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	free_percpu(net->mib.tcp_statistics);  err_tcp_mib:  	return -ENOMEM;  }  static __net_exit void ipv4_mib_exit_net(struct net *net)  { -	snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); -	snmp_mib_free((void __percpu **)net->mib.icmp_statistics); -	snmp_mib_free((void __percpu **)net->mib.udplite_statistics); -	snmp_mib_free((void __percpu **)net->mib.udp_statistics); -	snmp_mib_free((void __percpu **)net->mib.net_statistics); -	snmp_mib_free((void __percpu **)net->mib.ip_statistics); -	snmp_mib_free((void __percpu **)net->mib.tcp_statistics); +	kfree(net->mib.icmpmsg_statistics); +	free_percpu(net->mib.icmp_statistics); +	free_percpu(net->mib.udplite_statistics); +	free_percpu(net->mib.udp_statistics); +	free_percpu(net->mib.net_statistics); +	free_percpu(net->mib.ip_statistics); +	free_percpu(net->mib.tcp_statistics);  }  static __net_initdata struct pernet_operations ipv4_mib_ops = { @@ -1587,37 +1617,95 @@ static int __init init_ipv4_mibs(void)  	return register_pernet_subsys(&ipv4_mib_ops);  } +static __net_init int inet_init_net(struct net *net) +{ +	/* +	 * Set defaults for local port range +	 */ +	seqlock_init(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] =  32768; +	net->ipv4.ip_local_ports.range[1] =  61000; + +	seqlock_init(&net->ipv4.ping_group_range.lock); +	/* +	 * Sane defaults - nobody may create ping sockets. +	 * Boot scripts should set this to distro-specific group. +	 */ +	net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); +	net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); +	return 0; +} + +static __net_exit void inet_exit_net(struct net *net) +{ +} + +static __net_initdata struct pernet_operations af_inet_ops = { +	.init = inet_init_net, +	.exit = inet_exit_net, +}; + +static int __init init_inet_pernet_ops(void) +{ +	return register_pernet_subsys(&af_inet_ops); +} +  static int ipv4_proc_init(void);  /*   *	IP protocol layer initialiser   */ +static struct packet_offload ip_packet_offload __read_mostly = { +	.type = cpu_to_be16(ETH_P_IP), +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment = inet_gso_segment, +		.gro_receive = inet_gro_receive, +		.gro_complete = inet_gro_complete, +	}, +}; + +static const struct net_offload ipip_offload = { +	.callbacks = { +		.gso_send_check = inet_gso_send_check, +		.gso_segment	= inet_gso_segment, +	}, +}; + +static int __init ipv4_offload_init(void) +{ +	/* +	 * Add offloads +	 */ +	if (udpv4_offload_init() < 0) +		pr_crit("%s: Cannot add UDP protocol offload\n", __func__); +	if (tcpv4_offload_init() < 0) +		pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + +	dev_add_offload(&ip_packet_offload); +	inet_add_offload(&ipip_offload, IPPROTO_IPIP); +	return 0; +} + +fs_initcall(ipv4_offload_init); +  static struct packet_type ip_packet_type __read_mostly = {  	.type = cpu_to_be16(ETH_P_IP),  	.func = ip_rcv, -	.gso_send_check = inet_gso_send_check, -	.gso_segment = inet_gso_segment, -	.gro_receive = inet_gro_receive, -	.gro_complete = inet_gro_complete,  };  static int __init inet_init(void)  { -	struct sk_buff *dummy_skb;  	struct inet_protosw *q;  	struct list_head *r;  	int rc = -EINVAL; -	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); - -	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); -	if (!sysctl_local_reserved_ports) -		goto out; +	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));  	rc = proto_register(&tcp_prot, 1);  	if (rc) -		goto out_free_reserved_ports; +		goto out;  	rc = proto_register(&udp_prot, 1);  	if (rc) @@ -1627,6 +1715,10 @@ static int __init inet_init(void)  	if (rc)  		goto out_unregister_udp_proto; +	rc = proto_register(&ping_prot, 1); +	if (rc) +		goto out_unregister_raw_proto; +  	/*  	 *	Tell SOCKET that we are alive...  	 */ @@ -1642,14 +1734,14 @@ static int __init inet_init(void)  	 */  	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); +		pr_crit("%s: Cannot add ICMP protocol\n", __func__);  	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); +		pr_crit("%s: Cannot add UDP protocol\n", __func__);  	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +		pr_crit("%s: Cannot add TCP protocol\n", __func__);  #ifdef CONFIG_IP_MULTICAST  	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) -		printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +		pr_crit("%s: Cannot add IGMP protocol\n", __func__);  #endif  	/* Register the socket-side information for inet_create. */ @@ -1682,6 +1774,8 @@ static int __init inet_init(void)  	/* Add UDP-Lite (RFC 3828) */  	udplite4_register(); +	ping_init(); +  	/*  	 *	Set the ICMP layer up  	 */ @@ -1694,14 +1788,17 @@ static int __init inet_init(void)  	 */  #if defined(CONFIG_IP_MROUTE)  	if (ip_mr_init()) -		printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); +		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);  #endif + +	if (init_inet_pernet_ops()) +		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);  	/*  	 *	Initialise per-cpu ipv4 mibs  	 */  	if (init_ipv4_mibs()) -		printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); +		pr_crit("%s: Cannot init ipv4 mibs\n", __func__);  	ipv4_proc_init(); @@ -1712,12 +1809,12 @@ static int __init inet_init(void)  	rc = 0;  out:  	return rc; +out_unregister_raw_proto: +	proto_unregister(&raw_prot);  out_unregister_udp_proto:  	proto_unregister(&udp_prot);  out_unregister_tcp_proto:  	proto_unregister(&tcp_prot); -out_free_reserved_ports: -	kfree(sysctl_local_reserved_ports);  	goto out;  } @@ -1736,11 +1833,15 @@ static int __init ipv4_proc_init(void)  		goto out_tcp;  	if (udp4_proc_init())  		goto out_udp; +	if (ping_proc_init()) +		goto out_ping;  	if (ip_misc_proc_init())  		goto out_misc;  out:  	return rc;  out_misc: +	ping_proc_exit(); +out_ping:  	udp4_proc_exit();  out_udp:  	tcp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 880a5ec6dce..a2afa89513a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "IPsec: " fmt +  #include <crypto/hash.h>  #include <linux/err.h>  #include <linux/module.h> @@ -73,9 +75,9 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,   * into IP header for icv calculation. Options are already checked   * for validity, so paranoia is not required. */ -static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) +static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)  { -	unsigned char * optptr = (unsigned char*)(iph+1); +	unsigned char *optptr = (unsigned char *)(iph+1);  	int  l = iph->ihl*4 - sizeof(struct iphdr);  	int  optlen; @@ -136,8 +138,6 @@ static void ah_output_done(struct crypto_async_request *base, int err)  		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));  	} -	err = ah->nexthdr; -  	kfree(AH_SKB_CB(skb)->tmp);  	xfrm_output_resume(skb, err);  } @@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	struct iphdr *iph, *top_iph;  	struct ip_auth_hdr *ah;  	struct ah_data *ahp; +	int seqhi_len = 0; +	__be32 *seqhi; +	int sglists = 0; +	struct scatterlist *seqhisg;  	ahp = x->data;  	ahash = ahp->ahash; @@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	ah = ip_auth_hdr(skb);  	ihl = ip_hdrlen(skb); +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists = 1; +		seqhi_len = sizeof(*seqhi); +	}  	err = -ENOMEM; -	iph = ah_alloc_tmp(ahash, nfrags, ihl); +	iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);  	if (!iph)  		goto out; - -	icv = ah_tmp_icv(ahash, iph, ihl); +	seqhi = (__be32 *)((char *)iph + ihl); +	icv = ah_tmp_icv(ahash, seqhi, seqhi_len);  	req = ah_tmp_req(ahash, icv);  	sg = ah_req_sg(ahash, req); +	seqhisg = sg + nfrags;  	memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -201,16 +210,24 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)  	top_iph->ttl = 0;  	top_iph->check = 0; -	ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; +	if (x->props.flags & XFRM_STATE_ALIGN4) +		ah->hdrlen  = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; +	else +		ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;  	ah->reserved = 0;  	ah->spi = x->id.spi; -	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); +	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); -	sg_init_table(sg, nfrags); -	skb_to_sgvec(skb, sg, 0, skb->len); +	sg_init_table(sg, nfrags + sglists); +	skb_to_sgvec_nomark(skb, sg, 0, skb->len); -	ahash_request_set_crypt(req, sg, icv, skb->len); +	if (x->props.flags & XFRM_STATE_ESN) { +		/* Attach seqhi sg right after packet payload */ +		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); +		sg_set_buf(seqhisg, seqhi, seqhi_len); +	} +	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);  	ahash_request_set_callback(req, 0, ah_output_done, skb);  	AH_SKB_CB(skb)->tmp = iph; @@ -261,12 +278,16 @@ static void ah_input_done(struct crypto_async_request *base, int err)  	if (err)  		goto out; +	err = ah->nexthdr; +  	skb->network_header += ah_hlen;  	memcpy(skb_network_header(skb), work_iph, ihl);  	__skb_pull(skb, ah_hlen + ihl); -	skb_set_transport_header(skb, -ihl); -	err = ah->nexthdr; +	if (x->props.mode == XFRM_MODE_TUNNEL) +		skb_reset_transport_header(skb); +	else +		skb_set_transport_header(skb, -ihl);  out:  	kfree(AH_SKB_CB(skb)->tmp);  	xfrm_input_resume(skb, err); @@ -288,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	struct ip_auth_hdr *ah;  	struct ah_data *ahp;  	int err = -ENOMEM; +	int seqhi_len = 0; +	__be32 *seqhi; +	int sglists = 0; +	struct scatterlist *seqhisg;  	if (!pskb_may_pull(skb, sizeof(*ah)))  		goto out; @@ -299,37 +324,51 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	nexthdr = ah->nexthdr;  	ah_hlen = (ah->hdrlen + 2) << 2; -	if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && -	    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) -		goto out; +	if (x->props.flags & XFRM_STATE_ALIGN4) { +		if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && +		    ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) +			goto out; +	} else { +		if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && +		    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) +			goto out; +	}  	if (!pskb_may_pull(skb, ah_hlen))  		goto out;  	/* We are going to _remove_ AH header to keep sockets happy,  	 * so... Later this can change. */ -	if (skb_cloned(skb) && -	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, GFP_ATOMIC))  		goto out;  	skb->ip_summed = CHECKSUM_NONE; -	ah = (struct ip_auth_hdr *)skb->data; -	iph = ip_hdr(skb); -	ihl = ip_hdrlen(skb);  	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)  		goto out;  	nfrags = err; -	work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); +	ah = (struct ip_auth_hdr *)skb->data; +	iph = ip_hdr(skb); +	ihl = ip_hdrlen(skb); + +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists = 1; +		seqhi_len = sizeof(*seqhi); +	} + +	work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + +				ahp->icv_trunc_len + seqhi_len);  	if (!work_iph)  		goto out; -	auth_data = ah_tmp_auth(work_iph, ihl); +	seqhi = (__be32 *)((char *)work_iph + ihl); +	auth_data = ah_tmp_auth(seqhi, seqhi_len);  	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);  	req = ah_tmp_req(ahash, icv);  	sg = ah_req_sg(ahash, req); +	seqhisg = sg + nfrags;  	memcpy(work_iph, iph, ihl);  	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -348,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	skb_push(skb, ihl); -	sg_init_table(sg, nfrags); -	skb_to_sgvec(skb, sg, 0, skb->len); +	sg_init_table(sg, nfrags + sglists); +	skb_to_sgvec_nomark(skb, sg, 0, skb->len); -	ahash_request_set_crypt(req, sg, icv, skb->len); +	if (x->props.flags & XFRM_STATE_ESN) { +		/* Attach seqhi sg right after packet payload */ +		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi; +		sg_set_buf(seqhisg, seqhi, seqhi_len); +	} +	ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);  	ahash_request_set_callback(req, 0, ah_input_done, skb);  	AH_SKB_CB(skb)->tmp = work_iph; @@ -361,8 +405,6 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  		if (err == -EINPROGRESS)  			goto out; -		if (err == -EBUSY) -			err = NET_XMIT_DROP;  		goto out_free;  	} @@ -373,7 +415,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)  	skb->network_header += ah_hlen;  	memcpy(skb_network_header(skb), work_iph, ihl);  	__skb_pull(skb, ah_hlen + ihl); -	skb_set_transport_header(skb, -ihl); +	if (x->props.mode == XFRM_MODE_TUNNEL) +		skb_reset_transport_header(skb); +	else +		skb_set_transport_header(skb, -ihl);  	err = nexthdr; @@ -383,23 +428,35 @@ out:  	return err;  } -static void ah4_err(struct sk_buff *skb, u32 info) +static int ah4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev); -	struct iphdr *iph = (struct iphdr *)skb->data; +	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));  	struct xfrm_state *x; -	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || -	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -		return; +	switch (icmp_hdr(skb)->type) { +	case ICMP_DEST_UNREACH: +		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) +			return 0; +	case ICMP_REDIRECT: +		break; +	default: +		return 0; +	} -	x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); +	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, +			      ah->spi, IPPROTO_AH, AF_INET);  	if (!x) -		return; -	printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", -	       ntohl(ah->spi), ntohl(iph->daddr)); +		return 0; + +	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) +		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); +	else +		ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);  	xfrm_state_put(x); + +	return 0;  }  static int ah_init_state(struct xfrm_state *x) @@ -438,9 +495,10 @@ static int ah_init_state(struct xfrm_state *x)  	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=  	    crypto_ahash_digestsize(ahash)) { -		printk(KERN_INFO "AH: %s digestsize %u != %hu\n", -		       x->aalg->alg_name, crypto_ahash_digestsize(ahash), -		       aalg_desc->uinfo.auth.icv_fullbits/8); +		pr_info("%s: %s digestsize %u != %hu\n", +			__func__, x->aalg->alg_name, +			crypto_ahash_digestsize(ahash), +			aalg_desc->uinfo.auth.icv_fullbits / 8);  		goto error;  	} @@ -449,8 +507,12 @@ static int ah_init_state(struct xfrm_state *x)  	BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); -	x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + -					  ahp->icv_trunc_len); +	if (x->props.flags & XFRM_STATE_ALIGN4) +		x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + +						  ahp->icv_trunc_len); +	else +		x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + +						  ahp->icv_trunc_len);  	if (x->props.mode == XFRM_MODE_TUNNEL)  		x->props.header_len += sizeof(struct iphdr);  	x->data = ahp; @@ -476,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)  	kfree(ahp);  } +static int ah4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +}  static const struct xfrm_type ah_type =  { @@ -489,21 +555,22 @@ static const struct xfrm_type ah_type =  	.output		= ah_output  }; -static const struct net_protocol ah4_protocol = { +static struct xfrm4_protocol ah4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	ah4_rcv_cb,  	.err_handler	=	ah4_err, -	.no_policy	=	1, -	.netns_ok	=	1, +	.priority	=	0,  };  static int __init ah4_init(void)  {  	if (xfrm_register_type(&ah_type, AF_INET) < 0) { -		printk(KERN_INFO "ip ah init: can't add xfrm type\n"); +		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { -		printk(KERN_INFO "ip ah init: can't add protocol\n"); +	if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { +		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&ah_type, AF_INET);  		return -EAGAIN;  	} @@ -512,10 +579,10 @@ static int __init ah4_init(void)  static void __exit ah4_fini(void)  { -	if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) -		printk(KERN_INFO "ip ah close: can't remove protocol\n"); +	if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) +		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&ah_type, AF_INET) < 0) -		printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); +		pr_info("%s: can't remove xfrm type\n", __func__);  }  module_init(ah4_init); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a2fc7b961db..1a9b99e0446 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -73,6 +73,8 @@   *		Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/types.h>  #include <linux/string.h> @@ -89,7 +91,6 @@  #include <linux/etherdevice.h>  #include <linux/fddidevice.h>  #include <linux/if_arp.h> -#include <linux/trdevice.h>  #include <linux/skbuff.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h> @@ -97,7 +98,6 @@  #include <linux/init.h>  #include <linux/net.h>  #include <linux/rcupdate.h> -#include <linux/jhash.h>  #include <linux/slab.h>  #ifdef CONFIG_SYSCTL  #include <linux/sysctl.h> @@ -113,13 +113,7 @@  #include <net/arp.h>  #include <net/ax25.h>  #include <net/netrom.h> -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) -#include <net/atmclip.h> -struct neigh_table *clip_tbl_hook; -EXPORT_SYMBOL(clip_tbl_hook); -#endif -#include <asm/system.h>  #include <linux/uaccess.h>  #include <linux/netfilter_arp.h> @@ -127,7 +121,7 @@ EXPORT_SYMBOL(clip_tbl_hook);  /*   *	Interface to generic neighbour cache.   */ -static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); +static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);  static int arp_constructor(struct neighbour *neigh);  static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);  static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -139,8 +133,6 @@ static const struct neigh_ops arp_generic_ops = {  	.error_report =		arp_error_report,  	.output =		neigh_resolve_output,  	.connected_output =	neigh_connected_output, -	.hh_output =		dev_queue_xmit, -	.queue_xmit =		dev_queue_xmit,  };  static const struct neigh_ops arp_hh_ops = { @@ -149,16 +141,12 @@ static const struct neigh_ops arp_hh_ops = {  	.error_report =		arp_error_report,  	.output =		neigh_resolve_output,  	.connected_output =	neigh_resolve_output, -	.hh_output =		dev_queue_xmit, -	.queue_xmit =		dev_queue_xmit,  };  static const struct neigh_ops arp_direct_ops = {  	.family =		AF_INET, -	.output =		dev_queue_xmit, -	.connected_output =	dev_queue_xmit, -	.hh_output =		dev_queue_xmit, -	.queue_xmit =		dev_queue_xmit, +	.output =		neigh_direct_output, +	.connected_output =	neigh_direct_output,  };  static const struct neigh_ops arp_broken_ops = { @@ -167,13 +155,10 @@ static const struct neigh_ops arp_broken_ops = {  	.error_report =		arp_error_report,  	.output =		neigh_compat_output,  	.connected_output =	neigh_compat_output, -	.hh_output =		dev_queue_xmit, -	.queue_xmit =		dev_queue_xmit,  };  struct neigh_table arp_tbl = {  	.family		= AF_INET, -	.entry_size	= sizeof(struct neighbour) + 4,  	.key_len	= 4,  	.hash		= arp_hash,  	.constructor	= arp_constructor, @@ -181,18 +166,20 @@ struct neigh_table arp_tbl = {  	.id		= "arp_cache",  	.parms		= {  		.tbl			= &arp_tbl, -		.base_reachable_time	= 30 * HZ, -		.retrans_time		= 1 * HZ, -		.gc_staletime		= 60 * HZ,  		.reachable_time		= 30 * HZ, -		.delay_probe_time	= 5 * HZ, -		.queue_len		= 3, -		.ucast_probes		= 3, -		.mcast_probes		= 3, -		.anycast_delay		= 1 * HZ, -		.proxy_delay		= (8 * HZ) / 10, -		.proxy_qlen		= 64, -		.locktime		= 1 * HZ, +		.data	= { +			[NEIGH_VAR_MCAST_PROBES] = 3, +			[NEIGH_VAR_UCAST_PROBES] = 3, +			[NEIGH_VAR_RETRANS_TIME] = 1 * HZ, +			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, +			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, +			[NEIGH_VAR_GC_STALETIME] = 60 * HZ, +			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, +			[NEIGH_VAR_PROXY_QLEN] = 64, +			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, +			[NEIGH_VAR_PROXY_DELAY]	= (8 * HZ) / 10, +			[NEIGH_VAR_LOCKTIME] = 1 * HZ, +		},  	},  	.gc_interval	= 30 * HZ,  	.gc_thresh1	= 128, @@ -209,12 +196,12 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)  	case ARPHRD_IEEE802:  		ip_eth_mc_map(addr, haddr);  		return 0; -	case ARPHRD_IEEE802_TR: -		ip_tr_mc_map(addr, haddr); -		return 0;  	case ARPHRD_INFINIBAND:  		ip_ib_mc_map(addr, dev->broadcast, haddr);  		return 0; +	case ARPHRD_IPGRE: +		ip_ipgre_mc_map(addr, dev->broadcast, haddr); +		return 0;  	default:  		if (dir) {  			memcpy(haddr, dev->broadcast, dev->addr_len); @@ -227,9 +214,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)  static u32 arp_hash(const void *pkey,  		    const struct net_device *dev, -		    __u32 hash_rnd) +		    __u32 *hash_rnd)  { -	return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); +	return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);  }  static int arp_constructor(struct neighbour *neigh) @@ -256,7 +243,7 @@ static int arp_constructor(struct neighbour *neigh)  	if (!dev->header_ops) {  		neigh->nud_state = NUD_NOARP;  		neigh->ops = &arp_direct_ops; -		neigh->output = neigh->ops->queue_xmit; +		neigh->output = neigh_direct_output;  	} else {  		/* Good devices (checked by reading texts, but only Ethernet is  		   tested) @@ -289,9 +276,9 @@ static int arp_constructor(struct neighbour *neigh)  		default:  			break;  		case ARPHRD_ROSE: -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25)  		case ARPHRD_AX25: -#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#if IS_ENABLED(CONFIG_NETROM)  		case ARPHRD_NETROM:  #endif  			neigh->ops = &arp_broken_ops; @@ -336,7 +323,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)  static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  {  	__be32 saddr = 0; -	u8  *dst_ha = NULL; +	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;  	struct net_device *dev = neigh->dev;  	__be32 target = *(__be32 *)neigh->primary_key;  	int probes = atomic_read(&neigh->probes); @@ -374,31 +361,27 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  	if (!saddr)  		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); -	probes -= neigh->parms->ucast_probes; +	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);  	if (probes < 0) {  		if (!(neigh->nud_state & NUD_VALID)) -			printk(KERN_DEBUG -			       "trying to ucast probe in NUD_INVALID\n"); -		dst_ha = neigh->ha; -		read_lock_bh(&neigh->lock); +			pr_debug("trying to ucast probe in NUD_INVALID\n"); +		neigh_ha_snapshot(dst_ha, neigh, dev); +		dst_hw = dst_ha;  	} else { -		probes -= neigh->parms->app_probes; +		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);  		if (probes < 0) { -#ifdef CONFIG_ARPD  			neigh_app_ns(neigh); -#endif  			return;  		}  	}  	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, -		 dst_ha, dev->dev_addr, NULL); -	if (dst_ha) -		read_unlock_bh(&neigh->lock); +		 dst_hw, dev->dev_addr, NULL);  }  static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  { +	struct net *net = dev_net(in_dev->dev);  	int scope;  	switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -417,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  	case 3:	/* Do not reply for scope host addresses */  		sip = 0;  		scope = RT_SCOPE_LINK; +		in_dev = NULL;  		break;  	case 4:	/* Reserved */  	case 5: @@ -428,19 +412,18 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)  	default:  		return 0;  	} -	return !inet_confirm_addr(in_dev, sip, tip, scope); +	return !inet_confirm_addr(net, in_dev, sip, tip, scope);  }  static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)  { -	struct flowi fl = { .fl4_dst = sip, -			    .fl4_src = tip };  	struct rtable *rt;  	int flag = 0;  	/*unsigned long now; */  	struct net *net = dev_net(dev); -	if (ip_route_output_key(net, &rt, &fl) < 0) +	rt = ip_route_output(net, sip, tip, 0, 0); +	if (IS_ERR(rt))  		return 1;  	if (rt->dst.dev != dev) {  		NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); @@ -466,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,  {  	switch (addr_hint) {  	case RTN_LOCAL: -		printk(KERN_DEBUG "ARP: arp called for own IP address\n"); +		pr_debug("arp called for own IP address\n");  		memcpy(haddr, dev->dev_addr, dev->addr_len);  		return 1;  	case RTN_MULTICAST: @@ -487,13 +470,12 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)  	struct neighbour *n;  	if (!skb_dst(skb)) { -		printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); +		pr_debug("arp_find is called with dst==NULL\n");  		kfree_skb(skb);  		return 1;  	} -	paddr = skb_rtable(skb)->rt_gateway; - +	paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);  	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,  			       paddr, dev))  		return 0; @@ -516,30 +498,6 @@ EXPORT_SYMBOL(arp_find);  /* END OF OBSOLETE FUNCTIONS */ -int arp_bind_neighbour(struct dst_entry *dst) -{ -	struct net_device *dev = dst->dev; -	struct neighbour *n = dst->neighbour; - -	if (dev == NULL) -		return -EINVAL; -	if (n == NULL) { -		__be32 nexthop = ((struct rtable *)dst)->rt_gateway; -		if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) -			nexthop = 0; -		n = __neigh_lookup_errno( -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) -					 dev->type == ARPHRD_ATM ? -					 clip_tbl_hook : -#endif -					 &arp_tbl, &nexthop, dev); -		if (IS_ERR(n)) -			return PTR_ERR(n); -		dst->neighbour = n; -	} -	return 0; -} -  /*   * Check if we can use proxy ARP for this path   */ @@ -623,16 +581,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,  	struct sk_buff *skb;  	struct arphdr *arp;  	unsigned char *arp_ptr; +	int hlen = LL_RESERVED_SPACE(dev); +	int tlen = dev->needed_tailroom;  	/*  	 *	Allocate a buffer  	 */ -	skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); +	skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);  	if (skb == NULL)  		return NULL; -	skb_reserve(skb, LL_RESERVED_SPACE(dev)); +	skb_reserve(skb, hlen);  	skb_reset_network_header(skb);  	arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));  	skb->dev = dev; @@ -664,13 +624,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,  		arp->ar_pro = htons(ETH_P_IP);  		break; -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25)  	case ARPHRD_AX25:  		arp->ar_hrd = htons(ARPHRD_AX25);  		arp->ar_pro = htons(AX25_P_IP);  		break; -#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#if IS_ENABLED(CONFIG_NETROM)  	case ARPHRD_NETROM:  		arp->ar_hrd = htons(ARPHRD_NETROM);  		arp->ar_pro = htons(AX25_P_IP); @@ -678,18 +638,12 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,  #endif  #endif -#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) +#if IS_ENABLED(CONFIG_FDDI)  	case ARPHRD_FDDI:  		arp->ar_hrd = htons(ARPHRD_ETHER);  		arp->ar_pro = htons(ETH_P_IP);  		break;  #endif -#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) -	case ARPHRD_IEEE802_TR: -		arp->ar_hrd = htons(ARPHRD_IEEE802); -		arp->ar_pro = htons(ETH_P_IP); -		break; -#endif  	}  	arp->ar_hln = dev->addr_len; @@ -702,11 +656,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,  	arp_ptr += dev->addr_len;  	memcpy(arp_ptr, &src_ip, 4);  	arp_ptr += 4; -	if (target_hw != NULL) -		memcpy(arp_ptr, target_hw, dev->addr_len); -	else -		memset(arp_ptr, 0, dev->addr_len); -	arp_ptr += dev->addr_len; + +	switch (dev->type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) +	case ARPHRD_IEEE1394: +		break; +#endif +	default: +		if (target_hw != NULL) +			memcpy(arp_ptr, target_hw, dev->addr_len); +		else +			memset(arp_ptr, 0, dev->addr_len); +		arp_ptr += dev->addr_len; +	}  	memcpy(arp_ptr, &dest_ip, 4);  	return skb; @@ -770,6 +732,7 @@ static int arp_process(struct sk_buff *skb)  	int addr_type;  	struct neighbour *n;  	struct net *net = dev_net(dev); +	bool is_garp = false;  	/* arp_rcv below verifies the ARP header and verifies the device  	 * is ARP'able. @@ -787,11 +750,10 @@ static int arp_process(struct sk_buff *skb)  			goto out;  		break;  	case ARPHRD_ETHER: -	case ARPHRD_IEEE802_TR:  	case ARPHRD_FDDI:  	case ARPHRD_IEEE802:  		/* -		 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 +		 * ETHERNET, and Fibre Channel (which are IEEE 802  		 * devices, according to RFC 2625) devices will accept ARP  		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).  		 * This is the case also of FDDI, where the RFC 1390 says that @@ -830,13 +792,21 @@ static int arp_process(struct sk_buff *skb)  	arp_ptr += dev->addr_len;  	memcpy(&sip, arp_ptr, 4);  	arp_ptr += 4; -	arp_ptr += dev->addr_len; +	switch (dev_type) { +#if IS_ENABLED(CONFIG_FIREWIRE_NET) +	case ARPHRD_IEEE1394: +		break; +#endif +	default: +		arp_ptr += dev->addr_len; +	}  	memcpy(&tip, arp_ptr, 4);  /*   *	Check for bad requests for 127.x.x.x and requests for multicast   *	addresses.  If this is one such, delete it.   */ -	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) +	if (ipv4_is_multicast(tip) || +	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))  		goto out;  /* @@ -898,14 +868,15 @@ static int arp_process(struct sk_buff *skb)  			if (addr_type == RTN_UNICAST  &&  			    (arp_fwd_proxy(in_dev, dev, rt) ||  			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || -			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { +			     (rt->dst.dev != dev && +			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {  				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);  				if (n)  					neigh_release(n);  				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||  				    skb->pkt_type == PACKET_HOST || -				    in_dev->arp_parms->proxy_delay == 0) { +				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {  					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,  						 dev, tip, sha, dev->dev_addr,  						 sha); @@ -923,15 +894,17 @@ static int arp_process(struct sk_buff *skb)  	n = __neigh_lookup(&arp_tbl, &sip, dev, 0); -	if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) { +	if (IN_DEV_ARP_ACCEPT(in_dev)) {  		/* Unsolicited ARP is not accepted by default.  		   It is possible, that this option should be enabled for some  		   devices (strip is candidate)  		 */ +		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && +			  inet_addr_type(net, sip) == RTN_UNICAST; +  		if (n == NULL && -		    (arp->ar_op == htons(ARPOP_REPLY) || -		     (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && -		    inet_addr_type(net, sip) == RTN_UNICAST) +		    ((arp->ar_op == htons(ARPOP_REPLY)  && +		      inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))  			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);  	} @@ -944,7 +917,10 @@ static int arp_process(struct sk_buff *skb)  		   agents are active. Taking the first reply prevents  		   arp trashing and chooses the fastest router.  		 */ -		override = time_after(jiffies, n->updated + n->parms->locktime); +		override = time_after(jiffies, +				      n->updated + +				      NEIGH_VAR(n->parms, LOCKTIME)) || +			   is_garp;  		/* Broadcast replies and request packets  		   do not assert neighbour reachability. @@ -975,24 +951,25 @@ static void parp_redo(struct sk_buff *skb)  static int arp_rcv(struct sk_buff *skb, struct net_device *dev,  		   struct packet_type *pt, struct net_device *orig_dev)  { -	struct arphdr *arp; +	const struct arphdr *arp; + +	if (dev->flags & IFF_NOARP || +	    skb->pkt_type == PACKET_OTHERHOST || +	    skb->pkt_type == PACKET_LOOPBACK) +		goto freeskb; + +	skb = skb_share_check(skb, GFP_ATOMIC); +	if (!skb) +		goto out_of_mem;  	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */  	if (!pskb_may_pull(skb, arp_hdr_len(dev)))  		goto freeskb;  	arp = arp_hdr(skb); -	if (arp->ar_hln != dev->addr_len || -	    dev->flags & IFF_NOARP || -	    skb->pkt_type == PACKET_OTHERHOST || -	    skb->pkt_type == PACKET_LOOPBACK || -	    arp->ar_pln != 4) +	if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)  		goto freeskb; -	skb = skb_share_check(skb, GFP_ATOMIC); -	if (skb == NULL) -		goto out_of_mem; -  	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));  	return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); @@ -1017,14 +994,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)  		IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;  		return 0;  	} -	if (__in_dev_get_rcu(dev)) { -		IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on); +	if (__in_dev_get_rtnl(dev)) { +		IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);  		return 0;  	}  	return -ENXIO;  } -/* must be called with rcu_read_lock() */  static int arp_req_set_public(struct net *net, struct arpreq *r,  		struct net_device *dev)  { @@ -1062,19 +1038,17 @@ static int arp_req_set(struct net *net, struct arpreq *r,  	if (r->arp_flags & ATF_PERM)  		r->arp_flags |= ATF_COM;  	if (dev == NULL) { -		struct flowi fl = { .fl4_dst = ip, -				    .fl4_tos = RTO_ONLINK }; -		struct rtable *rt; -		err = ip_route_output_key(net, &rt, &fl); -		if (err != 0) -			return err; +		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + +		if (IS_ERR(rt)) +			return PTR_ERR(rt);  		dev = rt->dst.dev;  		ip_rt_put(rt);  		if (!dev)  			return -EINVAL;  	}  	switch (dev->type) { -#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE) +#if IS_ENABLED(CONFIG_FDDI)  	case ARPHRD_FDDI:  		/*  		 * According to RFC 1390, FDDI devices should accept ARP @@ -1097,7 +1071,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,  	neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);  	err = PTR_ERR(neigh);  	if (!IS_ERR(neigh)) { -		unsigned state = NUD_STALE; +		unsigned int state = NUD_STALE;  		if (r->arp_flags & ATF_PERM)  			state = NUD_PERMANENT;  		err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? @@ -1109,7 +1083,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,  	return err;  } -static unsigned arp_state_to_flags(struct neighbour *neigh) +static unsigned int arp_state_to_flags(struct neighbour *neigh)  {  	if (neigh->nud_state&NUD_PERMANENT)  		return ATF_PERM | ATF_COM; @@ -1143,6 +1117,22 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)  	return err;  } +static int arp_invalidate(struct net_device *dev, __be32 ip) +{ +	struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); +	int err = -ENXIO; + +	if (neigh) { +		if (neigh->nud_state & ~NUD_NOARP) +			err = neigh_update(neigh, NULL, NUD_FAILED, +					   NEIGH_UPDATE_F_OVERRIDE| +					   NEIGH_UPDATE_F_ADMIN); +		neigh_release(neigh); +	} + +	return err; +} +  static int arp_req_delete_public(struct net *net, struct arpreq *r,  		struct net_device *dev)  { @@ -1161,36 +1151,22 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,  static int arp_req_delete(struct net *net, struct arpreq *r,  			  struct net_device *dev)  { -	int err;  	__be32 ip; -	struct neighbour *neigh;  	if (r->arp_flags & ATF_PUBL)  		return arp_req_delete_public(net, r, dev);  	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;  	if (dev == NULL) { -		struct flowi fl = { .fl4_dst = ip, -				    .fl4_tos = RTO_ONLINK }; -		struct rtable *rt; -		err = ip_route_output_key(net, &rt, &fl); -		if (err != 0) -			return err; +		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); +		if (IS_ERR(rt)) +			return PTR_ERR(rt);  		dev = rt->dst.dev;  		ip_rt_put(rt);  		if (!dev)  			return -EINVAL;  	} -	err = -ENXIO; -	neigh = neigh_lookup(&arp_tbl, &ip, dev); -	if (neigh) { -		if (neigh->nud_state & ~NUD_NOARP) -			err = neigh_update(neigh, NULL, NUD_FAILED, -					   NEIGH_UPDATE_F_OVERRIDE| -					   NEIGH_UPDATE_F_ADMIN); -		neigh_release(neigh); -	} -	return err; +	return arp_invalidate(dev, ip);  }  /* @@ -1206,7 +1182,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	switch (cmd) {  	case SIOCDARP:  	case SIOCSARP: -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  	case SIOCGARP:  		err = copy_from_user(&r, arg, sizeof(struct arpreq)); @@ -1226,10 +1202,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	if (!(r.arp_flags & ATF_NETMASK))  		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =  							   htonl(0xFFFFFFFFUL); -	rcu_read_lock(); +	rtnl_lock();  	if (r.arp_dev[0]) {  		err = -ENODEV; -		dev = dev_get_by_name_rcu(net, r.arp_dev); +		dev = __dev_get_by_name(net, r.arp_dev);  		if (dev == NULL)  			goto out; @@ -1256,7 +1232,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)  		break;  	}  out: -	rcu_read_unlock(); +	rtnl_unlock();  	if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))  		err = -EFAULT;  	return err; @@ -1265,12 +1241,18 @@ out:  static int arp_netdev_event(struct notifier_block *this, unsigned long event,  			    void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct netdev_notifier_change_info *change_info;  	switch (event) {  	case NETDEV_CHANGEADDR:  		neigh_changeaddr(&arp_tbl, dev); -		rt_cache_flush(dev_net(dev), 0); +		rt_cache_flush(dev_net(dev)); +		break; +	case NETDEV_CHANGE: +		change_info = ptr; +		if (change_info->flags_changed & IFF_NOARP) +			neigh_changeaddr(&arp_tbl, dev);  		break;  	default:  		break; @@ -1311,13 +1293,13 @@ void __init arp_init(void)  	dev_add_pack(&arp_packet_type);  	arp_proc_init();  #ifdef CONFIG_SYSCTL -	neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); +	neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);  #endif  	register_netdevice_notifier(&arp_netdev_notifier);  }  #ifdef CONFIG_PROC_FS -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25)  /* ------------------------------------------------------------------------ */  /* @@ -1365,7 +1347,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,  	read_lock(&n->lock);  	/* Convert hardware address to XX:XX:XX:XX ... form. */ -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25)  	if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)  		ax2asc2((ax25_address *)n->ha, hbuffer);  	else { @@ -1378,7 +1360,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,  	if (k != 0)  		--k;  	hbuffer[k] = 0; -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if IS_ENABLED(CONFIG_AX25)  	}  #endif  	sprintf(tbuf, "%pI4", n->primary_key); @@ -1451,14 +1433,14 @@ static const struct file_operations arp_seq_fops = {  static int __net_init arp_net_init(struct net *net)  { -	if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops)) +	if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))  		return -ENOMEM;  	return 0;  }  static void __net_exit arp_net_exit(struct net *net)  { -	proc_net_remove(net, "arp"); +	remove_proc_entry("arp", net->proc_net);  }  static struct pernet_operations arp_net_ops = { diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 094e150c626..69e77c8ff28 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -31,8 +31,7 @@   * the GNU General Public License for more details.   *   * You should have received a copy of the GNU General Public License - * along with this program;  if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program;  if not, see <http://www.gnu.org/licenses/>.   *   */ @@ -50,7 +49,7 @@  #include <net/tcp.h>  #include <net/netlabel.h>  #include <net/cipso_ipv4.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <asm/bug.h>  #include <asm/unaligned.h> @@ -112,7 +111,7 @@ int cipso_v4_rbm_strictvalid = 1;  /* The maximum number of category ranges permitted in the ranged category tag   * (tag #5).  You may note that the IETF draft states that the maximum number   * of category ranges is 7, but if the low end of the last category range is - * zero then it is possibile to fit 8 category ranges because the zero should + * zero then it is possible to fit 8 category ranges because the zero should   * be omitted. */  #define CIPSO_V4_TAG_RNG_CAT_MAX      8 @@ -438,7 +437,7 @@ cache_add_failure:   *   * Description:   * Search the DOI definition list for a DOI definition with a DOI value that - * matches @doi.  The caller is responsibile for calling rcu_read_[un]lock(). + * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().   * Returns a pointer to the DOI definition on success and NULL on failure.   */  static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) @@ -476,7 +475,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,  	doi = doi_def->doi;  	doi_type = doi_def->type; -	if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN) +	if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)  		goto doi_add_return;  	for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {  		switch (doi_def->tags[iter]) { @@ -1293,7 +1292,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,  			return ret_val;  		/* This will send packets using the "optimized" format when -		 * possibile as specified in  section 3.4.2.6 of the +		 * possible as specified in  section 3.4.2.6 of the  		 * CIPSO draft. */  		if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)  			tag_len = 14; @@ -1336,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -		                       netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; @@ -1432,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -			               netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; @@ -1527,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,  	secattr->flags |= NETLBL_SECATTR_MLS_LVL;  	if (tag_len > 4) { -		secattr->attr.mls.cat = -			               netlbl_secattr_catmap_alloc(GFP_ATOMIC); +		secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);  		if (secattr->attr.mls.cat == NULL)  			return -ENOMEM; @@ -1725,8 +1721,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)  		case CIPSO_V4_TAG_LOCAL:  			/* This is a non-standard tag that we only allow for  			 * local connections, so if the incoming interface is -			 * not the loopback device drop the packet. */ -			if (!(skb->dev->flags & IFF_LOOPBACK)) { +			 * not the loopback device drop the packet. Further, +			 * there is no legitimate reason for setting this from +			 * userspace so reject it if skb is NULL. */ +			if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {  				err_offset = opt_iter;  				goto validate_return_locked;  			} @@ -1752,7 +1750,7 @@ validate_return:  }  /** - * cipso_v4_error - Send the correct reponse for a bad packet + * cipso_v4_error - Send the correct response for a bad packet   * @skb: the packet   * @error: the error code   * @gateway: CIPSO gateway flag @@ -1879,7 +1877,7 @@ int cipso_v4_sock_setattr(struct sock *sk,  	unsigned char *buf = NULL;  	u32 buf_len;  	u32 opt_len; -	struct ip_options *opt = NULL; +	struct ip_options_rcu *old, *opt = NULL;  	struct inet_sock *sk_inet;  	struct inet_connection_sock *sk_conn; @@ -1915,22 +1913,25 @@ int cipso_v4_sock_setattr(struct sock *sk,  		ret_val = -ENOMEM;  		goto socket_setattr_failure;  	} -	memcpy(opt->__data, buf, buf_len); -	opt->optlen = opt_len; -	opt->cipso = sizeof(struct iphdr); +	memcpy(opt->opt.__data, buf, buf_len); +	opt->opt.optlen = opt_len; +	opt->opt.cipso = sizeof(struct iphdr);  	kfree(buf);  	buf = NULL;  	sk_inet = inet_sk(sk); + +	old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));  	if (sk_inet->is_icsk) {  		sk_conn = inet_csk(sk); -		if (sk_inet->opt) -			sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen; -		sk_conn->icsk_ext_hdr_len += opt->optlen; +		if (old) +			sk_conn->icsk_ext_hdr_len -= old->opt.optlen; +		sk_conn->icsk_ext_hdr_len += opt->opt.optlen;  		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);  	} -	opt = xchg(&sk_inet->opt, opt); -	kfree(opt); +	rcu_assign_pointer(sk_inet->inet_opt, opt); +	if (old) +		kfree_rcu(old, rcu);  	return 0; @@ -1960,7 +1961,7 @@ int cipso_v4_req_setattr(struct request_sock *req,  	unsigned char *buf = NULL;  	u32 buf_len;  	u32 opt_len; -	struct ip_options *opt = NULL; +	struct ip_options_rcu *opt = NULL;  	struct inet_request_sock *req_inet;  	/* We allocate the maximum CIPSO option size here so we are probably @@ -1988,15 +1989,16 @@ int cipso_v4_req_setattr(struct request_sock *req,  		ret_val = -ENOMEM;  		goto req_setattr_failure;  	} -	memcpy(opt->__data, buf, buf_len); -	opt->optlen = opt_len; -	opt->cipso = sizeof(struct iphdr); +	memcpy(opt->opt.__data, buf, buf_len); +	opt->opt.optlen = opt_len; +	opt->opt.cipso = sizeof(struct iphdr);  	kfree(buf);  	buf = NULL;  	req_inet = inet_rsk(req);  	opt = xchg(&req_inet->opt, opt); -	kfree(opt); +	if (opt) +		kfree_rcu(opt, rcu);  	return 0; @@ -2016,34 +2018,34 @@ req_setattr_failure:   * values on failure.   *   */ -static int cipso_v4_delopt(struct ip_options **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)  {  	int hdr_delta = 0; -	struct ip_options *opt = *opt_ptr; +	struct ip_options_rcu *opt = *opt_ptr; -	if (opt->srr || opt->rr || opt->ts || opt->router_alert) { +	if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {  		u8 cipso_len;  		u8 cipso_off;  		unsigned char *cipso_ptr;  		int iter;  		int optlen_new; -		cipso_off = opt->cipso - sizeof(struct iphdr); -		cipso_ptr = &opt->__data[cipso_off]; +		cipso_off = opt->opt.cipso - sizeof(struct iphdr); +		cipso_ptr = &opt->opt.__data[cipso_off];  		cipso_len = cipso_ptr[1]; -		if (opt->srr > opt->cipso) -			opt->srr -= cipso_len; -		if (opt->rr > opt->cipso) -			opt->rr -= cipso_len; -		if (opt->ts > opt->cipso) -			opt->ts -= cipso_len; -		if (opt->router_alert > opt->cipso) -			opt->router_alert -= cipso_len; -		opt->cipso = 0; +		if (opt->opt.srr > opt->opt.cipso) +			opt->opt.srr -= cipso_len; +		if (opt->opt.rr > opt->opt.cipso) +			opt->opt.rr -= cipso_len; +		if (opt->opt.ts > opt->opt.cipso) +			opt->opt.ts -= cipso_len; +		if (opt->opt.router_alert > opt->opt.cipso) +			opt->opt.router_alert -= cipso_len; +		opt->opt.cipso = 0;  		memmove(cipso_ptr, cipso_ptr + cipso_len, -			opt->optlen - cipso_off - cipso_len); +			opt->opt.optlen - cipso_off - cipso_len);  		/* determining the new total option length is tricky because of  		 * the padding necessary, the only thing i can think to do at @@ -2052,21 +2054,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)  		 * from there we can determine the new total option length */  		iter = 0;  		optlen_new = 0; -		while (iter < opt->optlen) -			if (opt->__data[iter] != IPOPT_NOP) { -				iter += opt->__data[iter + 1]; +		while (iter < opt->opt.optlen) +			if (opt->opt.__data[iter] != IPOPT_NOP) { +				iter += opt->opt.__data[iter + 1];  				optlen_new = iter;  			} else  				iter++; -		hdr_delta = opt->optlen; -		opt->optlen = (optlen_new + 3) & ~3; -		hdr_delta -= opt->optlen; +		hdr_delta = opt->opt.optlen; +		opt->opt.optlen = (optlen_new + 3) & ~3; +		hdr_delta -= opt->opt.optlen;  	} else {  		/* only the cipso option was present on the socket so we can  		 * remove the entire option struct */  		*opt_ptr = NULL; -		hdr_delta = opt->optlen; -		kfree(opt); +		hdr_delta = opt->opt.optlen; +		kfree_rcu(opt, rcu);  	}  	return hdr_delta; @@ -2083,15 +2085,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)  void cipso_v4_sock_delattr(struct sock *sk)  {  	int hdr_delta; -	struct ip_options *opt; +	struct ip_options_rcu *opt;  	struct inet_sock *sk_inet;  	sk_inet = inet_sk(sk); -	opt = sk_inet->opt; -	if (opt == NULL || opt->cipso == 0) +	opt = rcu_dereference_protected(sk_inet->inet_opt, 1); +	if (opt == NULL || opt->opt.cipso == 0)  		return; -	hdr_delta = cipso_v4_delopt(&sk_inet->opt); +	hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);  	if (sk_inet->is_icsk && hdr_delta > 0) {  		struct inet_connection_sock *sk_conn = inet_csk(sk);  		sk_conn->icsk_ext_hdr_len -= hdr_delta; @@ -2109,12 +2111,12 @@ void cipso_v4_sock_delattr(struct sock *sk)   */  void cipso_v4_req_delattr(struct request_sock *req)  { -	struct ip_options *opt; +	struct ip_options_rcu *opt;  	struct inet_request_sock *req_inet;  	req_inet = inet_rsk(req);  	opt = req_inet->opt; -	if (opt == NULL || opt->cipso == 0) +	if (opt == NULL || opt->opt.cipso == 0)  		return;  	cipso_v4_delopt(&req_inet->opt); @@ -2184,14 +2186,18 @@ getattr_return:   */  int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)  { -	struct ip_options *opt; - -	opt = inet_sk(sk)->opt; -	if (opt == NULL || opt->cipso == 0) -		return -ENOMSG; +	struct ip_options_rcu *opt; +	int res = -ENOMSG; -	return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), -				secattr); +	rcu_read_lock(); +	opt = rcu_dereference(inet_sk(sk)->inet_opt); +	if (opt && opt->opt.cipso) +		res = cipso_v4_getattr(opt->opt.__data + +						opt->opt.cipso - +						sizeof(struct iphdr), +				       secattr); +	rcu_read_unlock(); +	return res;  }  /** diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 174be6caa5c..a3095fdefbe 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  {  	struct inet_sock *inet = inet_sk(sk);  	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; +	struct flowi4 *fl4;  	struct rtable *rt;  	__be32 saddr;  	int oif; @@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	sk_dst_reset(sk); +	lock_sock(sk); +  	oif = sk->sk_bound_dev_if;  	saddr = inet->inet_saddr;  	if (ipv4_is_multicast(usin->sin_addr.s_addr)) { @@ -46,33 +49,74 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  		if (!saddr)  			saddr = inet->mc_addr;  	} -	err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, -			       RT_CONN_FLAGS(sk), oif, -			       sk->sk_protocol, -			       inet->inet_sport, usin->sin_port, sk, 1); -	if (err) { +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, +			      RT_CONN_FLAGS(sk), oif, +			      sk->sk_protocol, +			      inet->inet_sport, usin->sin_port, sk); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt);  		if (err == -ENETUNREACH) -			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); -		return err; +			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); +		goto out;  	}  	if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {  		ip_rt_put(rt); -		return -EACCES; +		err = -EACCES; +		goto out;  	}  	if (!inet->inet_saddr) -		inet->inet_saddr = rt->rt_src;	/* Update source address */ +		inet->inet_saddr = fl4->saddr;	/* Update source address */  	if (!inet->inet_rcv_saddr) { -		inet->inet_rcv_saddr = rt->rt_src; +		inet->inet_rcv_saddr = fl4->saddr;  		if (sk->sk_prot->rehash)  			sk->sk_prot->rehash(sk);  	} -	inet->inet_daddr = rt->rt_dst; +	inet->inet_daddr = fl4->daddr;  	inet->inet_dport = usin->sin_port;  	sk->sk_state = TCP_ESTABLISHED;  	inet->inet_id = jiffies;  	sk_dst_set(sk, &rt->dst); -	return 0; +	err = 0; +out: +	release_sock(sk); +	return err;  }  EXPORT_SYMBOL(ip4_datagram_connect); + +/* Because UDP xmit path can manipulate sk_dst_cache without holding + * socket lock, we need to use sk_dst_set() here, + * even if we own the socket lock. + */ +void ip4_datagram_release_cb(struct sock *sk) +{ +	const struct inet_sock *inet = inet_sk(sk); +	const struct ip_options_rcu *inet_opt; +	__be32 daddr = inet->inet_daddr; +	struct dst_entry *dst; +	struct flowi4 fl4; +	struct rtable *rt; + +	rcu_read_lock(); + +	dst = __sk_dst_get(sk); +	if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { +		rcu_read_unlock(); +		return; +	} +	inet_opt = rcu_dereference(inet->inet_opt); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, +				   inet->inet_saddr, inet->inet_dport, +				   inet->inet_sport, sk->sk_protocol, +				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + +	dst = !IS_ERR(rt) ? &rt->dst : NULL; +	sk_dst_set(sk, dst); + +	rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ip4_datagram_release_cb); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3b067704ab3..e9449376b58 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -27,7 +27,6 @@  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/capability.h>  #include <linux/module.h> @@ -51,10 +50,12 @@  #include <linux/inetdevice.h>  #include <linux/igmp.h>  #include <linux/slab.h> +#include <linux/hash.h>  #ifdef CONFIG_SYSCTL  #include <linux/sysctl.h>  #endif  #include <linux/kmod.h> +#include <linux/netconf.h>  #include <net/arp.h>  #include <net/ip.h> @@ -62,6 +63,9 @@  #include <net/ip_fib.h>  #include <net/rtnetlink.h>  #include <net/net_namespace.h> +#include <net/addrconf.h> + +#include "fib_lookup.h"  static struct ipv4_devconf ipv4_devconf = {  	.data = { @@ -69,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {  		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,  		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,  		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, +		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, +		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,  	},  }; @@ -79,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {  		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,  		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,  		[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, +		[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, +		[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,  	},  }; @@ -90,8 +98,82 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {  	[IFA_ADDRESS]   	= { .type = NLA_U32 },  	[IFA_BROADCAST] 	= { .type = NLA_U32 },  	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, +	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) }, +	[IFA_FLAGS]		= { .type = NLA_U32 },  }; +#define IN4_ADDR_HSIZE_SHIFT	8 +#define IN4_ADDR_HSIZE		(1U << IN4_ADDR_HSIZE_SHIFT) + +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; + +static u32 inet_addr_hash(struct net *net, __be32 addr) +{ +	u32 val = (__force u32) addr ^ net_hash_mix(net); + +	return hash_32(val, IN4_ADDR_HSIZE_SHIFT); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ +	u32 hash = inet_addr_hash(net, ifa->ifa_local); + +	ASSERT_RTNL(); +	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ +	ASSERT_RTNL(); +	hlist_del_init_rcu(&ifa->hash); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ +	u32 hash = inet_addr_hash(net, addr); +	struct net_device *result = NULL; +	struct in_ifaddr *ifa; + +	rcu_read_lock(); +	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { +		if (ifa->ifa_local == addr) { +			struct net_device *dev = ifa->ifa_dev->dev; + +			if (!net_eq(dev_net(dev), net)) +				continue; +			result = dev; +			break; +		} +	} +	if (!result) { +		struct flowi4 fl4 = { .daddr = addr }; +		struct fib_result res = { 0 }; +		struct fib_table *local; + +		/* Fallback to FIB local table so that communication +		 * over loopback subnets work. +		 */ +		local = fib_get_table(net, RT_TABLE_LOCAL); +		if (local && +		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && +		    res.type == RTN_LOCAL) +			result = FIB_RES_DEV(res); +	} +	if (result && devref) +		dev_hold(result); +	rcu_read_unlock(); +	return result; +} +EXPORT_SYMBOL(__ip_dev_find); +  static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);  static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -101,10 +183,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  static void devinet_sysctl_register(struct in_device *idev);  static void devinet_sysctl_unregister(struct in_device *idev);  #else -static inline void devinet_sysctl_register(struct in_device *idev) +static void devinet_sysctl_register(struct in_device *idev)  {  } -static inline void devinet_sysctl_unregister(struct in_device *idev) +static void devinet_sysctl_unregister(struct in_device *idev)  {  }  #endif @@ -124,7 +206,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)  	kfree(ifa);  } -static inline void inet_free_ifa(struct in_ifaddr *ifa) +static void inet_free_ifa(struct in_ifaddr *ifa)  {  	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);  } @@ -135,9 +217,9 @@ void in_dev_finish_destroy(struct in_device *idev)  	WARN_ON(idev->ifa_list);  	WARN_ON(idev->mc_list); +	kfree(rcu_dereference_protected(idev->mc_hash, 1));  #ifdef NET_REFCNT_DEBUG -	printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", -	       idev, dev ? dev->name : "NIL"); +	pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");  #endif  	dev_put(dev);  	if (!idev->dead) @@ -209,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev)  		inet_free_ifa(ifa);  	} -	rcu_assign_pointer(dev->ip_ptr, NULL); +	RCU_INIT_POINTER(dev->ip_ptr, NULL);  	devinet_sysctl_unregister(in_dev);  	neigh_parms_release(&arp_tbl, in_dev->arp_parms); @@ -234,7 +316,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)  }  static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, -			 int destroy, struct nlmsghdr *nlh, u32 pid) +			 int destroy, struct nlmsghdr *nlh, u32 portid)  {  	struct in_ifaddr *promote = NULL;  	struct in_ifaddr *ifa, *ifa1 = *ifap; @@ -265,9 +347,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  			}  			if (!do_promote) { +				inet_hash_remove(ifa);  				*ifap1 = ifa->ifa_next; -				rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); +				rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);  				blocking_notifier_call_chain(&inetaddr_chain,  						NETDEV_DOWN, ifa);  				inet_free_ifa(ifa); @@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  		}  	} +	/* On promotion all secondaries from subnet are changing +	 * the primary IP, we must remove all their routes silently +	 * and later to add them back with new prefsrc. Do this +	 * while all addresses are on the device list. +	 */ +	for (ifa = promote; ifa; ifa = ifa->ifa_next) { +		if (ifa1->ifa_mask == ifa->ifa_mask && +		    inet_ifa_match(ifa1->ifa_address, ifa)) +			fib_del_ifaddr(ifa, ifa1); +	} +  	/* 2. Unlink it */  	*ifap = ifa1->ifa_next; +	inet_hash_remove(ifa1);  	/* 3. Announce address deletion */ @@ -292,10 +387,11 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  	   is valid, it will try to restore deleted routes... Grr.  	   So that, this order is correct.  	 */ -	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid); +	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);  	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);  	if (promote) { +		struct in_ifaddr *next_sec = promote->ifa_next;  		if (prev_prom) {  			prev_prom->ifa_next = promote->ifa_next; @@ -304,10 +400,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  		}  		promote->ifa_flags &= ~IFA_F_SECONDARY; -		rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); +		rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);  		blocking_notifier_call_chain(&inetaddr_chain,  				NETDEV_UP, promote); -		for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { +		for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {  			if (ifa1->ifa_mask != ifa->ifa_mask ||  			    !inet_ifa_match(ifa1->ifa_address, ifa))  					continue; @@ -325,8 +421,12 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,  	__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);  } +static void check_lifetime(struct work_struct *work); + +static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); +  static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, -			     u32 pid) +			     u32 portid)  {  	struct in_device *in_dev = ifa->ifa_dev;  	struct in_ifaddr *ifa1, **ifap, **last_primary; @@ -361,17 +461,22 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,  	}  	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { -		net_srandom(ifa->ifa_local); +		prandom_seed((__force u32) ifa->ifa_local);  		ifap = last_primary;  	}  	ifa->ifa_next = *ifap;  	*ifap = ifa; +	inet_hash_insert(dev_net(in_dev->dev), ifa); + +	cancel_delayed_work(&check_lifetime_work); +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); +  	/* Send message first, then call notifier.  	   Notifier will trigger FIB update, so that  	   listeners of netlink will know about new ifaddr */ -	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid); +	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);  	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);  	return 0; @@ -393,6 +498,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)  		return -ENOBUFS;  	}  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	if (ifa->ifa_dev != in_dev) {  		WARN_ON(ifa->ifa_dev);  		in_dev_hold(in_dev); @@ -434,7 +540,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,  	return NULL;  } -static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct nlattr *tb[IFA_MAX+1]; @@ -470,7 +576,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg  		    !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))  			continue; -		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid); +		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);  		return 0;  	} @@ -479,7 +585,132 @@ errout:  	return err;  } -static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) +#define INFINITY_LIFE_TIME	0xFFFFFFFF + +static void check_lifetime(struct work_struct *work) +{ +	unsigned long now, next, next_sec, next_sched; +	struct in_ifaddr *ifa; +	struct hlist_node *n; +	int i; + +	now = jiffies; +	next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); + +	for (i = 0; i < IN4_ADDR_HSIZE; i++) { +		bool change_needed = false; + +		rcu_read_lock(); +		hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { +			unsigned long age; + +			if (ifa->ifa_flags & IFA_F_PERMANENT) +				continue; + +			/* We try to batch several events at once. */ +			age = (now - ifa->ifa_tstamp + +			       ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + +			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && +			    age >= ifa->ifa_valid_lft) { +				change_needed = true; +			} else if (ifa->ifa_preferred_lft == +				   INFINITY_LIFE_TIME) { +				continue; +			} else if (age >= ifa->ifa_preferred_lft) { +				if (time_before(ifa->ifa_tstamp + +						ifa->ifa_valid_lft * HZ, next)) +					next = ifa->ifa_tstamp + +					       ifa->ifa_valid_lft * HZ; + +				if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) +					change_needed = true; +			} else if (time_before(ifa->ifa_tstamp + +					       ifa->ifa_preferred_lft * HZ, +					       next)) { +				next = ifa->ifa_tstamp + +				       ifa->ifa_preferred_lft * HZ; +			} +		} +		rcu_read_unlock(); +		if (!change_needed) +			continue; +		rtnl_lock(); +		hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { +			unsigned long age; + +			if (ifa->ifa_flags & IFA_F_PERMANENT) +				continue; + +			/* We try to batch several events at once. */ +			age = (now - ifa->ifa_tstamp + +			       ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + +			if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && +			    age >= ifa->ifa_valid_lft) { +				struct in_ifaddr **ifap; + +				for (ifap = &ifa->ifa_dev->ifa_list; +				     *ifap != NULL; ifap = &(*ifap)->ifa_next) { +					if (*ifap == ifa) { +						inet_del_ifa(ifa->ifa_dev, +							     ifap, 1); +						break; +					} +				} +			} else if (ifa->ifa_preferred_lft != +				   INFINITY_LIFE_TIME && +				   age >= ifa->ifa_preferred_lft && +				   !(ifa->ifa_flags & IFA_F_DEPRECATED)) { +				ifa->ifa_flags |= IFA_F_DEPRECATED; +				rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); +			} +		} +		rtnl_unlock(); +	} + +	next_sec = round_jiffies_up(next); +	next_sched = next; + +	/* If rounded timeout is accurate enough, accept it. */ +	if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) +		next_sched = next_sec; + +	now = jiffies; +	/* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ +	if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) +		next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; + +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, +			next_sched - now); +} + +static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, +			     __u32 prefered_lft) +{ +	unsigned long timeout; + +	ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); + +	timeout = addrconf_timeout_fixup(valid_lft, HZ); +	if (addrconf_finite_timeout(timeout)) +		ifa->ifa_valid_lft = timeout; +	else +		ifa->ifa_flags |= IFA_F_PERMANENT; + +	timeout = addrconf_timeout_fixup(prefered_lft, HZ); +	if (addrconf_finite_timeout(timeout)) { +		if (timeout == 0) +			ifa->ifa_flags |= IFA_F_DEPRECATED; +		ifa->ifa_preferred_lft = timeout; +	} +	ifa->ifa_tstamp = jiffies; +	if (!ifa->ifa_cstamp) +		ifa->ifa_cstamp = ifa->ifa_tstamp; +} + +static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, +				       __u32 *pvalid_lft, __u32 *pprefered_lft)  {  	struct nlattr *tb[IFA_MAX+1];  	struct in_ifaddr *ifa; @@ -516,14 +747,17 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)  		goto errout;  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	in_dev_hold(in_dev);  	if (tb[IFA_ADDRESS] == NULL)  		tb[IFA_ADDRESS] = tb[IFA_LOCAL]; +	INIT_HLIST_NODE(&ifa->hash);  	ifa->ifa_prefixlen = ifm->ifa_prefixlen;  	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); -	ifa->ifa_flags = ifm->ifa_flags; +	ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : +					 ifm->ifa_flags;  	ifa->ifa_scope = ifm->ifa_scope;  	ifa->ifa_dev = in_dev; @@ -538,31 +772,87 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)  	else  		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); +	if (tb[IFA_CACHEINFO]) { +		struct ifa_cacheinfo *ci; + +		ci = nla_data(tb[IFA_CACHEINFO]); +		if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { +			err = -EINVAL; +			goto errout_free; +		} +		*pvalid_lft = ci->ifa_valid; +		*pprefered_lft = ci->ifa_prefered; +	} +  	return ifa; +errout_free: +	inet_free_ifa(ifa);  errout:  	return ERR_PTR(err);  } -static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) +{ +	struct in_device *in_dev = ifa->ifa_dev; +	struct in_ifaddr *ifa1, **ifap; + +	if (!ifa->ifa_local) +		return NULL; + +	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; +	     ifap = &ifa1->ifa_next) { +		if (ifa1->ifa_mask == ifa->ifa_mask && +		    inet_ifa_match(ifa1->ifa_address, ifa) && +		    ifa1->ifa_local == ifa->ifa_local) +			return ifa1; +	} +	return NULL; +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct in_ifaddr *ifa; +	struct in_ifaddr *ifa_existing; +	__u32 valid_lft = INFINITY_LIFE_TIME; +	__u32 prefered_lft = INFINITY_LIFE_TIME;  	ASSERT_RTNL(); -	ifa = rtm_to_ifaddr(net, nlh); +	ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);  	if (IS_ERR(ifa))  		return PTR_ERR(ifa); -	return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid); +	ifa_existing = find_matching_ifa(ifa); +	if (!ifa_existing) { +		/* It would be best to check for !NLM_F_CREATE here but +		 * userspace already relies on not having to provide this. +		 */ +		set_ifa_lifetime(ifa, valid_lft, prefered_lft); +		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); +	} else { +		inet_free_ifa(ifa); + +		if (nlh->nlmsg_flags & NLM_F_EXCL || +		    !(nlh->nlmsg_flags & NLM_F_REPLACE)) +			return -EEXIST; +		ifa = ifa_existing; +		set_ifa_lifetime(ifa, valid_lft, prefered_lft); +		cancel_delayed_work(&check_lifetime_work); +		queue_delayed_work(system_power_efficient_wq, +				&check_lifetime_work, 0); +		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); +		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); +	} +	return 0;  }  /*   *	Determine a default network mask, based on the IP address.   */ -static inline int inet_abc_len(__be32 addr) +static int inet_abc_len(__be32 addr)  {  	int rc = -1;	/* Something else, probably a multicast. */ @@ -628,16 +918,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)  		break;  	case SIOCSIFFLAGS: -		ret = -EACCES; -		if (!capable(CAP_NET_ADMIN)) +		ret = -EPERM; +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			goto out;  		break;  	case SIOCSIFADDR:	/* Set interface address (and family) */  	case SIOCSIFBRDADDR:	/* Set the broadcast address */  	case SIOCSIFDSTADDR:	/* Set the destination address */  	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */ -		ret = -EACCES; -		if (!capable(CAP_NET_ADMIN)) +		ret = -EPERM; +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			goto out;  		ret = -EINVAL;  		if (sin->sin_family != AF_INET) @@ -670,7 +960,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)  			     ifap = &ifa->ifa_next) {  				if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&  				    sin_orig.sin_addr.s_addr == -							ifa->ifa_address) { +							ifa->ifa_local) {  					break; /* found */  				}  			} @@ -730,6 +1020,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)  			ifa = inet_alloc_ifa();  			if (!ifa)  				break; +			INIT_HLIST_NODE(&ifa->hash);  			if (colon)  				memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);  			else @@ -756,6 +1047,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)  			ifa->ifa_prefixlen = 32;  			ifa->ifa_mask = inet_make_mask(32);  		} +		set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);  		ret = inet_set_ifa(dev, ifa);  		break; @@ -841,10 +1133,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)  		if (len < (int) sizeof(ifr))  			break;  		memset(&ifr, 0, sizeof(struct ifreq)); -		if (ifa->ifa_label) -			strcpy(ifr.ifr_name, ifa->ifa_label); -		else -			strcpy(ifr.ifr_name, dev->name); +		strcpy(ifr.ifr_name, ifa->ifa_label);  		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;  		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = @@ -950,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,  /*   * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface   * - dst: only in the same subnet as dst, 0=any dst   * - local: address, 0=autoselect the local address   * - scope: maximum allowed scope value for the local address   */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,  			 __be32 dst, __be32 local, int scope)  {  	__be32 addr = 0;  	struct net_device *dev; -	struct net *net; -	if (scope != RT_SCOPE_LINK) +	if (in_dev != NULL)  		return confirm_addr_indev(in_dev, dst, local, scope); -	net = dev_net(in_dev->dev);  	rcu_read_lock();  	for_each_netdev_rcu(net, dev) {  		in_dev = __in_dev_get_rcu(dev); @@ -979,6 +1267,7 @@ __be32 inet_confirm_addr(struct in_device *in_dev,  	return addr;  } +EXPORT_SYMBOL(inet_confirm_addr);  /*   *	Device notifier @@ -1025,17 +1314,32 @@ skip:  	}  } -static inline bool inetdev_valid_mtu(unsigned mtu) +static bool inetdev_valid_mtu(unsigned int mtu)  {  	return mtu >= 68;  } +static void inetdev_send_gratuitous_arp(struct net_device *dev, +					struct in_device *in_dev) + +{ +	struct in_ifaddr *ifa; + +	for (ifa = in_dev->ifa_list; ifa; +	     ifa = ifa->ifa_next) { +		arp_send(ARPOP_REQUEST, ETH_P_ARP, +			 ifa->ifa_local, dev, +			 ifa->ifa_local, NULL, +			 dev->dev_addr, NULL); +	} +} +  /* Called only under RTNL semaphore */  static int inetdev_event(struct notifier_block *this, unsigned long event,  			 void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct in_device *in_dev = __in_dev_get_rtnl(dev);  	ASSERT_RTNL(); @@ -1059,8 +1363,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,  	switch (event) {  	case NETDEV_REGISTER: -		printk(KERN_DEBUG "inetdev_event: bug\n"); -		rcu_assign_pointer(dev->ip_ptr, NULL); +		pr_debug("%s: bug\n", __func__); +		RCU_INIT_POINTER(dev->ip_ptr, NULL);  		break;  	case NETDEV_UP:  		if (!inetdev_valid_mtu(dev->mtu)) @@ -1069,6 +1373,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,  			struct in_ifaddr *ifa = inet_alloc_ifa();  			if (ifa) { +				INIT_HLIST_NODE(&ifa->hash);  				ifa->ifa_local =  				  ifa->ifa_address = htonl(INADDR_LOOPBACK);  				ifa->ifa_prefixlen = 8; @@ -1077,23 +1382,22 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,  				ifa->ifa_dev = in_dev;  				ifa->ifa_scope = RT_SCOPE_HOST;  				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); +				set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, +						 INFINITY_LIFE_TIME); +				ipv4_devconf_setall(in_dev); +				neigh_parms_data_state_setall(in_dev->arp_parms);  				inet_insert_ifa(ifa);  			}  		}  		ip_mc_up(in_dev);  		/* fall through */ -	case NETDEV_NOTIFY_PEERS:  	case NETDEV_CHANGEADDR: +		if (!IN_DEV_ARP_NOTIFY(in_dev)) +			break; +		/* fall through */ +	case NETDEV_NOTIFY_PEERS:  		/* Send gratuitous ARP to notify of link change */ -		if (IN_DEV_ARP_NOTIFY(in_dev)) { -			struct in_ifaddr *ifa = in_dev->ifa_list; - -			if (ifa) -				arp_send(ARPOP_REQUEST, ETH_P_ARP, -					 ifa->ifa_address, dev, -					 ifa->ifa_address, NULL, -					 dev->dev_addr, NULL); -		} +		inetdev_send_gratuitous_arp(dev, in_dev);  		break;  	case NETDEV_DOWN:  		ip_mc_down(in_dev); @@ -1129,43 +1433,86 @@ static struct notifier_block ip_netdev_notifier = {  	.notifier_call = inetdev_event,  }; -static inline size_t inet_nlmsg_size(void) +static size_t inet_nlmsg_size(void)  {  	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))  	       + nla_total_size(4) /* IFA_ADDRESS */  	       + nla_total_size(4) /* IFA_LOCAL */  	       + nla_total_size(4) /* IFA_BROADCAST */ -	       + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ +	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ +	       + nla_total_size(4)  /* IFA_FLAGS */ +	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ +} + +static inline u32 cstamp_delta(unsigned long cstamp) +{ +	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} + +static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, +			 unsigned long tstamp, u32 preferred, u32 valid) +{ +	struct ifa_cacheinfo ci; + +	ci.cstamp = cstamp_delta(cstamp); +	ci.tstamp = cstamp_delta(tstamp); +	ci.ifa_prefered = preferred; +	ci.ifa_valid = valid; + +	return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);  }  static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, -			    u32 pid, u32 seq, int event, unsigned int flags) +			    u32 portid, u32 seq, int event, unsigned int flags)  {  	struct ifaddrmsg *ifm;  	struct nlmsghdr  *nlh; +	u32 preferred, valid; -	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);  	if (nlh == NULL)  		return -EMSGSIZE;  	ifm = nlmsg_data(nlh);  	ifm->ifa_family = AF_INET;  	ifm->ifa_prefixlen = ifa->ifa_prefixlen; -	ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; +	ifm->ifa_flags = ifa->ifa_flags;  	ifm->ifa_scope = ifa->ifa_scope;  	ifm->ifa_index = ifa->ifa_dev->dev->ifindex; -	if (ifa->ifa_address) -		NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address); +	if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { +		preferred = ifa->ifa_preferred_lft; +		valid = ifa->ifa_valid_lft; +		if (preferred != INFINITY_LIFE_TIME) { +			long tval = (jiffies - ifa->ifa_tstamp) / HZ; -	if (ifa->ifa_local) -		NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local); - -	if (ifa->ifa_broadcast) -		NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - -	if (ifa->ifa_label[0]) -		NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); +			if (preferred > tval) +				preferred -= tval; +			else +				preferred = 0; +			if (valid != INFINITY_LIFE_TIME) { +				if (valid > tval) +					valid -= tval; +				else +					valid = 0; +			} +		} +	} else { +		preferred = INFINITY_LIFE_TIME; +		valid = INFINITY_LIFE_TIME; +	} +	if ((ifa->ifa_address && +	     nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || +	    (ifa->ifa_local && +	     nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || +	    (ifa->ifa_broadcast && +	     nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || +	    (ifa->ifa_label[0] && +	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || +	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || +	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, +			  preferred, valid)) +		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -1184,7 +1531,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)  	struct in_device *in_dev;  	struct in_ifaddr *ifa;  	struct hlist_head *head; -	struct hlist_node *node;  	s_h = cb->args[0];  	s_idx = idx = cb->args[1]; @@ -1194,7 +1540,9 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)  		idx = 0;  		head = &net->dev_index_head[h];  		rcu_read_lock(); -		hlist_for_each_entry_rcu(dev, node, head, index_hlist) { +		cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ +			  net->dev_base_seq; +		hlist_for_each_entry_rcu(dev, head, index_hlist) {  			if (idx < s_idx)  				goto cont;  			if (h > s_h || idx > s_idx) @@ -1208,12 +1556,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)  				if (ip_idx < s_ip_idx)  					continue;  				if (inet_fill_ifaddr(skb, ifa, -					     NETLINK_CB(cb->skb).pid, +					     NETLINK_CB(cb->skb).portid,  					     cb->nlh->nlmsg_seq,  					     RTM_NEWADDR, NLM_F_MULTI) <= 0) {  					rcu_read_unlock();  					goto done;  				} +				nl_dump_check_consistent(cb, nlmsg_hdr(skb));  			}  cont:  			idx++; @@ -1230,7 +1579,7 @@ done:  }  static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, -		      u32 pid) +		      u32 portid)  {  	struct sk_buff *skb;  	u32 seq = nlh ? nlh->nlmsg_seq : 0; @@ -1242,14 +1591,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,  	if (skb == NULL)  		goto errout; -	err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0); +	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */  		WARN_ON(err == -EMSGSIZE);  		kfree_skb(skb);  		goto errout;  	} -	rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); +	rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);  	return;  errout:  	if (err < 0) @@ -1258,7 +1607,7 @@ errout:  static size_t inet_get_link_af_size(const struct net_device *dev)  { -	struct in_device *in_dev = __in_dev_get_rtnl(dev); +	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);  	if (!in_dev)  		return 0; @@ -1268,7 +1617,7 @@ static size_t inet_get_link_af_size(const struct net_device *dev)  static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)  { -	struct in_device *in_dev = __in_dev_get_rtnl(dev); +	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);  	struct nlattr *nla;  	int i; @@ -1337,6 +1686,232 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)  	return 0;  } +static int inet_netconf_msgsize_devconf(int type) +{ +	int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) +		   + nla_total_size(4);	/* NETCONFA_IFINDEX */ + +	/* type -1 is used for ALL */ +	if (type == -1 || type == NETCONFA_FORWARDING) +		size += nla_total_size(4); +	if (type == -1 || type == NETCONFA_RP_FILTER) +		size += nla_total_size(4); +	if (type == -1 || type == NETCONFA_MC_FORWARDING) +		size += nla_total_size(4); +	if (type == -1 || type == NETCONFA_PROXY_NEIGH) +		size += nla_total_size(4); + +	return size; +} + +static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, +				     struct ipv4_devconf *devconf, u32 portid, +				     u32 seq, int event, unsigned int flags, +				     int type) +{ +	struct nlmsghdr  *nlh; +	struct netconfmsg *ncm; + +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), +			flags); +	if (nlh == NULL) +		return -EMSGSIZE; + +	ncm = nlmsg_data(nlh); +	ncm->ncm_family = AF_INET; + +	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) +		goto nla_put_failure; + +	/* type -1 is used for ALL */ +	if ((type == -1 || type == NETCONFA_FORWARDING) && +	    nla_put_s32(skb, NETCONFA_FORWARDING, +			IPV4_DEVCONF(*devconf, FORWARDING)) < 0) +		goto nla_put_failure; +	if ((type == -1 || type == NETCONFA_RP_FILTER) && +	    nla_put_s32(skb, NETCONFA_RP_FILTER, +			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) +		goto nla_put_failure; +	if ((type == -1 || type == NETCONFA_MC_FORWARDING) && +	    nla_put_s32(skb, NETCONFA_MC_FORWARDING, +			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) +		goto nla_put_failure; +	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && +	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, +			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) +		goto nla_put_failure; + +	return nlmsg_end(skb, nlh); + +nla_put_failure: +	nlmsg_cancel(skb, nlh); +	return -EMSGSIZE; +} + +void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, +				 struct ipv4_devconf *devconf) +{ +	struct sk_buff *skb; +	int err = -ENOBUFS; + +	skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC); +	if (skb == NULL) +		goto errout; + +	err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, +					RTM_NEWNETCONF, 0, type); +	if (err < 0) { +		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ +		WARN_ON(err == -EMSGSIZE); +		kfree_skb(skb); +		goto errout; +	} +	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC); +	return; +errout: +	if (err < 0) +		rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); +} + +static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { +	[NETCONFA_IFINDEX]	= { .len = sizeof(int) }, +	[NETCONFA_FORWARDING]	= { .len = sizeof(int) }, +	[NETCONFA_RP_FILTER]	= { .len = sizeof(int) }, +	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) }, +}; + +static int inet_netconf_get_devconf(struct sk_buff *in_skb, +				    struct nlmsghdr *nlh) +{ +	struct net *net = sock_net(in_skb->sk); +	struct nlattr *tb[NETCONFA_MAX+1]; +	struct netconfmsg *ncm; +	struct sk_buff *skb; +	struct ipv4_devconf *devconf; +	struct in_device *in_dev; +	struct net_device *dev; +	int ifindex; +	int err; + +	err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, +			  devconf_ipv4_policy); +	if (err < 0) +		goto errout; + +	err = EINVAL; +	if (!tb[NETCONFA_IFINDEX]) +		goto errout; + +	ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); +	switch (ifindex) { +	case NETCONFA_IFINDEX_ALL: +		devconf = net->ipv4.devconf_all; +		break; +	case NETCONFA_IFINDEX_DEFAULT: +		devconf = net->ipv4.devconf_dflt; +		break; +	default: +		dev = __dev_get_by_index(net, ifindex); +		if (dev == NULL) +			goto errout; +		in_dev = __in_dev_get_rtnl(dev); +		if (in_dev == NULL) +			goto errout; +		devconf = &in_dev->cnf; +		break; +	} + +	err = -ENOBUFS; +	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); +	if (skb == NULL) +		goto errout; + +	err = inet_netconf_fill_devconf(skb, ifindex, devconf, +					NETLINK_CB(in_skb).portid, +					nlh->nlmsg_seq, RTM_NEWNETCONF, 0, +					-1); +	if (err < 0) { +		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ +		WARN_ON(err == -EMSGSIZE); +		kfree_skb(skb); +		goto errout; +	} +	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: +	return err; +} + +static int inet_netconf_dump_devconf(struct sk_buff *skb, +				     struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	int h, s_h; +	int idx, s_idx; +	struct net_device *dev; +	struct in_device *in_dev; +	struct hlist_head *head; + +	s_h = cb->args[0]; +	s_idx = idx = cb->args[1]; + +	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { +		idx = 0; +		head = &net->dev_index_head[h]; +		rcu_read_lock(); +		cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^ +			  net->dev_base_seq; +		hlist_for_each_entry_rcu(dev, head, index_hlist) { +			if (idx < s_idx) +				goto cont; +			in_dev = __in_dev_get_rcu(dev); +			if (!in_dev) +				goto cont; + +			if (inet_netconf_fill_devconf(skb, dev->ifindex, +						      &in_dev->cnf, +						      NETLINK_CB(cb->skb).portid, +						      cb->nlh->nlmsg_seq, +						      RTM_NEWNETCONF, +						      NLM_F_MULTI, +						      -1) <= 0) { +				rcu_read_unlock(); +				goto done; +			} +			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: +			idx++; +		} +		rcu_read_unlock(); +	} +	if (h == NETDEV_HASHENTRIES) { +		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, +					      net->ipv4.devconf_all, +					      NETLINK_CB(cb->skb).portid, +					      cb->nlh->nlmsg_seq, +					      RTM_NEWNETCONF, NLM_F_MULTI, +					      -1) <= 0) +			goto done; +		else +			h++; +	} +	if (h == NETDEV_HASHENTRIES + 1) { +		if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, +					      net->ipv4.devconf_dflt, +					      NETLINK_CB(cb->skb).portid, +					      cb->nlh->nlmsg_seq, +					      RTM_NEWNETCONF, NLM_F_MULTI, +					      -1) <= 0) +			goto done; +		else +			h++; +	} +done: +	cb->args[0] = h; +	cb->args[1] = idx; + +	return skb->len; +} +  #ifdef CONFIG_SYSCTL  static void devinet_copy_dflt_conf(struct net *net, int i) @@ -1362,6 +1937,12 @@ static void inet_forward_change(struct net *net)  	IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;  	IPV4_DEVCONF_DFLT(net, FORWARDING) = on; +	inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, +				    NETCONFA_IFINDEX_ALL, +				    net->ipv4.devconf_all); +	inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, +				    NETCONFA_IFINDEX_DEFAULT, +				    net->ipv4.devconf_dflt);  	for_each_netdev(net, dev) {  		struct in_device *in_dev; @@ -1369,33 +1950,69 @@ static void inet_forward_change(struct net *net)  			dev_disable_lro(dev);  		rcu_read_lock();  		in_dev = __in_dev_get_rcu(dev); -		if (in_dev) +		if (in_dev) {  			IN_DEV_CONF_SET(in_dev, FORWARDING, on); +			inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, +						    dev->ifindex, &in_dev->cnf); +		}  		rcu_read_unlock();  	}  } -static int devinet_conf_proc(ctl_table *ctl, int write, +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ +	if (cnf == net->ipv4.devconf_dflt) +		return NETCONFA_IFINDEX_DEFAULT; +	else if (cnf == net->ipv4.devconf_all) +		return NETCONFA_IFINDEX_ALL; +	else { +		struct in_device *idev +			= container_of(cnf, struct in_device, cnf); +		return idev->dev->ifindex; +	} +} + +static int devinet_conf_proc(struct ctl_table *ctl, int write,  			     void __user *buffer,  			     size_t *lenp, loff_t *ppos)  { +	int old_value = *(int *)ctl->data;  	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); +	int new_value = *(int *)ctl->data;  	if (write) {  		struct ipv4_devconf *cnf = ctl->extra1;  		struct net *net = ctl->extra2;  		int i = (int *)ctl->data - cnf->data; +		int ifindex;  		set_bit(i, cnf->state);  		if (cnf == net->ipv4.devconf_dflt)  			devinet_copy_dflt_conf(net, i); +		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || +		    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) +			if ((new_value == 0) && (old_value != 0)) +				rt_cache_flush(net); + +		if (i == IPV4_DEVCONF_RP_FILTER - 1 && +		    new_value != old_value) { +			ifindex = devinet_conf_ifindex(net, cnf); +			inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, +						    ifindex, cnf); +		} +		if (i == IPV4_DEVCONF_PROXY_ARP - 1 && +		    new_value != old_value) { +			ifindex = devinet_conf_ifindex(net, cnf); +			inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, +						    ifindex, cnf); +		}  	}  	return ret;  } -static int devinet_sysctl_forward(ctl_table *ctl, int write, +static int devinet_sysctl_forward(struct ctl_table *ctl, int write,  				  void __user *buffer,  				  size_t *lenp, loff_t *ppos)  { @@ -1416,23 +2033,31 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,  			}  			if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {  				inet_forward_change(net); -			} else if (*valp) { +			} else {  				struct ipv4_devconf *cnf = ctl->extra1;  				struct in_device *idev =  					container_of(cnf, struct in_device, cnf); -				dev_disable_lro(idev->dev); +				if (*valp) +					dev_disable_lro(idev->dev); +				inet_netconf_notify_devconf(net, +							    NETCONFA_FORWARDING, +							    idev->dev->ifindex, +							    cnf);  			}  			rtnl_unlock(); -			rt_cache_flush(net, 0); -		} +			rt_cache_flush(net); +		} else +			inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, +						    NETCONFA_IFINDEX_DEFAULT, +						    net->ipv4.devconf_dflt);  	}  	return ret;  } -int ipv4_doint_and_flush(ctl_table *ctl, int write, -			 void __user *buffer, -			 size_t *lenp, loff_t *ppos) +static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, +				void __user *buffer, +				size_t *lenp, loff_t *ppos)  {  	int *valp = ctl->data;  	int val = *valp; @@ -1440,7 +2065,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,  	struct net *net = ctl->extra2;  	if (write && *valp != val) -		rt_cache_flush(net, 0); +		rt_cache_flush(net);  	return ret;  } @@ -1471,7 +2096,6 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,  static struct devinet_sysctl_table {  	struct ctl_table_header *sysctl_header;  	struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; -	char *dev_name;  } devinet_sysctl = {  	.devinet_vars = {  		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -1498,13 +2122,19 @@ static struct devinet_sysctl_table {  		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),  		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),  		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), +		DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, +					"force_igmp_version"), +		DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, +					"igmpv2_unsolicited_report_interval"), +		DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, +					"igmpv3_unsolicited_report_interval"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), -		DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION, -					      "force_igmp_version"),  		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,  					      "promote_secondaries"), +		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, +					      "route_localnet"),  	},  }; @@ -1513,16 +2143,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,  {  	int i;  	struct devinet_sysctl_table *t; - -#define DEVINET_CTL_PATH_DEV	3 - -	struct ctl_path devinet_ctl_path[] = { -		{ .procname = "net",  }, -		{ .procname = "ipv4", }, -		{ .procname = "conf", }, -		{ /* to be set */ }, -		{ }, -	}; +	char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];  	t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);  	if (!t) @@ -1534,27 +2155,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,  		t->devinet_vars[i].extra2 = net;  	} -	/* -	 * Make a copy of dev_name, because '.procname' is regarded as const -	 * by sysctl and we wouldn't want anyone to change it under our feet -	 * (see SIOCSIFNAME). -	 */ -	t->dev_name = kstrdup(dev_name, GFP_KERNEL); -	if (!t->dev_name) -		goto free; - -	devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; +	snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); -	t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, -			t->devinet_vars); +	t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);  	if (!t->sysctl_header) -		goto free_procname; +		goto free;  	p->sysctl = t;  	return 0; -free_procname: -	kfree(t->dev_name);  free:  	kfree(t);  out: @@ -1569,14 +2178,13 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)  		return;  	cnf->sysctl = NULL; -	unregister_sysctl_table(t->sysctl_header); -	kfree(t->dev_name); +	unregister_net_sysctl_table(t->sysctl_header);  	kfree(t);  }  static void devinet_sysctl_register(struct in_device *idev)  { -	neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); +	neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);  	__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,  					&idev->cnf);  } @@ -1600,12 +2208,6 @@ static struct ctl_table ctl_forward_entry[] = {  	},  	{ },  }; - -static __net_initdata struct ctl_path net_ipv4_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ }, -};  #endif  static __net_init int devinet_init_net(struct net *net) @@ -1651,7 +2253,7 @@ static __net_init int devinet_init_net(struct net *net)  		goto err_reg_dflt;  	err = -ENOMEM; -	forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); +	forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);  	if (forw_hdr == NULL)  		goto err_reg_ctl;  	net->ipv4.forw_hdr = forw_hdr; @@ -1710,15 +2312,24 @@ static struct rtnl_af_ops inet_af_ops = {  void __init devinet_init(void)  { +	int i; + +	for (i = 0; i < IN4_ADDR_HSIZE; i++) +		INIT_HLIST_HEAD(&inet_addr_lst[i]); +  	register_pernet_subsys(&devinet_ops);  	register_gifconf(PF_INET, inet_gifconf);  	register_netdevice_notifier(&ip_netdev_notifier); +	queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); +  	rtnl_af_register(&inet_af_ops); -	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); -	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); -	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); +	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); +	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); +	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); +	rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, +		      inet_netconf_dump_devconf, NULL);  } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 14ca1f1c3fb..360b565918c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "IPsec: " fmt +  #include <crypto/aead.h>  #include <crypto/authenc.h>  #include <linux/err.h> @@ -23,6 +25,8 @@ struct esp_skb_cb {  #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) +static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); +  /*   * Allocate an AEAD request structure with extra space for SG and IV.   * @@ -31,11 +35,14 @@ struct esp_skb_cb {   *   * TODO: Use spare space in skb for this where possible.   */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)  {  	unsigned int len; -	len = crypto_aead_ivsize(aead); +	len = seqhilen; + +	len += crypto_aead_ivsize(aead); +  	if (len) {  		len += crypto_aead_alignmask(aead) &  		       ~(crypto_tfm_ctx_alignment() - 1); @@ -50,10 +57,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)  	return kmalloc(len, GFP_ATOMIC);  } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ +	return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)  {  	return crypto_aead_ivsize(aead) ? -	       PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; +	       PTR_ALIGN((u8 *)tmp + seqhilen, +			 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;  }  static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -109,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	struct aead_givcrypt_request *req;  	struct scatterlist *sg;  	struct scatterlist *asg; -	struct esp_data *esp;  	struct sk_buff *trailer;  	void *tmp;  	u8 *iv; @@ -117,46 +128,72 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	int blksize;  	int clen;  	int alen; +	int plen; +	int tfclen;  	int nfrags; +	int assoclen; +	int sglists; +	int seqhilen; +	__be32 *seqhi;  	/* skb is pure payload to encrypt */ -	err = -ENOMEM; - -	/* Round to block size */ -	clen = skb->len; - -	esp = x->data; -	aead = esp->aead; +	aead = x->data;  	alen = crypto_aead_authsize(aead); +	tfclen = 0; +	if (x->tfcpad) { +		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); +		u32 padto; + +		padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); +		if (skb->len < padto) +			tfclen = padto - skb->len; +	}  	blksize = ALIGN(crypto_aead_blocksize(aead), 4); -	clen = ALIGN(clen + 2, blksize); -	if (esp->padlen) -		clen = ALIGN(clen, esp->padlen); +	clen = ALIGN(skb->len + 2 + tfclen, blksize); +	plen = clen - skb->len - tfclen; -	if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) +	err = skb_cow_data(skb, tfclen + plen + alen, &trailer); +	if (err < 0)  		goto error;  	nfrags = err; -	tmp = esp_alloc_tmp(aead, nfrags + 1); -	if (!tmp) +	assoclen = sizeof(*esph); +	sglists = 1; +	seqhilen = 0; + +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists += 2; +		seqhilen += sizeof(__be32); +		assoclen += seqhilen; +	} + +	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); +	if (!tmp) { +		err = -ENOMEM;  		goto error; +	} -	iv = esp_tmp_iv(aead, tmp); +	seqhi = esp_tmp_seqhi(tmp); +	iv = esp_tmp_iv(aead, tmp, seqhilen);  	req = esp_tmp_givreq(aead, iv);  	asg = esp_givreq_sg(aead, req); -	sg = asg + 1; +	sg = asg + sglists;  	/* Fill padding... */  	tail = skb_tail_pointer(trailer); +	if (tfclen) { +		memset(tail, 0, tfclen); +		tail += tfclen; +	}  	do {  		int i; -		for (i=0; i<clen-skb->len - 2; i++) +		for (i = 0; i < plen - 2; i++)  			tail[i] = i + 1;  	} while (0); -	tail[clen - skb->len - 2] = (clen - skb->len) - 2; -	tail[clen - skb->len - 1] = *skb_mac_header(skb); +	tail[plen - 2] = plen - 2; +	tail[plen - 1] = *skb_mac_header(skb);  	pskb_put(skb, trailer, clen - skb->len + alen);  	skb_push(skb, -skb_network_offset(skb)); @@ -199,19 +236,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)  	}  	esph->spi = x->id.spi; -	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); +	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);  	sg_init_table(sg, nfrags);  	skb_to_sgvec(skb, sg,  		     esph->enc_data + crypto_aead_ivsize(aead) - skb->data,  		     clen + alen); -	sg_init_one(asg, esph, sizeof(*esph)); + +	if ((x->props.flags & XFRM_STATE_ESN)) { +		sg_init_table(asg, 3); +		sg_set_buf(asg, &esph->spi, sizeof(__be32)); +		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); +		sg_set_buf(asg + 1, seqhi, seqhilen); +		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); +	} else +		sg_init_one(asg, esph, sizeof(*esph));  	aead_givcrypt_set_callback(req, 0, esp_output_done, skb);  	aead_givcrypt_set_crypt(req, sg, sg, clen, iv); -	aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); +	aead_givcrypt_set_assoc(req, asg, assoclen);  	aead_givcrypt_set_giv(req, esph->enc_data, -			      XFRM_SKB_CB(skb)->seq.output); +			      XFRM_SKB_CB(skb)->seq.output.low);  	ESP_SKB_CB(skb)->tmp = tmp;  	err = crypto_aead_givencrypt(req); @@ -229,10 +274,9 @@ error:  static int esp_input_done2(struct sk_buff *skb, int err)  { -	struct iphdr *iph; +	const struct iphdr *iph;  	struct xfrm_state *x = xfrm_input_state(skb); -	struct esp_data *esp = x->data; -	struct crypto_aead *aead = esp->aead; +	struct crypto_aead *aead = x->data;  	int alen = crypto_aead_authsize(aead);  	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);  	int elen = skb->len - hlen; @@ -297,7 +341,10 @@ static int esp_input_done2(struct sk_buff *skb, int err)  	pskb_trim(skb, skb->len - alen - padlen - 2);  	__skb_pull(skb, hlen); -	skb_set_transport_header(skb, -ihl); +	if (x->props.mode == XFRM_MODE_TUNNEL) +		skb_reset_transport_header(skb); +	else +		skb_set_transport_header(skb, -ihl);  	err = nexthdr[1]; @@ -324,12 +371,15 @@ static void esp_input_done(struct crypto_async_request *base, int err)  static int esp_input(struct xfrm_state *x, struct sk_buff *skb)  {  	struct ip_esp_hdr *esph; -	struct esp_data *esp = x->data; -	struct crypto_aead *aead = esp->aead; +	struct crypto_aead *aead = x->data;  	struct aead_request *req;  	struct sk_buff *trailer;  	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);  	int nfrags; +	int assoclen; +	int sglists; +	int seqhilen; +	__be32 *seqhi;  	void *tmp;  	u8 *iv;  	struct scatterlist *sg; @@ -346,16 +396,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)  		goto out;  	nfrags = err; +	assoclen = sizeof(*esph); +	sglists = 1; +	seqhilen = 0; + +	if (x->props.flags & XFRM_STATE_ESN) { +		sglists += 2; +		seqhilen += sizeof(__be32); +		assoclen += seqhilen; +	} +  	err = -ENOMEM; -	tmp = esp_alloc_tmp(aead, nfrags + 1); +	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);  	if (!tmp)  		goto out;  	ESP_SKB_CB(skb)->tmp = tmp; -	iv = esp_tmp_iv(aead, tmp); +	seqhi = esp_tmp_seqhi(tmp); +	iv = esp_tmp_iv(aead, tmp, seqhilen);  	req = esp_tmp_req(aead, iv);  	asg = esp_req_sg(aead, req); -	sg = asg + 1; +	sg = asg + sglists;  	skb->ip_summed = CHECKSUM_NONE; @@ -366,11 +427,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)  	sg_init_table(sg, nfrags);  	skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); -	sg_init_one(asg, esph, sizeof(*esph)); + +	if ((x->props.flags & XFRM_STATE_ESN)) { +		sg_init_table(asg, 3); +		sg_set_buf(asg, &esph->spi, sizeof(__be32)); +		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi; +		sg_set_buf(asg + 1, seqhi, seqhilen); +		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); +	} else +		sg_init_one(asg, esph, sizeof(*esph));  	aead_request_set_callback(req, 0, esp_input_done, skb);  	aead_request_set_crypt(req, sg, sg, elen, iv); -	aead_request_set_assoc(req, asg, sizeof(*esph)); +	aead_request_set_assoc(req, asg, assoclen);  	err = crypto_aead_decrypt(req);  	if (err == -EINPROGRESS) @@ -384,66 +453,69 @@ out:  static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)  { -	struct esp_data *esp = x->data; -	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); -	u32 align = max_t(u32, blksize, esp->padlen); -	u32 rem; - -	mtu -= x->props.header_len + crypto_aead_authsize(esp->aead); -	rem = mtu & (align - 1); -	mtu &= ~(align - 1); +	struct crypto_aead *aead = x->data; +	u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); +	unsigned int net_adj;  	switch (x->props.mode) { -	case XFRM_MODE_TUNNEL: -		break; -	default:  	case XFRM_MODE_TRANSPORT: -		/* The worst case */ -		mtu -= blksize - 4; -		mtu += min_t(u32, blksize - 4, rem); -		break;  	case XFRM_MODE_BEET: -		/* The worst case. */ -		mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem); +		net_adj = sizeof(struct iphdr); +		break; +	case XFRM_MODE_TUNNEL: +		net_adj = 0;  		break; +	default: +		BUG();  	} -	return mtu - 2; +	return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - +		 net_adj) & ~(blksize - 1)) + net_adj - 2;  } -static void esp4_err(struct sk_buff *skb, u32 info) +static int esp4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev); -	struct iphdr *iph = (struct iphdr *)skb->data; +	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));  	struct xfrm_state *x; -	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || -	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -		return; +	switch (icmp_hdr(skb)->type) { +	case ICMP_DEST_UNREACH: +		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) +			return 0; +	case ICMP_REDIRECT: +		break; +	default: +		return 0; +	} -	x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); +	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, +			      esph->spi, IPPROTO_ESP, AF_INET);  	if (!x) -		return; -	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", -		 ntohl(esph->spi), ntohl(iph->daddr)); +		return 0; + +	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) +		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0); +	else +		ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);  	xfrm_state_put(x); + +	return 0;  }  static void esp_destroy(struct xfrm_state *x)  { -	struct esp_data *esp = x->data; +	struct crypto_aead *aead = x->data; -	if (!esp) +	if (!aead)  		return; -	crypto_free_aead(esp->aead); -	kfree(esp); +	crypto_free_aead(aead);  }  static int esp_init_aead(struct xfrm_state *x)  { -	struct esp_data *esp = x->data;  	struct crypto_aead *aead;  	int err; @@ -452,7 +524,7 @@ static int esp_init_aead(struct xfrm_state *x)  	if (IS_ERR(aead))  		goto error; -	esp->aead = aead; +	x->data = aead;  	err = crypto_aead_setkey(aead, x->aead->alg_key,  				 (x->aead->alg_key_len + 7) / 8); @@ -469,7 +541,6 @@ error:  static int esp_init_authenc(struct xfrm_state *x)  { -	struct esp_data *esp = x->data;  	struct crypto_aead *aead;  	struct crypto_authenc_key_param *param;  	struct rtattr *rta; @@ -484,17 +555,27 @@ static int esp_init_authenc(struct xfrm_state *x)  		goto error;  	err = -ENAMETOOLONG; -	if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", -		     x->aalg ? x->aalg->alg_name : "digest_null", -		     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) -		goto error; + +	if ((x->props.flags & XFRM_STATE_ESN)) { +		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, +			     "authencesn(%s,%s)", +			     x->aalg ? x->aalg->alg_name : "digest_null", +			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) +			goto error; +	} else { +		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, +			     "authenc(%s,%s)", +			     x->aalg ? x->aalg->alg_name : "digest_null", +			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) +			goto error; +	}  	aead = crypto_alloc_aead(authenc_name, 0, 0);  	err = PTR_ERR(aead);  	if (IS_ERR(aead))  		goto error; -	esp->aead = aead; +	x->data = aead;  	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +  		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -549,16 +630,11 @@ error:  static int esp_init_state(struct xfrm_state *x)  { -	struct esp_data *esp;  	struct crypto_aead *aead;  	u32 align;  	int err; -	esp = kzalloc(sizeof(*esp), GFP_KERNEL); -	if (esp == NULL) -		return -ENOMEM; - -	x->data = esp; +	x->data = NULL;  	if (x->aead)  		err = esp_init_aead(x); @@ -568,9 +644,7 @@ static int esp_init_state(struct xfrm_state *x)  	if (err)  		goto error; -	aead = esp->aead; - -	esp->padlen = 0; +	aead = x->data;  	x->props.header_len = sizeof(struct ip_esp_hdr) +  			      crypto_aead_ivsize(aead); @@ -594,14 +668,17 @@ static int esp_init_state(struct xfrm_state *x)  	}  	align = ALIGN(crypto_aead_blocksize(aead), 4); -	if (esp->padlen) -		align = max_t(u32, align, esp->padlen); -	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); +	x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);  error:  	return err;  } +static int esp4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +} +  static const struct xfrm_type esp_type =  {  	.description	= "ESP4", @@ -615,21 +692,22 @@ static const struct xfrm_type esp_type =  	.output		= esp_output  }; -static const struct net_protocol esp4_protocol = { +static struct xfrm4_protocol esp4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	esp4_rcv_cb,  	.err_handler	=	esp4_err, -	.no_policy	=	1, -	.netns_ok	=	1, +	.priority	=	0,  };  static int __init esp4_init(void)  {  	if (xfrm_register_type(&esp_type, AF_INET) < 0) { -		printk(KERN_INFO "ip esp init: can't add xfrm type\n"); +		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { -		printk(KERN_INFO "ip esp init: can't add protocol\n"); +	if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) { +		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&esp_type, AF_INET);  		return -EAGAIN;  	} @@ -638,10 +716,10 @@ static int __init esp4_init(void)  static void __exit esp4_fini(void)  { -	if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) -		printk(KERN_INFO "ip esp close: can't remove protocol\n"); +	if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0) +		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&esp_type, AF_INET) < 0) -		printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); +		pr_info("%s: can't remove xfrm type\n", __func__);  }  module_init(esp4_init); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d3a1112b9d9..255aa9946fe 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -15,7 +15,6 @@  #include <linux/module.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/capability.h>  #include <linux/types.h> @@ -32,6 +31,7 @@  #include <linux/if_addr.h>  #include <linux/if_arp.h>  #include <linux/skbuff.h> +#include <linux/cache.h>  #include <linux/init.h>  #include <linux/list.h>  #include <linux/slab.h> @@ -44,6 +44,7 @@  #include <net/arp.h>  #include <net/ip_fib.h>  #include <net/rtnetlink.h> +#include <net/xfrm.h>  #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net)  {  	struct fib_table *local_table, *main_table; -	local_table = fib_hash_table(RT_TABLE_LOCAL); +	local_table = fib_trie_table(RT_TABLE_LOCAL);  	if (local_table == NULL)  		return -ENOMEM; -	main_table  = fib_hash_table(RT_TABLE_MAIN); +	main_table  = fib_trie_table(RT_TABLE_MAIN);  	if (main_table == NULL)  		goto fail; @@ -82,9 +83,27 @@ struct fib_table *fib_new_table(struct net *net, u32 id)  	if (tb)  		return tb; -	tb = fib_hash_table(id); +	tb = fib_trie_table(id);  	if (!tb)  		return NULL; + +	switch (id) { +	case RT_TABLE_LOCAL: +		net->ipv4.fib_local = tb; +		break; + +	case RT_TABLE_MAIN: +		net->ipv4.fib_main = tb; +		break; + +	case RT_TABLE_DEFAULT: +		net->ipv4.fib_default = tb; +		break; + +	default: +		break; +	} +  	h = id & (FIB_TABLE_HASHSZ - 1);  	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);  	return tb; @@ -93,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)  struct fib_table *fib_get_table(struct net *net, u32 id)  {  	struct fib_table *tb; -	struct hlist_node *node;  	struct hlist_head *head;  	unsigned int h; @@ -103,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id)  	rcu_read_lock();  	head = &net->ipv4.fib_table_hash[h]; -	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { +	hlist_for_each_entry_rcu(tb, head, tb_hlist) {  		if (tb->tb_id == id) {  			rcu_read_unlock();  			return tb; @@ -114,84 +132,34 @@ struct fib_table *fib_get_table(struct net *net, u32 id)  }  #endif /* CONFIG_IP_MULTIPLE_TABLES */ -void fib_select_default(struct net *net, -			const struct flowi *flp, struct fib_result *res) -{ -	struct fib_table *tb; -	int table = RT_TABLE_MAIN; -#ifdef CONFIG_IP_MULTIPLE_TABLES -	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) -		return; -	table = res->r->table; -#endif -	tb = fib_get_table(net, table); -	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) -		fib_table_select_default(tb, flp, res); -} -  static void fib_flush(struct net *net)  {  	int flushed = 0;  	struct fib_table *tb; -	struct hlist_node *node;  	struct hlist_head *head;  	unsigned int h;  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {  		head = &net->ipv4.fib_table_hash[h]; -		hlist_for_each_entry(tb, node, head, tb_hlist) +		hlist_for_each_entry(tb, head, tb_hlist)  			flushed += fib_table_flush(tb);  	}  	if (flushed) -		rt_cache_flush(net, -1); -} - -/** - * __ip_dev_find - find the first device with a given source address. - * @net: the net namespace - * @addr: the source address - * @devref: if true, take a reference on the found device - * - * If a caller uses devref=false, it should be protected by RCU, or RTNL - */ -struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) -{ -	struct flowi fl = { -		.fl4_dst = addr, -		.flags = FLOWI_FLAG_MATCH_ANY_IIF -	}; -	struct fib_result res = { 0 }; -	struct net_device *dev = NULL; - -	rcu_read_lock(); -	if (fib_lookup(net, &fl, &res)) { -		rcu_read_unlock(); -		return NULL; -	} -	if (res.type != RTN_LOCAL) -		goto out; -	dev = FIB_RES_DEV(res); - -	if (dev && devref) -		dev_hold(dev); -out: -	rcu_read_unlock(); -	return dev; +		rt_cache_flush(net);  } -EXPORT_SYMBOL(__ip_dev_find);  /*   * Find address type as if only "dev" was present in the system. If   * on_dev is NULL then all interfaces are taken into consideration.   */ -static inline unsigned __inet_dev_addr_type(struct net *net, -					    const struct net_device *dev, -					    __be32 addr) +static inline unsigned int __inet_dev_addr_type(struct net *net, +						const struct net_device *dev, +						__be32 addr)  { -	struct flowi		fl = { .fl4_dst = addr }; +	struct flowi4		fl4 = { .daddr = addr };  	struct fib_result	res; -	unsigned ret = RTN_BROADCAST; +	unsigned int ret = RTN_BROADCAST;  	struct fib_table *local_table;  	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) @@ -199,15 +167,11 @@ static inline unsigned __inet_dev_addr_type(struct net *net,  	if (ipv4_is_multicast(addr))  		return RTN_MULTICAST; -#ifdef CONFIG_IP_MULTIPLE_TABLES -	res.r = NULL; -#endif -  	local_table = fib_get_table(net, RT_TABLE_LOCAL);  	if (local_table) {  		ret = RTN_UNICAST;  		rcu_read_lock(); -		if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { +		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {  			if (!dev || dev == res.fi->fib_dev)  				ret = res.type;  		} @@ -229,6 +193,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,  }  EXPORT_SYMBOL(inet_dev_addr_type); +__be32 fib_compute_spec_dst(struct sk_buff *skb) +{ +	struct net_device *dev = skb->dev; +	struct in_device *in_dev; +	struct fib_result res; +	struct rtable *rt; +	struct flowi4 fl4; +	struct net *net; +	int scope; + +	rt = skb_rtable(skb); +	if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == +	    RTCF_LOCAL) +		return ip_hdr(skb)->daddr; + +	in_dev = __in_dev_get_rcu(dev); +	BUG_ON(!in_dev); + +	net = dev_net(dev); + +	scope = RT_SCOPE_UNIVERSE; +	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { +		fl4.flowi4_oif = 0; +		fl4.flowi4_iif = LOOPBACK_IFINDEX; +		fl4.daddr = ip_hdr(skb)->saddr; +		fl4.saddr = 0; +		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); +		fl4.flowi4_scope = scope; +		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; +		if (!fib_lookup(net, &fl4, &res)) +			return FIB_RES_PREFSRC(net, res); +	} else { +		scope = RT_SCOPE_LINK; +	} + +	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); +} +  /* Given (packet source, input interface) and optional (dst, oif, tos):   * - (main) check, that source is valid i.e. not broadcast or our local   *   address. @@ -237,45 +239,35 @@ EXPORT_SYMBOL(inet_dev_addr_type);   * - check, that packet arrived from expected physical interface.   * called with rcu_read_lock()   */ -int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, -			struct net_device *dev, __be32 *spec_dst, -			u32 *itag, u32 mark) +static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, +				 u8 tos, int oif, struct net_device *dev, +				 int rpf, struct in_device *idev, u32 *itag)  { -	struct in_device *in_dev; -	struct flowi fl = { -		.fl4_dst = src, -		.fl4_src = dst, -		.fl4_tos = tos, -		.mark = mark, -		.iif = oif -	}; +	int ret, no_addr, accept_local;  	struct fib_result res; -	int no_addr, rpf, accept_local; -	bool dev_match; -	int ret; +	struct flowi4 fl4;  	struct net *net; +	bool dev_match; -	no_addr = rpf = accept_local = 0; -	in_dev = __in_dev_get_rcu(dev); -	if (in_dev) { -		no_addr = in_dev->ifa_list == NULL; -		rpf = IN_DEV_RPFILTER(in_dev); -		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); -		if (mark && !IN_DEV_SRC_VMARK(in_dev)) -			fl.mark = 0; -	} +	fl4.flowi4_oif = 0; +	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; +	fl4.daddr = src; +	fl4.saddr = dst; +	fl4.flowi4_tos = tos; +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE; -	if (in_dev == NULL) -		goto e_inval; +	no_addr = idev->ifa_list == NULL; + +	accept_local = IN_DEV_ACCEPT_LOCAL(idev); +	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;  	net = dev_net(dev); -	if (fib_lookup(net, &fl, &res)) +	if (fib_lookup(net, &fl4, &res))  		goto last_resort;  	if (res.type != RTN_UNICAST) {  		if (res.type != RTN_LOCAL || !accept_local)  			goto e_inval;  	} -	*spec_dst = FIB_RES_PREFSRC(res);  	fib_combine_itag(itag, &res);  	dev_match = false; @@ -300,21 +292,18 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,  		goto last_resort;  	if (rpf == 1)  		goto e_rpf; -	fl.oif = dev->ifindex; +	fl4.flowi4_oif = dev->ifindex;  	ret = 0; -	if (fib_lookup(net, &fl, &res) == 0) { -		if (res.type == RTN_UNICAST) { -			*spec_dst = FIB_RES_PREFSRC(res); +	if (fib_lookup(net, &fl4, &res) == 0) { +		if (res.type == RTN_UNICAST)  			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; -		}  	}  	return ret;  last_resort:  	if (rpf)  		goto e_rpf; -	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);  	*itag = 0;  	return 0; @@ -324,6 +313,21 @@ e_rpf:  	return -EXDEV;  } +/* Ignore rp_filter for packets protected by IPsec. */ +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, +			u8 tos, int oif, struct net_device *dev, +			struct in_device *idev, u32 *itag) +{ +	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + +	if (!r && !fib_num_tclassid_users(dev_net(dev)) && +	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { +		*itag = 0; +		return 0; +	} +	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); +} +  static inline __be32 sk_extract_addr(struct sockaddr *addr)  {  	return ((struct sockaddr_in *) addr)->sin_addr.s_addr; @@ -482,7 +486,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)  	switch (cmd) {  	case SIOCADDRT:		/* Add a route */  	case SIOCDELRT:		/* Delete a route */ -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EPERM;  		if (copy_from_user(&rt, arg, sizeof(rt))) @@ -552,7 +556,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,  	cfg->fc_flags = rtm->rtm_flags;  	cfg->fc_nlflags = nlh->nlmsg_flags; -	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; +	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;  	cfg->fc_nlinfo.nlh = nlh;  	cfg->fc_nlinfo.nl_net = net; @@ -600,7 +604,7 @@ errout:  	return err;  } -static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct fib_config cfg; @@ -622,7 +626,7 @@ errout:  	return err;  } -static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	struct fib_config cfg; @@ -650,13 +654,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)  	unsigned int h, s_h;  	unsigned int e = 0, s_e;  	struct fib_table *tb; -	struct hlist_node *node;  	struct hlist_head *head;  	int dumped = 0;  	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&  	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) -		return ip_rt_dump(skb, cb); +		return skb->len;  	s_h = cb->args[0];  	s_e = cb->args[1]; @@ -664,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)  	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {  		e = 0;  		head = &net->ipv4.fib_table_hash[h]; -		hlist_for_each_entry(tb, node, head, tb_hlist) { +		hlist_for_each_entry(tb, head, tb_hlist) {  			if (e < s_e)  				goto next;  			if (dumped) @@ -740,7 +743,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)  	if (ifa->ifa_flags & IFA_F_SECONDARY) {  		prim = inet_ifa_byprefix(in_dev, prefix, mask);  		if (prim == NULL) { -			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); +			pr_warn("%s: bug: prim == NULL\n", __func__);  			return;  		}  	} @@ -769,30 +772,44 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)  	}  } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)  {  	struct in_device *in_dev = ifa->ifa_dev;  	struct net_device *dev = in_dev->dev;  	struct in_ifaddr *ifa1; -	struct in_ifaddr *prim = ifa; +	struct in_ifaddr *prim = ifa, *prim1 = NULL;  	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;  	__be32 any = ifa->ifa_address & ifa->ifa_mask;  #define LOCAL_OK	1  #define BRD_OK		2  #define BRD0_OK		4  #define BRD1_OK		8 -	unsigned ok = 0; +	unsigned int ok = 0; +	int subnet = 0;		/* Primary network */ +	int gone = 1;		/* Address is missing */ +	int same_prefsrc = 0;	/* Another primary with same IP */ -	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) -		fib_magic(RTM_DELROUTE, -			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, -			  any, ifa->ifa_prefixlen, prim); -	else { +	if (ifa->ifa_flags & IFA_F_SECONDARY) {  		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);  		if (prim == NULL) { -			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); +			pr_warn("%s: bug: prim == NULL\n", __func__);  			return;  		} +		if (iprim && iprim != prim) { +			pr_warn("%s: bug: iprim != prim\n", __func__); +			return; +		} +	} else if (!ipv4_is_zeronet(any) && +		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { +		fib_magic(RTM_DELROUTE, +			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, +			  any, ifa->ifa_prefixlen, prim); +		subnet = 1;  	}  	/* Deletion is more complicated than add. @@ -802,6 +819,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)  	 */  	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { +		if (ifa1 == ifa) { +			/* promotion, keep the IP */ +			gone = 0; +			continue; +		} +		/* Ignore IFAs from our subnet */ +		if (iprim && ifa1->ifa_mask == iprim->ifa_mask && +		    inet_ifa_match(ifa1->ifa_address, iprim)) +			continue; + +		/* Ignore ifa1 if it uses different primary IP (prefsrc) */ +		if (ifa1->ifa_flags & IFA_F_SECONDARY) { +			/* Another address from our subnet? */ +			if (ifa1->ifa_mask == prim->ifa_mask && +			    inet_ifa_match(ifa1->ifa_address, prim)) +				prim1 = prim; +			else { +				/* We reached the secondaries, so +				 * same_prefsrc should be determined. +				 */ +				if (!same_prefsrc) +					continue; +				/* Search new prim1 if ifa1 is not +				 * using the current prim1 +				 */ +				if (!prim1 || +				    ifa1->ifa_mask != prim1->ifa_mask || +				    !inet_ifa_match(ifa1->ifa_address, prim1)) +					prim1 = inet_ifa_byprefix(in_dev, +							ifa1->ifa_address, +							ifa1->ifa_mask); +				if (!prim1) +					continue; +				if (prim1->ifa_local != prim->ifa_local) +					continue; +			} +		} else { +			if (prim->ifa_local != ifa1->ifa_local) +				continue; +			prim1 = ifa1; +			if (prim != prim1) +				same_prefsrc = 1; +		}  		if (ifa->ifa_local == ifa1->ifa_local)  			ok |= LOCAL_OK;  		if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -810,19 +870,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)  			ok |= BRD1_OK;  		if (any == ifa1->ifa_broadcast)  			ok |= BRD0_OK; +		/* primary has network specific broadcasts */ +		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { +			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; +			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + +			if (!ipv4_is_zeronet(any1)) { +				if (ifa->ifa_broadcast == brd1 || +				    ifa->ifa_broadcast == any1) +					ok |= BRD_OK; +				if (brd == brd1 || brd == any1) +					ok |= BRD1_OK; +				if (any == brd1 || any == any1) +					ok |= BRD0_OK; +			} +		}  	}  	if (!(ok & BRD_OK))  		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); -	if (!(ok & BRD1_OK)) -		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); -	if (!(ok & BRD0_OK)) -		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); +	if (subnet && ifa->ifa_prefixlen < 31) { +		if (!(ok & BRD1_OK)) +			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); +		if (!(ok & BRD0_OK)) +			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); +	}  	if (!(ok & LOCAL_OK)) {  		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);  		/* Check, that this local address finally disappeared. */ -		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { +		if (gone && +		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {  			/* And the last, but not the least thing.  			 * We must flush stray FIB entries.  			 * @@ -843,24 +921,19 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)  {  	struct fib_result       res; -	struct flowi            fl = { -		.mark = frn->fl_mark, -		.fl4_dst = frn->fl_addr, -		.fl4_tos = frn->fl_tos, -		.fl4_scope = frn->fl_scope, +	struct flowi4           fl4 = { +		.flowi4_mark = frn->fl_mark, +		.daddr = frn->fl_addr, +		.flowi4_tos = frn->fl_tos, +		.flowi4_scope = frn->fl_scope,  	}; -#ifdef CONFIG_IP_MULTIPLE_TABLES -	res.r = NULL; -#endif -  	frn->err = -ENOENT;  	if (tb) {  		local_bh_disable();  		frn->tb_id = tb->tb_id; -		rcu_read_lock(); -		frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); +		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);  		if (!frn->err) {  			frn->prefixlen = res.prefixlen; @@ -868,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)  			frn->type = res.type;  			frn->scope = res.scope;  		} -		rcu_read_unlock();  		local_bh_enable();  	}  } @@ -879,35 +951,38 @@ static void nl_fib_input(struct sk_buff *skb)  	struct fib_result_nl *frn;  	struct nlmsghdr *nlh;  	struct fib_table *tb; -	u32 pid; +	u32 portid;  	net = sock_net(skb->sk);  	nlh = nlmsg_hdr(skb); -	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || -	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) +	if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || +	    nlmsg_len(nlh) < sizeof(*frn))  		return; -	skb = skb_clone(skb, GFP_KERNEL); +	skb = netlink_skb_clone(skb, GFP_KERNEL);  	if (skb == NULL)  		return;  	nlh = nlmsg_hdr(skb); -	frn = (struct fib_result_nl *) NLMSG_DATA(nlh); +	frn = (struct fib_result_nl *) nlmsg_data(nlh);  	tb = fib_get_table(net, frn->tb_id_in);  	nl_fib_lookup(frn, tb); -	pid = NETLINK_CB(skb).pid;      /* pid of sending process */ -	NETLINK_CB(skb).pid = 0;        /* from kernel */ +	portid = NETLINK_CB(skb).portid;      /* netlink portid */ +	NETLINK_CB(skb).portid = 0;        /* from kernel */  	NETLINK_CB(skb).dst_group = 0;  /* unicast */ -	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); +	netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);  }  static int __net_init nl_fib_lookup_init(struct net *net)  {  	struct sock *sk; -	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, -				   nl_fib_input, NULL, THIS_MODULE); +	struct netlink_kernel_cfg cfg = { +		.input	= nl_fib_input, +	}; + +	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);  	if (sk == NULL)  		return -EAFNOSUPPORT;  	net->ipv4.fibnl = sk; @@ -920,11 +995,11 @@ static void nl_fib_lookup_exit(struct net *net)  	net->ipv4.fibnl = NULL;  } -static void fib_disable_ip(struct net_device *dev, int force, int delay) +static void fib_disable_ip(struct net_device *dev, int force)  {  	if (fib_sync_down_dev(dev, force))  		fib_flush(dev_net(dev)); -	rt_cache_flush(dev_net(dev), delay); +	rt_cache_flush(dev_net(dev));  	arp_ifdown(dev);  } @@ -932,6 +1007,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,  {  	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;  	struct net_device *dev = ifa->ifa_dev->dev; +	struct net *net = dev_net(dev);  	switch (event) {  	case NETDEV_UP: @@ -939,17 +1015,19 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,  #ifdef CONFIG_IP_ROUTE_MULTIPATH  		fib_sync_up(dev);  #endif -		rt_cache_flush(dev_net(dev), -1); +		atomic_inc(&net->ipv4.dev_addr_genid); +		rt_cache_flush(dev_net(dev));  		break;  	case NETDEV_DOWN: -		fib_del_ifaddr(ifa); +		fib_del_ifaddr(ifa, NULL); +		atomic_inc(&net->ipv4.dev_addr_genid);  		if (ifa->ifa_dev->ifa_list == NULL) {  			/* Last address was deleted from this interface.  			 * Disable IP.  			 */ -			fib_disable_ip(dev, 1, 0); +			fib_disable_ip(dev, 1);  		} else { -			rt_cache_flush(dev_net(dev), -1); +			rt_cache_flush(dev_net(dev));  		}  		break;  	} @@ -958,14 +1036,17 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,  static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; -	struct in_device *in_dev = __in_dev_get_rtnl(dev); +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct in_device *in_dev; +	struct net *net = dev_net(dev);  	if (event == NETDEV_UNREGISTER) { -		fib_disable_ip(dev, 2, -1); +		fib_disable_ip(dev, 2); +		rt_flush_dev(dev);  		return NOTIFY_DONE;  	} +	in_dev = __in_dev_get_rtnl(dev);  	if (!in_dev)  		return NOTIFY_DONE; @@ -977,17 +1058,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo  #ifdef CONFIG_IP_ROUTE_MULTIPATH  		fib_sync_up(dev);  #endif -		rt_cache_flush(dev_net(dev), -1); +		atomic_inc(&net->ipv4.dev_addr_genid); +		rt_cache_flush(net);  		break;  	case NETDEV_DOWN: -		fib_disable_ip(dev, 0, 0); +		fib_disable_ip(dev, 0);  		break;  	case NETDEV_CHANGEMTU:  	case NETDEV_CHANGE: -		rt_cache_flush(dev_net(dev), 0); -		break; -	case NETDEV_UNREGISTER_BATCH: -		rt_cache_flush_batch(); +		rt_cache_flush(net);  		break;  	}  	return NOTIFY_DONE; @@ -1031,18 +1110,20 @@ static void ip_fib_net_exit(struct net *net)  	fib4_rules_exit(net);  #endif +	rtnl_lock();  	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {  		struct fib_table *tb;  		struct hlist_head *head; -		struct hlist_node *node, *tmp; +		struct hlist_node *tmp;  		head = &net->ipv4.fib_table_hash[i]; -		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { -			hlist_del(node); +		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { +			hlist_del(&tb->tb_hlist);  			fib_table_flush(tb);  			fib_free_table(tb);  		}  	} +	rtnl_unlock();  	kfree(net->ipv4.fib_table_hash);  } @@ -1050,6 +1131,9 @@ static int __net_init fib_net_init(struct net *net)  {  	int error; +#ifdef CONFIG_IP_ROUTE_CLASSID +	net->ipv4.fib_num_tclassid_users = 0; +#endif  	error = ip_fib_net_init(net);  	if (error < 0)  		goto out; @@ -1083,13 +1167,13 @@ static struct pernet_operations fib_net_ops = {  void __init ip_fib_init(void)  { -	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); -	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); -	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); +	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); +	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); +	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);  	register_pernet_subsys(&fib_net_ops);  	register_netdevice_notifier(&fib_netdev_notifier);  	register_inetaddr_notifier(&fib_inetaddr_notifier); -	fib_hash_init(); +	fib_trie_init();  } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index b3acb0417b2..00000000000 --- a/net/ipv4/fib_hash.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * INET		An implementation of the TCP/IP protocol suite for the LINUX - *		operating system.  INET is implemented using the  BSD Socket - *		interface as the means of communication with the user level. - * - *		IPv4 FIB: lookup engine and maintenance routines. - * - * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - *		This program is free software; you can redistribute it and/or - *		modify it under the terms of the GNU General Public License - *		as published by the Free Software Foundation; either version - *		2 of the License, or (at your option) any later version. - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/ip_fib.h> - -#include "fib_lookup.h" - -static struct kmem_cache *fn_hash_kmem __read_mostly; -static struct kmem_cache *fn_alias_kmem __read_mostly; - -struct fib_node { -	struct hlist_node	fn_hash; -	struct list_head	fn_alias; -	__be32			fn_key; -	struct fib_alias        fn_embedded_alias; -}; - -#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - -struct fn_zone { -	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/ -	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/ -	seqlock_t		fz_lock; -	u32			fz_hashmask;	/* (fz_divisor - 1)	*/ - -	u8			fz_order;	/* Zone order (0..32)	*/ -	u8			fz_revorder;	/* 32 - fz_order	*/ -	__be32			fz_mask;	/* inet_make_mask(order) */ -#define FZ_MASK(fz)		((fz)->fz_mask) - -	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE]; - -	int			fz_nent;	/* Number of entries	*/ -	int			fz_divisor;	/* Hash size (mask+1)	*/ -}; - -struct fn_hash { -	struct fn_zone		*fn_zones[33]; -	struct fn_zone __rcu	*fn_zone_list; -}; - -static inline u32 fn_hash(__be32 key, struct fn_zone *fz) -{ -	u32 h = ntohl(key) >> fz->fz_revorder; -	h ^= (h>>20); -	h ^= (h>>10); -	h ^= (h>>5); -	h &= fz->fz_hashmask; -	return h; -} - -static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) -{ -	return dst & FZ_MASK(fz); -} - -static unsigned int fib_hash_genid; - -#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) - -static struct hlist_head *fz_hash_alloc(int divisor) -{ -	unsigned long size = divisor * sizeof(struct hlist_head); - -	if (size <= PAGE_SIZE) -		return kzalloc(size, GFP_KERNEL); - -	return (struct hlist_head *) -		__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); -} - -/* The fib hash lock must be held when this is called. */ -static inline void fn_rebuild_zone(struct fn_zone *fz, -				   struct hlist_head *old_ht, -				   int old_divisor) -{ -	int i; - -	for (i = 0; i < old_divisor; i++) { -		struct hlist_node *node, *n; -		struct fib_node *f; - -		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { -			struct hlist_head *new_head; - -			hlist_del_rcu(&f->fn_hash); - -			new_head = rcu_dereference_protected(fz->fz_hash, 1) + -				   fn_hash(f->fn_key, fz); -			hlist_add_head_rcu(&f->fn_hash, new_head); -		} -	} -} - -static void fz_hash_free(struct hlist_head *hash, int divisor) -{ -	unsigned long size = divisor * sizeof(struct hlist_head); - -	if (size <= PAGE_SIZE) -		kfree(hash); -	else -		free_pages((unsigned long)hash, get_order(size)); -} - -static void fn_rehash_zone(struct fn_zone *fz) -{ -	struct hlist_head *ht, *old_ht; -	int old_divisor, new_divisor; -	u32 new_hashmask; - -	new_divisor = old_divisor = fz->fz_divisor; - -	switch (old_divisor) { -	case EMBEDDED_HASH_SIZE: -		new_divisor *= EMBEDDED_HASH_SIZE; -		break; -	case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: -		new_divisor *= (EMBEDDED_HASH_SIZE/2); -		break; -	default: -		if ((old_divisor << 1) > FZ_MAX_DIVISOR) { -			printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); -			return; -		} -		new_divisor = (old_divisor << 1); -		break; -	} - -	new_hashmask = (new_divisor - 1); - -#if RT_CACHE_DEBUG >= 2 -	printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", -	       fz->fz_order, old_divisor); -#endif - -	ht = fz_hash_alloc(new_divisor); - -	if (ht)	{ -		struct fn_zone nfz; - -		memcpy(&nfz, fz, sizeof(nfz)); - -		write_seqlock_bh(&fz->fz_lock); -		old_ht = rcu_dereference_protected(fz->fz_hash, 1); -		RCU_INIT_POINTER(nfz.fz_hash, ht); -		nfz.fz_hashmask = new_hashmask; -		nfz.fz_divisor = new_divisor; -		fn_rebuild_zone(&nfz, old_ht, old_divisor); -		fib_hash_genid++; -		rcu_assign_pointer(fz->fz_hash, ht); -		fz->fz_hashmask = new_hashmask; -		fz->fz_divisor = new_divisor; -		write_sequnlock_bh(&fz->fz_lock); - -		if (old_ht != fz->fz_embedded_hash) { -			synchronize_rcu(); -			fz_hash_free(old_ht, old_divisor); -		} -	} -} - -static void fn_free_node_rcu(struct rcu_head *head) -{ -	struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); - -	kmem_cache_free(fn_hash_kmem, f); -} - -static inline void fn_free_node(struct fib_node *f) -{ -	call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); -} - -static void fn_free_alias_rcu(struct rcu_head *head) -{ -	struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - -	kmem_cache_free(fn_alias_kmem, fa); -} - -static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) -{ -	fib_release_info(fa->fa_info); -	if (fa == &f->fn_embedded_alias) -		fa->fa_info = NULL; -	else -		call_rcu(&fa->rcu, fn_free_alias_rcu); -} - -static struct fn_zone * -fn_new_zone(struct fn_hash *table, int z) -{ -	int i; -	struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); -	if (!fz) -		return NULL; - -	seqlock_init(&fz->fz_lock); -	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; -	fz->fz_hashmask = fz->fz_divisor - 1; -	RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); -	fz->fz_order = z; -	fz->fz_revorder = 32 - z; -	fz->fz_mask = inet_make_mask(z); - -	/* Find the first not empty zone with more specific mask */ -	for (i = z + 1; i <= 32; i++) -		if (table->fn_zones[i]) -			break; -	if (i > 32) { -		/* No more specific masks, we are the first. */ -		rcu_assign_pointer(fz->fz_next, -				   rtnl_dereference(table->fn_zone_list)); -		rcu_assign_pointer(table->fn_zone_list, fz); -	} else { -		rcu_assign_pointer(fz->fz_next, -				   rtnl_dereference(table->fn_zones[i]->fz_next)); -		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); -	} -	table->fn_zones[z] = fz; -	fib_hash_genid++; -	return fz; -} - -int fib_table_lookup(struct fib_table *tb, -		     const struct flowi *flp, struct fib_result *res, -		     int fib_flags) -{ -	int err; -	struct fn_zone *fz; -	struct fn_hash *t = (struct fn_hash *)tb->tb_data; - -	rcu_read_lock(); -	for (fz = rcu_dereference(t->fn_zone_list); -	     fz != NULL; -	     fz = rcu_dereference(fz->fz_next)) { -		struct hlist_head *head; -		struct hlist_node *node; -		struct fib_node *f; -		__be32 k; -		unsigned int seq; - -		do { -			seq = read_seqbegin(&fz->fz_lock); -			k = fz_key(flp->fl4_dst, fz); - -			head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); -			hlist_for_each_entry_rcu(f, node, head, fn_hash) { -				if (f->fn_key != k) -					continue; - -				err = fib_semantic_match(&f->fn_alias, -						 flp, res, -						 fz->fz_order, fib_flags); -				if (err <= 0) -					goto out; -			} -		} while (read_seqretry(&fz->fz_lock, seq)); -	} -	err = 1; -out: -	rcu_read_unlock(); -	return err; -} - -void fib_table_select_default(struct fib_table *tb, -			      const struct flowi *flp, struct fib_result *res) -{ -	int order, last_idx; -	struct hlist_node *node; -	struct fib_node *f; -	struct fib_info *fi = NULL; -	struct fib_info *last_resort; -	struct fn_hash *t = (struct fn_hash *)tb->tb_data; -	struct fn_zone *fz = t->fn_zones[0]; -	struct hlist_head *head; - -	if (fz == NULL) -		return; - -	last_idx = -1; -	last_resort = NULL; -	order = -1; - -	rcu_read_lock(); -	head = rcu_dereference(fz->fz_hash); -	hlist_for_each_entry_rcu(f, node, head, fn_hash) { -		struct fib_alias *fa; - -		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { -			struct fib_info *next_fi = fa->fa_info; - -			if (fa->fa_scope != res->scope || -			    fa->fa_type != RTN_UNICAST) -				continue; - -			if (next_fi->fib_priority > res->fi->fib_priority) -				break; -			if (!next_fi->fib_nh[0].nh_gw || -			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) -				continue; - -			fib_alias_accessed(fa); - -			if (fi == NULL) { -				if (next_fi != res->fi) -					break; -			} else if (!fib_detect_death(fi, order, &last_resort, -						&last_idx, tb->tb_default)) { -				fib_result_assign(res, fi); -				tb->tb_default = order; -				goto out; -			} -			fi = next_fi; -			order++; -		} -	} - -	if (order <= 0 || fi == NULL) { -		tb->tb_default = -1; -		goto out; -	} - -	if (!fib_detect_death(fi, order, &last_resort, &last_idx, -				tb->tb_default)) { -		fib_result_assign(res, fi); -		tb->tb_default = order; -		goto out; -	} - -	if (last_idx >= 0) -		fib_result_assign(res, last_resort); -	tb->tb_default = last_idx; -out: -	rcu_read_unlock(); -} - -/* Insert node F to FZ. */ -static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) -{ -	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - -	hlist_add_head_rcu(&f->fn_hash, head); -} - -/* Return the node in FZ matching KEY. */ -static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) -{ -	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); -	struct hlist_node *node; -	struct fib_node *f; - -	hlist_for_each_entry_rcu(f, node, head, fn_hash) { -		if (f->fn_key == key) -			return f; -	} - -	return NULL; -} - - -static struct fib_alias *fib_fast_alloc(struct fib_node *f) -{ -	struct fib_alias *fa = &f->fn_embedded_alias; - -	if (fa->fa_info != NULL) -		fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); -	return fa; -} - -/* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) -{ -	struct fn_hash *table = (struct fn_hash *) tb->tb_data; -	struct fib_node *new_f = NULL; -	struct fib_node *f; -	struct fib_alias *fa, *new_fa; -	struct fn_zone *fz; -	struct fib_info *fi; -	u8 tos = cfg->fc_tos; -	__be32 key; -	int err; - -	if (cfg->fc_dst_len > 32) -		return -EINVAL; - -	fz = table->fn_zones[cfg->fc_dst_len]; -	if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) -		return -ENOBUFS; - -	key = 0; -	if (cfg->fc_dst) { -		if (cfg->fc_dst & ~FZ_MASK(fz)) -			return -EINVAL; -		key = fz_key(cfg->fc_dst, fz); -	} - -	fi = fib_create_info(cfg); -	if (IS_ERR(fi)) -		return PTR_ERR(fi); - -	if (fz->fz_nent > (fz->fz_divisor<<1) && -	    fz->fz_divisor < FZ_MAX_DIVISOR && -	    (cfg->fc_dst_len == 32 || -	     (1 << cfg->fc_dst_len) > fz->fz_divisor)) -		fn_rehash_zone(fz); - -	f = fib_find_node(fz, key); - -	if (!f) -		fa = NULL; -	else -		fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); - -	/* Now fa, if non-NULL, points to the first fib alias -	 * with the same keys [prefix,tos,priority], if such key already -	 * exists or to the node before which we will insert new one. -	 * -	 * If fa is NULL, we will need to allocate a new one and -	 * insert to the head of f. -	 * -	 * If f is NULL, no fib node matched the destination key -	 * and we need to allocate a new one of those as well. -	 */ - -	if (fa && fa->fa_tos == tos && -	    fa->fa_info->fib_priority == fi->fib_priority) { -		struct fib_alias *fa_first, *fa_match; - -		err = -EEXIST; -		if (cfg->fc_nlflags & NLM_F_EXCL) -			goto out; - -		/* We have 2 goals: -		 * 1. Find exact match for type, scope, fib_info to avoid -		 * duplicate routes -		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it -		 */ -		fa_match = NULL; -		fa_first = fa; -		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); -		list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { -			if (fa->fa_tos != tos) -				break; -			if (fa->fa_info->fib_priority != fi->fib_priority) -				break; -			if (fa->fa_type == cfg->fc_type && -			    fa->fa_scope == cfg->fc_scope && -			    fa->fa_info == fi) { -				fa_match = fa; -				break; -			} -		} - -		if (cfg->fc_nlflags & NLM_F_REPLACE) { -			u8 state; - -			fa = fa_first; -			if (fa_match) { -				if (fa == fa_match) -					err = 0; -				goto out; -			} -			err = -ENOBUFS; -			new_fa = fib_fast_alloc(f); -			if (new_fa == NULL) -				goto out; - -			new_fa->fa_tos = fa->fa_tos; -			new_fa->fa_info = fi; -			new_fa->fa_type = cfg->fc_type; -			new_fa->fa_scope = cfg->fc_scope; -			state = fa->fa_state; -			new_fa->fa_state = state & ~FA_S_ACCESSED; -			fib_hash_genid++; -			list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - -			fn_free_alias(fa, f); -			if (state & FA_S_ACCESSED) -				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); -			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, -				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); -			return 0; -		} - -		/* Error if we find a perfect match which -		 * uses the same scope, type, and nexthop -		 * information. -		 */ -		if (fa_match) -			goto out; - -		if (!(cfg->fc_nlflags & NLM_F_APPEND)) -			fa = fa_first; -	} - -	err = -ENOENT; -	if (!(cfg->fc_nlflags & NLM_F_CREATE)) -		goto out; - -	err = -ENOBUFS; - -	if (!f) { -		new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); -		if (new_f == NULL) -			goto out; - -		INIT_HLIST_NODE(&new_f->fn_hash); -		INIT_LIST_HEAD(&new_f->fn_alias); -		new_f->fn_key = key; -		f = new_f; -	} - -	new_fa = fib_fast_alloc(f); -	if (new_fa == NULL) -		goto out; - -	new_fa->fa_info = fi; -	new_fa->fa_tos = tos; -	new_fa->fa_type = cfg->fc_type; -	new_fa->fa_scope = cfg->fc_scope; -	new_fa->fa_state = 0; - -	/* -	 * Insert new entry to the list. -	 */ - -	if (new_f) -		fib_insert_node(fz, new_f); -	list_add_tail_rcu(&new_fa->fa_list, -		 (fa ? &fa->fa_list : &f->fn_alias)); -	fib_hash_genid++; - -	if (new_f) -		fz->fz_nent++; -	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - -	rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, -		  &cfg->fc_nlinfo, 0); -	return 0; - -out: -	if (new_f) -		kmem_cache_free(fn_hash_kmem, new_f); -	fib_release_info(fi); -	return err; -} - -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) -{ -	struct fn_hash *table = (struct fn_hash *)tb->tb_data; -	struct fib_node *f; -	struct fib_alias *fa, *fa_to_delete; -	struct fn_zone *fz; -	__be32 key; - -	if (cfg->fc_dst_len > 32) -		return -EINVAL; - -	if ((fz  = table->fn_zones[cfg->fc_dst_len]) == NULL) -		return -ESRCH; - -	key = 0; -	if (cfg->fc_dst) { -		if (cfg->fc_dst & ~FZ_MASK(fz)) -			return -EINVAL; -		key = fz_key(cfg->fc_dst, fz); -	} - -	f = fib_find_node(fz, key); - -	if (!f) -		fa = NULL; -	else -		fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); -	if (!fa) -		return -ESRCH; - -	fa_to_delete = NULL; -	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); -	list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { -		struct fib_info *fi = fa->fa_info; - -		if (fa->fa_tos != cfg->fc_tos) -			break; - -		if ((!cfg->fc_type || -		     fa->fa_type == cfg->fc_type) && -		    (cfg->fc_scope == RT_SCOPE_NOWHERE || -		     fa->fa_scope == cfg->fc_scope) && -		    (!cfg->fc_protocol || -		     fi->fib_protocol == cfg->fc_protocol) && -		    fib_nh_match(cfg, fi) == 0) { -			fa_to_delete = fa; -			break; -		} -	} - -	if (fa_to_delete) { -		int kill_fn; - -		fa = fa_to_delete; -		rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, -			  tb->tb_id, &cfg->fc_nlinfo, 0); - -		kill_fn = 0; -		list_del_rcu(&fa->fa_list); -		if (list_empty(&f->fn_alias)) { -			hlist_del_rcu(&f->fn_hash); -			kill_fn = 1; -		} -		fib_hash_genid++; - -		if (fa->fa_state & FA_S_ACCESSED) -			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); -		fn_free_alias(fa, f); -		if (kill_fn) { -			fn_free_node(f); -			fz->fz_nent--; -		} - -		return 0; -	} -	return -ESRCH; -} - -static int fn_flush_list(struct fn_zone *fz, int idx) -{ -	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; -	struct hlist_node *node, *n; -	struct fib_node *f; -	int found = 0; - -	hlist_for_each_entry_safe(f, node, n, head, fn_hash) { -		struct fib_alias *fa, *fa_node; -		int kill_f; - -		kill_f = 0; -		list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { -			struct fib_info *fi = fa->fa_info; - -			if (fi && (fi->fib_flags&RTNH_F_DEAD)) { -				list_del_rcu(&fa->fa_list); -				if (list_empty(&f->fn_alias)) { -					hlist_del_rcu(&f->fn_hash); -					kill_f = 1; -				} -				fib_hash_genid++; - -				fn_free_alias(fa, f); -				found++; -			} -		} -		if (kill_f) { -			fn_free_node(f); -			fz->fz_nent--; -		} -	} -	return found; -} - -/* caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) -{ -	struct fn_hash *table = (struct fn_hash *) tb->tb_data; -	struct fn_zone *fz; -	int found = 0; - -	for (fz = rtnl_dereference(table->fn_zone_list); -	     fz != NULL; -	     fz = rtnl_dereference(fz->fz_next)) { -		int i; - -		for (i = fz->fz_divisor - 1; i >= 0; i--) -			found += fn_flush_list(fz, i); -	} -	return found; -} - -void fib_free_table(struct fib_table *tb) -{ -	struct fn_hash *table = (struct fn_hash *) tb->tb_data; -	struct fn_zone *fz, *next; - -	next = table->fn_zone_list; -	while (next != NULL) { -		fz = next; -		next = fz->fz_next; - -		if (fz->fz_hash != fz->fz_embedded_hash) -			fz_hash_free(fz->fz_hash, fz->fz_divisor); - -		kfree(fz); -	} - -	kfree(tb); -} - -static inline int -fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, -		     struct fib_table *tb, -		     struct fn_zone *fz, -		     struct hlist_head *head) -{ -	struct hlist_node *node; -	struct fib_node *f; -	int i, s_i; - -	s_i = cb->args[4]; -	i = 0; -	hlist_for_each_entry_rcu(f, node, head, fn_hash) { -		struct fib_alias *fa; - -		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { -			if (i < s_i) -				goto next; - -			if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, -					  cb->nlh->nlmsg_seq, -					  RTM_NEWROUTE, -					  tb->tb_id, -					  fa->fa_type, -					  fa->fa_scope, -					  f->fn_key, -					  fz->fz_order, -					  fa->fa_tos, -					  fa->fa_info, -					  NLM_F_MULTI) < 0) { -				cb->args[4] = i; -				return -1; -			} -next: -			i++; -		} -	} -	cb->args[4] = i; -	return skb->len; -} - -static inline int -fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, -		   struct fib_table *tb, -		   struct fn_zone *fz) -{ -	int h, s_h; -	struct hlist_head *head = rcu_dereference(fz->fz_hash); - -	if (head == NULL) -		return skb->len; -	s_h = cb->args[3]; -	for (h = s_h; h < fz->fz_divisor; h++) { -		if (hlist_empty(head + h)) -			continue; -		if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { -			cb->args[3] = h; -			return -1; -		} -		memset(&cb->args[4], 0, -		       sizeof(cb->args) - 4*sizeof(cb->args[0])); -	} -	cb->args[3] = h; -	return skb->len; -} - -int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, -		   struct netlink_callback *cb) -{ -	int m = 0, s_m; -	struct fn_zone *fz; -	struct fn_hash *table = (struct fn_hash *)tb->tb_data; - -	s_m = cb->args[2]; -	rcu_read_lock(); -	for (fz = rcu_dereference(table->fn_zone_list); -	     fz != NULL; -	     fz = rcu_dereference(fz->fz_next), m++) { -		if (m < s_m) -			continue; -		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { -			cb->args[2] = m; -			rcu_read_unlock(); -			return -1; -		} -		memset(&cb->args[3], 0, -		       sizeof(cb->args) - 3*sizeof(cb->args[0])); -	} -	rcu_read_unlock(); -	cb->args[2] = m; -	return skb->len; -} - -void __init fib_hash_init(void) -{ -	fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), -					 0, SLAB_PANIC, NULL); - -	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), -					  0, SLAB_PANIC, NULL); - -} - -struct fib_table *fib_hash_table(u32 id) -{ -	struct fib_table *tb; - -	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), -		     GFP_KERNEL); -	if (tb == NULL) -		return NULL; - -	tb->tb_id = id; -	tb->tb_default = -1; - -	memset(tb->tb_data, 0, sizeof(struct fn_hash)); -	return tb; -} - -/* ------------------------------------------------------------------------ */ -#ifdef CONFIG_PROC_FS - -struct fib_iter_state { -	struct seq_net_private p; -	struct fn_zone	*zone; -	int		bucket; -	struct hlist_head *hash_head; -	struct fib_node *fn; -	struct fib_alias *fa; -	loff_t pos; -	unsigned int genid; -	int valid; -}; - -static struct fib_alias *fib_get_first(struct seq_file *seq) -{ -	struct fib_iter_state *iter = seq->private; -	struct fib_table *main_table; -	struct fn_hash *table; - -	main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); -	table = (struct fn_hash *)main_table->tb_data; - -	iter->bucket    = 0; -	iter->hash_head = NULL; -	iter->fn        = NULL; -	iter->fa        = NULL; -	iter->pos	= 0; -	iter->genid	= fib_hash_genid; -	iter->valid	= 1; - -	for (iter->zone = rcu_dereference(table->fn_zone_list); -	     iter->zone != NULL; -	     iter->zone = rcu_dereference(iter->zone->fz_next)) { -		int maxslot; - -		if (!iter->zone->fz_nent) -			continue; - -		iter->hash_head = rcu_dereference(iter->zone->fz_hash); -		maxslot = iter->zone->fz_divisor; - -		for (iter->bucket = 0; iter->bucket < maxslot; -		     ++iter->bucket, ++iter->hash_head) { -			struct hlist_node *node; -			struct fib_node *fn; - -			hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { -				struct fib_alias *fa; - -				list_for_each_entry(fa, &fn->fn_alias, fa_list) { -					iter->fn = fn; -					iter->fa = fa; -					goto out; -				} -			} -		} -	} -out: -	return iter->fa; -} - -static struct fib_alias *fib_get_next(struct seq_file *seq) -{ -	struct fib_iter_state *iter = seq->private; -	struct fib_node *fn; -	struct fib_alias *fa; - -	/* Advance FA, if any. */ -	fn = iter->fn; -	fa = iter->fa; -	if (fa) { -		BUG_ON(!fn); -		list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { -			iter->fa = fa; -			goto out; -		} -	} - -	fa = iter->fa = NULL; - -	/* Advance FN. */ -	if (fn) { -		struct hlist_node *node = &fn->fn_hash; -		hlist_for_each_entry_continue(fn, node, fn_hash) { -			iter->fn = fn; - -			list_for_each_entry(fa, &fn->fn_alias, fa_list) { -				iter->fa = fa; -				goto out; -			} -		} -	} - -	fn = iter->fn = NULL; - -	/* Advance hash chain. */ -	if (!iter->zone) -		goto out; - -	for (;;) { -		struct hlist_node *node; -		int maxslot; - -		maxslot = iter->zone->fz_divisor; - -		while (++iter->bucket < maxslot) { -			iter->hash_head++; - -			hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { -				list_for_each_entry(fa, &fn->fn_alias, fa_list) { -					iter->fn = fn; -					iter->fa = fa; -					goto out; -				} -			} -		} - -		iter->zone = rcu_dereference(iter->zone->fz_next); - -		if (!iter->zone) -			goto out; - -		iter->bucket = 0; -		iter->hash_head = rcu_dereference(iter->zone->fz_hash); - -		hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { -			list_for_each_entry(fa, &fn->fn_alias, fa_list) { -				iter->fn = fn; -				iter->fa = fa; -				goto out; -			} -		} -	} -out: -	iter->pos++; -	return fa; -} - -static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -{ -	struct fib_iter_state *iter = seq->private; -	struct fib_alias *fa; - -	if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { -		fa   = iter->fa; -		pos -= iter->pos; -	} else -		fa = fib_get_first(seq); - -	if (fa) -		while (pos && (fa = fib_get_next(seq))) -			--pos; -	return pos ? NULL : fa; -} - -static void *fib_seq_start(struct seq_file *seq, loff_t *pos) -	__acquires(RCU) -{ -	void *v = NULL; - -	rcu_read_lock(); -	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) -		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; -	return v; -} - -static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ -	++*pos; -	return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); -} - -static void fib_seq_stop(struct seq_file *seq, void *v) -	__releases(RCU) -{ -	rcu_read_unlock(); -} - -static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -{ -	static const unsigned type2flags[RTN_MAX + 1] = { -		[7] = RTF_REJECT, -		[8] = RTF_REJECT, -	}; -	unsigned flags = type2flags[type]; - -	if (fi && fi->fib_nh->nh_gw) -		flags |= RTF_GATEWAY; -	if (mask == htonl(0xFFFFFFFF)) -		flags |= RTF_HOST; -	flags |= RTF_UP; -	return flags; -} - -/* - *	This outputs /proc/net/route. - * - *	It always works in backward compatibility mode. - *	The format of the file is not supposed to be changed. - */ -static int fib_seq_show(struct seq_file *seq, void *v) -{ -	struct fib_iter_state *iter; -	int len; -	__be32 prefix, mask; -	unsigned flags; -	struct fib_node *f; -	struct fib_alias *fa; -	struct fib_info *fi; - -	if (v == SEQ_START_TOKEN) { -		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " -			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" -			   "\tWindow\tIRTT"); -		goto out; -	} - -	iter	= seq->private; -	f	= iter->fn; -	fa	= iter->fa; -	fi	= fa->fa_info; -	prefix	= f->fn_key; -	mask	= FZ_MASK(iter->zone); -	flags	= fib_flag_trans(fa->fa_type, mask, fi); -	if (fi) -		seq_printf(seq, -			 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", -			 fi->fib_dev ? fi->fib_dev->name : "*", prefix, -			 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, -			 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), -			 fi->fib_window, -			 fi->fib_rtt >> 3, &len); -	else -		seq_printf(seq, -			 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", -			 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); - -	seq_printf(seq, "%*s\n", 127 - len, ""); -out: -	return 0; -} - -static const struct seq_operations fib_seq_ops = { -	.start  = fib_seq_start, -	.next   = fib_seq_next, -	.stop   = fib_seq_stop, -	.show   = fib_seq_show, -}; - -static int fib_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &fib_seq_ops, -			    sizeof(struct fib_iter_state)); -} - -static const struct file_operations fib_seq_fops = { -	.owner		= THIS_MODULE, -	.open           = fib_seq_open, -	.read           = seq_read, -	.llseek         = seq_lseek, -	.release	= seq_release_net, -}; - -int __net_init fib_proc_init(struct net *net) -{ -	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) -		return -ENOMEM; -	return 0; -} - -void __net_exit fib_proc_exit(struct net *net) -{ -	proc_net_remove(net, "route"); -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c079cc0ec65..1e4f6600b31 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias {  	struct fib_info		*fa_info;  	u8			fa_tos;  	u8			fa_type; -	u8			fa_scope;  	u8			fa_state;  	struct rcu_head		rcu;  }; @@ -25,24 +24,15 @@ static inline void fib_alias_accessed(struct fib_alias *fa)  }  /* Exported by fib_semantics.c */ -extern int fib_semantic_match(struct list_head *head, -			      const struct flowi *flp, -			      struct fib_result *res, int prefixlen, int fib_flags); -extern void fib_release_info(struct fib_info *); -extern struct fib_info *fib_create_info(struct fib_config *cfg); -extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); -extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, -			 u32 tb_id, u8 type, u8 scope, __be32 dst, -			 int dst_len, u8 tos, struct fib_info *fi, -			 unsigned int); -extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, -		      int dst_len, u32 tb_id, struct nl_info *info, -		      unsigned int nlm_flags); -extern struct fib_alias *fib_find_alias(struct list_head *fah, -					u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, -			    struct fib_info **last_resort, -			    int *last_idx, int dflt); +void fib_release_info(struct fib_info *); +struct fib_info *fib_create_info(struct fib_config *cfg); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, +		  u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, +		  unsigned int); +void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, +	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);  static inline void fib_result_assign(struct fib_result *res,  				     struct fib_info *fi) @@ -51,4 +41,11 @@ static inline void fib_result_assign(struct fib_result *res,  	res->fi = fi;  } +struct fib_prop { +	int	error; +	u8	scope; +}; + +extern const struct fib_prop fib_props[RTN_MAX + 1]; +  #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7..f2e15738534 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -26,6 +26,7 @@  #include <linux/init.h>  #include <linux/list.h>  #include <linux/rcupdate.h> +#include <linux/export.h>  #include <net/ip.h>  #include <net/route.h>  #include <net/tcp.h> @@ -41,19 +42,12 @@ struct fib4_rule {  	__be32			srcmask;  	__be32			dst;  	__be32			dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	u32			tclassid;  #endif  }; -#ifdef CONFIG_NET_CLS_ROUTE -u32 fib_rules_tclass(struct fib_result *res) -{ -	return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; -} -#endif - -int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) +int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)  {  	struct fib_lookup_arg arg = {  		.result = res, @@ -61,11 +55,16 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)  	};  	int err; -	err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); -	res->r = arg.rule; - +	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); +#ifdef CONFIG_IP_ROUTE_CLASSID +	if (arg.rule) +		res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; +	else +		res->tclassid = 0; +#endif  	return err;  } +EXPORT_SYMBOL_GPL(__fib_lookup);  static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,  			    int flags, struct fib_lookup_arg *arg) @@ -95,25 +94,53 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,  	if (!tbl)  		goto errout; -	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); +	err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);  	if (err > 0)  		err = -EAGAIN;  errout:  	return err;  } +static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ +	struct fib_result *result = (struct fib_result *) arg->result; +	struct net_device *dev = NULL; + +	if (result->fi) +		dev = result->fi->fib_dev; + +	/* do not accept result if the route does +	 * not meet the required prefix length +	 */ +	if (result->prefixlen <= rule->suppress_prefixlen) +		goto suppress_route; + +	/* do not accept result if the route uses a device +	 * belonging to a forbidden interface group +	 */ +	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) +		goto suppress_route; + +	return false; + +suppress_route: +	if (!(arg->flags & FIB_LOOKUP_NOREF)) +		fib_info_put(result->fi); +	return true; +}  static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)  {  	struct fib4_rule *r = (struct fib4_rule *) rule; -	__be32 daddr = fl->fl4_dst; -	__be32 saddr = fl->fl4_src; +	struct flowi4 *fl4 = &fl->u.ip4; +	__be32 daddr = fl4->daddr; +	__be32 saddr = fl4->saddr;  	if (((saddr ^ r->src) & r->srcmask) ||  	    ((daddr ^ r->dst) & r->dstmask))  		return 0; -	if (r->tos && (r->tos != fl->fl4_tos)) +	if (r->tos && (r->tos != fl4->flowi4_tos))  		return 0;  	return 1; @@ -165,9 +192,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,  	if (frh->dst_len)  		rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE -	if (tb[FRA_FLOW]) +#ifdef CONFIG_IP_ROUTE_CLASSID +	if (tb[FRA_FLOW]) {  		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); +		if (rule4->tclassid) +			net->ipv4.fib_num_tclassid_users++; +	}  #endif  	rule4->src_len = frh->src_len; @@ -176,11 +206,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,  	rule4->dstmask = inet_make_mask(rule4->dst_len);  	rule4->tos = frh->tos; +	net->ipv4.fib_has_custom_rules = true;  	err = 0;  errout:  	return err;  } +static void fib4_rule_delete(struct fib_rule *rule) +{ +	struct net *net = rule->fr_net; +#ifdef CONFIG_IP_ROUTE_CLASSID +	struct fib4_rule *rule4 = (struct fib4_rule *) rule; + +	if (rule4->tclassid) +		net->ipv4.fib_num_tclassid_users--; +#endif +	net->ipv4.fib_has_custom_rules = true; +} +  static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,  			     struct nlattr **tb)  { @@ -195,7 +238,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,  	if (frh->tos && (rule4->tos != frh->tos))  		return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))  		return 0;  #endif @@ -218,15 +261,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,  	frh->src_len = rule4->src_len;  	frh->tos = rule4->tos; -	if (rule4->dst_len) -		NLA_PUT_BE32(skb, FRA_DST, rule4->dst); - -	if (rule4->src_len) -		NLA_PUT_BE32(skb, FRA_SRC, rule4->src); - -#ifdef CONFIG_NET_CLS_ROUTE -	if (rule4->tclassid) -		NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); +	if ((rule4->dst_len && +	     nla_put_be32(skb, FRA_DST, rule4->dst)) || +	    (rule4->src_len && +	     nla_put_be32(skb, FRA_SRC, rule4->src))) +		goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID +	if (rule4->tclassid && +	    nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) +		goto nla_put_failure;  #endif  	return 0; @@ -243,16 +286,18 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)  static void fib4_rule_flush_cache(struct fib_rules_ops *ops)  { -	rt_cache_flush(ops->fro_net, -1); +	rt_cache_flush(ops->fro_net);  } -static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {  	.family		= AF_INET,  	.rule_size	= sizeof(struct fib4_rule),  	.addr_size	= sizeof(u32),  	.action		= fib4_rule_action, +	.suppress	= fib4_rule_suppress,  	.match		= fib4_rule_match,  	.configure	= fib4_rule_configure, +	.delete		= fib4_rule_delete,  	.compare	= fib4_rule_compare,  	.fill		= fib4_rule_fill,  	.default_pref	= fib_default_rule_pref, @@ -292,6 +337,7 @@ int __net_init fib4_rules_init(struct net *net)  	if (err < 0)  		goto fail;  	net->ipv4.rules_ops = ops; +	net->ipv4.fib_has_custom_rules = false;  	return 0;  fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3df1b..b10cd43a472 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -14,7 +14,6 @@   */  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -49,7 +48,7 @@  static DEFINE_SPINLOCK(fib_info_lock);  static struct hlist_head *fib_info_hash;  static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_hash_size; +static unsigned int fib_info_hash_size;  static unsigned int fib_info_cnt;  #define DEVINDEX_HASHBITS 8 @@ -90,11 +89,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);  #define endfor_nexthops(fi) } -static const struct -{ -	int	error; -	u8	scope; -} fib_props[RTN_MAX + 1] = { +const struct fib_prop fib_props[RTN_MAX + 1] = {  	[RTN_UNSPEC] = {  		.error	= 0,  		.scope	= RT_SCOPE_NOWHERE, @@ -145,29 +140,96 @@ static const struct  	},  }; +static void rt_fibinfo_free(struct rtable __rcu **rtp) +{ +	struct rtable *rt = rcu_dereference_protected(*rtp, 1); -/* Release a nexthop info record */ +	if (!rt) +		return; + +	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL); +	 * because we waited an RCU grace period before calling +	 * free_fib_info_rcu() +	 */ + +	dst_free(&rt->dst); +} +static void free_nh_exceptions(struct fib_nh *nh) +{ +	struct fnhe_hash_bucket *hash = nh->nh_exceptions; +	int i; + +	for (i = 0; i < FNHE_HASH_SIZE; i++) { +		struct fib_nh_exception *fnhe; + +		fnhe = rcu_dereference_protected(hash[i].chain, 1); +		while (fnhe) { +			struct fib_nh_exception *next; +			 +			next = rcu_dereference_protected(fnhe->fnhe_next, 1); + +			rt_fibinfo_free(&fnhe->fnhe_rth_input); +			rt_fibinfo_free(&fnhe->fnhe_rth_output); + +			kfree(fnhe); + +			fnhe = next; +		} +	} +	kfree(hash); +} + +static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) +{ +	int cpu; + +	if (!rtp) +		return; + +	for_each_possible_cpu(cpu) { +		struct rtable *rt; + +		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); +		if (rt) +			dst_free(&rt->dst); +	} +	free_percpu(rtp); +} + +/* Release a nexthop info record */  static void free_fib_info_rcu(struct rcu_head *head)  {  	struct fib_info *fi = container_of(head, struct fib_info, rcu); +	change_nexthops(fi) { +		if (nexthop_nh->nh_dev) +			dev_put(nexthop_nh->nh_dev); +		if (nexthop_nh->nh_exceptions) +			free_nh_exceptions(nexthop_nh); +		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); +		rt_fibinfo_free(&nexthop_nh->nh_rth_input); +	} endfor_nexthops(fi); + +	release_net(fi->fib_net); +	if (fi->fib_metrics != (u32 *) dst_default_metrics) +		kfree(fi->fib_metrics);  	kfree(fi);  }  void free_fib_info(struct fib_info *fi)  {  	if (fi->fib_dead == 0) { -		pr_warning("Freeing alive fib_info %p\n", fi); +		pr_warn("Freeing alive fib_info %p\n", fi);  		return;  	} +	fib_info_cnt--; +#ifdef CONFIG_IP_ROUTE_CLASSID  	change_nexthops(fi) { -		if (nexthop_nh->nh_dev) -			dev_put(nexthop_nh->nh_dev); -		nexthop_nh->nh_dev = NULL; +		if (nexthop_nh->nh_tclassid) +			fi->fib_net->ipv4.fib_num_tclassid_users--;  	} endfor_nexthops(fi); -	fib_info_cnt--; -	release_net(fi->fib_net); +#endif  	call_rcu(&fi->rcu, free_fib_info_rcu);  } @@ -200,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)  #ifdef CONFIG_IP_ROUTE_MULTIPATH  		    nh->nh_weight != onh->nh_weight ||  #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  		    nh->nh_tclassid != onh->nh_tclassid ||  #endif  		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -221,10 +283,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)  static inline unsigned int fib_info_hashfn(const struct fib_info *fi)  { -	unsigned int mask = (fib_hash_size - 1); +	unsigned int mask = (fib_info_hash_size - 1);  	unsigned int val = fi->fib_nhs; -	val ^= fi->fib_protocol; +	val ^= (fi->fib_protocol << 8) | fi->fib_scope;  	val ^= (__force u32)fi->fib_prefsrc;  	val ^= fi->fib_priority;  	for_nexthops(fi) { @@ -237,23 +299,24 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)  static struct fib_info *fib_find_info(const struct fib_info *nfi)  {  	struct hlist_head *head; -	struct hlist_node *node;  	struct fib_info *fi;  	unsigned int hash;  	hash = fib_info_hashfn(nfi);  	head = &fib_info_hash[hash]; -	hlist_for_each_entry(fi, node, head, fib_hash) { +	hlist_for_each_entry(fi, head, fib_hash) {  		if (!net_eq(fi->fib_net, nfi->fib_net))  			continue;  		if (fi->fib_nhs != nfi->fib_nhs)  			continue;  		if (nfi->fib_protocol == fi->fib_protocol && +		    nfi->fib_scope == fi->fib_scope &&  		    nfi->fib_prefsrc == fi->fib_prefsrc &&  		    nfi->fib_priority == fi->fib_priority && +		    nfi->fib_type == fi->fib_type &&  		    memcmp(nfi->fib_metrics, fi->fib_metrics, -			   sizeof(fi->fib_metrics)) == 0 && +			   sizeof(u32) * RTAX_MAX) == 0 &&  		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&  		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))  			return fi; @@ -268,7 +331,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)  int ip_fib_check_default(__be32 gw, struct net_device *dev)  {  	struct hlist_head *head; -	struct hlist_node *node;  	struct fib_nh *nh;  	unsigned int hash; @@ -276,7 +338,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)  	hash = fib_devindex_hashfn(dev->ifindex);  	head = &fib_info_devhash[hash]; -	hlist_for_each_entry(nh, node, head, nh_hash) { +	hlist_for_each_entry(nh, head, nh_hash) {  		if (nh->nh_dev == dev &&  		    nh->nh_gw == gw &&  		    !(nh->nh_flags & RTNH_F_DEAD)) { @@ -318,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)  }  void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, -	       int dst_len, u32 tb_id, struct nl_info *info, +	       int dst_len, u32 tb_id, const struct nl_info *info,  	       unsigned int nlm_flags)  {  	struct sk_buff *skb; @@ -329,8 +391,8 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,  	if (skb == NULL)  		goto errout; -	err = fib_dump_info(skb, info->pid, seq, event, tb_id, -			    fa->fa_type, fa->fa_scope, key, dst_len, +	err = fib_dump_info(skb, info->portid, seq, event, tb_id, +			    fa->fa_type, key, dst_len,  			    fa->fa_tos, fa->fa_info, nlm_flags);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -338,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,  		kfree_skb(skb);  		goto errout;  	} -	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, +	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,  		    info->nlh, GFP_KERNEL);  	return;  errout: @@ -364,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)  	return NULL;  } -int fib_detect_death(struct fib_info *fi, int order, -		     struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, +			    struct fib_info **last_resort, int *last_idx, +			    int dflt)  {  	struct neighbour *n;  	int state = NUD_NONE; @@ -422,9 +485,11 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,  			nla = nla_find(attrs, attrlen, RTA_GATEWAY);  			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  			nla = nla_find(attrs, attrlen, RTA_FLOW);  			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; +			if (nexthop_nh->nh_tclassid) +				fi->fib_net->ipv4.fib_num_tclassid_users++;  #endif  		} @@ -476,7 +541,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)  			nla = nla_find(attrs, attrlen, RTA_GATEWAY);  			if (nla && nla_get_be32(nla) != nh->nh_gw)  				return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  			nla = nla_find(attrs, attrlen, RTA_FLOW);  			if (nla && nla_get_u32(nla) != nh->nh_tclassid)  				return 1; @@ -562,16 +627,17 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,  		}  		rcu_read_lock();  		{ -			struct flowi fl = { -				.fl4_dst = nh->nh_gw, -				.fl4_scope = cfg->fc_scope + 1, -				.oif = nh->nh_oif, +			struct flowi4 fl4 = { +				.daddr = nh->nh_gw, +				.flowi4_scope = cfg->fc_scope + 1, +				.flowi4_oif = nh->nh_oif, +				.flowi4_iif = LOOPBACK_IFINDEX,  			};  			/* It is not necessary, but requires a bit of thinking */ -			if (fl.fl4_scope < RT_SCOPE_LINK) -				fl.fl4_scope = RT_SCOPE_LINK; -			err = fib_lookup(net, &fl, &res); +			if (fl4.flowi4_scope < RT_SCOPE_LINK) +				fl4.flowi4_scope = RT_SCOPE_LINK; +			err = fib_lookup(net, &fl4, &res);  			if (err) {  				rcu_read_unlock();  				return err; @@ -613,14 +679,14 @@ out:  static inline unsigned int fib_laddr_hashfn(__be32 val)  { -	unsigned int mask = (fib_hash_size - 1); +	unsigned int mask = (fib_info_hash_size - 1);  	return ((__force u32)val ^  		((__force u32)val >> 7) ^  		((__force u32)val >> 14)) & mask;  } -static struct hlist_head *fib_hash_alloc(int bytes) +static struct hlist_head *fib_info_hash_alloc(int bytes)  {  	if (bytes <= PAGE_SIZE)  		return kzalloc(bytes, GFP_KERNEL); @@ -630,7 +696,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)  					 get_order(bytes));  } -static void fib_hash_free(struct hlist_head *hash, int bytes) +static void fib_info_hash_free(struct hlist_head *hash, int bytes)  {  	if (!hash)  		return; @@ -641,25 +707,25 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)  		free_pages((unsigned long) hash, get_order(bytes));  } -static void fib_hash_move(struct hlist_head *new_info_hash, -			  struct hlist_head *new_laddrhash, -			  unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash, +			       struct hlist_head *new_laddrhash, +			       unsigned int new_size)  {  	struct hlist_head *old_info_hash, *old_laddrhash; -	unsigned int old_size = fib_hash_size; +	unsigned int old_size = fib_info_hash_size;  	unsigned int i, bytes;  	spin_lock_bh(&fib_info_lock);  	old_info_hash = fib_info_hash;  	old_laddrhash = fib_info_laddrhash; -	fib_hash_size = new_size; +	fib_info_hash_size = new_size;  	for (i = 0; i < old_size; i++) {  		struct hlist_head *head = &fib_info_hash[i]; -		struct hlist_node *node, *n; +		struct hlist_node *n;  		struct fib_info *fi; -		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { +		hlist_for_each_entry_safe(fi, n, head, fib_hash) {  			struct hlist_head *dest;  			unsigned int new_hash; @@ -674,10 +740,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,  	for (i = 0; i < old_size; i++) {  		struct hlist_head *lhead = &fib_info_laddrhash[i]; -		struct hlist_node *node, *n; +		struct hlist_node *n;  		struct fib_info *fi; -		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { +		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {  			struct hlist_head *ldest;  			unsigned int new_hash; @@ -693,8 +759,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,  	spin_unlock_bh(&fib_info_lock);  	bytes = old_size * sizeof(struct hlist_head *); -	fib_hash_free(old_info_hash, bytes); -	fib_hash_free(old_laddrhash, bytes); +	fib_info_hash_free(old_info_hash, bytes); +	fib_info_hash_free(old_laddrhash, bytes); +} + +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ +	nh->nh_saddr = inet_select_addr(nh->nh_dev, +					nh->nh_gw, +					nh->nh_parent->fib_scope); +	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + +	return nh->nh_saddr;  }  struct fib_info *fib_create_info(struct fib_config *cfg) @@ -705,6 +781,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  	int nhs = 1;  	struct net *net = cfg->fc_nlinfo.nl_net; +	if (cfg->fc_type > RTN_MAX) +		goto err_inval; +  	/* Fast check to catch the most weird cases */  	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)  		goto err_inval; @@ -718,24 +797,24 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  #endif  	err = -ENOBUFS; -	if (fib_info_cnt >= fib_hash_size) { -		unsigned int new_size = fib_hash_size << 1; +	if (fib_info_cnt >= fib_info_hash_size) { +		unsigned int new_size = fib_info_hash_size << 1;  		struct hlist_head *new_info_hash;  		struct hlist_head *new_laddrhash;  		unsigned int bytes;  		if (!new_size) -			new_size = 1; +			new_size = 16;  		bytes = new_size * sizeof(struct hlist_head *); -		new_info_hash = fib_hash_alloc(bytes); -		new_laddrhash = fib_hash_alloc(bytes); +		new_info_hash = fib_info_hash_alloc(bytes); +		new_laddrhash = fib_info_hash_alloc(bytes);  		if (!new_info_hash || !new_laddrhash) { -			fib_hash_free(new_info_hash, bytes); -			fib_hash_free(new_laddrhash, bytes); +			fib_info_hash_free(new_info_hash, bytes); +			fib_info_hash_free(new_laddrhash, bytes);  		} else -			fib_hash_move(new_info_hash, new_laddrhash, new_size); +			fib_info_hash_move(new_info_hash, new_laddrhash, new_size); -		if (!fib_hash_size) +		if (!fib_info_hash_size)  			goto failure;  	} @@ -743,16 +822,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  	if (fi == NULL)  		goto failure;  	fib_info_cnt++; +	if (cfg->fc_mx) { +		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); +		if (!fi->fib_metrics) +			goto failure; +	} else +		fi->fib_metrics = (u32 *) dst_default_metrics;  	fi->fib_net = hold_net(net);  	fi->fib_protocol = cfg->fc_protocol; +	fi->fib_scope = cfg->fc_scope;  	fi->fib_flags = cfg->fc_flags;  	fi->fib_priority = cfg->fc_priority;  	fi->fib_prefsrc = cfg->fc_prefsrc; +	fi->fib_type = cfg->fc_type;  	fi->fib_nhs = nhs;  	change_nexthops(fi) {  		nexthop_nh->nh_parent = fi; +		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); +		if (!nexthop_nh->nh_pcpu_rth_output) +			goto failure;  	} endfor_nexthops(fi)  	if (cfg->fc_mx) { @@ -763,9 +853,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  			int type = nla_type(nla);  			if (type) { +				u32 val; +  				if (type > RTAX_MAX)  					goto err_inval; -				fi->fib_metrics[type - 1] = nla_get_u32(nla); +				val = nla_get_u32(nla); +				if (type == RTAX_ADVMSS && val > 65535 - 40) +					val = 65535 - 40; +				if (type == RTAX_MTU && val > 65535 - 15) +					val = 65535 - 15; +				fi->fib_metrics[type - 1] = val;  			}  		}  	} @@ -779,7 +876,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  			goto err_inval;  		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)  			goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)  			goto err_inval;  #endif @@ -792,8 +889,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  		nh->nh_oif = cfg->fc_oif;  		nh->nh_gw = cfg->fc_gw;  		nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  		nh->nh_tclassid = cfg->fc_flow; +		if (nh->nh_tclassid) +			fi->fib_net->ipv4.fib_num_tclassid_users++;  #endif  #ifdef CONFIG_IP_ROUTE_MULTIPATH  		nh->nh_weight = 1; @@ -804,6 +903,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)  			goto err_inval;  		goto link_it; +	} else { +		switch (cfg->fc_type) { +		case RTN_UNICAST: +		case RTN_LOCAL: +		case RTN_BROADCAST: +		case RTN_ANYCAST: +		case RTN_MULTICAST: +			break; +		default: +			goto err_inval; +		}  	}  	if (cfg->fc_scope > RT_SCOPE_HOST) @@ -835,6 +945,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)  				goto err_inval;  	} +	change_nexthops(fi) { +		fib_info_update_nh_saddr(net, nexthop_nh); +	} endfor_nexthops(fi) +  link_it:  	ofi = fib_find_info(fi);  	if (ofi) { @@ -880,92 +994,14 @@ failure:  	return ERR_PTR(err);  } -/* Note! fib_semantic_match intentionally uses  RCU list functions. */ -int fib_semantic_match(struct list_head *head, const struct flowi *flp, -		       struct fib_result *res, int prefixlen, int fib_flags) -{ -	struct fib_alias *fa; -	int nh_sel = 0; - -	list_for_each_entry_rcu(fa, head, fa_list) { -		int err; - -		if (fa->fa_tos && -		    fa->fa_tos != flp->fl4_tos) -			continue; - -		if (fa->fa_scope < flp->fl4_scope) -			continue; - -		fib_alias_accessed(fa); - -		err = fib_props[fa->fa_type].error; -		if (err == 0) { -			struct fib_info *fi = fa->fa_info; - -			if (fi->fib_flags & RTNH_F_DEAD) -				continue; - -			switch (fa->fa_type) { -			case RTN_UNICAST: -			case RTN_LOCAL: -			case RTN_BROADCAST: -			case RTN_ANYCAST: -			case RTN_MULTICAST: -				for_nexthops(fi) { -					if (nh->nh_flags & RTNH_F_DEAD) -						continue; -					if (!flp->oif || flp->oif == nh->nh_oif) -						break; -				} -#ifdef CONFIG_IP_ROUTE_MULTIPATH -				if (nhsel < fi->fib_nhs) { -					nh_sel = nhsel; -					goto out_fill_res; -				} -#else -				if (nhsel < 1) -					goto out_fill_res; -#endif -				endfor_nexthops(fi); -				continue; - -			default: -				pr_warning("fib_semantic_match bad type %#x\n", -					   fa->fa_type); -				return -EINVAL; -			} -		} -		return err; -	} -	return 1; - -out_fill_res: -	res->prefixlen = prefixlen; -	res->nh_sel = nh_sel; -	res->type = fa->fa_type; -	res->scope = fa->fa_scope; -	res->fi = fa->fa_info; -	if (!(fib_flags & FIB_LOOKUP_NOREF)) -		atomic_inc(&res->fi->fib_clntref); -	return 0; -} - -/* Find appropriate source address to this destination */ - -__be32 __fib_res_prefsrc(struct fib_result *res) -{ -	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); -} - -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, -		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, +int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, +		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,  		  struct fib_info *fi, unsigned int flags)  {  	struct nlmsghdr *nlh;  	struct rtmsg *rtm; -	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);  	if (nlh == NULL)  		return -EMSGSIZE; @@ -978,33 +1014,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,  		rtm->rtm_table = tb_id;  	else  		rtm->rtm_table = RT_TABLE_COMPAT; -	NLA_PUT_U32(skb, RTA_TABLE, tb_id); +	if (nla_put_u32(skb, RTA_TABLE, tb_id)) +		goto nla_put_failure;  	rtm->rtm_type = type;  	rtm->rtm_flags = fi->fib_flags; -	rtm->rtm_scope = scope; +	rtm->rtm_scope = fi->fib_scope;  	rtm->rtm_protocol = fi->fib_protocol; -	if (rtm->rtm_dst_len) -		NLA_PUT_BE32(skb, RTA_DST, dst); - -	if (fi->fib_priority) -		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); - +	if (rtm->rtm_dst_len && +	    nla_put_be32(skb, RTA_DST, dst)) +		goto nla_put_failure; +	if (fi->fib_priority && +	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) +		goto nla_put_failure;  	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)  		goto nla_put_failure; -	if (fi->fib_prefsrc) -		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); - +	if (fi->fib_prefsrc && +	    nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) +		goto nla_put_failure;  	if (fi->fib_nhs == 1) { -		if (fi->fib_nh->nh_gw) -			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); - -		if (fi->fib_nh->nh_oif) -			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE -		if (fi->fib_nh[0].nh_tclassid) -			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); +		if (fi->fib_nh->nh_gw && +		    nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) +			goto nla_put_failure; +		if (fi->fib_nh->nh_oif && +		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) +			goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID +		if (fi->fib_nh[0].nh_tclassid && +		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) +			goto nla_put_failure;  #endif  	}  #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1025,11 +1064,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,  			rtnh->rtnh_hops = nh->nh_weight - 1;  			rtnh->rtnh_ifindex = nh->nh_oif; -			if (nh->nh_gw) -				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE -			if (nh->nh_tclassid) -				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); +			if (nh->nh_gw && +			    nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) +				goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID +			if (nh->nh_tclassid && +			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) +				goto nla_put_failure;  #endif  			/* length of rtnetlink header + attributes */  			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; @@ -1056,13 +1097,12 @@ int fib_sync_down_addr(struct net *net, __be32 local)  	int ret = 0;  	unsigned int hash = fib_laddr_hashfn(local);  	struct hlist_head *head = &fib_info_laddrhash[hash]; -	struct hlist_node *node;  	struct fib_info *fi;  	if (fib_info_laddrhash == NULL || local == 0)  		return 0; -	hlist_for_each_entry(fi, node, head, fib_lhash) { +	hlist_for_each_entry(fi, head, fib_lhash) {  		if (!net_eq(fi->fib_net, net))  			continue;  		if (fi->fib_prefsrc == local) { @@ -1080,13 +1120,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)  	struct fib_info *prev_fi = NULL;  	unsigned int hash = fib_devindex_hashfn(dev->ifindex);  	struct hlist_head *head = &fib_info_devhash[hash]; -	struct hlist_node *node;  	struct fib_nh *nh;  	if (force)  		scope = -1; -	hlist_for_each_entry(nh, node, head, nh_hash) { +	hlist_for_each_entry(nh, head, nh_hash) {  		struct fib_info *fi = nh->nh_parent;  		int dead; @@ -1125,6 +1164,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)  	return ret;  } +/* Must be invoked inside of an RCU protected region.  */ +void fib_select_default(struct fib_result *res) +{ +	struct fib_info *fi = NULL, *last_resort = NULL; +	struct list_head *fa_head = res->fa_head; +	struct fib_table *tb = res->table; +	int order = -1, last_idx = -1; +	struct fib_alias *fa; + +	list_for_each_entry_rcu(fa, fa_head, fa_list) { +		struct fib_info *next_fi = fa->fa_info; + +		if (next_fi->fib_scope != res->scope || +		    fa->fa_type != RTN_UNICAST) +			continue; + +		if (next_fi->fib_priority > res->fi->fib_priority) +			break; +		if (!next_fi->fib_nh[0].nh_gw || +		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) +			continue; + +		fib_alias_accessed(fa); + +		if (fi == NULL) { +			if (next_fi != res->fi) +				break; +		} else if (!fib_detect_death(fi, order, &last_resort, +					     &last_idx, tb->tb_default)) { +			fib_result_assign(res, fi); +			tb->tb_default = order; +			goto out; +		} +		fi = next_fi; +		order++; +	} + +	if (order <= 0 || fi == NULL) { +		tb->tb_default = -1; +		goto out; +	} + +	if (!fib_detect_death(fi, order, &last_resort, &last_idx, +				tb->tb_default)) { +		fib_result_assign(res, fi); +		tb->tb_default = order; +		goto out; +	} + +	if (last_idx >= 0) +		fib_result_assign(res, last_resort); +	tb->tb_default = last_idx; +out: +	return; +} +  #ifdef CONFIG_IP_ROUTE_MULTIPATH  /* @@ -1136,7 +1231,6 @@ int fib_sync_up(struct net_device *dev)  	struct fib_info *prev_fi;  	unsigned int hash;  	struct hlist_head *head; -	struct hlist_node *node;  	struct fib_nh *nh;  	int ret; @@ -1148,7 +1242,7 @@ int fib_sync_up(struct net_device *dev)  	head = &fib_info_devhash[hash];  	ret = 0; -	hlist_for_each_entry(nh, node, head, nh_hash) { +	hlist_for_each_entry(nh, head, nh_hash) {  		struct fib_info *fi = nh->nh_parent;  		int alive; @@ -1189,7 +1283,7 @@ int fib_sync_up(struct net_device *dev)   * The algorithm is suboptimal, but it provides really   * fair weighted route distribution.   */ -void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +void fib_select_multipath(struct fib_result *res)  {  	struct fib_info *fi = res->fi;  	int w; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 200eb538fbb..5afeb5aa4c7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -12,7 +12,7 @@   *   *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet   * - * This work is based on the LPC-trie which is originally descibed in: + * This work is based on the LPC-trie which is originally described in:   *   * An experimental study of compression methods for dynamic tries   * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. @@ -51,7 +51,6 @@  #define VERSION "0.409"  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -72,6 +71,7 @@  #include <linux/init.h>  #include <linux/list.h>  #include <linux/slab.h> +#include <linux/export.h>  #include <net/net_namespace.h>  #include <net/ip.h>  #include <net/protocol.h> @@ -95,7 +95,7 @@ typedef unsigned int t_key;  #define IS_TNODE(n) (!(n->parent & T_LEAF))  #define IS_LEAF(n) (n->parent & T_LEAF) -struct node { +struct rt_trie_node {  	unsigned long parent;  	t_key key;  }; @@ -109,9 +109,10 @@ struct leaf {  struct leaf_info {  	struct hlist_node hlist; -	struct rcu_head rcu;  	int plen; +	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */  	struct list_head falh; +	struct rcu_head rcu;  };  struct tnode { @@ -123,10 +124,9 @@ struct tnode {  	unsigned int empty_children;	/* KEYLENGTH bits needed */  	union {  		struct rcu_head rcu; -		struct work_struct work;  		struct tnode *tnode_free;  	}; -	struct node *child[0]; +	struct rt_trie_node __rcu *child[0];  };  #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,16 +151,15 @@ struct trie_stat {  };  struct trie { -	struct node *trie; +	struct rt_trie_node __rcu *trie;  #ifdef CONFIG_IP_FIB_TRIE_STATS  	struct trie_use_stats stats;  #endif  }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,  				  int wasfull); -static struct node *resize(struct trie *t, struct tnode *tn); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);  static struct tnode *inflate(struct trie *t, struct tnode *tn);  static struct tnode *halve(struct trie *t, struct tnode *tn);  /* tnodes to free after resize(); protected by RTNL */ @@ -177,39 +176,58 @@ static const int sync_pages = 128;  static struct kmem_cache *fn_alias_kmem __read_mostly;  static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct node *node) +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node)  { -	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); +	unsigned long parent; + +	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + +	return (struct tnode *)(parent & ~NODE_TYPE_MASK);  } -static inline struct tnode *node_parent_rcu(struct node *node) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)  { -	struct tnode *ret = node_parent(node); +	unsigned long parent; -	return rcu_dereference_rtnl(ret); +	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || +							   lockdep_rtnl_is_held()); + +	return (struct tnode *)(parent & ~NODE_TYPE_MASK);  }  /* Same as rcu_assign_pointer   * but that macro() assumes that value is a pointer.   */ -static inline void node_set_parent(struct node *node, struct tnode *ptr) +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)  {  	smp_wmb();  	node->parent = (unsigned long)ptr | NODE_TYPE(node);  } -static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)  {  	BUG_ON(i >= 1U << tn->bits); -	return tn->child[i]; +	return rtnl_dereference(tn->child[i]);  } -static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)  { -	struct node *ret = tnode_get_child(tn, i); +	BUG_ON(i >= 1U << tn->bits); -	return rcu_dereference_rtnl(ret); +	return rcu_dereference_rtnl(tn->child[i]);  }  static inline int tnode_child_length(const struct tnode *tn) @@ -217,12 +235,12 @@ static inline int tnode_child_length(const struct tnode *tn)  	return 1 << tn->bits;  } -static inline t_key mask_pfx(t_key k, unsigned short l) +static inline t_key mask_pfx(t_key k, unsigned int l)  {  	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);  } -static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)  {  	if (offset < KEYLENGTH)  		return ((t_key)(a << offset)) >> (KEYLENGTH - bits); @@ -347,17 +365,12 @@ static void __leaf_free_rcu(struct rcu_head *head)  static inline void free_leaf(struct leaf *l)  { -	call_rcu_bh(&l->rcu, __leaf_free_rcu); -} - -static void __leaf_info_free_rcu(struct rcu_head *head) -{ -	kfree(container_of(head, struct leaf_info, rcu)); +	call_rcu(&l->rcu, __leaf_free_rcu);  }  static inline void free_leaf_info(struct leaf_info *leaf)  { -	call_rcu(&leaf->rcu, __leaf_info_free_rcu); +	kfree_rcu(leaf, rcu);  }  static struct tnode *tnode_alloc(size_t size) @@ -365,27 +378,19 @@ static struct tnode *tnode_alloc(size_t size)  	if (size <= PAGE_SIZE)  		return kzalloc(size, GFP_KERNEL);  	else -		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); -} - -static void __tnode_vfree(struct work_struct *arg) -{ -	struct tnode *tn = container_of(arg, struct tnode, work); -	vfree(tn); +		return vzalloc(size);  }  static void __tnode_free_rcu(struct rcu_head *head)  {  	struct tnode *tn = container_of(head, struct tnode, rcu);  	size_t size = sizeof(struct tnode) + -		      (sizeof(struct node *) << tn->bits); +		      (sizeof(struct rt_trie_node *) << tn->bits);  	if (size <= PAGE_SIZE)  		kfree(tn); -	else { -		INIT_WORK(&tn->work, __tnode_vfree); -		schedule_work(&tn->work); -	} +	else +		vfree(tn);  }  static inline void tnode_free(struct tnode *tn) @@ -402,7 +407,7 @@ static void tnode_free_safe(struct tnode *tn)  	tn->tnode_free = tnode_free_head;  	tnode_free_head = tn;  	tnode_free_size += sizeof(struct tnode) + -			   (sizeof(struct node *) << tn->bits); +			   (sizeof(struct rt_trie_node *) << tn->bits);  }  static void tnode_free_flush(void) @@ -436,6 +441,7 @@ static struct leaf_info *leaf_info_new(int plen)  	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);  	if (li) {  		li->plen = plen; +		li->mask_plen = ntohl(inet_make_mask(plen));  		INIT_LIST_HEAD(&li->falh);  	}  	return li; @@ -443,7 +449,7 @@ static struct leaf_info *leaf_info_new(int plen)  static struct tnode *tnode_new(t_key key, int pos, int bits)  { -	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); +	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);  	struct tnode *tn = tnode_alloc(sz);  	if (tn) { @@ -456,7 +462,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)  	}  	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), -		 sizeof(struct node) << bits); +		 sizeof(struct rt_trie_node *) << bits);  	return tn;  } @@ -465,7 +471,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)   * and no bits are skipped. See discussion in dyntree paper p. 6   */ -static inline int tnode_full(const struct tnode *tn, const struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)  {  	if (n == NULL || IS_LEAF(n))  		return 0; @@ -473,8 +479,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)  	return ((struct tnode *) n)->pos == tn->pos + tn->bits;  } -static inline void put_child(struct trie *t, struct tnode *tn, int i, -			     struct node *n) +static inline void put_child(struct tnode *tn, int i, +			     struct rt_trie_node *n)  {  	tnode_put_child_reorg(tn, i, n, -1);  } @@ -484,10 +490,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,    * Update the value of full_children and empty_children.    */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,  				  int wasfull)  { -	struct node *chi = tn->child[i]; +	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);  	int isfull;  	BUG_ON(i >= 1<<tn->bits); @@ -515,7 +521,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,  }  #define MAX_WORK 10 -static struct node *resize(struct trie *t, struct tnode *tn) +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)  {  	int i;  	struct tnode *old_tn; @@ -605,7 +611,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)  	/* Keep root node larger  */ -	if (!node_parent((struct node *)tn)) { +	if (!node_parent((struct rt_trie_node *)tn)) {  		inflate_threshold_use = inflate_threshold_root;  		halve_threshold_use = halve_threshold_root;  	} else { @@ -635,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)  	/* Return if at least one inflate is run */  	if (max_work != MAX_WORK) -		return (struct node *) tn; +		return (struct rt_trie_node *) tn;  	/*  	 * Halve as long as the number of empty children in this @@ -663,9 +669,9 @@ static struct node *resize(struct trie *t, struct tnode *tn)  	if (tn->empty_children == tnode_child_length(tn) - 1) {  one_child:  		for (i = 0; i < tnode_child_length(tn); i++) { -			struct node *n; +			struct rt_trie_node *n; -			n = tn->child[i]; +			n = rtnl_dereference(tn->child[i]);  			if (!n)  				continue; @@ -676,7 +682,21 @@ one_child:  			return n;  		}  	} -	return (struct node *) tn; +	return (struct rt_trie_node *) tn; +} + + +static void tnode_clean_free(struct tnode *tn) +{ +	int i; +	struct tnode *tofree; + +	for (i = 0; i < tnode_child_length(tn); i++) { +		tofree = (struct tnode *)rtnl_dereference(tn->child[i]); +		if (tofree) +			tnode_free(tofree); +	} +	tnode_free(tn);  }  static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -723,14 +743,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)  				goto nomem;  			} -			put_child(t, tn, 2*i, (struct node *) left); -			put_child(t, tn, 2*i+1, (struct node *) right); +			put_child(tn, 2*i, (struct rt_trie_node *) left); +			put_child(tn, 2*i+1, (struct rt_trie_node *) right);  		}  	}  	for (i = 0; i < olen; i++) {  		struct tnode *inode; -		struct node *node = tnode_get_child(oldtnode, i); +		struct rt_trie_node *node = tnode_get_child(oldtnode, i);  		struct tnode *left, *right;  		int size, j; @@ -742,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)  		if (IS_LEAF(node) || ((struct tnode *) node)->pos >  		   tn->pos + tn->bits - 1) { -			if (tkey_extract_bits(node->key, -					      oldtnode->pos + oldtnode->bits, -					      1) == 0) -				put_child(t, tn, 2*i, node); -			else -				put_child(t, tn, 2*i+1, node); +			put_child(tn, +				tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), +				node);  			continue;  		} @@ -755,8 +772,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)  		inode = (struct tnode *) node;  		if (inode->bits == 1) { -			put_child(t, tn, 2*i, inode->child[0]); -			put_child(t, tn, 2*i+1, inode->child[1]); +			put_child(tn, 2*i, rtnl_dereference(inode->child[0])); +			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));  			tnode_free_safe(inode);  			continue; @@ -786,46 +803,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)  		 */  		left = (struct tnode *) tnode_get_child(tn, 2*i); -		put_child(t, tn, 2*i, NULL); +		put_child(tn, 2*i, NULL);  		BUG_ON(!left);  		right = (struct tnode *) tnode_get_child(tn, 2*i+1); -		put_child(t, tn, 2*i+1, NULL); +		put_child(tn, 2*i+1, NULL);  		BUG_ON(!right);  		size = tnode_child_length(left);  		for (j = 0; j < size; j++) { -			put_child(t, left, j, inode->child[j]); -			put_child(t, right, j, inode->child[j + size]); +			put_child(left, j, rtnl_dereference(inode->child[j])); +			put_child(right, j, rtnl_dereference(inode->child[j + size]));  		} -		put_child(t, tn, 2*i, resize(t, left)); -		put_child(t, tn, 2*i+1, resize(t, right)); +		put_child(tn, 2*i, resize(t, left)); +		put_child(tn, 2*i+1, resize(t, right));  		tnode_free_safe(inode);  	}  	tnode_free_safe(oldtnode);  	return tn;  nomem: -	{ -		int size = tnode_child_length(tn); -		int j; - -		for (j = 0; j < size; j++) -			if (tn->child[j]) -				tnode_free((struct tnode *)tn->child[j]); - -		tnode_free(tn); - -		return ERR_PTR(-ENOMEM); -	} +	tnode_clean_free(tn); +	return ERR_PTR(-ENOMEM);  }  static struct tnode *halve(struct trie *t, struct tnode *tn)  {  	struct tnode *oldtnode = tn; -	struct node *left, *right; +	struct rt_trie_node *left, *right;  	int i;  	int olen = tnode_child_length(tn); @@ -856,7 +863,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)  			if (!newn)  				goto nomem; -			put_child(t, tn, i/2, (struct node *)newn); +			put_child(tn, i/2, (struct rt_trie_node *)newn);  		}  	} @@ -871,37 +878,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)  		if (left == NULL) {  			if (right == NULL)    /* Both are empty */  				continue; -			put_child(t, tn, i/2, right); +			put_child(tn, i/2, right);  			continue;  		}  		if (right == NULL) { -			put_child(t, tn, i/2, left); +			put_child(tn, i/2, left);  			continue;  		}  		/* Two nonempty children */  		newBinNode = (struct tnode *) tnode_get_child(tn, i/2); -		put_child(t, tn, i/2, NULL); -		put_child(t, newBinNode, 0, left); -		put_child(t, newBinNode, 1, right); -		put_child(t, tn, i/2, resize(t, newBinNode)); +		put_child(tn, i/2, NULL); +		put_child(newBinNode, 0, left); +		put_child(newBinNode, 1, right); +		put_child(tn, i/2, resize(t, newBinNode));  	}  	tnode_free_safe(oldtnode);  	return tn;  nomem: -	{ -		int size = tnode_child_length(tn); -		int j; - -		for (j = 0; j < size; j++) -			if (tn->child[j]) -				tnode_free((struct tnode *)tn->child[j]); - -		tnode_free(tn); - -		return ERR_PTR(-ENOMEM); -	} +	tnode_clean_free(tn); +	return ERR_PTR(-ENOMEM);  }  /* readside must use rcu_read_lock currently dump routines @@ -910,10 +907,9 @@ nomem:  static struct leaf_info *find_leaf_info(struct leaf *l, int plen)  {  	struct hlist_head *head = &l->list; -	struct hlist_node *node;  	struct leaf_info *li; -	hlist_for_each_entry_rcu(li, node, head, hlist) +	hlist_for_each_entry_rcu(li, head, hlist)  		if (li->plen == plen)  			return li; @@ -933,12 +929,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)  static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)  {  	struct leaf_info *li = NULL, *last = NULL; -	struct hlist_node *node;  	if (hlist_empty(head)) {  		hlist_add_head_rcu(&new->hlist, head);  	} else { -		hlist_for_each_entry(li, node, head, hlist) { +		hlist_for_each_entry(li, head, hlist) {  			if (new->plen > li->plen)  				break; @@ -958,7 +953,7 @@ fib_find_node(struct trie *t, u32 key)  {  	int pos;  	struct tnode *tn; -	struct node *n; +	struct rt_trie_node *n;  	pos = 0;  	n = rcu_dereference_rtnl(t->trie); @@ -993,17 +988,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)  	key = tn->key; -	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { +	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {  		cindex = tkey_extract_bits(key, tp->pos, tp->bits);  		wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); -		tn = (struct tnode *) resize(t, (struct tnode *)tn); +		tn = (struct tnode *)resize(t, tn); -		tnode_put_child_reorg((struct tnode *)tp, cindex, -				      (struct node *)tn, wasfull); +		tnode_put_child_reorg(tp, cindex, +				      (struct rt_trie_node *)tn, wasfull); -		tp = node_parent((struct node *) tn); +		tp = node_parent((struct rt_trie_node *) tn);  		if (!tp) -			rcu_assign_pointer(t->trie, (struct node *)tn); +			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);  		tnode_free_flush();  		if (!tp) @@ -1013,9 +1008,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)  	/* Handle last (top) tnode */  	if (IS_TNODE(tn)) -		tn = (struct tnode *)resize(t, (struct tnode *)tn); +		tn = (struct tnode *)resize(t, tn); -	rcu_assign_pointer(t->trie, (struct node *)tn); +	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);  	tnode_free_flush();  } @@ -1025,7 +1020,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  {  	int pos, newpos;  	struct tnode *tp = NULL, *tn = NULL; -	struct node *n; +	struct rt_trie_node *n;  	struct leaf *l;  	int missbit;  	struct list_head *fa_head = NULL; @@ -1033,7 +1028,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  	t_key cindex;  	pos = 0; -	n = t->trie; +	n = rtnl_dereference(t->trie);  	/* If we point to NULL, stop. Either the tree is empty and we should  	 * just put a new leaf in if, or we have reached an empty child slot, @@ -1111,10 +1106,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  	if (t->trie && n == NULL) {  		/* Case 2: n is NULL, and will just insert a new leaf */ -		node_set_parent((struct node *)l, tp); +		node_set_parent((struct rt_trie_node *)l, tp);  		cindex = tkey_extract_bits(key, tp->pos, tp->bits); -		put_child(t, (struct tnode *)tp, cindex, (struct node *)l); +		put_child(tp, cindex, (struct rt_trie_node *)l);  	} else {  		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */  		/* @@ -1122,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  		 *  first tnode need some special handling  		 */ -		if (tp) -			pos = tp->pos+tp->bits; -		else -			pos = 0; -  		if (n) { +			pos = tp ? tp->pos+tp->bits : 0;  			newpos = tkey_mismatch(key, pos, n->key);  			tn = tnode_new(n->key, newpos, 1);  		} else { @@ -1141,26 +1132,24 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)  			return NULL;  		} -		node_set_parent((struct node *)tn, tp); +		node_set_parent((struct rt_trie_node *)tn, tp);  		missbit = tkey_extract_bits(key, newpos, 1); -		put_child(t, tn, missbit, (struct node *)l); -		put_child(t, tn, 1-missbit, n); +		put_child(tn, missbit, (struct rt_trie_node *)l); +		put_child(tn, 1-missbit, n);  		if (tp) {  			cindex = tkey_extract_bits(key, tp->pos, tp->bits); -			put_child(t, (struct tnode *)tp, cindex, -				  (struct node *)tn); +			put_child(tp, cindex, (struct rt_trie_node *)tn);  		} else { -			rcu_assign_pointer(t->trie, (struct node *)tn); +			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);  			tp = tn;  		}  	}  	if (tp && tp->pos + tp->bits > 32) -		pr_warning("fib_trie" -			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", -			   tp, tp->pos, tp->bits, key, plen); +		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", +			tp, tp->pos, tp->bits, key, plen);  	/* Rebalance the trie */ @@ -1245,7 +1234,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)  			if (fa->fa_info->fib_priority != fi->fib_priority)  				break;  			if (fa->fa_type == cfg->fc_type && -			    fa->fa_scope == cfg->fc_scope &&  			    fa->fa_info == fi) {  				fa_match = fa;  				break; @@ -1271,7 +1259,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)  			new_fa->fa_tos = fa->fa_tos;  			new_fa->fa_info = fi;  			new_fa->fa_type = cfg->fc_type; -			new_fa->fa_scope = cfg->fc_scope;  			state = fa->fa_state;  			new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1280,7 +1267,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)  			fib_release_info(fi_drop);  			if (state & FA_S_ACCESSED) -				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); +				rt_cache_flush(cfg->fc_nlinfo.nl_net);  			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,  				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1308,7 +1295,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)  	new_fa->fa_info = fi;  	new_fa->fa_tos = tos;  	new_fa->fa_type = cfg->fc_type; -	new_fa->fa_scope = cfg->fc_scope;  	new_fa->fa_state = 0;  	/*  	 * Insert new entry to the list. @@ -1322,10 +1308,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)  		}  	} +	if (!plen) +		tb->tb_num_default++; +  	list_add_tail_rcu(&new_fa->fa_list,  			  (fa ? &fa->fa_list : fa_head)); -	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); +	rt_cache_flush(cfg->fc_nlinfo.nl_net);  	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,  		  &cfg->fc_nlinfo, 0);  succeeded: @@ -1340,49 +1329,83 @@ err:  }  /* should be called with rcu_read_lock */ -static int check_leaf(struct trie *t, struct leaf *l, -		      t_key key,  const struct flowi *flp, +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, +		      t_key key,  const struct flowi4 *flp,  		      struct fib_result *res, int fib_flags)  {  	struct leaf_info *li;  	struct hlist_head *hhead = &l->list; -	struct hlist_node *node; -	hlist_for_each_entry_rcu(li, node, hhead, hlist) { -		int err; -		int plen = li->plen; -		__be32 mask = inet_make_mask(plen); +	hlist_for_each_entry_rcu(li, hhead, hlist) { +		struct fib_alias *fa; -		if (l->key != (key & ntohl(mask))) +		if (l->key != (key & li->mask_plen))  			continue; -		err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); +		list_for_each_entry_rcu(fa, &li->falh, fa_list) { +			struct fib_info *fi = fa->fa_info; +			int nhsel, err; +			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) +				continue; +			if (fi->fib_dead) +				continue; +			if (fa->fa_info->fib_scope < flp->flowi4_scope) +				continue; +			fib_alias_accessed(fa); +			err = fib_props[fa->fa_type].error; +			if (err) {  #ifdef CONFIG_IP_FIB_TRIE_STATS -		if (err <= 0) -			t->stats.semantic_match_passed++; -		else -			t->stats.semantic_match_miss++; +				t->stats.semantic_match_passed++; +#endif +				return err; +			} +			if (fi->fib_flags & RTNH_F_DEAD) +				continue; +			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { +				const struct fib_nh *nh = &fi->fib_nh[nhsel]; + +				if (nh->nh_flags & RTNH_F_DEAD) +					continue; +				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) +					continue; + +#ifdef CONFIG_IP_FIB_TRIE_STATS +				t->stats.semantic_match_passed++; +#endif +				res->prefixlen = li->plen; +				res->nh_sel = nhsel; +				res->type = fa->fa_type; +				res->scope = fa->fa_info->fib_scope; +				res->fi = fi; +				res->table = tb; +				res->fa_head = &li->falh; +				if (!(fib_flags & FIB_LOOKUP_NOREF)) +					atomic_inc(&fi->fib_clntref); +				return 0; +			} +		} + +#ifdef CONFIG_IP_FIB_TRIE_STATS +		t->stats.semantic_match_miss++;  #endif -		if (err <= 0) -			return err;  	}  	return 1;  } -int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, +int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,  		     struct fib_result *res, int fib_flags)  {  	struct trie *t = (struct trie *) tb->tb_data;  	int ret; -	struct node *n; +	struct rt_trie_node *n;  	struct tnode *pn; -	int pos, bits; -	t_key key = ntohl(flp->fl4_dst); -	int chopped_off; +	unsigned int pos, bits; +	t_key key = ntohl(flp->daddr); +	unsigned int chopped_off;  	t_key cindex = 0; -	int current_prefix_length = KEYLENGTH; +	unsigned int current_prefix_length = KEYLENGTH;  	struct tnode *cn;  	t_key pref_mismatch; @@ -1398,7 +1421,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,  	/* Just a leaf? */  	if (IS_LEAF(n)) { -		ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); +		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);  		goto found;  	} @@ -1423,7 +1446,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,  		}  		if (IS_LEAF(n)) { -			ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); +			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);  			if (ret > 0)  				goto backtrace;  			goto found; @@ -1507,7 +1530,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,  		 * state.directly.  		 */  		if (pref_mismatch) { -			int mp = KEYLENGTH - fls(pref_mismatch); +			/* fls(x) = __fls(x) + 1 */ +			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;  			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)  				goto backtrace; @@ -1541,7 +1565,7 @@ backtrace:  		if (chopped_off <= pn->bits) {  			cindex &= ~(1 << (chopped_off-1));  		} else { -			struct tnode *parent = node_parent_rcu((struct node *) pn); +			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);  			if (!parent)  				goto failed; @@ -1562,22 +1586,23 @@ found:  	rcu_read_unlock();  	return ret;  } +EXPORT_SYMBOL_GPL(fib_table_lookup);  /*   * Remove the leaf and return parent.   */  static void trie_leaf_remove(struct trie *t, struct leaf *l)  { -	struct tnode *tp = node_parent((struct node *) l); +	struct tnode *tp = node_parent((struct rt_trie_node *) l);  	pr_debug("entering trie_leaf_remove(%p)\n", l);  	if (tp) {  		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); -		put_child(t, (struct tnode *)tp, cindex, NULL); +		put_child(tp, cindex, NULL);  		trie_rebalance(t, tp);  	} else -		rcu_assign_pointer(t->trie, NULL); +		RCU_INIT_POINTER(t->trie, NULL);  	free_leaf(l);  } @@ -1611,7 +1636,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)  	if (!l)  		return -ESRCH; -	fa_head = get_fa_head(l, plen); +	li = find_leaf_info(l, plen); + +	if (!li) +		return -ESRCH; + +	fa_head = &li->falh;  	fa = fib_find_alias(fa_head, tos, 0);  	if (!fa) @@ -1629,7 +1659,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)  		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&  		    (cfg->fc_scope == RT_SCOPE_NOWHERE || -		     fa->fa_scope == cfg->fc_scope) && +		     fa->fa_info->fib_scope == cfg->fc_scope) && +		    (!cfg->fc_prefsrc || +		     fi->fib_prefsrc == cfg->fc_prefsrc) &&  		    (!cfg->fc_protocol ||  		     fi->fib_protocol == cfg->fc_protocol) &&  		    fib_nh_match(cfg, fi) == 0) { @@ -1645,11 +1677,11 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)  	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,  		  &cfg->fc_nlinfo, 0); -	l = fib_find_node(t, key); -	li = find_leaf_info(l, plen); -  	list_del_rcu(&fa->fa_list); +	if (!plen) +		tb->tb_num_default--; +  	if (list_empty(fa_head)) {  		hlist_del_rcu(&li->hlist);  		free_leaf_info(li); @@ -1659,7 +1691,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)  		trie_leaf_remove(t, l);  	if (fa->fa_state & FA_S_ACCESSED) -		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); +		rt_cache_flush(cfg->fc_nlinfo.nl_net);  	fib_release_info(fa->fa_info);  	alias_free_mem_rcu(fa); @@ -1688,10 +1720,10 @@ static int trie_flush_leaf(struct leaf *l)  {  	int found = 0;  	struct hlist_head *lih = &l->list; -	struct hlist_node *node, *tmp; +	struct hlist_node *tmp;  	struct leaf_info *li = NULL; -	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { +	hlist_for_each_entry_safe(li, tmp, lih, hlist) {  		found += trie_flush_list(&li->falh);  		if (list_empty(&li->falh)) { @@ -1706,7 +1738,7 @@ static int trie_flush_leaf(struct leaf *l)   * Scan for the next right leaf starting at node p->child[idx]   * Since we have back pointer, no recursion necessary.   */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)  {  	do {  		t_key idx; @@ -1721,10 +1753,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)  			if (!c)  				continue; -			if (IS_LEAF(c)) { -				prefetch(p->child[idx]); +			if (IS_LEAF(c))  				return (struct leaf *) c; -			}  			/* Rescan start scanning in new node */  			p = (struct tnode *) c; @@ -1732,7 +1762,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)  		}  		/* Node empty, walk back up to parent */ -		c = (struct node *) p; +		c = (struct rt_trie_node *) p;  	} while ((p = node_parent_rcu(c)) != NULL);  	return NULL; /* Root of trie */ @@ -1753,7 +1783,7 @@ static struct leaf *trie_firstleaf(struct trie *t)  static struct leaf *trie_nextleaf(struct leaf *l)  { -	struct node *c = (struct node *) l; +	struct rt_trie_node *c = (struct rt_trie_node *) l;  	struct tnode *p = node_parent_rcu(c);  	if (!p) @@ -1802,80 +1832,6 @@ void fib_free_table(struct fib_table *tb)  	kfree(tb);  } -void fib_table_select_default(struct fib_table *tb, -			      const struct flowi *flp, -			      struct fib_result *res) -{ -	struct trie *t = (struct trie *) tb->tb_data; -	int order, last_idx; -	struct fib_info *fi = NULL; -	struct fib_info *last_resort; -	struct fib_alias *fa = NULL; -	struct list_head *fa_head; -	struct leaf *l; - -	last_idx = -1; -	last_resort = NULL; -	order = -1; - -	rcu_read_lock(); - -	l = fib_find_node(t, 0); -	if (!l) -		goto out; - -	fa_head = get_fa_head(l, 0); -	if (!fa_head) -		goto out; - -	if (list_empty(fa_head)) -		goto out; - -	list_for_each_entry_rcu(fa, fa_head, fa_list) { -		struct fib_info *next_fi = fa->fa_info; - -		if (fa->fa_scope != res->scope || -		    fa->fa_type != RTN_UNICAST) -			continue; - -		if (next_fi->fib_priority > res->fi->fib_priority) -			break; -		if (!next_fi->fib_nh[0].nh_gw || -		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) -			continue; - -		fib_alias_accessed(fa); - -		if (fi == NULL) { -			if (next_fi != res->fi) -				break; -		} else if (!fib_detect_death(fi, order, &last_resort, -					     &last_idx, tb->tb_default)) { -			fib_result_assign(res, fi); -			tb->tb_default = order; -			goto out; -		} -		fi = next_fi; -		order++; -	} -	if (order <= 0 || fi == NULL) { -		tb->tb_default = -1; -		goto out; -	} - -	if (!fib_detect_death(fi, order, &last_resort, &last_idx, -				tb->tb_default)) { -		fib_result_assign(res, fi); -		tb->tb_default = order; -		goto out; -	} -	if (last_idx >= 0) -		fib_result_assign(res, last_resort); -	tb->tb_default = last_idx; -out: -	rcu_read_unlock(); -} -  static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,  			   struct fib_table *tb,  			   struct sk_buff *skb, struct netlink_callback *cb) @@ -1895,12 +1851,11 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,  			continue;  		} -		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, +		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,  				  cb->nlh->nlmsg_seq,  				  RTM_NEWROUTE,  				  tb->tb_id,  				  fa->fa_type, -				  fa->fa_scope,  				  xkey,  				  plen,  				  fa->fa_tos, @@ -1918,14 +1873,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,  			struct sk_buff *skb, struct netlink_callback *cb)  {  	struct leaf_info *li; -	struct hlist_node *node;  	int i, s_i;  	s_i = cb->args[4];  	i = 0;  	/* rcu_read_lock is hold by caller */ -	hlist_for_each_entry_rcu(li, node, &l->list, hlist) { +	hlist_for_each_entry_rcu(li, &l->list, hlist) {  		if (i < s_i) {  			i++;  			continue; @@ -1990,7 +1944,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,  	return skb->len;  } -void __init fib_hash_init(void) +void __init fib_trie_init(void)  {  	fn_alias_kmem = kmem_cache_create("ip_fib_alias",  					  sizeof(struct fib_alias), @@ -2003,8 +1957,7 @@ void __init fib_hash_init(void)  } -/* Fix more generic FIB names for init later */ -struct fib_table *fib_hash_table(u32 id) +struct fib_table *fib_trie_table(u32 id)  {  	struct fib_table *tb;  	struct trie *t; @@ -2016,13 +1969,11 @@ struct fib_table *fib_hash_table(u32 id)  	tb->tb_id = id;  	tb->tb_default = -1; +	tb->tb_num_default = 0;  	t = (struct trie *) tb->tb_data;  	memset(t, 0, sizeof(*t)); -	if (id == RT_TABLE_LOCAL) -		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); -  	return tb;  } @@ -2036,7 +1987,7 @@ struct fib_trie_iter {  	unsigned int depth;  }; -static struct node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)  {  	struct tnode *tn = iter->tnode;  	unsigned int cindex = iter->index; @@ -2050,7 +2001,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)  		 iter->tnode, iter->index, iter->depth);  rescan:  	while (cindex < (1<<tn->bits)) { -		struct node *n = tnode_get_child_rcu(tn, cindex); +		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);  		if (n) {  			if (IS_LEAF(n)) { @@ -2069,7 +2020,7 @@ rescan:  	}  	/* Current node exhausted, pop back up */ -	p = node_parent_rcu((struct node *)tn); +	p = node_parent_rcu((struct rt_trie_node *)tn);  	if (p) {  		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;  		tn = p; @@ -2081,10 +2032,10 @@ rescan:  	return NULL;  } -static struct node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,  				       struct trie *t)  { -	struct node *n; +	struct rt_trie_node *n;  	if (!t)  		return NULL; @@ -2108,7 +2059,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,  static void trie_collect_stats(struct trie *t, struct trie_stat *s)  { -	struct node *n; +	struct rt_trie_node *n;  	struct fib_trie_iter iter;  	memset(s, 0, sizeof(*s)); @@ -2118,14 +2069,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)  		if (IS_LEAF(n)) {  			struct leaf *l = (struct leaf *)n;  			struct leaf_info *li; -			struct hlist_node *tmp;  			s->leaves++;  			s->totdepth += iter.depth;  			if (iter.depth > s->maxdepth)  				s->maxdepth = iter.depth; -			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) +			hlist_for_each_entry_rcu(li, &l->list, hlist)  				++s->prefixes;  		} else {  			const struct tnode *tn = (const struct tnode *) n; @@ -2173,7 +2123,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)  		max--;  	pointers = 0; -	for (i = 1; i <= max; i++) +	for (i = 1; i < max; i++)  		if (stat->nodesizes[i] != 0) {  			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);  			pointers += (1<<i) * stat->nodesizes[i]; @@ -2181,7 +2131,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)  	seq_putc(seq, '\n');  	seq_printf(seq, "\tPointers: %u\n", pointers); -	bytes += sizeof(struct node *) * pointers; +	bytes += sizeof(struct rt_trie_node *) * pointers;  	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);  	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);  } @@ -2226,10 +2176,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {  		struct hlist_head *head = &net->ipv4.fib_table_hash[h]; -		struct hlist_node *node;  		struct fib_table *tb; -		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { +		hlist_for_each_entry_rcu(tb, head, tb_hlist) {  			struct trie *t = (struct trie *) tb->tb_data;  			struct trie_stat stat; @@ -2262,7 +2211,7 @@ static const struct file_operations fib_triestat_fops = {  	.release = single_release_net,  }; -static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)  {  	struct fib_trie_iter *iter = seq->private;  	struct net *net = seq_file_net(seq); @@ -2271,11 +2220,10 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {  		struct hlist_head *head = &net->ipv4.fib_table_hash[h]; -		struct hlist_node *node;  		struct fib_table *tb; -		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { -			struct node *n; +		hlist_for_each_entry_rcu(tb, head, tb_hlist) { +			struct rt_trie_node *n;  			for (n = fib_trie_get_first(iter,  						    (struct trie *) tb->tb_data); @@ -2304,7 +2252,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	struct fib_table *tb = iter->tb;  	struct hlist_node *tb_node;  	unsigned int h; -	struct node *n; +	struct rt_trie_node *n;  	++*pos;  	/* next node in same table */ @@ -2314,7 +2262,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	/* walk rest of this hash chain */  	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); -	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { +	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {  		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);  		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);  		if (n) @@ -2324,7 +2272,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	/* new hash chain */  	while (++h < FIB_TABLE_HASHSZ) {  		struct hlist_head *head = &net->ipv4.fib_table_hash[h]; -		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) { +		hlist_for_each_entry_rcu(tb, head, tb_hlist) {  			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);  			if (n)  				goto found; @@ -2390,7 +2338,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)  static int fib_trie_seq_show(struct seq_file *seq, void *v)  {  	const struct fib_trie_iter *iter = seq->private; -	struct node *n = v; +	struct rt_trie_node *n = v;  	if (!node_parent_rcu(n))  		fib_table_print(seq, iter->tb); @@ -2407,13 +2355,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)  	} else {  		struct leaf *l = (struct leaf *) n;  		struct leaf_info *li; -		struct hlist_node *node;  		__be32 val = htonl(l->key);  		seq_indent(seq, iter->depth);  		seq_printf(seq, "  |-- %pI4\n", &val); -		hlist_for_each_entry_rcu(li, node, &l->list, hlist) { +		hlist_for_each_entry_rcu(li, &l->list, hlist) {  			struct fib_alias *fa;  			list_for_each_entry_rcu(fa, &li->falh, fa_list) { @@ -2422,7 +2369,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)  				seq_indent(seq, iter->depth+1);  				seq_printf(seq, "  /%d %s %s", li->plen,  					   rtn_scope(buf1, sizeof(buf1), -						     fa->fa_scope), +						     fa->fa_info->fib_scope),  					   rtn_type(buf2, sizeof(buf2),  						    fa->fa_type));  				if (fa->fa_tos) @@ -2558,7 +2505,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  {  	struct leaf *l = v;  	struct leaf_info *li; -	struct hlist_node *node;  	if (v == SEQ_START_TOKEN) {  		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2567,7 +2513,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  		return 0;  	} -	hlist_for_each_entry_rcu(li, node, &l->list, hlist) { +	hlist_for_each_entry_rcu(li, &l->list, hlist) {  		struct fib_alias *fa;  		__be32 mask, prefix; @@ -2577,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  		list_for_each_entry_rcu(fa, &li->falh, fa_list) {  			const struct fib_info *fi = fa->fa_info;  			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); -			int len;  			if (fa->fa_type == RTN_BROADCAST  			    || fa->fa_type == RTN_MULTICAST)  				continue; +			seq_setwidth(seq, 127); +  			if (fi)  				seq_printf(seq,  					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t" -					 "%d\t%08X\t%d\t%u\t%u%n", +					 "%d\t%08X\t%d\t%u\t%u",  					 fi->fib_dev ? fi->fib_dev->name : "*",  					 prefix,  					 fi->fib_nh->nh_gw, flags, 0, 0, @@ -2595,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)  					 (fi->fib_advmss ?  					  fi->fib_advmss + 40 : 0),  					 fi->fib_window, -					 fi->fib_rtt >> 3, &len); +					 fi->fib_rtt >> 3);  			else  				seq_printf(seq,  					 "*\t%08X\t%08X\t%04X\t%d\t%u\t" -					 "%d\t%08X\t%d\t%u\t%u%n", +					 "%d\t%08X\t%d\t%u\t%u",  					 prefix, 0, flags, 0, 0, 0, -					 mask, 0, 0, 0, &len); +					 mask, 0, 0, 0); -			seq_printf(seq, "%*s\n", 127 - len, ""); +			seq_pad(seq, '\n');  		}  	} @@ -2633,31 +2580,31 @@ static const struct file_operations fib_route_fops = {  int __net_init fib_proc_init(struct net *net)  { -	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) +	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))  		goto out1; -	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, -				  &fib_triestat_fops)) +	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, +			 &fib_triestat_fops))  		goto out2; -	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) +	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))  		goto out3;  	return 0;  out3: -	proc_net_remove(net, "fib_triestat"); +	remove_proc_entry("fib_triestat", net->proc_net);  out2: -	proc_net_remove(net, "fib_trie"); +	remove_proc_entry("fib_trie", net->proc_net);  out1:  	return -ENOMEM;  }  void __net_exit fib_proc_exit(struct net *net)  { -	proc_net_remove(net, "fib_trie"); -	proc_net_remove(net, "fib_triestat"); -	proc_net_remove(net, "route"); +	remove_proc_entry("fib_trie", net->proc_net); +	remove_proc_entry("fib_triestat", net->proc_net); +	remove_proc_entry("route", net->proc_net);  }  #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c deleted file mode 100644 index c6933f2ea31..00000000000 --- a/net/ipv4/gre.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - *	GRE over IPv4 demultiplexer driver - * - *	Authors: Dmitry Kozlov (xeb@mail.ru) - * - *	This program is free software; you can redistribute it and/or - *	modify it under the terms of the GNU General Public License - *	as published by the Free Software Foundation; either version - *	2 of the License, or (at your option) any later version. - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/netdevice.h> -#include <linux/version.h> -#include <linux/spinlock.h> -#include <net/protocol.h> -#include <net/gre.h> - - -static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static DEFINE_SPINLOCK(gre_proto_lock); - -int gre_add_protocol(const struct gre_protocol *proto, u8 version) -{ -	if (version >= GREPROTO_MAX) -		goto err_out; - -	spin_lock(&gre_proto_lock); -	if (gre_proto[version]) -		goto err_out_unlock; - -	rcu_assign_pointer(gre_proto[version], proto); -	spin_unlock(&gre_proto_lock); -	return 0; - -err_out_unlock: -	spin_unlock(&gre_proto_lock); -err_out: -	return -1; -} -EXPORT_SYMBOL_GPL(gre_add_protocol); - -int gre_del_protocol(const struct gre_protocol *proto, u8 version) -{ -	if (version >= GREPROTO_MAX) -		goto err_out; - -	spin_lock(&gre_proto_lock); -	if (rcu_dereference_protected(gre_proto[version], -			lockdep_is_held(&gre_proto_lock)) != proto) -		goto err_out_unlock; -	rcu_assign_pointer(gre_proto[version], NULL); -	spin_unlock(&gre_proto_lock); -	synchronize_rcu(); -	return 0; - -err_out_unlock: -	spin_unlock(&gre_proto_lock); -err_out: -	return -1; -} -EXPORT_SYMBOL_GPL(gre_del_protocol); - -static int gre_rcv(struct sk_buff *skb) -{ -	const struct gre_protocol *proto; -	u8 ver; -	int ret; - -	if (!pskb_may_pull(skb, 12)) -		goto drop; - -	ver = skb->data[1]&0x7f; -	if (ver >= GREPROTO_MAX) -		goto drop; - -	rcu_read_lock(); -	proto = rcu_dereference(gre_proto[ver]); -	if (!proto || !proto->handler) -		goto drop_unlock; -	ret = proto->handler(skb); -	rcu_read_unlock(); -	return ret; - -drop_unlock: -	rcu_read_unlock(); -drop: -	kfree_skb(skb); -	return NET_RX_DROP; -} - -static void gre_err(struct sk_buff *skb, u32 info) -{ -	const struct gre_protocol *proto; -	u8 ver; - -	if (!pskb_may_pull(skb, 12)) -		goto drop; - -	ver = skb->data[1]&0x7f; -	if (ver >= GREPROTO_MAX) -		goto drop; - -	rcu_read_lock(); -	proto = rcu_dereference(gre_proto[ver]); -	if (!proto || !proto->err_handler) -		goto drop_unlock; -	proto->err_handler(skb, info); -	rcu_read_unlock(); -	return; - -drop_unlock: -	rcu_read_unlock(); -drop: -	kfree_skb(skb); -} - -static const struct net_protocol net_gre_protocol = { -	.handler     = gre_rcv, -	.err_handler = gre_err, -	.netns_ok    = 1, -}; - -static int __init gre_init(void) -{ -	pr_info("GRE over IPv4 demultiplexor driver"); - -	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { -		pr_err("gre: can't add protocol\n"); -		return -EAGAIN; -	} - -	return 0; -} - -static void __exit gre_exit(void) -{ -	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -} - -module_init(gre_init); -module_exit(gre_exit); - -MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); -MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); -MODULE_LICENSE("GPL"); - diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c new file mode 100644 index 00000000000..0485bf7f8f0 --- /dev/null +++ b/net/ipv4/gre_demux.c @@ -0,0 +1,364 @@ +/* + *	GRE over IPv4 demultiplexer driver + * + *	Authors: Dmitry Kozlov (xeb@mail.ru) + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/if.h> +#include <linux/icmp.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <linux/if_tunnel.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + +#include <net/icmp.h> +#include <net/route.h> +#include <net/xfrm.h> + +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; +static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ +	if (version >= GREPROTO_MAX) +		return -EINVAL; + +	return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? +		0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ +	int ret; + +	if (version >= GREPROTO_MAX) +		return -EINVAL; + +	ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? +		0 : -EBUSY; + +	if (ret) +		return ret; + +	synchronize_rcu(); +	return 0; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, +		      int hdr_len) +{ +	struct gre_base_hdr *greh; + +	skb_push(skb, hdr_len); + +	skb_reset_transport_header(skb); +	greh = (struct gre_base_hdr *)skb->data; +	greh->flags = tnl_flags_to_gre_flags(tpi->flags); +	greh->protocol = tpi->proto; + +	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { +		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + +		if (tpi->flags&TUNNEL_SEQ) { +			*ptr = tpi->seq; +			ptr--; +		} +		if (tpi->flags&TUNNEL_KEY) { +			*ptr = tpi->key; +			ptr--; +		} +		if (tpi->flags&TUNNEL_CSUM && +		    !(skb_shinfo(skb)->gso_type & +		      (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { +			*ptr = 0; +			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, +								 skb->len, 0)); +		} +	} +} +EXPORT_SYMBOL_GPL(gre_build_header); + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, +			    bool *csum_err) +{ +	unsigned int ip_hlen = ip_hdrlen(skb); +	const struct gre_base_hdr *greh; +	__be32 *options; +	int hdr_len; + +	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) +		return -EINVAL; + +	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); +	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) +		return -EINVAL; + +	tpi->flags = gre_flags_to_tnl_flags(greh->flags); +	hdr_len = ip_gre_calc_hlen(tpi->flags); + +	if (!pskb_may_pull(skb, hdr_len)) +		return -EINVAL; + +	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen); +	tpi->proto = greh->protocol; + +	options = (__be32 *)(greh + 1); +	if (greh->flags & GRE_CSUM) { +		if (skb_checksum_simple_validate(skb)) { +			*csum_err = true; +			return -EINVAL; +		} +		options++; +	} + +	if (greh->flags & GRE_KEY) { +		tpi->key = *options; +		options++; +	} else +		tpi->key = 0; + +	if (unlikely(greh->flags & GRE_SEQ)) { +		tpi->seq = *options; +		options++; +	} else +		tpi->seq = 0; + +	/* WCCP version 1 and 2 protocol decoding. +	 * - Change protocol to IP +	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header +	 */ +	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { +		tpi->proto = htons(ETH_P_IP); +		if ((*(u8 *)options & 0xF0) != 0x40) { +			hdr_len += 4; +			if (!pskb_may_pull(skb, hdr_len)) +				return -EINVAL; +		} +	} + +	return iptunnel_pull_header(skb, hdr_len, tpi->proto); +} + +static int gre_cisco_rcv(struct sk_buff *skb) +{ +	struct tnl_ptk_info tpi; +	int i; +	bool csum_err = false; + +#ifdef CONFIG_NET_IPGRE_BROADCAST +	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { +		/* Looped back packet, drop it! */ +		if (rt_is_output_route(skb_rtable(skb))) +			goto drop; +	} +#endif + +	if (parse_gre_header(skb, &tpi, &csum_err) < 0) +		goto drop; + +	rcu_read_lock(); +	for (i = 0; i < GRE_IP_PROTO_MAX; i++) { +		struct gre_cisco_protocol *proto; +		int ret; + +		proto = rcu_dereference(gre_cisco_proto_list[i]); +		if (!proto) +			continue; +		ret = proto->handler(skb, &tpi); +		if (ret == PACKET_RCVD) { +			rcu_read_unlock(); +			return 0; +		} +	} +	rcu_read_unlock(); + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: +	kfree_skb(skb); +	return 0; +} + +static void gre_cisco_err(struct sk_buff *skb, u32 info) +{ +	/* All the routers (except for Linux) return only +	 * 8 bytes of packet payload. It means, that precise relaying of +	 * ICMP in the real Internet is absolutely infeasible. +	 * +	 * Moreover, Cisco "wise men" put GRE key to the third word +	 * in GRE header. It makes impossible maintaining even soft +	 * state for keyed +	 * GRE tunnels with enabled checksum. Tell them "thank you". +	 * +	 * Well, I wonder, rfc1812 was written by Cisco employee, +	 * what the hell these idiots break standards established +	 * by themselves??? +	 */ + +	const int type = icmp_hdr(skb)->type; +	const int code = icmp_hdr(skb)->code; +	struct tnl_ptk_info tpi; +	bool csum_err = false; +	int i; + +	if (parse_gre_header(skb, &tpi, &csum_err)) { +		if (!csum_err)		/* ignore csum errors. */ +			return; +	} + +	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { +		ipv4_update_pmtu(skb, dev_net(skb->dev), info, +				skb->dev->ifindex, 0, IPPROTO_GRE, 0); +		return; +	} +	if (type == ICMP_REDIRECT) { +		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, +				IPPROTO_GRE, 0); +		return; +	} + +	rcu_read_lock(); +	for (i = 0; i < GRE_IP_PROTO_MAX; i++) { +		struct gre_cisco_protocol *proto; + +		proto = rcu_dereference(gre_cisco_proto_list[i]); +		if (!proto) +			continue; + +		if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) +			goto out; + +	} +out: +	rcu_read_unlock(); +} + +static int gre_rcv(struct sk_buff *skb) +{ +	const struct gre_protocol *proto; +	u8 ver; +	int ret; + +	if (!pskb_may_pull(skb, 12)) +		goto drop; + +	ver = skb->data[1]&0x7f; +	if (ver >= GREPROTO_MAX) +		goto drop; + +	rcu_read_lock(); +	proto = rcu_dereference(gre_proto[ver]); +	if (!proto || !proto->handler) +		goto drop_unlock; +	ret = proto->handler(skb); +	rcu_read_unlock(); +	return ret; + +drop_unlock: +	rcu_read_unlock(); +drop: +	kfree_skb(skb); +	return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ +	const struct gre_protocol *proto; +	const struct iphdr *iph = (const struct iphdr *)skb->data; +	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + +	if (ver >= GREPROTO_MAX) +		return; + +	rcu_read_lock(); +	proto = rcu_dereference(gre_proto[ver]); +	if (proto && proto->err_handler) +		proto->err_handler(skb, info); +	rcu_read_unlock(); +} + +static const struct net_protocol net_gre_protocol = { +	.handler     = gre_rcv, +	.err_handler = gre_err, +	.netns_ok    = 1, +}; + +static const struct gre_protocol ipgre_protocol = { +	.handler     = gre_cisco_rcv, +	.err_handler = gre_cisco_err, +}; + +int gre_cisco_register(struct gre_cisco_protocol *newp) +{ +	struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) +					    &gre_cisco_proto_list[newp->priority]; + +	return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; +} +EXPORT_SYMBOL_GPL(gre_cisco_register); + +int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) +{ +	struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) +					    &gre_cisco_proto_list[del_proto->priority]; +	int ret; + +	ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; + +	if (ret) +		return ret; + +	synchronize_net(); +	return 0; +} +EXPORT_SYMBOL_GPL(gre_cisco_unregister); + +static int __init gre_init(void) +{ +	pr_info("GRE over IPv4 demultiplexor driver\n"); + +	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { +		pr_err("can't add protocol\n"); +		goto err; +	} + +	if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { +		pr_info("%s: can't add ipgre handler\n", __func__); +		goto err_gre; +	} + +	return 0; +err_gre: +	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +err: +	return -EAGAIN; +} + +static void __exit gre_exit(void) +{ +	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); +	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c new file mode 100644 index 00000000000..f0bdd47bbbc --- /dev/null +++ b/net/ipv4/gre_offload.c @@ -0,0 +1,298 @@ +/* + *	IPV4 GSO/GRO offload support + *	Linux INET implementation + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *	GRE GSO support + */ + +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/gre.h> + +static int gre_gso_send_check(struct sk_buff *skb) +{ +	if (!skb->encapsulation) +		return -EINVAL; +	return 0; +} + +static struct sk_buff *gre_gso_segment(struct sk_buff *skb, +				       netdev_features_t features) +{ +	struct sk_buff *segs = ERR_PTR(-EINVAL); +	netdev_features_t enc_features; +	int ghl; +	struct gre_base_hdr *greh; +	u16 mac_offset = skb->mac_header; +	int mac_len = skb->mac_len; +	__be16 protocol = skb->protocol; +	int tnl_hlen; +	bool csum; + +	if (unlikely(skb_shinfo(skb)->gso_type & +				~(SKB_GSO_TCPV4 | +				  SKB_GSO_TCPV6 | +				  SKB_GSO_UDP | +				  SKB_GSO_DODGY | +				  SKB_GSO_TCP_ECN | +				  SKB_GSO_GRE | +				  SKB_GSO_GRE_CSUM | +				  SKB_GSO_IPIP))) +		goto out; + +	if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) +		goto out; + +	greh = (struct gre_base_hdr *)skb_transport_header(skb); + +	ghl = skb_inner_network_header(skb) - skb_transport_header(skb); +	if (unlikely(ghl < sizeof(*greh))) +		goto out; + +	csum = !!(greh->flags & GRE_CSUM); +	if (csum) +		skb->encap_hdr_csum = 1; + +	if (unlikely(!pskb_may_pull(skb, ghl))) +		goto out; + +	/* setup inner skb. */ +	skb->protocol = greh->protocol; +	skb->encapsulation = 0; + +	__skb_pull(skb, ghl); +	skb_reset_mac_header(skb); +	skb_set_network_header(skb, skb_inner_network_offset(skb)); +	skb->mac_len = skb_inner_network_offset(skb); + +	/* segment inner packet. */ +	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); +	segs = skb_mac_gso_segment(skb, enc_features); +	if (!segs || IS_ERR(segs)) { +		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); +		goto out; +	} + +	skb = segs; +	tnl_hlen = skb_tnl_header_len(skb); +	do { +		__skb_push(skb, ghl); +		if (csum) { +			__be32 *pcsum; + +			if (skb_has_shared_frag(skb)) { +				int err; + +				err = __skb_linearize(skb); +				if (err) { +					kfree_skb_list(segs); +					segs = ERR_PTR(err); +					goto out; +				} +			} + +			skb_reset_transport_header(skb); + +			greh = (struct gre_base_hdr *) +			    skb_transport_header(skb); +			pcsum = (__be32 *)(greh + 1); +			*pcsum = 0; +			*(__sum16 *)pcsum = gso_make_checksum(skb, 0); +		} +		__skb_push(skb, tnl_hlen - ghl); + +		skb_reset_inner_headers(skb); +		skb->encapsulation = 1; + +		skb_reset_mac_header(skb); +		skb_set_network_header(skb, mac_len); +		skb->mac_len = mac_len; +		skb->protocol = protocol; +	} while ((skb = skb->next)); +out: +	return segs; +} + +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ +	__sum16 sum; + +	skb->csum = skb_checksum(skb, 0, skb->len, 0); +	NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, +		csum_partial(skb->data, skb_gro_offset(skb), 0)); +	sum = csum_fold(NAPI_GRO_CB(skb)->csum); +	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { +		if (unlikely(!sum) && !skb->csum_complete_sw) +			netdev_rx_csum_fault(skb->dev); +	} else { +		skb->ip_summed = CHECKSUM_COMPLETE; +		skb->csum_complete_sw = 1; +	} + +	return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, +					struct sk_buff *skb) +{ +	struct sk_buff **pp = NULL; +	struct sk_buff *p; +	const struct gre_base_hdr *greh; +	unsigned int hlen, grehlen; +	unsigned int off; +	int flush = 1; +	struct packet_offload *ptype; +	__be16 type; + +	off = skb_gro_offset(skb); +	hlen = off + sizeof(*greh); +	greh = skb_gro_header_fast(skb, off); +	if (skb_gro_header_hard(skb, hlen)) { +		greh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!greh)) +			goto out; +	} + +	/* Only support version 0 and K (key), C (csum) flags. Note that +	 * although the support for the S (seq#) flag can be added easily +	 * for GRO, this is problematic for GSO hence can not be enabled +	 * here because a GRO pkt may end up in the forwarding path, thus +	 * requiring GSO support to break it up correctly. +	 */ +	if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) +		goto out; + +	type = greh->protocol; + +	rcu_read_lock(); +	ptype = gro_find_receive_by_type(type); +	if (ptype == NULL) +		goto out_unlock; + +	grehlen = GRE_HEADER_SECTION; + +	if (greh->flags & GRE_KEY) +		grehlen += GRE_HEADER_SECTION; + +	if (greh->flags & GRE_CSUM) +		grehlen += GRE_HEADER_SECTION; + +	hlen = off + grehlen; +	if (skb_gro_header_hard(skb, hlen)) { +		greh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!greh)) +			goto out_unlock; +	} +	if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ +		__sum16 csum = 0; + +		if (skb->ip_summed == CHECKSUM_COMPLETE) +			csum = csum_fold(NAPI_GRO_CB(skb)->csum); +		/* Don't trust csum error calculated/reported by h/w */ +		if (skb->ip_summed == CHECKSUM_NONE || csum != 0) +			csum = gro_skb_checksum(skb); + +		/* GRE CSUM is the 1's complement of the 1's complement sum +		 * of the GRE hdr plus payload so it should add up to 0xffff +		 * (and 0 after csum_fold()) just like the IPv4 hdr csum. +		 */ +		if (csum) +			goto out_unlock; +	} +	flush = 0; + +	for (p = *head; p; p = p->next) { +		const struct gre_base_hdr *greh2; + +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		/* The following checks are needed to ensure only pkts +		 * from the same tunnel are considered for aggregation. +		 * The criteria for "the same tunnel" includes: +		 * 1) same version (we only support version 0 here) +		 * 2) same protocol (we only support ETH_P_IP for now) +		 * 3) same set of flags +		 * 4) same key if the key field is present. +		 */ +		greh2 = (struct gre_base_hdr *)(p->data + off); + +		if (greh2->flags != greh->flags || +		    greh2->protocol != greh->protocol) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +		if (greh->flags & GRE_KEY) { +			/* compare keys */ +			if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { +				NAPI_GRO_CB(p)->same_flow = 0; +				continue; +			} +		} +	} + +	skb_gro_pull(skb, grehlen); + +	/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ +	skb_gro_postpull_rcsum(skb, greh, grehlen); + +	pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: +	rcu_read_unlock(); +out: +	NAPI_GRO_CB(skb)->flush |= flush; + +	return pp; +} + +static int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ +	struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); +	struct packet_offload *ptype; +	unsigned int grehlen = sizeof(*greh); +	int err = -ENOENT; +	__be16 type; + +	skb->encapsulation = 1; +	skb_shinfo(skb)->gso_type = SKB_GSO_GRE; + +	type = greh->protocol; +	if (greh->flags & GRE_KEY) +		grehlen += GRE_HEADER_SECTION; + +	if (greh->flags & GRE_CSUM) +		grehlen += GRE_HEADER_SECTION; + +	rcu_read_lock(); +	ptype = gro_find_complete_by_type(type); +	if (ptype != NULL) +		err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + +	rcu_read_unlock(); +	return err; +} + +static const struct net_offload gre_offload = { +	.callbacks = { +		.gso_send_check = gre_gso_send_check, +		.gso_segment = gre_gso_segment, +		.gro_receive = gre_gro_receive, +		.gro_complete = gre_gro_complete, +	}, +}; + +static int __init gre_offload_init(void) +{ +	return inet_add_offload(&gre_offload, IPPROTO_GRE); +} +device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4aa1b7f01ea..42b7bcf8045 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -62,6 +62,8 @@   *   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/types.h>  #include <linux/jiffies.h> @@ -83,16 +85,17 @@  #include <net/tcp.h>  #include <net/udp.h>  #include <net/raw.h> +#include <net/ping.h>  #include <linux/skbuff.h>  #include <net/sock.h>  #include <linux/errno.h>  #include <linux/timer.h>  #include <linux/init.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <net/checksum.h>  #include <net/xfrm.h>  #include <net/inet_common.h> +#include <net/ip_fib.h>  /*   *	Build xmit assembly blocks @@ -108,8 +111,7 @@ struct icmp_bxm {  		__be32	       times[3];  	} data;  	int head_len; -	struct ip_options replyopts; -	unsigned char  optbuf[40]; +	struct ip_options_data replyopts;  };  /* An array of errno for error messages from dest unreach. */ @@ -233,48 +235,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)   *	Send an ICMP frame.   */ -/* - *	Check transmit rate limitation for given message. - *	The rate information is held in the destination cache now. - *	This function is generic and could be used for other purposes - *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - *	Note that the same dst_entry fields are modified by functions in - *	route.c too, but these work for packet destinations while xrlim_allow - *	works for icmp destinations. This means the rate limiting information - *	for one "ip object" is shared - and these ICMPs are twice limited: - *	by source and by destination. - * - *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - *			  SHOULD allow setting of rate limits - * - * 	Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ -	unsigned long now, token = dst->rate_tokens; -	int rc = 0; - -	now = jiffies; -	token += now - dst->rate_last; -	dst->rate_last = now; -	if (token > XRLIM_BURST_FACTOR * timeout) -		token = XRLIM_BURST_FACTOR * timeout; -	if (token >= timeout) { -		token -= timeout; -		rc = 1; -	} -	dst->rate_tokens = token; -	return rc; -} -EXPORT_SYMBOL(xrlim_allow); - -static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, -		int type, int code) +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, +				      struct flowi4 *fl4, int type, int code)  {  	struct dst_entry *dst = &rt->dst; -	int rc = 1; +	bool rc = true;  	if (type > NR_ICMP_TYPES)  		goto out; @@ -288,8 +253,13 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,  		goto out;  	/* Limit if icmp type is enabled in ratemask. */ -	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) -		rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); +	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { +		struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); +		rc = inet_peer_xrlim_allow(peer, +					   net->ipv4.sysctl_icmp_ratelimit); +		if (peer) +			inet_putpeer(peer); +	}  out:  	return rc;  } @@ -324,13 +294,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,  }  static void icmp_push_reply(struct icmp_bxm *icmp_param, +			    struct flowi4 *fl4,  			    struct ipcm_cookie *ipc, struct rtable **rt)  {  	struct sock *sk;  	struct sk_buff *skb;  	sk = icmp_sk(dev_net((*rt)->dst.dev)); -	if (ip_append_data(sk, icmp_glue_bits, icmp_param, +	if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,  			   icmp_param->data_len+icmp_param->head_len,  			   icmp_param->head_len,  			   ipc, rt, MSG_DONTWAIT) < 0) { @@ -349,7 +320,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,  						 icmp_param->head_len, csum);  		icmph->checksum = csum_fold(csum);  		skb->ip_summed = CHECKSUM_NONE; -		ip_push_pending_frames(sk); +		ip_push_pending_frames(sk, fl4);  	}  } @@ -362,11 +333,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	struct ipcm_cookie ipc;  	struct rtable *rt = skb_rtable(skb);  	struct net *net = dev_net(rt->dst.dev); +	struct flowi4 fl4;  	struct sock *sk;  	struct inet_sock *inet; -	__be32 daddr; +	__be32 daddr, saddr; +	u32 mark = IP4_REPLY_MARK(net, skb->mark); -	if (ip_options_echo(&icmp_param->replyopts, skb)) +	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))  		return;  	sk = icmp_xmit_lock(net); @@ -377,31 +350,129 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	icmp_param->data.icmph.checksum = 0;  	inet->tos = ip_hdr(skb)->tos; -	daddr = ipc.addr = rt->rt_src; +	sk->sk_mark = mark; +	daddr = ipc.addr = ip_hdr(skb)->saddr; +	saddr = fib_compute_spec_dst(skb);  	ipc.opt = NULL;  	ipc.tx_flags = 0; -	if (icmp_param->replyopts.optlen) { -		ipc.opt = &icmp_param->replyopts; -		if (ipc.opt->srr) -			daddr = icmp_param->replyopts.faddr; -	} -	{ -		struct flowi fl = { .fl4_dst= daddr, -				    .fl4_src = rt->rt_spec_dst, -				    .fl4_tos = RT_TOS(ip_hdr(skb)->tos), -				    .proto = IPPROTO_ICMP }; -		security_skb_classify_flow(skb, &fl); -		if (ip_route_output_key(net, &rt, &fl)) -			goto out_unlock; +	ipc.ttl = 0; +	ipc.tos = -1; + +	if (icmp_param->replyopts.opt.opt.optlen) { +		ipc.opt = &icmp_param->replyopts.opt; +		if (ipc.opt->opt.srr) +			daddr = icmp_param->replyopts.opt.opt.faddr;  	} -	if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, +	memset(&fl4, 0, sizeof(fl4)); +	fl4.daddr = daddr; +	fl4.saddr = saddr; +	fl4.flowi4_mark = mark; +	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); +	fl4.flowi4_proto = IPPROTO_ICMP; +	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt)) +		goto out_unlock; +	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,  			       icmp_param->data.icmph.code)) -		icmp_push_reply(icmp_param, &ipc, &rt); +		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);  	ip_rt_put(rt);  out_unlock:  	icmp_xmit_unlock(sk);  } +static struct rtable *icmp_route_lookup(struct net *net, +					struct flowi4 *fl4, +					struct sk_buff *skb_in, +					const struct iphdr *iph, +					__be32 saddr, u8 tos, u32 mark, +					int type, int code, +					struct icmp_bxm *param) +{ +	struct rtable *rt, *rt2; +	struct flowi4 fl4_dec; +	int err; + +	memset(fl4, 0, sizeof(*fl4)); +	fl4->daddr = (param->replyopts.opt.opt.srr ? +		      param->replyopts.opt.opt.faddr : iph->saddr); +	fl4->saddr = saddr; +	fl4->flowi4_mark = mark; +	fl4->flowi4_tos = RT_TOS(tos); +	fl4->flowi4_proto = IPPROTO_ICMP; +	fl4->fl4_icmp_type = type; +	fl4->fl4_icmp_code = code; +	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); +	rt = __ip_route_output_key(net, fl4); +	if (IS_ERR(rt)) +		return rt; + +	/* No need to clone since we're just using its address. */ +	rt2 = rt; + +	rt = (struct rtable *) xfrm_lookup(net, &rt->dst, +					   flowi4_to_flowi(fl4), NULL, 0); +	if (!IS_ERR(rt)) { +		if (rt != rt2) +			return rt; +	} else if (PTR_ERR(rt) == -EPERM) { +		rt = NULL; +	} else +		return rt; + +	err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); +	if (err) +		goto relookup_failed; + +	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { +		rt2 = __ip_route_output_key(net, &fl4_dec); +		if (IS_ERR(rt2)) +			err = PTR_ERR(rt2); +	} else { +		struct flowi4 fl4_2 = {}; +		unsigned long orefdst; + +		fl4_2.daddr = fl4_dec.saddr; +		rt2 = ip_route_output_key(net, &fl4_2); +		if (IS_ERR(rt2)) { +			err = PTR_ERR(rt2); +			goto relookup_failed; +		} +		/* Ugh! */ +		orefdst = skb_in->_skb_refdst; /* save old refdst */ +		err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, +				     RT_TOS(tos), rt2->dst.dev); + +		dst_release(&rt2->dst); +		rt2 = skb_rtable(skb_in); +		skb_in->_skb_refdst = orefdst; /* restore old refdst */ +	} + +	if (err) +		goto relookup_failed; + +	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, +					    flowi4_to_flowi(&fl4_dec), NULL, +					    XFRM_LOOKUP_ICMP); +	if (!IS_ERR(rt2)) { +		dst_release(&rt->dst); +		memcpy(fl4, &fl4_dec, sizeof(*fl4)); +		rt = rt2; +	} else if (PTR_ERR(rt2) == -EPERM) { +		if (rt) +			dst_release(&rt->dst); +		return rt2; +	} else { +		err = PTR_ERR(rt2); +		goto relookup_failed; +	} +	return rt; + +relookup_failed: +	if (rt) +		return rt; +	return ERR_PTR(err); +}  /*   *	Send an ICMP message in response to a situation @@ -418,11 +489,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  {  	struct iphdr *iph;  	int room; -	struct icmp_bxm icmp_param; +	struct icmp_bxm *icmp_param;  	struct rtable *rt = skb_rtable(skb_in);  	struct ipcm_cookie ipc; +	struct flowi4 fl4;  	__be32 saddr;  	u8  tos; +	u32 mark;  	struct net *net;  	struct sock *sk; @@ -438,7 +511,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	iph = ip_hdr(skb_in);  	if ((u8 *)iph < skb_in->head || -	    (skb_in->network_header + sizeof(*iph)) > skb_in->tail) +	    (skb_network_header(skb_in) + sizeof(*iph)) > +	    skb_tail_pointer(skb_in))  		goto out;  	/* @@ -492,9 +566,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  		}  	} +	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); +	if (!icmp_param) +		return; +  	sk = icmp_xmit_lock(net);  	if (sk == NULL) -		return; +		goto out_free;  	/*  	 *	Construct source address and options. @@ -507,7 +585,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  		rcu_read_lock();  		if (rt_is_input_route(rt) &&  		    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) -			dev = dev_get_by_index_rcu(net, rt->fl.iif); +			dev = dev_get_by_index_rcu(net, inet_iif(skb_in));  		if (dev)  			saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -519,8 +597,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |  					   IPTOS_PREC_INTERNETCONTROL) :  					  iph->tos; +	mark = IP4_REPLY_MARK(net, skb_in->mark); -	if (ip_options_echo(&icmp_param.replyopts, skb_in)) +	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))  		goto out_unlock; @@ -528,98 +607,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)  	 *	Prepare data for ICMP header.  	 */ -	icmp_param.data.icmph.type	 = type; -	icmp_param.data.icmph.code	 = code; -	icmp_param.data.icmph.un.gateway = info; -	icmp_param.data.icmph.checksum	 = 0; -	icmp_param.skb	  = skb_in; -	icmp_param.offset = skb_network_offset(skb_in); +	icmp_param->data.icmph.type	 = type; +	icmp_param->data.icmph.code	 = code; +	icmp_param->data.icmph.un.gateway = info; +	icmp_param->data.icmph.checksum	 = 0; +	icmp_param->skb	  = skb_in; +	icmp_param->offset = skb_network_offset(skb_in);  	inet_sk(sk)->tos = tos; +	sk->sk_mark = mark;  	ipc.addr = iph->saddr; -	ipc.opt = &icmp_param.replyopts; +	ipc.opt = &icmp_param->replyopts.opt;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; -	{ -		struct flowi fl = { -			.fl4_dst = icmp_param.replyopts.srr ? -				   icmp_param.replyopts.faddr : iph->saddr, -			.fl4_src = saddr, -			.fl4_tos = RT_TOS(tos), -			.proto = IPPROTO_ICMP, -			.fl_icmp_type = type, -			.fl_icmp_code = code, -		}; -		int err; -		struct rtable *rt2; - -		security_skb_classify_flow(skb_in, &fl); -		if (__ip_route_output_key(net, &rt, &fl)) -			goto out_unlock; - -		/* No need to clone since we're just using its address. */ -		rt2 = rt; - -		if (!fl.nl_u.ip4_u.saddr) -			fl.nl_u.ip4_u.saddr = rt->rt_src; - -		err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); -		switch (err) { -		case 0: -			if (rt != rt2) -				goto route_done; -			break; -		case -EPERM: -			rt = NULL; -			break; -		default: -			goto out_unlock; -		} - -		if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) -			goto relookup_failed; - -		if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) -			err = __ip_route_output_key(net, &rt2, &fl); -		else { -			struct flowi fl2 = {}; -			unsigned long orefdst; - -			fl2.fl4_dst = fl.fl4_src; -			if (ip_route_output_key(net, &rt2, &fl2)) -				goto relookup_failed; - -			/* Ugh! */ -			orefdst = skb_in->_skb_refdst; /* save old refdst */ -			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, -					     RT_TOS(tos), rt2->dst.dev); - -			dst_release(&rt2->dst); -			rt2 = skb_rtable(skb_in); -			skb_in->_skb_refdst = orefdst; /* restore old refdst */ -		} - -		if (err) -			goto relookup_failed; - -		err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, -				  XFRM_LOOKUP_ICMP); -		switch (err) { -		case 0: -			dst_release(&rt->dst); -			rt = rt2; -			break; -		case -EPERM: -			goto ende; -		default: -relookup_failed: -			if (!rt) -				goto out_unlock; -			break; -		} -	} +	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, +			       type, code, icmp_param); +	if (IS_ERR(rt)) +		goto out_unlock; -route_done: -	if (!icmpv4_xrlim_allow(net, rt, type, code)) +	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))  		goto ende;  	/* RFC says return as much as we can without exceeding 576 bytes. */ @@ -627,36 +634,68 @@ route_done:  	room = dst_mtu(&rt->dst);  	if (room > 576)  		room = 576; -	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; +	room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;  	room -= sizeof(struct icmphdr); -	icmp_param.data_len = skb_in->len - icmp_param.offset; -	if (icmp_param.data_len > room) -		icmp_param.data_len = room; -	icmp_param.head_len = sizeof(struct icmphdr); +	icmp_param->data_len = skb_in->len - icmp_param->offset; +	if (icmp_param->data_len > room) +		icmp_param->data_len = room; +	icmp_param->head_len = sizeof(struct icmphdr); -	icmp_push_reply(&icmp_param, &ipc, &rt); +	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);  ende:  	ip_rt_put(rt);  out_unlock:  	icmp_xmit_unlock(sk); +out_free: +	kfree(icmp_param);  out:;  }  EXPORT_SYMBOL(icmp_send); +static void icmp_socket_deliver(struct sk_buff *skb, u32 info) +{ +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	const struct net_protocol *ipprot; +	int protocol = iph->protocol; + +	/* Checkin full IP header plus 8 bytes of protocol to +	 * avoid additional coding at protocol handlers. +	 */ +	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) +		return; + +	raw_icmp_error(skb, protocol, info); + +	rcu_read_lock(); +	ipprot = rcu_dereference(inet_protos[protocol]); +	if (ipprot && ipprot->err_handler) +		ipprot->err_handler(skb, info); +	rcu_read_unlock(); +} + +static bool icmp_tag_validation(int proto) +{ +	bool ok; + +	rcu_read_lock(); +	ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; +	rcu_read_unlock(); +	return ok; +} +  /* - *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + *	ICMP_PARAMETERPROB.   */  static void icmp_unreach(struct sk_buff *skb)  { -	struct iphdr *iph; +	const struct iphdr *iph;  	struct icmphdr *icmph; -	int hash, protocol; -	const struct net_protocol *ipprot; -	u32 info = 0;  	struct net *net; +	u32 info = 0;  	net = dev_net(skb_dst(skb)->dev); @@ -670,7 +709,7 @@ static void icmp_unreach(struct sk_buff *skb)  		goto out_err;  	icmph = icmp_hdr(skb); -	iph   = (struct iphdr *)skb->data; +	iph   = (const struct iphdr *)skb->data;  	if (iph->ihl < 5) /* Mangled header, drop. */  		goto out_err; @@ -683,19 +722,27 @@ static void icmp_unreach(struct sk_buff *skb)  		case ICMP_PORT_UNREACH:  			break;  		case ICMP_FRAG_NEEDED: -			if (ipv4_config.no_pmtu_disc) { -				LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n", +			/* for documentation of the ip_no_pmtu_disc +			 * values please see +			 * Documentation/networking/ip-sysctl.txt +			 */ +			switch (net->ipv4.sysctl_ip_no_pmtu_disc) { +			default: +				LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),  					       &iph->daddr); -			} else { -				info = ip_rt_frag_needed(net, iph, -							 ntohs(icmph->un.frag.mtu), -							 skb->dev); -				if (!info) +				break; +			case 2: +				goto out; +			case 3: +				if (!icmp_tag_validation(iph->protocol))  					goto out; +				/* fall through */ +			case 0: +				info = ntohs(icmph->un.frag.mtu);  			}  			break;  		case ICMP_SR_FAILED: -			LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n", +			LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),  				       &iph->daddr);  			break;  		default: @@ -718,7 +765,7 @@ static void icmp_unreach(struct sk_buff *skb)  	 */  	/* -	 *	Check the other end isnt violating RFC 1122. Some routers send +	 *	Check the other end isn't violating RFC 1122. Some routers send  	 *	bogus responses to broadcast frames. If you see this message  	 *	first check your netmask matches at both ends, if it does then  	 *	get the other vendor to fix their kit. @@ -726,37 +773,14 @@ static void icmp_unreach(struct sk_buff *skb)  	if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&  	    inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { -		if (net_ratelimit()) -			printk(KERN_WARNING "%pI4 sent an invalid ICMP " -					    "type %u, code %u " -					    "error to a broadcast: %pI4 on %s\n", -			       &ip_hdr(skb)->saddr, -			       icmph->type, icmph->code, -			       &iph->daddr, -			       skb->dev->name); +		net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", +				     &ip_hdr(skb)->saddr, +				     icmph->type, icmph->code, +				     &iph->daddr, skb->dev->name);  		goto out;  	} -	/* Checkin full IP header plus 8 bytes of protocol to -	 * avoid additional coding at protocol handlers. -	 */ -	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) -		goto out; - -	iph = (struct iphdr *)skb->data; -	protocol = iph->protocol; - -	/* -	 *	Deliver ICMP message to raw sockets. Pretty useless feature? -	 */ -	raw_icmp_error(skb, protocol, info); - -	hash = protocol & (MAX_INET_PROTOS - 1); -	rcu_read_lock(); -	ipprot = rcu_dereference(inet_protos[hash]); -	if (ipprot && ipprot->err_handler) -		ipprot->err_handler(skb, info); -	rcu_read_unlock(); +	icmp_socket_deliver(skb, info);  out:  	return; @@ -772,37 +796,15 @@ out_err:  static void icmp_redirect(struct sk_buff *skb)  { -	struct iphdr *iph; - -	if (skb->len < sizeof(struct iphdr)) -		goto out_err; +	if (skb->len < sizeof(struct iphdr)) { +		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); +		return; +	} -	/* -	 *	Get the copied header of the packet that caused the redirect -	 */  	if (!pskb_may_pull(skb, sizeof(struct iphdr))) -		goto out; - -	iph = (struct iphdr *)skb->data; +		return; -	switch (icmp_hdr(skb)->code & 7) { -	case ICMP_REDIR_NET: -	case ICMP_REDIR_NETTOS: -		/* -		 * As per RFC recommendations now handle it as a host redirect. -		 */ -	case ICMP_REDIR_HOST: -	case ICMP_REDIR_HOSTTOS: -		ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr, -			       icmp_hdr(skb)->un.gateway, -			       iph->saddr, skb->dev); -		break; -	} -out: -	return; -out_err: -	ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); -	goto out; +	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);  }  /* @@ -876,87 +878,6 @@ out_err:  	goto out;  } - -/* - *	Handle ICMP_ADDRESS_MASK requests.  (RFC950) - * - * RFC1122 (3.2.2.9).  A host MUST only send replies to - * ADDRESS_MASK requests if it's been configured as an address mask - * agent.  Receiving a request doesn't constitute implicit permission to - * act as one. Of course, implementing this correctly requires (SHOULD) - * a way to turn the functionality on and off.  Another one for sysctl(), - * I guess. -- MS - * - * RFC1812 (4.3.3.9).	A router MUST implement it. - *			A router SHOULD have switch turning it on/off. - *		      	This switch MUST be ON by default. - * - * Gratuitous replies, zero-source replies are not implemented, - * that complies with RFC. DO NOT implement them!!! All the idea - * of broadcast addrmask replies as specified in RFC950 is broken. - * The problem is that it is not uncommon to have several prefixes - * on one physical interface. Moreover, addrmask agent can even be - * not aware of existing another prefixes. - * If source is zero, addrmask agent cannot choose correct prefix. - * Gratuitous mask announcements suffer from the same problem. - * RFC1812 explains it, but still allows to use ADDRMASK, - * that is pretty silly. --ANK - * - * All these rules are so bizarre, that I removed kernel addrmask - * support at all. It is wrong, it is obsolete, nobody uses it in - * any case. --ANK - * - * Furthermore you can do it with a usermode address agent program - * anyway... - */ - -static void icmp_address(struct sk_buff *skb) -{ -#if 0 -	if (net_ratelimit()) -		printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); -#endif -} - -/* - * RFC1812 (4.3.3.9).	A router SHOULD listen all replies, and complain - *			loudly if an inconsistency is found. - * called with rcu_read_lock() - */ - -static void icmp_address_reply(struct sk_buff *skb) -{ -	struct rtable *rt = skb_rtable(skb); -	struct net_device *dev = skb->dev; -	struct in_device *in_dev; -	struct in_ifaddr *ifa; - -	if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) -		return; - -	in_dev = __in_dev_get_rcu(dev); -	if (!in_dev) -		return; - -	if (in_dev->ifa_list && -	    IN_DEV_LOG_MARTIANS(in_dev) && -	    IN_DEV_FORWARD(in_dev)) { -		__be32 _mask, *mp; - -		mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); -		BUG_ON(mp == NULL); -		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { -			if (*mp == ifa->ifa_mask && -			    inet_ifa_match(rt->rt_src, ifa)) -				break; -		} -		if (!ifa && net_ratelimit()) { -			printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", -			       mp, dev->name, &rt->rt_src); -		} -	} -} -  static void icmp_discard(struct sk_buff *skb)  {  } @@ -992,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb)  	ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		if (!csum_fold(skb->csum)) -			break; -		/* fall through */ -	case CHECKSUM_NONE: -		skb->csum = 0; -		if (__skb_checksum_complete(skb)) -			goto error; -	} +	if (skb_checksum_simple_validate(skb)) +		goto csum_error;  	if (!pskb_pull(skb, sizeof(*icmph)))  		goto error; @@ -1048,17 +961,43 @@ int icmp_rcv(struct sk_buff *skb)  drop:  	kfree_skb(skb);  	return 0; +csum_error: +	ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);  error:  	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);  	goto drop;  } +void icmp_err(struct sk_buff *skb, u32 info) +{ +	struct iphdr *iph = (struct iphdr *)skb->data; +	int offset = iph->ihl<<2; +	struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); +	int type = icmp_hdr(skb)->type; +	int code = icmp_hdr(skb)->code; +	struct net *net = dev_net(skb->dev); + +	/* +	 * Use ping_err to handle all icmp errors except those +	 * triggered by ICMP_ECHOREPLY which sent from kernel. +	 */ +	if (icmph->type != ICMP_ECHOREPLY) { +		ping_err(skb, offset, info); +		return; +	} + +	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) +		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); +	else if (type == ICMP_REDIRECT) +		ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} +  /*   *	This table is the definition of how we handle ICMP.   */  static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {  	[ICMP_ECHOREPLY] = { -		.handler = icmp_discard, +		.handler = ping_rcv,  	},  	[1] = {  		.handler = icmp_discard, @@ -1120,10 +1059,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {  		.handler = icmp_discard,  	},  	[ICMP_ADDRESS] = { -		.handler = icmp_address, +		.handler = icmp_discard,  	},  	[ICMP_ADDRESSREPLY] = { -		.handler = icmp_address_reply, +		.handler = icmp_discard,  	},  }; @@ -1157,10 +1096,9 @@ static int __net_init icmp_sk_init(struct net *net)  		net->ipv4.icmp_sk[i] = sk;  		/* Enough space for 2 64K ICMP packets, including -		 * sk_buff struct overhead. +		 * sk_buff/skb_shared_info struct overhead.  		 */ -		sk->sk_sndbuf = -			(2 * ((64 * 1024) + sizeof(struct sk_buff))); +		sk->sk_sndbuf =	2 * SKB_TRUESIZE(64 * 1024);  		/*  		 * Speedup sock_wfree() diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e0e77e297de..db710b059ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -73,7 +73,6 @@  #include <linux/module.h>  #include <linux/slab.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/jiffies.h> @@ -89,6 +88,7 @@  #include <linux/if_arp.h>  #include <linux/rtnetlink.h>  #include <linux/times.h> +#include <linux/pkt_sched.h>  #include <net/net_namespace.h>  #include <net/arp.h> @@ -114,7 +114,8 @@  #define IGMP_V1_Router_Present_Timeout		(400*HZ)  #define IGMP_V2_Router_Present_Timeout		(400*HZ) -#define IGMP_Unsolicited_Report_Interval	(10*HZ) +#define IGMP_V2_Unsolicited_Report_Interval	(10*HZ) +#define IGMP_V3_Unsolicited_Report_Interval	(1*HZ)  #define IGMP_Query_Response_Interval		(10*HZ)  #define IGMP_Unsolicited_Report_Count		2 @@ -139,6 +140,29 @@  	 ((in_dev)->mr_v2_seen && \  	  time_before(jiffies, (in_dev)->mr_v2_seen))) +static int unsolicited_report_interval(struct in_device *in_dev) +{ +	int interval_ms, interval_jiffies; + +	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) +		interval_ms = IN_DEV_CONF_GET( +			in_dev, +			IGMPV2_UNSOLICITED_REPORT_INTERVAL); +	else /* v3 */ +		interval_ms = IN_DEV_CONF_GET( +			in_dev, +			IGMPV3_UNSOLICITED_REPORT_INTERVAL); + +	interval_jiffies = msecs_to_jiffies(interval_ms); + +	/* _timer functions can't handle a delay of 0 jiffies so ensure +	 *  we always return a positive value. +	 */ +	if (interval_jiffies <= 0) +		interval_jiffies = 1; +	return interval_jiffies; +} +  static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);  static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);  static void igmpv3_clear_delrec(struct in_device *in_dev); @@ -149,17 +173,11 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc);  static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  			 int sfcount, __be32 *psfsrc, int delta); - -static void ip_mc_list_reclaim(struct rcu_head *head) -{ -	kfree(container_of(head, struct ip_mc_list, rcu)); -} -  static void ip_ma_put(struct ip_mc_list *im)  {  	if (atomic_dec_and_test(&im->refcnt)) {  		in_dev_put(im->interface); -		call_rcu(&im->rcu, ip_mc_list_reclaim); +		kfree_rcu(im, rcu);  	}  } @@ -193,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)  /* It must be called with locked im->lock */  static void igmp_start_timer(struct ip_mc_list *im, int max_delay)  { -	int tv = net_random() % max_delay; +	int tv = prandom_u32() % max_delay;  	im->tm_running = 1;  	if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -202,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)  static void igmp_gq_start_timer(struct in_device *in_dev)  { -	int tv = net_random() % in_dev->mr_maxdelay; +	int tv = prandom_u32() % in_dev->mr_maxdelay;  	in_dev->mr_gq_running = 1;  	if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -211,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)  static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)  { -	int tv = net_random() % delay; +	int tv = prandom_u32() % delay;  	if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))  		in_dev_hold(in_dev); @@ -292,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)  	struct ip_sf_list *psf;  	int scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (!is_in(pmc, psf, type, gdeleted, sdeleted))  			continue;  		scount++; @@ -309,9 +327,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)  	struct iphdr *pip;  	struct igmpv3_report *pig;  	struct net *net = dev_net(dev); +	struct flowi4 fl4; +	int hlen = LL_RESERVED_SPACE(dev); +	int tlen = dev->needed_tailroom;  	while (1) { -		skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), +		skb = alloc_skb(size + hlen + tlen,  				GFP_ATOMIC | __GFP_NOWARN);  		if (skb)  			break; @@ -319,27 +340,21 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)  		if (size < 256)  			return NULL;  	} +	skb->priority = TC_PRIO_CONTROL;  	igmp_skb_size(skb) = size; -	{ -		struct flowi fl = { .oif = dev->ifindex, -				    .fl4_dst = IGMPV3_ALL_MCR, -				    .proto = IPPROTO_IGMP }; -		if (ip_route_output_key(net, &rt, &fl)) { -			kfree_skb(skb); -			return NULL; -		} -	} -	if (rt->rt_src == 0) { +	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, +				   0, 0, +				   IPPROTO_IGMP, 0, dev->ifindex); +	if (IS_ERR(rt)) {  		kfree_skb(skb); -		ip_rt_put(rt);  		return NULL;  	}  	skb_dst_set(skb, &rt->dst);  	skb->dev = dev; -	skb_reserve(skb, LL_RESERVED_SPACE(dev)); +	skb_reserve(skb, hlen);  	skb_reset_network_header(skb);  	pip = ip_hdr(skb); @@ -350,15 +365,15 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)  	pip->tos      = 0xc0;  	pip->frag_off = htons(IP_DF);  	pip->ttl      = 1; -	pip->daddr    = rt->rt_dst; -	pip->saddr    = rt->rt_src; +	pip->daddr    = fl4.daddr; +	pip->saddr    = fl4.saddr;  	pip->protocol = IPPROTO_IGMP;  	pip->tot_len  = 0;	/* filled in later */ -	ip_select_ident(pip, &rt->dst, NULL); -	((u8*)&pip[1])[0] = IPOPT_RA; -	((u8*)&pip[1])[1] = 4; -	((u8*)&pip[1])[2] = 0; -	((u8*)&pip[1])[3] = 0; +	ip_select_ident(skb, NULL); +	((u8 *)&pip[1])[0] = IPOPT_RA; +	((u8 *)&pip[1])[1] = 4; +	((u8 *)&pip[1])[2] = 0; +	((u8 *)&pip[1])[3] = 0;  	skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;  	skb_put(skb, sizeof(*pig)); @@ -374,7 +389,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)  static int igmpv3_sendpack(struct sk_buff *skb)  {  	struct igmphdr *pig = igmp_hdr(skb); -	const int igmplen = skb->tail - skb->transport_header; +	const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);  	pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); @@ -448,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,  	}  	first = 1;  	psf_prev = NULL; -	for (psf=*psf_list; psf; psf=psf_next) { +	for (psf = *psf_list; psf; psf = psf_next) {  		__be32 *psrc;  		psf_next = psf->sf_next; @@ -505,7 +520,7 @@ empty_source:  			return skb;  		if (pmc->crcount || isquery) {  			/* make sure we have room for group header */ -			if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) { +			if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {  				igmpv3_sendpack(skb);  				skb = NULL; /* add_grhead will get a new one */  			} @@ -561,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)  	struct ip_sf_list *psf_prev, *psf_next, *psf;  	psf_prev = NULL; -	for (psf=*ppsf; psf; psf = psf_next) { +	for (psf = *ppsf; psf; psf = psf_next) {  		psf_next = psf->sf_next;  		if (psf->sf_crcount == 0) {  			if (psf_prev) @@ -585,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)  	/* deleted MCA's */  	pmc_prev = NULL; -	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { +	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {  		pmc_next = pmc->next;  		if (pmc->sfmode == MCAST_INCLUDE) {  			type = IGMPV3_BLOCK_OLD_SOURCES; @@ -657,7 +672,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,  	struct net_device *dev = in_dev->dev;  	struct net *net = dev_net(dev);  	__be32	group = pmc ? pmc->multiaddr : 0; +	struct flowi4 fl4;  	__be32	dst; +	int hlen, tlen;  	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)  		return igmpv3_send_report(in_dev, pmc); @@ -666,27 +683,24 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,  	else  		dst = group; -	{ -		struct flowi fl = { .oif = dev->ifindex, -				    .fl4_dst = dst, -				    .proto = IPPROTO_IGMP }; -		if (ip_route_output_key(net, &rt, &fl)) -			return -1; -	} -	if (rt->rt_src == 0) { -		ip_rt_put(rt); +	rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, +				   0, 0, +				   IPPROTO_IGMP, 0, dev->ifindex); +	if (IS_ERR(rt))  		return -1; -	} -	skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); +	hlen = LL_RESERVED_SPACE(dev); +	tlen = dev->needed_tailroom; +	skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);  	if (skb == NULL) {  		ip_rt_put(rt);  		return -1;  	} +	skb->priority = TC_PRIO_CONTROL;  	skb_dst_set(skb, &rt->dst); -	skb_reserve(skb, LL_RESERVED_SPACE(dev)); +	skb_reserve(skb, hlen);  	skb_reset_network_header(skb);  	iph = ip_hdr(skb); @@ -698,13 +712,13 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,  	iph->frag_off = htons(IP_DF);  	iph->ttl      = 1;  	iph->daddr    = dst; -	iph->saddr    = rt->rt_src; +	iph->saddr    = fl4.saddr;  	iph->protocol = IPPROTO_IGMP; -	ip_select_ident(iph, &rt->dst, NULL); -	((u8*)&iph[1])[0] = IPOPT_RA; -	((u8*)&iph[1])[1] = 4; -	((u8*)&iph[1])[2] = 0; -	((u8*)&iph[1])[3] = 0; +	ip_select_ident(skb, NULL); +	((u8 *)&iph[1])[0] = IPOPT_RA; +	((u8 *)&iph[1])[1] = 4; +	((u8 *)&iph[1])[2] = 0; +	((u8 *)&iph[1])[3] = 0;  	ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));  	ih->type = type; @@ -722,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)  	in_dev->mr_gq_running = 0;  	igmpv3_send_report(in_dev, NULL); -	__in_dev_put(in_dev); +	in_dev_put(in_dev);  }  static void igmp_ifc_timer_expire(unsigned long data) @@ -732,9 +746,10 @@ static void igmp_ifc_timer_expire(unsigned long data)  	igmpv3_send_cr(in_dev);  	if (in_dev->mr_ifc_count) {  		in_dev->mr_ifc_count--; -		igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); +		igmp_ifc_start_timer(in_dev, +				     unsolicited_report_interval(in_dev));  	} -	__in_dev_put(in_dev); +	in_dev_put(in_dev);  }  static void igmp_ifc_event(struct in_device *in_dev) @@ -749,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev)  static void igmp_timer_expire(unsigned long data)  { -	struct ip_mc_list *im=(struct ip_mc_list *)data; +	struct ip_mc_list *im = (struct ip_mc_list *)data;  	struct in_device *in_dev = im->interface;  	spin_lock(&im->lock); @@ -757,7 +772,7 @@ static void igmp_timer_expire(unsigned long data)  	if (im->unsolicit_count) {  		im->unsolicit_count--; -		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); +		igmp_start_timer(im, unsolicited_report_interval(in_dev));  	}  	im->reporter = 1;  	spin_unlock(&im->lock); @@ -779,15 +794,15 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  	int i, scount;  	scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (scount == nsrcs)  			break; -		for (i=0; i<nsrcs; i++) { +		for (i = 0; i < nsrcs; i++) {  			/* skip inactive filters */ -			if (pmc->sfcount[MCAST_INCLUDE] || +			if (psf->sf_count[MCAST_INCLUDE] ||  			    pmc->sfcount[MCAST_EXCLUDE] !=  			    psf->sf_count[MCAST_EXCLUDE]) -				continue; +				break;  			if (srcs[i] == psf->sf_inaddr) {  				scount++;  				break; @@ -810,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  	/* mark INCLUDE-mode sources */  	scount = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (scount == nsrcs)  			break; -		for (i=0; i<nsrcs; i++) +		for (i = 0; i < nsrcs; i++)  			if (srcs[i] == psf->sf_inaddr) {  				psf->sf_gsresp = 1;  				scount++; @@ -828,14 +843,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)  	return 1;  } -static void igmp_heard_report(struct in_device *in_dev, __be32 group) +/* return true if packet was dropped */ +static bool igmp_heard_report(struct in_device *in_dev, __be32 group)  {  	struct ip_mc_list *im;  	/* Timers are only set for non-local groups */  	if (group == IGMP_ALL_HOSTS) -		return; +		return false;  	rcu_read_lock();  	for_each_pmc_rcu(in_dev, im) { @@ -845,9 +861,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)  		}  	}  	rcu_read_unlock(); +	return false;  } -static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, +/* return true if packet was dropped */ +static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  	int len)  {  	struct igmphdr 		*ih = igmp_hdr(skb); @@ -879,7 +897,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  		/* clear deleted report items */  		igmpv3_clear_delrec(in_dev);  	} else if (len < 12) { -		return;	/* ignore bogus packet; freed by caller */ +		return true;	/* ignore bogus packet; freed by caller */  	} else if (IGMP_V1_SEEN(in_dev)) {  		/* This is a v3 query with v1 queriers present */  		max_delay = IGMP_Query_Response_Interval; @@ -892,15 +910,17 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  		 * to be intended in a v3 query.  		 */  		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); +		if (!max_delay) +			max_delay = 1;	/* can't mod w/ 0 */  	} else { /* v3 */  		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) -			return; +			return true;  		ih3 = igmpv3_query_hdr(skb);  		if (ih3->nsrcs) {  			if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)  					   + ntohs(ih3->nsrcs)*sizeof(__be32))) -				return; +				return true;  			ih3 = igmpv3_query_hdr(skb);  		} @@ -912,9 +932,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  			in_dev->mr_qrv = ih3->qrv;  		if (!group) { /* general query */  			if (ih3->nsrcs) -				return;	/* no sources allowed */ +				return false;	/* no sources allowed */  			igmp_gq_start_timer(in_dev); -			return; +			return false;  		}  		/* mark sources to include, if group & source-specific */  		mark = ih3->nsrcs != 0; @@ -950,6 +970,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,  			igmp_mod_timer(im, max_delay);  	}  	rcu_read_unlock(); +	return false;  }  /* called in rcu_read_lock() section */ @@ -959,6 +980,7 @@ int igmp_rcv(struct sk_buff *skb)  	struct igmphdr *ih;  	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);  	int len = skb->len; +	bool dropped = true;  	if (in_dev == NULL)  		goto drop; @@ -966,21 +988,13 @@ int igmp_rcv(struct sk_buff *skb)  	if (!pskb_may_pull(skb, sizeof(struct igmphdr)))  		goto drop; -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		if (!csum_fold(skb->csum)) -			break; -		/* fall through */ -	case CHECKSUM_NONE: -		skb->csum = 0; -		if (__skb_checksum_complete(skb)) -			goto drop; -	} +	if (skb_checksum_simple_validate(skb)) +		goto drop;  	ih = igmp_hdr(skb);  	switch (ih->type) {  	case IGMP_HOST_MEMBERSHIP_QUERY: -		igmp_heard_query(in_dev, skb, len); +		dropped = igmp_heard_query(in_dev, skb, len);  		break;  	case IGMP_HOST_MEMBERSHIP_REPORT:  	case IGMPV2_HOST_MEMBERSHIP_REPORT: @@ -990,7 +1004,7 @@ int igmp_rcv(struct sk_buff *skb)  		/* don't rely on MC router hearing unicast reports */  		if (skb->pkt_type == PACKET_MULTICAST ||  		    skb->pkt_type == PACKET_BROADCAST) -			igmp_heard_report(in_dev, ih->group); +			dropped = igmp_heard_report(in_dev, ih->group);  		break;  	case IGMP_PIM:  #ifdef CONFIG_IP_PIMSM_V1 @@ -1008,7 +1022,10 @@ int igmp_rcv(struct sk_buff *skb)  	}  drop: -	kfree_skb(skb); +	if (dropped) +		kfree_skb(skb); +	else +		consume_skb(skb);  	return 0;  } @@ -1026,7 +1043,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)  	/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.  	   We will get multicast token leakage, when IFF_MULTICAST -	   is changed. This check should be done in dev->set_multicast_list +	   is changed. This check should be done in ndo_set_rx_mode  	   routine. Something sort of:  	   if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }  	   --ANK @@ -1078,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)  		pmc->tomb = im->tomb;  		pmc->sources = im->sources;  		im->tomb = im->sources = NULL; -		for (psf=pmc->sources; psf; psf=psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = pmc->crcount;  	}  	spin_unlock_bh(&im->lock); @@ -1096,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)  	spin_lock_bh(&in_dev->mc_tomb_lock);  	pmc_prev = NULL; -	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { +	for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {  		if (pmc->multiaddr == multiaddr)  			break;  		pmc_prev = pmc; @@ -1109,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)  	}  	spin_unlock_bh(&in_dev->mc_tomb_lock);  	if (pmc) { -		for (psf=pmc->tomb; psf; psf=psf_next) { +		for (psf = pmc->tomb; psf; psf = psf_next) {  			psf_next = psf->sf_next;  			kfree(psf);  		} @@ -1142,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)  		psf = pmc->tomb;  		pmc->tomb = NULL;  		spin_unlock_bh(&pmc->lock); -		for (; psf; psf=psf_next) { +		for (; psf; psf = psf_next) {  			psf_next = psf->sf_next;  			kfree(psf);  		} @@ -1172,20 +1189,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)  	if (!in_dev->dead) {  		if (IGMP_V1_SEEN(in_dev)) -			goto done; +			return;  		if (IGMP_V2_SEEN(in_dev)) {  			if (reporter)  				igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); -			goto done; +			return;  		}  		/* IGMPv3 */  		igmpv3_add_delrec(in_dev, im);  		igmp_ifc_event(in_dev);  	} -done:  #endif -	ip_mc_clear_src(im);  }  static void igmp_group_added(struct ip_mc_list *im) @@ -1222,6 +1237,57 @@ static void igmp_group_added(struct ip_mc_list *im)   *	Multicast list managers   */ +static u32 ip_mc_hash(const struct ip_mc_list *im) +{ +	return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); +} + +static void ip_mc_hash_add(struct in_device *in_dev, +			   struct ip_mc_list *im) +{ +	struct ip_mc_list __rcu **mc_hash; +	u32 hash; + +	mc_hash = rtnl_dereference(in_dev->mc_hash); +	if (mc_hash) { +		hash = ip_mc_hash(im); +		im->next_hash = mc_hash[hash]; +		rcu_assign_pointer(mc_hash[hash], im); +		return; +	} + +	/* do not use a hash table for small number of items */ +	if (in_dev->mc_count < 4) +		return; + +	mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, +			  GFP_KERNEL); +	if (!mc_hash) +		return; + +	for_each_pmc_rtnl(in_dev, im) { +		hash = ip_mc_hash(im); +		im->next_hash = mc_hash[hash]; +		RCU_INIT_POINTER(mc_hash[hash], im); +	} + +	rcu_assign_pointer(in_dev->mc_hash, mc_hash); +} + +static void ip_mc_hash_remove(struct in_device *in_dev, +			      struct ip_mc_list *im) +{ +	struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); +	struct ip_mc_list *aux; + +	if (!mc_hash) +		return; +	mc_hash += ip_mc_hash(im); +	while ((aux = rtnl_dereference(*mc_hash)) != im) +		mc_hash = &aux->next_hash; +	*mc_hash = im->next_hash; +} +  /*   *	A socket has joined a multicast group on device dev. @@ -1263,6 +1329,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)  	in_dev->mc_count++;  	rcu_assign_pointer(in_dev->mc_list, im); +	ip_mc_hash_add(in_dev, im); +  #ifdef CONFIG_IP_MULTICAST  	igmpv3_del_delrec(in_dev, im->multiaddr);  #endif @@ -1275,16 +1343,17 @@ out:  EXPORT_SYMBOL(ip_mc_inc_group);  /* - *	Resend IGMP JOIN report; used for bonding. - *	Called with rcu_read_lock() + *	Resend IGMP JOIN report; used by netdev notifier.   */ -void ip_mc_rejoin_groups(struct in_device *in_dev) +static void ip_mc_rejoin_groups(struct in_device *in_dev)  {  #ifdef CONFIG_IP_MULTICAST  	struct ip_mc_list *im;  	int type; -	for_each_pmc_rcu(in_dev, im) { +	ASSERT_RTNL(); + +	for_each_pmc_rtnl(in_dev, im) {  		if (im->multiaddr == IGMP_ALL_HOSTS)  			continue; @@ -1301,7 +1370,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)  	}  #endif  } -EXPORT_SYMBOL(ip_mc_rejoin_groups);  /*   *	A socket has left a multicast group on device dev @@ -1319,9 +1387,11 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)  	     ip = &i->next_rcu) {  		if (i->multiaddr == addr) {  			if (--i->users == 0) { +				ip_mc_hash_remove(in_dev, i);  				*ip = i->next_rcu;  				in_dev->mc_count--;  				igmp_group_dropped(i); +				ip_mc_clear_src(i);  				if (!in_dev->dead)  					ip_rt_multicast_event(in_dev); @@ -1385,13 +1455,9 @@ void ip_mc_init_dev(struct in_device *in_dev)  {  	ASSERT_RTNL(); -	in_dev->mc_tomb = NULL;  #ifdef CONFIG_IP_MULTICAST -	in_dev->mr_gq_running = 0;  	setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,  			(unsigned long)in_dev); -	in_dev->mr_ifc_count = 0; -	in_dev->mc_count     = 0;  	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,  			(unsigned long)in_dev);  	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; @@ -1431,7 +1497,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev)  		in_dev->mc_list = i->next_rcu;  		in_dev->mc_count--; -		igmp_group_dropped(i); +		/* We've dropped the groups in ip_mc_down already */ +		ip_mc_clear_src(i);  		ip_ma_put(i);  	}  } @@ -1439,8 +1506,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)  /* RTNL is locked */  static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)  { -	struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr }; -	struct rtable *rt;  	struct net_device *dev = NULL;  	struct in_device *idev = NULL; @@ -1454,9 +1519,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)  			return NULL;  	} -	if (!dev && !ip_route_output_key(net, &rt, &fl)) { -		dev = rt->dst.dev; -		ip_rt_put(rt); +	if (!dev) { +		struct rtable *rt = ip_route_output(net, +						    imr->imr_multiaddr.s_addr, +						    0, 0, 0); +		if (!IS_ERR(rt)) { +			dev = rt->dst.dev; +			ip_rt_put(rt); +		}  	}  	if (dev) {  		imr->imr_ifindex = dev->ifindex; @@ -1479,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,  	int rv = 0;  	psf_prev = NULL; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (psf->sf_inaddr == *psfsrc)  			break;  		psf_prev = psf; @@ -1552,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->sfcount[sfmode]--;  	}  	err = 0; -	for (i=0; i<sfcount; i++) { +	for (i = 0; i < sfcount; i++) {  		int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);  		changerec |= rv > 0; @@ -1572,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :  			IGMP_Unsolicited_Report_Count;  		in_dev->mr_ifc_count = pmc->crcount; -		for (psf=pmc->sources; psf; psf = psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0;  		igmp_ifc_event(pmc->interface);  	} else if (sf_setstate(pmc) || changerec) { @@ -1588,12 +1658,12 @@ out_unlock:   * Add multicast single-source filter to the interface list   */  static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, -	__be32 *psfsrc, int delta) +	__be32 *psfsrc)  {  	struct ip_sf_list *psf, *psf_prev;  	psf_prev = NULL; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (psf->sf_inaddr == *psfsrc)  			break;  		psf_prev = psf; @@ -1621,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc)  	struct ip_sf_list *psf;  	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; -	for (psf=pmc->sources; psf; psf=psf->sf_next) +	for (psf = pmc->sources; psf; psf = psf->sf_next)  		if (pmc->sfcount[MCAST_EXCLUDE]) {  			psf->sf_oldin = mca_xcount ==  				psf->sf_count[MCAST_EXCLUDE] && @@ -1638,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  	int new_in, rv;  	rv = 0; -	for (psf=pmc->sources; psf; psf=psf->sf_next) { +	for (psf = pmc->sources; psf; psf = psf->sf_next) {  		if (pmc->sfcount[MCAST_EXCLUDE]) {  			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&  				!psf->sf_count[MCAST_INCLUDE]; @@ -1648,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  			if (!psf->sf_oldin) {  				struct ip_sf_list *prev = NULL; -				for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { +				for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {  					if (dpsf->sf_inaddr == psf->sf_inaddr)  						break;  					prev = dpsf; @@ -1670,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc)  			 * add or update "delete" records if an active filter  			 * is now inactive  			 */ -			for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) +			for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)  				if (dpsf->sf_inaddr == psf->sf_inaddr)  					break;  			if (!dpsf) { @@ -1722,17 +1792,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  	if (!delta)  		pmc->sfcount[sfmode]++;  	err = 0; -	for (i=0; i<sfcount; i++) { -		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); +	for (i = 0; i < sfcount; i++) { +		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);  		if (err)  			break;  	}  	if (err) {  		int j; -		pmc->sfcount[sfmode]--; -		for (j=0; j<i; j++) -			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); +		if (!delta) +			pmc->sfcount[sfmode]--; +		for (j = 0; j < i; j++) +			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);  	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {  #ifdef CONFIG_IP_MULTICAST  		struct ip_sf_list *psf; @@ -1750,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,  		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :  			IGMP_Unsolicited_Report_Count;  		in_dev->mr_ifc_count = pmc->crcount; -		for (psf=pmc->sources; psf; psf = psf->sf_next) +		for (psf = pmc->sources; psf; psf = psf->sf_next)  			psf->sf_crcount = 0;  		igmp_ifc_event(in_dev);  	} else if (sf_setstate(pmc)) { @@ -1765,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)  {  	struct ip_sf_list *psf, *nextpsf; -	for (psf=pmc->tomb; psf; psf=nextpsf) { +	for (psf = pmc->tomb; psf; psf = nextpsf) {  		nextpsf = psf->sf_next;  		kfree(psf);  	}  	pmc->tomb = NULL; -	for (psf=pmc->sources; psf; psf=nextpsf) { +	for (psf = pmc->sources; psf; psf = nextpsf) {  		nextpsf = psf->sf_next;  		kfree(psf);  	} @@ -1836,12 +1907,6 @@ done:  }  EXPORT_SYMBOL(ip_mc_join_group); -static void ip_sf_socklist_reclaim(struct rcu_head *rp) -{ -	kfree(container_of(rp, struct ip_sf_socklist, rcu)); -	/* sk_omem_alloc should have been decreased by the caller*/ -} -  static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,  			   struct in_device *in_dev)  { @@ -1855,21 +1920,13 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,  	}  	err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,  			iml->sfmode, psf->sl_count, psf->sl_addr, 0); -	rcu_assign_pointer(iml->sflist, NULL); +	RCU_INIT_POINTER(iml->sflist, NULL);  	/* decrease mem now to avoid the memleak warning */  	atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); -	call_rcu(&psf->rcu, ip_sf_socklist_reclaim); +	kfree_rcu(psf, rcu);  	return err;  } - -static void ip_mc_socklist_reclaim(struct rcu_head *rp) -{ -	kfree(container_of(rp, struct ip_mc_socklist, rcu)); -	/* sk_omem_alloc should have been decreased by the caller*/ -} - -  /*   *	Ask a socket to leave a group.   */ @@ -1887,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)  	rtnl_lock();  	in_dev = ip_mc_find_dev(net, imr); +	if (!in_dev) { +		ret = -ENODEV; +		goto out; +	}  	ifindex = imr->imr_ifindex;  	for (imlp = &inet->mc_list;  	     (iml = rtnl_dereference(*imlp)) != NULL; @@ -1904,19 +1965,18 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)  		*imlp = iml->next_rcu; -		if (in_dev) -			ip_mc_dec_group(in_dev, group); +		ip_mc_dec_group(in_dev, group);  		rtnl_unlock();  		/* decrease mem now to avoid the memleak warning */  		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); -		call_rcu(&iml->rcu, ip_mc_socklist_reclaim); +		kfree_rcu(iml, rcu);  		return 0;  	} -	if (!in_dev) -		ret = -ENODEV; +out:  	rtnl_unlock();  	return ret;  } +EXPORT_SYMBOL(ip_mc_leave_group);  int ip_mc_source(int add, int omode, struct sock *sk, struct  	ip_mreq_source *mreqs, int ifindex) @@ -1977,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		if (!psl)  			goto done;	/* err = -EADDRNOTAVAIL */  		rv = !0; -		for (i=0; i<psl->sl_count; i++) { +		for (i = 0; i < psl->sl_count; i++) {  			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,  				sizeof(__be32));  			if (rv == 0) @@ -1996,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,  			&mreqs->imr_sourceaddr, 1); -		for (j=i+1; j<psl->sl_count; j++) +		for (j = i+1; j < psl->sl_count; j++)  			psl->sl_addr[j-1] = psl->sl_addr[j];  		psl->sl_count--;  		err = 0; @@ -2022,17 +2082,17 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  		newpsl->sl_max = count;  		newpsl->sl_count = count - IP_SFBLOCK;  		if (psl) { -			for (i=0; i<psl->sl_count; i++) +			for (i = 0; i < psl->sl_count; i++)  				newpsl->sl_addr[i] = psl->sl_addr[i];  			/* decrease mem now to avoid the memleak warning */  			atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); -			call_rcu(&psl->rcu, ip_sf_socklist_reclaim); +			kfree_rcu(psl, rcu);  		}  		rcu_assign_pointer(pmc->sflist, newpsl);  		psl = newpsl;  	}  	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */ -	for (i=0; i<psl->sl_count; i++) { +	for (i = 0; i < psl->sl_count; i++) {  		rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,  			sizeof(__be32));  		if (rv == 0) @@ -2040,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct  	}  	if (rv == 0)		/* address already there is an error */  		goto done; -	for (j=psl->sl_count-1; j>=i; j--) +	for (j = psl->sl_count-1; j >= i; j--)  		psl->sl_addr[j+1] = psl->sl_addr[j];  	psl->sl_addr[i] = mreqs->imr_sourceaddr;  	psl->sl_count++; @@ -2127,7 +2187,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)  			psl->sl_count, psl->sl_addr, 0);  		/* decrease mem now to avoid the memleak warning */  		atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); -		call_rcu(&psl->rcu, ip_sf_socklist_reclaim); +		kfree_rcu(psl, rcu);  	} else  		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,  			0, NULL, 0); @@ -2239,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,  	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {  		return -EFAULT;  	} -	for (i=0; i<copycount; i++) { +	for (i = 0; i < copycount; i++) {  		struct sockaddr_storage ss;  		psin = (struct sockaddr_in *)&ss; @@ -2284,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)  	if (!psl)  		goto unlock; -	for (i=0; i<psl->sl_count; i++) { +	for (i = 0; i < psl->sl_count; i++) {  		if (psl->sl_addr[i] == rmt_addr)  			break;  	} @@ -2324,27 +2384,40 @@ void ip_mc_drop_socket(struct sock *sk)  			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);  		/* decrease mem now to avoid the memleak warning */  		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); -		call_rcu(&iml->rcu, ip_mc_socklist_reclaim); +		kfree_rcu(iml, rcu);  	}  	rtnl_unlock();  } -int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +/* called with rcu_read_lock() */ +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)  {  	struct ip_mc_list *im; +	struct ip_mc_list __rcu **mc_hash;  	struct ip_sf_list *psf;  	int rv = 0; -	rcu_read_lock(); -	for_each_pmc_rcu(in_dev, im) { -		if (im->multiaddr == mc_addr) -			break; +	mc_hash = rcu_dereference(in_dev->mc_hash); +	if (mc_hash) { +		u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); + +		for (im = rcu_dereference(mc_hash[hash]); +		     im != NULL; +		     im = rcu_dereference(im->next_hash)) { +			if (im->multiaddr == mc_addr) +				break; +		} +	} else { +		for_each_pmc_rcu(in_dev, im) { +			if (im->multiaddr == mc_addr) +				break; +		}  	}  	if (im && proto == IPPROTO_IGMP) {  		rv = 1;  	} else if (im) {  		if (src_addr) { -			for (psf=im->sources; psf; psf=psf->sf_next) { +			for (psf = im->sources; psf; psf = psf->sf_next) {  				if (psf->sf_inaddr == src_addr)  					break;  			} @@ -2357,7 +2430,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p  		} else  			rv = 1; /* unspecified source; tentatively allow */  	} -	rcu_read_unlock();  	return rv;  } @@ -2457,6 +2529,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)  		struct ip_mc_list *im = (struct ip_mc_list *)v;  		struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);  		char   *querier; +		long delta; +  #ifdef CONFIG_IP_MULTICAST  		querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :  			  IGMP_V2_SEEN(state->in_dev) ? "V2" : @@ -2470,11 +2544,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)  				   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);  		} +		delta = im->timer.expires - jiffies;  		seq_printf(seq,  			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",  			   im->multiaddr, im->users, -			   im->tm_running, im->tm_running ? -			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0, +			   im->tm_running, +			   im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,  			   im->reporter);  	}  	return 0; @@ -2656,33 +2731,72 @@ static int __net_init igmp_net_init(struct net *net)  {  	struct proc_dir_entry *pde; -	pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops); +	pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);  	if (!pde)  		goto out_igmp; -	pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); +	pde = proc_create("mcfilter", S_IRUGO, net->proc_net, +			  &igmp_mcf_seq_fops);  	if (!pde)  		goto out_mcfilter;  	return 0;  out_mcfilter: -	proc_net_remove(net, "igmp"); +	remove_proc_entry("igmp", net->proc_net);  out_igmp:  	return -ENOMEM;  }  static void __net_exit igmp_net_exit(struct net *net)  { -	proc_net_remove(net, "mcfilter"); -	proc_net_remove(net, "igmp"); +	remove_proc_entry("mcfilter", net->proc_net); +	remove_proc_entry("igmp", net->proc_net);  }  static struct pernet_operations igmp_net_ops = {  	.init = igmp_net_init,  	.exit = igmp_net_exit,  }; +#endif -int __init igmp_mc_proc_init(void) +static int igmp_netdev_event(struct notifier_block *this, +			     unsigned long event, void *ptr)  { -	return register_pernet_subsys(&igmp_net_ops); +	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct in_device *in_dev; + +	switch (event) { +	case NETDEV_RESEND_IGMP: +		in_dev = __in_dev_get_rtnl(dev); +		if (in_dev) +			ip_mc_rejoin_groups(in_dev); +		break; +	default: +		break; +	} +	return NOTIFY_DONE;  } + +static struct notifier_block igmp_notifier = { +	.notifier_call = igmp_netdev_event, +}; + +int __init igmp_mc_init(void) +{ +#if defined(CONFIG_PROC_FS) +	int err; + +	err = register_pernet_subsys(&igmp_net_ops); +	if (err) +		return err; +	err = register_netdevice_notifier(&igmp_notifier); +	if (err) +		goto reg_notif_fail; +	return 0; + +reg_notif_fail: +	unregister_pernet_subsys(&igmp_net_ops); +	return err; +#else +	return register_netdevice_notifier(&igmp_notifier);  #endif +} diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 06f5f8f482f..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,36 +29,26 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";  EXPORT_SYMBOL(inet_csk_timer_bug_msg);  #endif -/* - * This struct holds the first and last local port number. - */ -struct local_ports sysctl_local_ports __read_mostly = { -	.lock = SEQLOCK_UNLOCKED, -	.range = { 32768, 61000 }, -}; - -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - -void inet_get_local_port_range(int *low, int *high) +void inet_get_local_port_range(struct net *net, int *low, int *high)  { -	unsigned seq; +	unsigned int seq; +  	do { -		seq = read_seqbegin(&sysctl_local_ports.lock); +		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); -		*low = sysctl_local_ports.range[0]; -		*high = sysctl_local_ports.range[1]; -	} while (read_seqretry(&sysctl_local_ports.lock, seq)); +		*low = net->ipv4.ip_local_ports.range[0]; +		*high = net->ipv4.ip_local_ports.range[1]; +	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));  }  EXPORT_SYMBOL(inet_get_local_port_range);  int inet_csk_bind_conflict(const struct sock *sk, -			   const struct inet_bind_bucket *tb) +			   const struct inet_bind_bucket *tb, bool relax)  { -	const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);  	struct sock *sk2; -	struct hlist_node *node;  	int reuse = sk->sk_reuse; +	int reuseport = sk->sk_reuseport; +	kuid_t uid = sock_i_uid((struct sock *)sk);  	/*  	 * Unlike other sk lookup places we do not check @@ -67,22 +57,32 @@ int inet_csk_bind_conflict(const struct sock *sk,  	 * one this bucket belongs to.  	 */ -	sk_for_each_bound(sk2, node, &tb->owners) { +	sk_for_each_bound(sk2, &tb->owners) {  		if (sk != sk2 &&  		    !inet_v6_ipv6only(sk2) &&  		    (!sk->sk_bound_dev_if ||  		     !sk2->sk_bound_dev_if ||  		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { -			if (!reuse || !sk2->sk_reuse || -			    sk2->sk_state == TCP_LISTEN) { -				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); -				if (!sk2_rcv_saddr || !sk_rcv_saddr || -				    sk2_rcv_saddr == sk_rcv_saddr) +			if ((!reuse || !sk2->sk_reuse || +			    sk2->sk_state == TCP_LISTEN) && +			    (!reuseport || !sk2->sk_reuseport || +			    (sk2->sk_state != TCP_TIME_WAIT && +			     !uid_eq(uid, sock_i_uid(sk2))))) { + +				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || +				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr) +					break; +			} +			if (!relax && reuse && sk2->sk_reuse && +			    sk2->sk_state != TCP_LISTEN) { + +				if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || +				    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)  					break;  			}  		}  	} -	return node != NULL; +	return sk2 != NULL;  }  EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); @@ -93,42 +93,49 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)  {  	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;  	struct inet_bind_hashbucket *head; -	struct hlist_node *node;  	struct inet_bind_bucket *tb;  	int ret, attempts = 5;  	struct net *net = sock_net(sk);  	int smallest_size = -1, smallest_rover; +	kuid_t uid = sock_i_uid(sk);  	local_bh_disable();  	if (!snum) {  		int remaining, rover, low, high;  again: -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1; -		smallest_rover = rover = net_random() % remaining + low; +		smallest_rover = rover = prandom_u32() % remaining + low;  		smallest_size = -1;  		do { -			if (inet_is_reserved_local_port(rover)) +			if (inet_is_local_reserved_port(net, rover))  				goto next_nolock;  			head = &hashinfo->bhash[inet_bhashfn(net, rover,  					hashinfo->bhash_size)];  			spin_lock(&head->lock); -			inet_bind_bucket_for_each(tb, node, &head->chain) +			inet_bind_bucket_for_each(tb, &head->chain)  				if (net_eq(ib_net(tb), net) && tb->port == rover) { -					if (tb->fastreuse > 0 && -					    sk->sk_reuse && -					    sk->sk_state != TCP_LISTEN && +					if (((tb->fastreuse > 0 && +					      sk->sk_reuse && +					      sk->sk_state != TCP_LISTEN) || +					     (tb->fastreuseport > 0 && +					      sk->sk_reuseport && +					      uid_eq(tb->fastuid, uid))) &&  					    (tb->num_owners < smallest_size || smallest_size == -1)) {  						smallest_size = tb->num_owners;  						smallest_rover = rover; -						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { -							spin_unlock(&head->lock); +						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && +						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {  							snum = smallest_rover; -							goto have_snum; +							goto tb_found;  						}  					} +					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { +						snum = rover; +						goto tb_found; +					}  					goto next;  				}  			break; @@ -162,7 +169,7 @@ have_snum:  		head = &hashinfo->bhash[inet_bhashfn(net, snum,  				hashinfo->bhash_size)];  		spin_lock(&head->lock); -		inet_bind_bucket_for_each(tb, node, &head->chain) +		inet_bind_bucket_for_each(tb, &head->chain)  			if (net_eq(ib_net(tb), net) && tb->port == snum)  				goto tb_found;  	} @@ -170,18 +177,26 @@ have_snum:  	goto tb_not_found;  tb_found:  	if (!hlist_empty(&tb->owners)) { -		if (tb->fastreuse > 0 && -		    sk->sk_reuse && sk->sk_state != TCP_LISTEN && +		if (sk->sk_reuse == SK_FORCE_REUSE) +			goto success; + +		if (((tb->fastreuse > 0 && +		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) || +		     (tb->fastreuseport > 0 && +		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&  		    smallest_size == -1) {  			goto success;  		} else {  			ret = 1; -			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { -				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && +			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { +				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || +				     (tb->fastreuseport > 0 && +				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&  				    smallest_size != -1 && --attempts >= 0) {  					spin_unlock(&head->lock);  					goto again;  				} +  				goto fail_unlock;  			}  		} @@ -196,9 +211,19 @@ tb_not_found:  			tb->fastreuse = 1;  		else  			tb->fastreuse = 0; -	} else if (tb->fastreuse && -		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) -		tb->fastreuse = 0; +		if (sk->sk_reuseport) { +			tb->fastreuseport = 1; +			tb->fastuid = uid; +		} else +			tb->fastreuseport = 0; +	} else { +		if (tb->fastreuse && +		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) +			tb->fastreuse = 0; +		if (tb->fastreuseport && +		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) +			tb->fastreuseport = 0; +	}  success:  	if (!inet_csk(sk)->icsk_bind_hash)  		inet_bind_hash(sk, tb, snum); @@ -267,7 +292,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)  struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)  {  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct request_sock_queue *queue = &icsk->icsk_accept_queue;  	struct sock *newsk; +	struct request_sock *req;  	int error;  	lock_sock(sk); @@ -280,7 +307,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)  		goto out_err;  	/* Find already established connection */ -	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { +	if (reqsk_queue_empty(queue)) {  		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);  		/* If this is a non blocking socket don't sleep */ @@ -292,14 +319,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)  		if (error)  			goto out_err;  	} - -	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); -	WARN_ON(newsk->sk_state == TCP_SYN_RECV); +	req = reqsk_queue_remove(queue); +	newsk = req->sk; + +	sk_acceptq_removed(sk); +	if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { +		spin_lock_bh(&queue->fastopenq->lock); +		if (tcp_rsk(req)->listener) { +			/* We are still waiting for the final ACK from 3WHS +			 * so can't free req now. Instead, we set req->sk to +			 * NULL to signify that the child socket is taken +			 * so reqsk_fastopen_remove() will free the req +			 * when 3WHS finishes (or is aborted). +			 */ +			req->sk = NULL; +			req = NULL; +		} +		spin_unlock_bh(&queue->fastopenq->lock); +	}  out:  	release_sock(sk); +	if (req) +		__reqsk_free(req);  	return newsk;  out_err:  	newsk = NULL; +	req = NULL;  	*err = error;  	goto out;  } @@ -351,27 +396,26 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)  EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);  struct dst_entry *inet_csk_route_req(struct sock *sk, +				     struct flowi4 *fl4,  				     const struct request_sock *req)  {  	struct rtable *rt;  	const struct inet_request_sock *ireq = inet_rsk(req); -	struct ip_options *opt = inet_rsk(req)->opt; -	struct flowi fl = { .oif = sk->sk_bound_dev_if, -			    .mark = sk->sk_mark, -			    .fl4_dst = ((opt && opt->srr) ? -					  opt->faddr : ireq->rmt_addr), -			    .fl4_src = ireq->loc_addr, -			    .fl4_tos = RT_CONN_FLAGS(sk), -			    .proto = sk->sk_protocol, -			    .flags = inet_sk_flowi_flags(sk), -			    .fl_ip_sport = inet_sk(sk)->inet_sport, -			    .fl_ip_dport = ireq->rmt_port }; +	struct ip_options_rcu *opt = inet_rsk(req)->opt;  	struct net *net = sock_net(sk); - -	security_req_classify_flow(req, &fl); -	if (ip_route_output_flow(net, &rt, &fl, sk, 0)) +	int flags = inet_sk_flowi_flags(sk); + +	flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, +			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, +			   sk->sk_protocol, +			   flags, +			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); +	security_req_classify_flow(req, flowi4_to_flowi(fl4)); +	rt = ip_route_output_flow(net, fl4, sk); +	if (IS_ERR(rt))  		goto no_route; -	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) +	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)  		goto route_err;  	return &rt->dst; @@ -383,13 +427,51 @@ no_route:  }  EXPORT_SYMBOL_GPL(inet_csk_route_req); +struct dst_entry *inet_csk_route_child_sock(struct sock *sk, +					    struct sock *newsk, +					    const struct request_sock *req) +{ +	const struct inet_request_sock *ireq = inet_rsk(req); +	struct inet_sock *newinet = inet_sk(newsk); +	struct ip_options_rcu *opt; +	struct net *net = sock_net(sk); +	struct flowi4 *fl4; +	struct rtable *rt; + +	fl4 = &newinet->cork.fl.u.ip4; + +	rcu_read_lock(); +	opt = rcu_dereference(newinet->inet_opt); +	flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, +			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, +			   sk->sk_protocol, inet_sk_flowi_flags(sk), +			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); +	security_req_classify_flow(req, flowi4_to_flowi(fl4)); +	rt = ip_route_output_flow(net, fl4, sk); +	if (IS_ERR(rt)) +		goto no_route; +	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) +		goto route_err; +	rcu_read_unlock(); +	return &rt->dst; + +route_err: +	ip_rt_put(rt); +no_route: +	rcu_read_unlock(); +	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); +	return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); +  static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,  				 const u32 rnd, const u32 synq_hsize)  {  	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);  } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  #define AF_INET_FAMILY(fam) ((fam) == AF_INET)  #else  #define AF_INET_FAMILY(fam) 1 @@ -410,9 +492,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,  	     prev = &req->dl_next) {  		const struct inet_request_sock *ireq = inet_rsk(req); -		if (ireq->rmt_port == rport && -		    ireq->rmt_addr == raddr && -		    ireq->loc_addr == laddr && +		if (ireq->ir_rmt_port == rport && +		    ireq->ir_rmt_addr == raddr && +		    ireq->ir_loc_addr == laddr &&  		    AF_INET_FAMILY(req->rsk_ops->family)) {  			WARN_ON(req->sk);  			*prevp = prev; @@ -429,7 +511,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; -	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, +	const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr, +				     inet_rsk(req)->ir_rmt_port,  				     lopt->hash_rnd, lopt->nr_table_entries);  	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -448,21 +531,31 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,  				  int *expire, int *resend)  {  	if (!rskq_defer_accept) { -		*expire = req->retrans >= thresh; +		*expire = req->num_timeout >= thresh;  		*resend = 1;  		return;  	} -	*expire = req->retrans >= thresh && -		  (!inet_rsk(req)->acked || req->retrans >= max_retries); +	*expire = req->num_timeout >= thresh && +		  (!inet_rsk(req)->acked || req->num_timeout >= max_retries);  	/*  	 * Do not resend while waiting for data after ACK,  	 * start to resend on end of deferring period to give  	 * last chance for data or ACK to create established socket.  	 */  	*resend = !inet_rsk(req)->acked || -		  req->retrans >= rskq_defer_accept - 1; +		  req->num_timeout >= rskq_defer_accept - 1;  } +int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) +{ +	int err = req->rsk_ops->rtx_syn_ack(parent, req); + +	if (!err) +		req->num_retrans++; +	return err; +} +EXPORT_SYMBOL(inet_rtx_syn_ack); +  void inet_csk_reqsk_queue_prune(struct sock *parent,  				const unsigned long interval,  				const unsigned long timeout, @@ -482,7 +575,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,  	/* Normally all the openreqs are young and become mature  	 * (i.e. converted to established socket) for first timeout. -	 * If synack was not acknowledged for 3 seconds, it means +	 * If synack was not acknowledged for 1 second, it means  	 * one of the following things: synack was lost, ack was lost,  	 * rtt is high or nobody planned to ack (i.e. synflood).  	 * When server is a bit loaded, queue is populated with old @@ -523,17 +616,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,  				syn_ack_recalc(req, thresh, max_retries,  					       queue->rskq_defer_accept,  					       &expire, &resend); -				if (req->rsk_ops->syn_ack_timeout) -					req->rsk_ops->syn_ack_timeout(parent, req); +				req->rsk_ops->syn_ack_timeout(parent, req);  				if (!expire &&  				    (!resend || -				     !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || +				     !inet_rtx_syn_ack(parent, req) ||  				     inet_rsk(req)->acked)) {  					unsigned long timeo; -					if (req->retrans++ == 0) +					if (req->num_timeout++ == 0)  						lopt->qlen_young--; -					timeo = min((timeout << req->retrans), max_rto); +					timeo = min(timeout << req->num_timeout, +						    max_rto);  					req->expires = now + timeo;  					reqp = &req->dl_next;  					continue; @@ -559,10 +652,19 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,  }  EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); -struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, -			    const gfp_t priority) +/** + *	inet_csk_clone_lock - clone an inet socket, and lock its clone + *	@sk: the socket to clone + *	@req: request_sock + *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + *	Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *inet_csk_clone_lock(const struct sock *sk, +				 const struct request_sock *req, +				 const gfp_t priority)  { -	struct sock *newsk = sk_clone(sk, priority); +	struct sock *newsk = sk_clone_lock(sk, priority);  	if (newsk != NULL) {  		struct inet_connection_sock *newicsk = inet_csk(newsk); @@ -570,11 +672,13 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,  		newsk->sk_state = TCP_SYN_RECV;  		newicsk->icsk_bind_hash = NULL; -		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; -		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); -		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; +		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; +		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; +		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);  		newsk->sk_write_space = sk_stream_write_space; +		newsk->sk_mark = inet_rsk(req)->ir_mark; +  		newicsk->icsk_retransmits = 0;  		newicsk->icsk_backoff	  = 0;  		newicsk->icsk_probes_out  = 0; @@ -586,7 +690,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,  	}  	return newsk;  } -EXPORT_SYMBOL_GPL(inet_csk_clone); +EXPORT_SYMBOL_GPL(inet_csk_clone_lock);  /*   * At this point, there should be no process reference to this @@ -618,6 +722,23 @@ void inet_csk_destroy_sock(struct sock *sk)  }  EXPORT_SYMBOL(inet_csk_destroy_sock); +/* This function allows to force a closure of a socket after the call to + * tcp/dccp_create_openreq_child(). + */ +void inet_csk_prepare_forced_close(struct sock *sk) +	__releases(&sk->sk_lock.slock) +{ +	/* sk_clone_lock locked the socket and set refcnt to 2 */ +	bh_unlock_sock(sk); +	sock_put(sk); + +	/* The below has to be done to allow calling inet_csk_destroy_sock */ +	sock_set_flag(sk, SOCK_DEAD); +	percpu_counter_inc(sk->sk_prot->orphan_count); +	inet_sk(sk)->inet_num = 0; +} +EXPORT_SYMBOL(inet_csk_prepare_forced_close); +  int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)  {  	struct inet_sock *inet = inet_sk(sk); @@ -659,13 +780,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);  void inet_csk_listen_stop(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct request_sock_queue *queue = &icsk->icsk_accept_queue;  	struct request_sock *acc_req;  	struct request_sock *req;  	inet_csk_delete_keepalive_timer(sk);  	/* make all the listen_opt local to us */ -	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); +	acc_req = reqsk_queue_yank_acceptq(queue);  	/* Following specs, it would be better either to send FIN  	 * (and enter FIN-WAIT-1, it is normal close) @@ -675,7 +797,7 @@ void inet_csk_listen_stop(struct sock *sk)  	 * To be honest, we are not able to make either  	 * of the variants now.			--ANK  	 */ -	reqsk_queue_destroy(&icsk->icsk_accept_queue); +	reqsk_queue_destroy(queue);  	while ((req = acc_req) != NULL) {  		struct sock *child = req->sk; @@ -693,6 +815,19 @@ void inet_csk_listen_stop(struct sock *sk)  		percpu_counter_inc(sk->sk_prot->orphan_count); +		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { +			BUG_ON(tcp_sk(child)->fastopen_rsk != req); +			BUG_ON(sk != tcp_rsk(req)->listener); + +			/* Paranoid, to prevent race condition if +			 * an inbound pkt destined for child is +			 * blocked by sock lock in tcp_v4_rcv(). +			 * Also to satisfy an assertion in +			 * tcp_v4_destroy_sock(). +			 */ +			tcp_sk(child)->fastopen_rsk = NULL; +			sock_put(sk); +		}  		inet_csk_destroy_sock(child);  		bh_unlock_sock(child); @@ -702,6 +837,17 @@ void inet_csk_listen_stop(struct sock *sk)  		sk_acceptq_removed(sk);  		__reqsk_free(req);  	} +	if (queue->fastopenq != NULL) { +		/* Free all the reqs queued in rskq_rst_head. */ +		spin_lock_bh(&queue->fastopenq->lock); +		acc_req = queue->fastopenq->rskq_rst_head; +		queue->fastopenq->rskq_rst_head = NULL; +		spin_unlock_bh(&queue->fastopenq->lock); +		while ((req = acc_req) != NULL) { +			acc_req = req->dl_next; +			__reqsk_free(req); +		} +	}  	WARN_ON(sk->sk_ack_backlog);  }  EXPORT_SYMBOL_GPL(inet_csk_listen_stop); @@ -744,3 +890,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,  }  EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);  #endif + +static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) +{ +	const struct inet_sock *inet = inet_sk(sk); +	const struct ip_options_rcu *inet_opt; +	__be32 daddr = inet->inet_daddr; +	struct flowi4 *fl4; +	struct rtable *rt; + +	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	fl4 = &fl->u.ip4; +	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, +				   inet->inet_saddr, inet->inet_dport, +				   inet->inet_sport, sk->sk_protocol, +				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); +	if (IS_ERR(rt)) +		rt = NULL; +	if (rt) +		sk_setup_caps(sk, &rt->dst); +	rcu_read_unlock(); + +	return &rt->dst; +} + +struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) +{ +	struct dst_entry *dst = __sk_dst_check(sk, 0); +	struct inet_sock *inet = inet_sk(sk); + +	if (!dst) { +		dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +		if (!dst) +			goto out; +	} +	dst->ops->update_pmtu(dst, sk, NULL, mtu); + +	dst = __sk_dst_check(sk, 0); +	if (!dst) +		dst = inet_csk_rebuild_route(sk, &inet->cork.fl); +out: +	return dst; +} +EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2ada17129fc..e34dccbc4d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -33,6 +33,7 @@  #include <linux/stddef.h>  #include <linux/inet_diag.h> +#include <linux/sock_diag.h>  static const struct inet_diag_handler **inet_diag_table; @@ -43,26 +44,25 @@ struct inet_diag_entry {  	u16 dport;  	u16 family;  	u16 userlocks; +#if IS_ENABLED(CONFIG_IPV6) +	struct in6_addr saddr_storage;	/* for IPv4-mapped-IPv6 addresses */ +	struct in6_addr daddr_storage;	/* for IPv4-mapped-IPv6 addresses */ +#endif  }; -static struct sock *idiagnl; - -#define INET_DIAG_PUT(skb, attrtype, attrlen) \ -	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -  static DEFINE_MUTEX(inet_diag_table_mutex); -static const struct inet_diag_handler *inet_diag_lock_handler(int type) +static const struct inet_diag_handler *inet_diag_lock_handler(int proto)  { -	if (!inet_diag_table[type]) -		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, -			       NETLINK_INET_DIAG, type); +	if (!inet_diag_table[proto]) +		request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, +			       NETLINK_SOCK_DIAG, AF_INET, proto);  	mutex_lock(&inet_diag_table_mutex); -	if (!inet_diag_table[type]) +	if (!inet_diag_table[proto])  		return ERR_PTR(-ENOENT); -	return inet_diag_table[type]; +	return inet_diag_table[proto];  }  static inline void inet_diag_unlock_handler( @@ -71,71 +71,100 @@ static inline void inet_diag_unlock_handler(  	mutex_unlock(&inet_diag_table_mutex);  } -static int inet_csk_diag_fill(struct sock *sk, -			      struct sk_buff *skb, -			      int ext, u32 pid, u32 seq, u16 nlmsg_flags, +int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, +			      struct sk_buff *skb, struct inet_diag_req_v2 *req, +			      struct user_namespace *user_ns,		      	 +			      u32 portid, u32 seq, u16 nlmsg_flags,  			      const struct nlmsghdr *unlh)  {  	const struct inet_sock *inet = inet_sk(sk); -	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct inet_diag_msg *r;  	struct nlmsghdr  *nlh; +	struct nlattr *attr;  	void *info = NULL; -	struct inet_diag_meminfo  *minfo = NULL; -	unsigned char	 *b = skb_tail_pointer(skb);  	const struct inet_diag_handler *handler; +	int ext = req->idiag_ext; -	handler = inet_diag_table[unlh->nlmsg_type]; +	handler = inet_diag_table[req->sdiag_protocol];  	BUG_ON(handler == NULL); -	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); -	nlh->nlmsg_flags = nlmsg_flags; +	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), +			nlmsg_flags); +	if (!nlh) +		return -EMSGSIZE; -	r = NLMSG_DATA(nlh); +	r = nlmsg_data(nlh);  	BUG_ON(sk->sk_state == TCP_TIME_WAIT); -	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) -		minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo)); - -	if (ext & (1 << (INET_DIAG_INFO - 1))) -		info = INET_DIAG_PUT(skb, INET_DIAG_INFO, -				     handler->idiag_info_size); - -	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { -		const size_t len = strlen(icsk->icsk_ca_ops->name); - -		strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), -		       icsk->icsk_ca_ops->name); -	} -  	r->idiag_family = sk->sk_family;  	r->idiag_state = sk->sk_state;  	r->idiag_timer = 0;  	r->idiag_retrans = 0;  	r->id.idiag_if = sk->sk_bound_dev_if; -	r->id.idiag_cookie[0] = (u32)(unsigned long)sk; -	r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); +	sock_diag_save_cookie(sk, r->id.idiag_cookie);  	r->id.idiag_sport = inet->inet_sport;  	r->id.idiag_dport = inet->inet_dport; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); +  	r->id.idiag_src[0] = inet->inet_rcv_saddr;  	r->id.idiag_dst[0] = inet->inet_daddr; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +	if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) +		goto errout; + +	/* IPv6 dual-stack sockets use inet->tos for IPv4 connections, +	 * hence this needs to be included regardless of socket family. +	 */ +	if (ext & (1 << (INET_DIAG_TOS - 1))) +		if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) +			goto errout; + +#if IS_ENABLED(CONFIG_IPV6)  	if (r->idiag_family == AF_INET6) { -		struct ipv6_pinfo *np = inet6_sk(sk); -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, -			       &np->rcv_saddr); -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, -			       &np->daddr); +		*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; +		*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + +		if (ext & (1 << (INET_DIAG_TCLASS - 1))) +			if (nla_put_u8(skb, INET_DIAG_TCLASS, +				       inet6_sk(sk)->tclass) < 0) +				goto errout;  	}  #endif +	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); +	r->idiag_inode = sock_i_ino(sk); + +	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { +		struct inet_diag_meminfo minfo = { +			.idiag_rmem = sk_rmem_alloc_get(sk), +			.idiag_wmem = sk->sk_wmem_queued, +			.idiag_fmem = sk->sk_forward_alloc, +			.idiag_tmem = sk_wmem_alloc_get(sk), +		}; + +		if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) +			goto errout; +	} + +	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) +		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) +			goto errout; + +	if (icsk == NULL) { +		handler->idiag_get_info(sk, r, NULL); +		goto out; +	} +  #define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) -	if (icsk->icsk_pending == ICSK_TIME_RETRANS) { +	if (icsk->icsk_pending == ICSK_TIME_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {  		r->idiag_timer = 1;  		r->idiag_retrans = icsk->icsk_retransmits;  		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); @@ -153,123 +182,129 @@ static int inet_csk_diag_fill(struct sock *sk,  	}  #undef EXPIRES_IN_MS -	r->idiag_uid = sock_i_uid(sk); -	r->idiag_inode = sock_i_ino(sk); +	if (ext & (1 << (INET_DIAG_INFO - 1))) { +		attr = nla_reserve(skb, INET_DIAG_INFO, +				   sizeof(struct tcp_info)); +		if (!attr) +			goto errout; -	if (minfo) { -		minfo->idiag_rmem = sk_rmem_alloc_get(sk); -		minfo->idiag_wmem = sk->sk_wmem_queued; -		minfo->idiag_fmem = sk->sk_forward_alloc; -		minfo->idiag_tmem = sk_wmem_alloc_get(sk); +		info = nla_data(attr);  	} +	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) +		if (nla_put_string(skb, INET_DIAG_CONG, +				   icsk->icsk_ca_ops->name) < 0) +			goto errout; +  	handler->idiag_get_info(sk, r, info);  	if (sk->sk_state < TCP_TIME_WAIT &&  	    icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)  		icsk->icsk_ca_ops->get_info(sk, ext, skb); -	nlh->nlmsg_len = skb_tail_pointer(skb) - b; -	return skb->len; +out: +	return nlmsg_end(skb, nlh); -rtattr_failure: -nlmsg_failure: -	nlmsg_trim(skb, b); +errout: +	nlmsg_cancel(skb, nlh);  	return -EMSGSIZE;  } +EXPORT_SYMBOL_GPL(inet_sk_diag_fill); + +static int inet_csk_diag_fill(struct sock *sk, +			      struct sk_buff *skb, struct inet_diag_req_v2 *req, +			      struct user_namespace *user_ns, +			      u32 portid, u32 seq, u16 nlmsg_flags, +			      const struct nlmsghdr *unlh) +{ +	return inet_sk_diag_fill(sk, inet_csk(sk), +			skb, req, user_ns, portid, seq, nlmsg_flags, unlh); +}  static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, -			       struct sk_buff *skb, int ext, u32 pid, -			       u32 seq, u16 nlmsg_flags, +			       struct sk_buff *skb, struct inet_diag_req_v2 *req, +			       u32 portid, u32 seq, u16 nlmsg_flags,  			       const struct nlmsghdr *unlh)  { -	long tmo; +	s32 tmo;  	struct inet_diag_msg *r; -	const unsigned char *previous_tail = skb_tail_pointer(skb); -	struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, -					 unlh->nlmsg_type, sizeof(*r)); +	struct nlmsghdr *nlh; -	r = NLMSG_DATA(nlh); -	BUG_ON(tw->tw_state != TCP_TIME_WAIT); +	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), +			nlmsg_flags); +	if (!nlh) +		return -EMSGSIZE; -	nlh->nlmsg_flags = nlmsg_flags; +	r = nlmsg_data(nlh); +	BUG_ON(tw->tw_state != TCP_TIME_WAIT); -	tmo = tw->tw_ttd - jiffies; +	tmo = tw->tw_ttd - inet_tw_time_stamp();  	if (tmo < 0)  		tmo = 0;  	r->idiag_family	      = tw->tw_family;  	r->idiag_retrans      = 0; +  	r->id.idiag_if	      = tw->tw_bound_dev_if; -	r->id.idiag_cookie[0] = (u32)(unsigned long)tw; -	r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1); +	sock_diag_save_cookie(tw, r->id.idiag_cookie); +  	r->id.idiag_sport     = tw->tw_sport;  	r->id.idiag_dport     = tw->tw_dport; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); +  	r->id.idiag_src[0]    = tw->tw_rcv_saddr;  	r->id.idiag_dst[0]    = tw->tw_daddr; +  	r->idiag_state	      = tw->tw_substate;  	r->idiag_timer	      = 3; -	r->idiag_expires      = DIV_ROUND_UP(tmo * 1000, HZ); +	r->idiag_expires      = jiffies_to_msecs(tmo);  	r->idiag_rqueue	      = 0;  	r->idiag_wqueue	      = 0;  	r->idiag_uid	      = 0;  	r->idiag_inode	      = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (tw->tw_family == AF_INET6) { -		const struct inet6_timewait_sock *tw6 = -						inet6_twsk((struct sock *)tw); - -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, -			       &tw6->tw_v6_rcv_saddr); -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, -			       &tw6->tw_v6_daddr); +		*(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; +		*(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;  	}  #endif -	nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail; -	return skb->len; -nlmsg_failure: -	nlmsg_trim(skb, previous_tail); -	return -EMSGSIZE; + +	return nlmsg_end(skb, nlh);  }  static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, -			int ext, u32 pid, u32 seq, u16 nlmsg_flags, +			struct inet_diag_req_v2 *r, +			struct user_namespace *user_ns, +			u32 portid, u32 seq, u16 nlmsg_flags,  			const struct nlmsghdr *unlh)  {  	if (sk->sk_state == TCP_TIME_WAIT) -		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, -					   skb, ext, pid, seq, nlmsg_flags, -					   unlh); -	return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh); +		return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, +					   nlmsg_flags, unlh); + +	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, +				  nlmsg_flags, unlh);  } -static int inet_diag_get_exact(struct sk_buff *in_skb, -			       const struct nlmsghdr *nlh) +int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, +		const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)  {  	int err;  	struct sock *sk; -	struct inet_diag_req *req = NLMSG_DATA(nlh);  	struct sk_buff *rep; -	struct inet_hashinfo *hashinfo; -	const struct inet_diag_handler *handler; - -	handler = inet_diag_lock_handler(nlh->nlmsg_type); -	if (IS_ERR(handler)) { -		err = PTR_ERR(handler); -		goto unlock; -	} +	struct net *net = sock_net(in_skb->sk); -	hashinfo = handler->idiag_hashinfo;  	err = -EINVAL; - -	if (req->idiag_family == AF_INET) { -		sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0], +	if (req->sdiag_family == AF_INET) { +		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],  				 req->id.idiag_dport, req->id.idiag_src[0],  				 req->id.idiag_sport, req->id.idiag_if);  	} -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -	else if (req->idiag_family == AF_INET6) { -		sk = inet6_lookup(&init_net, hashinfo, +#if IS_ENABLED(CONFIG_IPV6) +	else if (req->sdiag_family == AF_INET6) { +		sk = inet6_lookup(net, hashinfo,  				  (struct in6_addr *)req->id.idiag_dst,  				  req->id.idiag_dport,  				  (struct in6_addr *)req->id.idiag_src, @@ -278,50 +313,62 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,  	}  #endif  	else { -		goto unlock; +		goto out_nosk;  	}  	err = -ENOENT;  	if (sk == NULL) -		goto unlock; +		goto out_nosk; -	err = -ESTALE; -	if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || -	     req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && -	    ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || -	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) +	err = sock_diag_check_cookie(sk, req->id.idiag_cookie); +	if (err)  		goto out; -	err = -ENOMEM; -	rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + -				     sizeof(struct inet_diag_meminfo) + -				     handler->idiag_info_size + 64)), -			GFP_KERNEL); -	if (!rep) +	rep = nlmsg_new(sizeof(struct inet_diag_msg) + +			sizeof(struct inet_diag_meminfo) + +			sizeof(struct tcp_info) + 64, GFP_KERNEL); +	if (!rep) { +		err = -ENOMEM;  		goto out; +	} -	err = sk_diag_fill(sk, rep, req->idiag_ext, -			   NETLINK_CB(in_skb).pid, +	err = sk_diag_fill(sk, rep, req, +			   sk_user_ns(NETLINK_CB(in_skb).sk), +			   NETLINK_CB(in_skb).portid,  			   nlh->nlmsg_seq, 0, nlh);  	if (err < 0) {  		WARN_ON(err == -EMSGSIZE); -		kfree_skb(rep); +		nlmsg_free(rep);  		goto out;  	} -	err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,  			      MSG_DONTWAIT);  	if (err > 0)  		err = 0;  out: -	if (sk) { -		if (sk->sk_state == TCP_TIME_WAIT) -			inet_twsk_put((struct inet_timewait_sock *)sk); -		else -			sock_put(sk); -	} -unlock: +	if (sk) +		sock_gen_put(sk); + +out_nosk: +	return err; +} +EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); + +static int inet_diag_get_exact(struct sk_buff *in_skb, +			       const struct nlmsghdr *nlh, +			       struct inet_diag_req_v2 *req) +{ +	const struct inet_diag_handler *handler; +	int err; + +	handler = inet_diag_lock_handler(req->sdiag_protocol); +	if (IS_ERR(handler)) +		err = PTR_ERR(handler); +	else +		err = handler->dump_one(in_skb, nlh, req);  	inet_diag_unlock_handler(handler); +  	return err;  } @@ -352,9 +399,12 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)  } -static int inet_diag_bc_run(const void *bc, int len, -			    const struct inet_diag_entry *entry) +static int inet_diag_bc_run(const struct nlattr *_bc, +		const struct inet_diag_entry *entry)  { +	const void *bc = nla_data(_bc); +	int len = nla_len(_bc); +  	while (len > 0) {  		int yes = 1;  		const struct inet_diag_bc_op *op = bc; @@ -393,25 +443,31 @@ static int inet_diag_bc_run(const void *bc, int len,  				break;  			} -			if (cond->prefix_len == 0) -				break; -  			if (op->code == INET_DIAG_BC_S_COND)  				addr = entry->saddr;  			else  				addr = entry->daddr; +			if (cond->family != AF_UNSPEC && +			    cond->family != entry->family) { +				if (entry->family == AF_INET6 && +				    cond->family == AF_INET) { +					if (addr[0] == 0 && addr[1] == 0 && +					    addr[2] == htonl(0xffff) && +					    bitstring_match(addr + 3, +							    cond->addr, +							    cond->prefix_len)) +						break; +				} +				yes = 0; +				break; +			} + +			if (cond->prefix_len == 0) +				break;  			if (bitstring_match(addr, cond->addr,  					    cond->prefix_len))  				break; -			if (entry->family == AF_INET6 && -			    cond->family == AF_INET) { -				if (addr[0] == 0 && addr[1] == 0 && -				    addr[2] == htonl(0xffff) && -				    bitstring_match(addr + 3, cond->addr, -						    cond->prefix_len)) -					break; -			}  			yes = 0;  			break;  		} @@ -428,6 +484,34 @@ static int inet_diag_bc_run(const void *bc, int len,  	return len == 0;  } +int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +{ +	struct inet_diag_entry entry; +	struct inet_sock *inet = inet_sk(sk); + +	if (bc == NULL) +		return 1; + +	entry.family = sk->sk_family; +#if IS_ENABLED(CONFIG_IPV6) +	if (entry.family == AF_INET6) { + +		entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; +		entry.daddr = sk->sk_v6_daddr.s6_addr32; +	} else +#endif +	{ +		entry.saddr = &inet->inet_rcv_saddr; +		entry.daddr = &inet->inet_daddr; +	} +	entry.sport = inet->inet_num; +	entry.dport = ntohs(inet->inet_dport); +	entry.userlocks = sk->sk_userlocks; + +	return inet_diag_bc_run(bc, &entry); +} +EXPORT_SYMBOL_GPL(inet_diag_bc_sk); +  static int valid_cc(const void *bc, int len, int cc)  {  	while (len >= 0) { @@ -437,7 +521,7 @@ static int valid_cc(const void *bc, int len, int cc)  			return 0;  		if (cc == len)  			return 1; -		if (op->yes < 4) +		if (op->yes < 4 || op->yes & 3)  			return 0;  		len -= op->yes;  		bc  += op->yes; @@ -445,39 +529,96 @@ static int valid_cc(const void *bc, int len, int cc)  	return 0;  } +/* Validate an inet_diag_hostcond. */ +static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, +			   int *min_len) +{ +	int addr_len; +	struct inet_diag_hostcond *cond; + +	/* Check hostcond space. */ +	*min_len += sizeof(struct inet_diag_hostcond); +	if (len < *min_len) +		return false; +	cond = (struct inet_diag_hostcond *)(op + 1); + +	/* Check address family and address length. */ +	switch (cond->family) { +	case AF_UNSPEC: +		addr_len = 0; +		break; +	case AF_INET: +		addr_len = sizeof(struct in_addr); +		break; +	case AF_INET6: +		addr_len = sizeof(struct in6_addr); +		break; +	default: +		return false; +	} +	*min_len += addr_len; +	if (len < *min_len) +		return false; + +	/* Check prefix length (in bits) vs address length (in bytes). */ +	if (cond->prefix_len > 8 * addr_len) +		return false; + +	return true; +} + +/* Validate a port comparison operator. */ +static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, +					 int len, int *min_len) +{ +	/* Port comparisons put the port in a follow-on inet_diag_bc_op. */ +	*min_len += sizeof(struct inet_diag_bc_op); +	if (len < *min_len) +		return false; +	return true; +} +  static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)  { -	const unsigned char *bc = bytecode; +	const void *bc = bytecode;  	int  len = bytecode_len;  	while (len > 0) { -		struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; +		const struct inet_diag_bc_op *op = bc; +		int min_len = sizeof(struct inet_diag_bc_op);  //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);  		switch (op->code) { -		case INET_DIAG_BC_AUTO:  		case INET_DIAG_BC_S_COND:  		case INET_DIAG_BC_D_COND: +			if (!valid_hostcond(bc, len, &min_len)) +				return -EINVAL; +			break;  		case INET_DIAG_BC_S_GE:  		case INET_DIAG_BC_S_LE:  		case INET_DIAG_BC_D_GE:  		case INET_DIAG_BC_D_LE: -			if (op->yes < 4 || op->yes > len + 4) -				return -EINVAL; -		case INET_DIAG_BC_JMP: -			if (op->no < 4 || op->no > len + 4) -				return -EINVAL; -			if (op->no < len && -			    !valid_cc(bytecode, bytecode_len, len - op->no)) +			if (!valid_port_comparison(bc, len, &min_len))  				return -EINVAL;  			break; +		case INET_DIAG_BC_AUTO: +		case INET_DIAG_BC_JMP:  		case INET_DIAG_BC_NOP: -			if (op->yes < 4 || op->yes > len + 4) -				return -EINVAL;  			break;  		default:  			return -EINVAL;  		} + +		if (op->code != INET_DIAG_BC_NOP) { +			if (op->no < min_len || op->no > len + 4 || op->no & 3) +				return -EINVAL; +			if (op->no < len && +			    !valid_cc(bytecode, bytecode_len, len - op->no)) +				return -EINVAL; +		} + +		if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) +			return -EINVAL;  		bc  += op->yes;  		len -= op->yes;  	} @@ -486,62 +627,35 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)  static int inet_csk_diag_dump(struct sock *sk,  			      struct sk_buff *skb, -			      struct netlink_callback *cb) +			      struct netlink_callback *cb, +			      struct inet_diag_req_v2 *r, +			      const struct nlattr *bc)  { -	struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - -	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { -		struct inet_diag_entry entry; -		const struct nlattr *bc = nlmsg_find_attr(cb->nlh, -							  sizeof(*r), -							  INET_DIAG_REQ_BYTECODE); -		struct inet_sock *inet = inet_sk(sk); - -		entry.family = sk->sk_family; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -		if (entry.family == AF_INET6) { -			struct ipv6_pinfo *np = inet6_sk(sk); - -			entry.saddr = np->rcv_saddr.s6_addr32; -			entry.daddr = np->daddr.s6_addr32; -		} else -#endif -		{ -			entry.saddr = &inet->inet_rcv_saddr; -			entry.daddr = &inet->inet_daddr; -		} -		entry.sport = inet->inet_num; -		entry.dport = ntohs(inet->inet_dport); -		entry.userlocks = sk->sk_userlocks; +	if (!inet_diag_bc_sk(bc, sk)) +		return 0; -		if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) -			return 0; -	} - -	return inet_csk_diag_fill(sk, skb, r->idiag_ext, -				  NETLINK_CB(cb->skb).pid, +	return inet_csk_diag_fill(sk, skb, r, +				  sk_user_ns(NETLINK_CB(cb->skb).sk), +				  NETLINK_CB(cb->skb).portid,  				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);  } -static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, +static int inet_twsk_diag_dump(struct sock *sk,  			       struct sk_buff *skb, -			       struct netlink_callback *cb) +			       struct netlink_callback *cb, +			       struct inet_diag_req_v2 *r, +			       const struct nlattr *bc)  { -	struct inet_diag_req *r = NLMSG_DATA(cb->nlh); +	struct inet_timewait_sock *tw = inet_twsk(sk); -	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { +	if (bc != NULL) {  		struct inet_diag_entry entry; -		const struct nlattr *bc = nlmsg_find_attr(cb->nlh, -							  sizeof(*r), -							  INET_DIAG_REQ_BYTECODE);  		entry.family = tw->tw_family; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  		if (tw->tw_family == AF_INET6) { -			struct inet6_timewait_sock *tw6 = -						inet6_twsk((struct sock *)tw); -			entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32; -			entry.daddr = tw6->tw_v6_daddr.s6_addr32; +			entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; +			entry.daddr = tw->tw_v6_daddr.s6_addr32;  		} else  #endif  		{ @@ -552,77 +666,109 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,  		entry.dport = ntohs(tw->tw_dport);  		entry.userlocks = 0; -		if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) +		if (!inet_diag_bc_run(bc, &entry))  			return 0;  	} -	return inet_twsk_diag_fill(tw, skb, r->idiag_ext, -				   NETLINK_CB(cb->skb).pid, +	return inet_twsk_diag_fill(tw, skb, r, +				   NETLINK_CB(cb->skb).portid,  				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);  } +/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses + * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. + */ +static inline void inet_diag_req_addrs(const struct sock *sk, +				       const struct request_sock *req, +				       struct inet_diag_entry *entry) +{ +	struct inet_request_sock *ireq = inet_rsk(req); + +#if IS_ENABLED(CONFIG_IPV6) +	if (sk->sk_family == AF_INET6) { +		if (req->rsk_ops->family == AF_INET6) { +			entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; +			entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32; +		} else if (req->rsk_ops->family == AF_INET) { +			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, +					       &entry->saddr_storage); +			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, +					       &entry->daddr_storage); +			entry->saddr = entry->saddr_storage.s6_addr32; +			entry->daddr = entry->daddr_storage.s6_addr32; +		} +	} else +#endif +	{ +		entry->saddr = &ireq->ir_loc_addr; +		entry->daddr = &ireq->ir_rmt_addr; +	} +} +  static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, -			      struct request_sock *req, u32 pid, u32 seq, +			      struct request_sock *req, +			      struct user_namespace *user_ns, +			      u32 portid, u32 seq,  			      const struct nlmsghdr *unlh)  {  	const struct inet_request_sock *ireq = inet_rsk(req);  	struct inet_sock *inet = inet_sk(sk); -	unsigned char *b = skb_tail_pointer(skb);  	struct inet_diag_msg *r;  	struct nlmsghdr *nlh;  	long tmo; -	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); -	nlh->nlmsg_flags = NLM_F_MULTI; -	r = NLMSG_DATA(nlh); +	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), +			NLM_F_MULTI); +	if (!nlh) +		return -EMSGSIZE; +	r = nlmsg_data(nlh);  	r->idiag_family = sk->sk_family;  	r->idiag_state = TCP_SYN_RECV;  	r->idiag_timer = 1; -	r->idiag_retrans = req->retrans; +	r->idiag_retrans = req->num_retrans;  	r->id.idiag_if = sk->sk_bound_dev_if; -	r->id.idiag_cookie[0] = (u32)(unsigned long)req; -	r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); +	sock_diag_save_cookie(req, r->id.idiag_cookie);  	tmo = req->expires - jiffies;  	if (tmo < 0)  		tmo = 0;  	r->id.idiag_sport = inet->inet_sport; -	r->id.idiag_dport = ireq->rmt_port; -	r->id.idiag_src[0] = ireq->loc_addr; -	r->id.idiag_dst[0] = ireq->rmt_addr; +	r->id.idiag_dport = ireq->ir_rmt_port; + +	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); +	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + +	r->id.idiag_src[0] = ireq->ir_loc_addr; +	r->id.idiag_dst[0] = ireq->ir_rmt_addr; +  	r->idiag_expires = jiffies_to_msecs(tmo);  	r->idiag_rqueue = 0;  	r->idiag_wqueue = 0; -	r->idiag_uid = sock_i_uid(sk); +	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));  	r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (r->idiag_family == AF_INET6) { -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, -			       &inet6_rsk(req)->loc_addr); -		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, -			       &inet6_rsk(req)->rmt_addr); +		struct inet_diag_entry entry; +		inet_diag_req_addrs(sk, req, &entry); +		memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); +		memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));  	}  #endif -	nlh->nlmsg_len = skb_tail_pointer(skb) - b; -	return skb->len; - -nlmsg_failure: -	nlmsg_trim(skb, b); -	return -1; +	return nlmsg_end(skb, nlh);  }  static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, -			       struct netlink_callback *cb) +			       struct netlink_callback *cb, +			       struct inet_diag_req_v2 *r, +			       const struct nlattr *bc)  {  	struct inet_diag_entry entry; -	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct listen_sock *lopt; -	const struct nlattr *bc = NULL;  	struct inet_sock *inet = inet_sk(sk);  	int j, s_j;  	int reqnum, s_reqnum; @@ -642,9 +788,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,  	if (!lopt || !lopt->qlen)  		goto out; -	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { -		bc = nlmsg_find_attr(cb->nlh, sizeof(*r), -				     INET_DIAG_REQ_BYTECODE); +	if (bc != NULL) {  		entry.sport = inet->inet_num;  		entry.userlocks = sk->sk_userlocks;  	} @@ -658,32 +802,21 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,  			if (reqnum < s_reqnum)  				continue; -			if (r->id.idiag_dport != ireq->rmt_port && +			if (r->id.idiag_dport != ireq->ir_rmt_port &&  			    r->id.idiag_dport)  				continue;  			if (bc) { -				entry.saddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -					(entry.family == AF_INET6) ? -					inet6_rsk(req)->loc_addr.s6_addr32 : -#endif -					&ireq->loc_addr; -				entry.daddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -					(entry.family == AF_INET6) ? -					inet6_rsk(req)->rmt_addr.s6_addr32 : -#endif -					&ireq->rmt_addr; -				entry.dport = ntohs(ireq->rmt_port); +				inet_diag_req_addrs(sk, req, &entry); +				entry.dport = ntohs(ireq->ir_rmt_port); -				if (!inet_diag_bc_run(nla_data(bc), -						      nla_len(bc), &entry)) +				if (!inet_diag_bc_run(bc, &entry))  					continue;  			}  			err = inet_diag_fill_req(skb, sk, req, -					       NETLINK_CB(cb->skb).pid, +					       sk_user_ns(NETLINK_CB(cb->skb).sk), +					       NETLINK_CB(cb->skb).portid,  					       cb->nlh->nlmsg_seq, cb->nlh);  			if (err < 0) {  				cb->args[3] = j + 1; @@ -701,19 +834,12 @@ out:  	return err;  } -static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, +		struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)  {  	int i, num;  	int s_i, s_num; -	struct inet_diag_req *r = NLMSG_DATA(cb->nlh); -	const struct inet_diag_handler *handler; -	struct inet_hashinfo *hashinfo; - -	handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); -	if (IS_ERR(handler)) -		goto unlock; - -	hashinfo = handler->idiag_hashinfo; +	struct net *net = sock_net(skb->sk);  	s_i = cb->args[1];  	s_num = num = cb->args[2]; @@ -733,11 +859,18 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)  			sk_nulls_for_each(sk, node, &ilb->head) {  				struct inet_sock *inet = inet_sk(sk); +				if (!net_eq(sock_net(sk), net)) +					continue; +  				if (num < s_num) {  					num++;  					continue;  				} +				if (r->sdiag_family != AF_UNSPEC && +						sk->sk_family != r->sdiag_family) +					goto next_listen; +  				if (r->id.idiag_sport != inet->inet_sport &&  				    r->id.idiag_sport)  					goto next_listen; @@ -747,7 +880,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)  				    cb->args[3] > 0)  					goto syn_recv; -				if (inet_csk_diag_dump(sk, skb, cb) < 0) { +				if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {  					spin_unlock_bh(&ilb->lock);  					goto done;  				} @@ -756,7 +889,7 @@ syn_recv:  				if (!(r->idiag_states & TCPF_SYN_RECV))  					goto next_listen; -				if (inet_diag_dump_reqs(skb, sk, cb) < 0) { +				if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {  					spin_unlock_bh(&ilb->lock);  					goto done;  				} @@ -778,7 +911,7 @@ skip_listen_ht:  	}  	if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) -		goto unlock; +		goto out;  	for (i = s_i; i <= hashinfo->ehash_mask; i++) {  		struct inet_ehash_bucket *head = &hashinfo->ehash[i]; @@ -788,8 +921,7 @@ skip_listen_ht:  		num = 0; -		if (hlist_nulls_empty(&head->chain) && -			hlist_nulls_empty(&head->twchain)) +		if (hlist_nulls_empty(&head->chain))  			continue;  		if (i > s_i) @@ -797,19 +929,31 @@ skip_listen_ht:  		spin_lock_bh(lock);  		sk_nulls_for_each(sk, node, &head->chain) { -			struct inet_sock *inet = inet_sk(sk); +			int res; +			int state; +			if (!net_eq(sock_net(sk), net)) +				continue;  			if (num < s_num)  				goto next_normal; -			if (!(r->idiag_states & (1 << sk->sk_state))) +			state = (sk->sk_state == TCP_TIME_WAIT) ? +				inet_twsk(sk)->tw_substate : sk->sk_state; +			if (!(r->idiag_states & (1 << state)))  				goto next_normal; -			if (r->id.idiag_sport != inet->inet_sport && +			if (r->sdiag_family != AF_UNSPEC && +			    sk->sk_family != r->sdiag_family) +				goto next_normal; +			if (r->id.idiag_sport != htons(sk->sk_num) &&  			    r->id.idiag_sport)  				goto next_normal; -			if (r->id.idiag_dport != inet->inet_dport && +			if (r->id.idiag_dport != sk->sk_dport &&  			    r->id.idiag_dport)  				goto next_normal; -			if (inet_csk_diag_dump(sk, skb, cb) < 0) { +			if (sk->sk_state == TCP_TIME_WAIT) +				res = inet_twsk_diag_dump(sk, skb, cb, r, bc); +			else +				res = inet_csk_diag_dump(sk, skb, cb, r, bc); +			if (res < 0) {  				spin_unlock_bh(lock);  				goto done;  			} @@ -817,43 +961,95 @@ next_normal:  			++num;  		} -		if (r->idiag_states & TCPF_TIME_WAIT) { -			struct inet_timewait_sock *tw; - -			inet_twsk_for_each(tw, node, -				    &head->twchain) { - -				if (num < s_num) -					goto next_dying; -				if (r->id.idiag_sport != tw->tw_sport && -				    r->id.idiag_sport) -					goto next_dying; -				if (r->id.idiag_dport != tw->tw_dport && -				    r->id.idiag_dport) -					goto next_dying; -				if (inet_twsk_diag_dump(tw, skb, cb) < 0) { -					spin_unlock_bh(lock); -					goto done; -				} -next_dying: -				++num; -			} -		}  		spin_unlock_bh(lock);  	}  done:  	cb->args[1] = i;  	cb->args[2] = num; -unlock: +out: +	; +} +EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); + +static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, +		struct inet_diag_req_v2 *r, struct nlattr *bc) +{ +	const struct inet_diag_handler *handler; +	int err = 0; + +	handler = inet_diag_lock_handler(r->sdiag_protocol); +	if (!IS_ERR(handler)) +		handler->dump(skb, cb, r, bc); +	else +		err = PTR_ERR(handler);  	inet_diag_unlock_handler(handler); -	return skb->len; + +	return err ? : skb->len; +} + +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ +	struct nlattr *bc = NULL; +	int hdrlen = sizeof(struct inet_diag_req_v2); + +	if (nlmsg_attrlen(cb->nlh, hdrlen)) +		bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); + +	return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); +} + +static inline int inet_diag_type2proto(int type) +{ +	switch (type) { +	case TCPDIAG_GETSOCK: +		return IPPROTO_TCP; +	case DCCPDIAG_GETSOCK: +		return IPPROTO_DCCP; +	default: +		return 0; +	}  } -static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)  { +	struct inet_diag_req *rc = nlmsg_data(cb->nlh); +	struct inet_diag_req_v2 req; +	struct nlattr *bc = NULL;  	int hdrlen = sizeof(struct inet_diag_req); +	req.sdiag_family = AF_UNSPEC; /* compatibility */ +	req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); +	req.idiag_ext = rc->idiag_ext; +	req.idiag_states = rc->idiag_states; +	req.id = rc->id; + +	if (nlmsg_attrlen(cb->nlh, hdrlen)) +		bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); + +	return __inet_diag_dump(skb, cb, &req, bc); +} + +static int inet_diag_get_exact_compat(struct sk_buff *in_skb, +			       const struct nlmsghdr *nlh) +{ +	struct inet_diag_req *rc = nlmsg_data(nlh); +	struct inet_diag_req_v2 req; + +	req.sdiag_family = rc->idiag_family; +	req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); +	req.idiag_ext = rc->idiag_ext; +	req.idiag_states = rc->idiag_states; +	req.id = rc->id; + +	return inet_diag_get_exact(in_skb, nlh, &req); +} + +static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) +{ +	int hdrlen = sizeof(struct inet_diag_req); +	struct net *net = sock_net(skb->sk); +  	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||  	    nlmsg_len(nlh) < hdrlen)  		return -EINVAL; @@ -869,29 +1065,62 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  			    inet_diag_bc_audit(nla_data(attr), nla_len(attr)))  				return -EINVAL;  		} - -		return netlink_dump_start(idiagnl, skb, nlh, -					  inet_diag_dump, NULL); +		{ +			struct netlink_dump_control c = { +				.dump = inet_diag_dump_compat, +			}; +			return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); +		}  	} -	return inet_diag_get_exact(skb, nlh); +	return inet_diag_get_exact_compat(skb, nlh);  } -static DEFINE_MUTEX(inet_diag_mutex); - -static void inet_diag_rcv(struct sk_buff *skb) +static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)  { -	mutex_lock(&inet_diag_mutex); -	netlink_rcv_skb(skb, &inet_diag_rcv_msg); -	mutex_unlock(&inet_diag_mutex); +	int hdrlen = sizeof(struct inet_diag_req_v2); +	struct net *net = sock_net(skb->sk); + +	if (nlmsg_len(h) < hdrlen) +		return -EINVAL; + +	if (h->nlmsg_flags & NLM_F_DUMP) { +		if (nlmsg_attrlen(h, hdrlen)) { +			struct nlattr *attr; +			attr = nlmsg_find_attr(h, hdrlen, +					       INET_DIAG_REQ_BYTECODE); +			if (attr == NULL || +			    nla_len(attr) < sizeof(struct inet_diag_bc_op) || +			    inet_diag_bc_audit(nla_data(attr), nla_len(attr))) +				return -EINVAL; +		} +		{ +			struct netlink_dump_control c = { +				.dump = inet_diag_dump, +			}; +			return netlink_dump_start(net->diag_nlsk, skb, h, &c); +		} +	} + +	return inet_diag_get_exact(skb, h, nlmsg_data(h));  } +static const struct sock_diag_handler inet_diag_handler = { +	.family = AF_INET, +	.dump = inet_diag_handler_dump, +}; + +static const struct sock_diag_handler inet6_diag_handler = { +	.family = AF_INET6, +	.dump = inet_diag_handler_dump, +}; +  int inet_diag_register(const struct inet_diag_handler *h)  {  	const __u16 type = h->idiag_type;  	int err = -EINVAL; -	if (type >= INET_DIAG_GETSOCK_MAX) +	if (type >= IPPROTO_MAX)  		goto out;  	mutex_lock(&inet_diag_table_mutex); @@ -910,7 +1139,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)  {  	const __u16 type = h->idiag_type; -	if (type >= INET_DIAG_GETSOCK_MAX) +	if (type >= IPPROTO_MAX)  		return;  	mutex_lock(&inet_diag_table_mutex); @@ -921,7 +1150,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister);  static int __init inet_diag_init(void)  { -	const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * +	const int inet_diag_table_size = (IPPROTO_MAX *  					  sizeof(struct inet_diag_handler *));  	int err = -ENOMEM; @@ -929,25 +1158,35 @@ static int __init inet_diag_init(void)  	if (!inet_diag_table)  		goto out; -	idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, -					inet_diag_rcv, NULL, THIS_MODULE); -	if (idiagnl == NULL) -		goto out_free_table; -	err = 0; +	err = sock_diag_register(&inet_diag_handler); +	if (err) +		goto out_free_nl; + +	err = sock_diag_register(&inet6_diag_handler); +	if (err) +		goto out_free_inet; + +	sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);  out:  	return err; -out_free_table: + +out_free_inet: +	sock_diag_unregister(&inet_diag_handler); +out_free_nl:  	kfree(inet_diag_table);  	goto out;  }  static void __exit inet_diag_exit(void)  { -	netlink_kernel_release(idiagnl); +	sock_diag_unregister(&inet6_diag_handler); +	sock_diag_unregister(&inet_diag_handler); +	sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);  	kfree(inet_diag_table);  }  module_init(inet_diag_init);  module_exit(inet_diag_exit);  MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 5ff2a51b6d0..3b01959bf4b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,7 +21,30 @@  #include <linux/rtnetlink.h>  #include <linux/slab.h> +#include <net/sock.h>  #include <net/inet_frag.h> +#include <net/inet_ecn.h> + +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +const u8 ip_frag_ecn_table[16] = { +	/* at least one fragment had CE, and others ECT_0 or ECT_1 */ +	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE, +	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE, +	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE, + +	/* invalid combinations : drop frame */ +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, +	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; +EXPORT_SYMBOL(ip_frag_ecn_table);  static void inet_frag_secret_rebuild(unsigned long dummy)  { @@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)  	unsigned long now = jiffies;  	int i; +	/* Per bucket lock NOT needed here, due to write lock protection */  	write_lock(&f->lock); +  	get_random_bytes(&f->rnd, sizeof(u32));  	for (i = 0; i < INETFRAGS_HASHSZ; i++) { +		struct inet_frag_bucket *hb;  		struct inet_frag_queue *q; -		struct hlist_node *p, *n; +		struct hlist_node *n; -		hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) { +		hb = &f->hash[i]; +		hlist_for_each_entry_safe(q, n, &hb->chain, list) {  			unsigned int hval = f->hashfn(q);  			if (hval != i) { +				struct inet_frag_bucket *hb_dest; +  				hlist_del(&q->list);  				/* Relink to new hash chain. */ -				hlist_add_head(&q->list, &f->hash[hval]); +				hb_dest = &f->hash[hval]; +				hlist_add_head(&q->list, &hb_dest->chain);  			}  		}  	} @@ -55,14 +85,14 @@ void inet_frags_init(struct inet_frags *f)  {  	int i; -	for (i = 0; i < INETFRAGS_HASHSZ; i++) -		INIT_HLIST_HEAD(&f->hash[i]); +	for (i = 0; i < INETFRAGS_HASHSZ; i++) { +		struct inet_frag_bucket *hb = &f->hash[i]; +		spin_lock_init(&hb->chain_lock); +		INIT_HLIST_HEAD(&hb->chain); +	}  	rwlock_init(&f->lock); -	f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ -				   (jiffies ^ (jiffies >> 6))); -  	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,  			(unsigned long)f);  	f->secret_timer.expires = jiffies + f->secret_interval; @@ -73,8 +103,9 @@ EXPORT_SYMBOL(inet_frags_init);  void inet_frags_init_net(struct netns_frags *nf)  {  	nf->nqueues = 0; -	atomic_set(&nf->mem, 0); +	init_frag_mem_limit(nf);  	INIT_LIST_HEAD(&nf->lru_list); +	spin_lock_init(&nf->lru_lock);  }  EXPORT_SYMBOL(inet_frags_init_net); @@ -89,18 +120,28 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)  	nf->low_thresh = 0;  	local_bh_disable(); -	inet_frag_evictor(nf, f); +	inet_frag_evictor(nf, f, true);  	local_bh_enable(); + +	percpu_counter_destroy(&nf->mem);  }  EXPORT_SYMBOL(inet_frags_exit_net);  static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)  { -	write_lock(&f->lock); +	struct inet_frag_bucket *hb; +	unsigned int hash; + +	read_lock(&f->lock); +	hash = f->hashfn(fq); +	hb = &f->hash[hash]; + +	spin_lock(&hb->chain_lock);  	hlist_del(&fq->list); -	list_del(&fq->lru_list); -	fq->net->nqueues--; -	write_unlock(&f->lock); +	spin_unlock(&hb->chain_lock); + +	read_unlock(&f->lock); +	inet_frag_lru_del(fq);  }  void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) @@ -117,12 +158,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)  EXPORT_SYMBOL(inet_frag_kill);  static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, -		struct sk_buff *skb, int *work) +		struct sk_buff *skb)  { -	if (work) -		*work -= skb->truesize; - -	atomic_sub(skb->truesize, &nf->mem);  	if (f->skb_free)  		f->skb_free(skb);  	kfree_skb(skb); @@ -133,6 +170,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,  {  	struct sk_buff *fp;  	struct netns_frags *nf; +	unsigned int sum, sum_truesize = 0;  	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));  	WARN_ON(del_timer(&q->timer) != 0); @@ -143,13 +181,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,  	while (fp) {  		struct sk_buff *xp = fp->next; -		frag_kfree_skb(nf, f, fp, work); +		sum_truesize += fp->truesize; +		frag_kfree_skb(nf, f, fp);  		fp = xp;  	} - +	sum = sum_truesize + f->qsize;  	if (work) -		*work -= f->qsize; -	atomic_sub(f->qsize, &nf->mem); +		*work -= sum; +	sub_frag_mem_limit(q, sum);  	if (f->destructor)  		f->destructor(q); @@ -158,23 +197,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,  }  EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)  {  	struct inet_frag_queue *q;  	int work, evicted = 0; -	work = atomic_read(&nf->mem) - nf->low_thresh; -	while (work > 0) { -		read_lock(&f->lock); +	if (!force) { +		if (frag_mem_limit(nf) <= nf->high_thresh) +			return 0; +	} + +	work = frag_mem_limit(nf) - nf->low_thresh; +	while (work > 0 || force) { +		spin_lock(&nf->lru_lock); +  		if (list_empty(&nf->lru_list)) { -			read_unlock(&f->lock); +			spin_unlock(&nf->lru_lock);  			break;  		}  		q = list_first_entry(&nf->lru_list,  				struct inet_frag_queue, lru_list);  		atomic_inc(&q->refcnt); -		read_unlock(&f->lock); +		/* Remove q from list to avoid several CPUs grabbing it */ +		list_del_init(&q->lru_list); + +		spin_unlock(&nf->lru_lock);  		spin_lock(&q->lock);  		if (!(q->last_in & INET_FRAG_COMPLETE)) @@ -194,28 +242,30 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,  		struct inet_frag_queue *qp_in, struct inet_frags *f,  		void *arg)  { +	struct inet_frag_bucket *hb;  	struct inet_frag_queue *qp; -#ifdef CONFIG_SMP -	struct hlist_node *n; -#endif  	unsigned int hash; -	write_lock(&f->lock); +	read_lock(&f->lock); /* Protects against hash rebuild */  	/*  	 * While we stayed w/o the lock other CPU could update  	 * the rnd seed, so we need to re-calculate the hash  	 * chain. Fortunatelly the qp_in can be used to get one.  	 */  	hash = f->hashfn(qp_in); +	hb = &f->hash[hash]; +	spin_lock(&hb->chain_lock); +  #ifdef CONFIG_SMP  	/* With SMP race we have to recheck hash table, because  	 * such entry could be created on other cpu, while we -	 * promoted read lock to write lock. +	 * released the hash bucket lock.  	 */ -	hlist_for_each_entry(qp, n, &f->hash[hash], list) { +	hlist_for_each_entry(qp, &hb->chain, list) {  		if (qp->net == nf && f->match(qp, arg)) {  			atomic_inc(&qp->refcnt); -			write_unlock(&f->lock); +			spin_unlock(&hb->chain_lock); +			read_unlock(&f->lock);  			qp_in->last_in |= INET_FRAG_COMPLETE;  			inet_frag_put(qp_in, f);  			return qp; @@ -227,10 +277,11 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,  		atomic_inc(&qp->refcnt);  	atomic_inc(&qp->refcnt); -	hlist_add_head(&qp->list, &f->hash[hash]); -	list_add_tail(&qp->lru_list, &nf->lru_list); -	nf->nqueues++; -	write_unlock(&f->lock); +	hlist_add_head(&qp->list, &hb->chain); +	inet_frag_lru_add(nf, qp); +	spin_unlock(&hb->chain_lock); +	read_unlock(&f->lock); +  	return qp;  } @@ -243,12 +294,14 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,  	if (q == NULL)  		return NULL; +	q->net = nf;  	f->constructor(q, arg); -	atomic_add(f->qsize, &nf->mem); +	add_frag_mem_limit(q, f->qsize); +  	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);  	spin_lock_init(&q->lock);  	atomic_set(&q->refcnt, 1); -	q->net = nf; +	INIT_LIST_HEAD(&q->lru_list);  	return q;  } @@ -269,18 +322,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,  		struct inet_frags *f, void *key, unsigned int hash)  	__releases(&f->lock)  { +	struct inet_frag_bucket *hb;  	struct inet_frag_queue *q; -	struct hlist_node *n; +	int depth = 0; + +	hb = &f->hash[hash]; -	hlist_for_each_entry(q, n, &f->hash[hash], list) { +	spin_lock(&hb->chain_lock); +	hlist_for_each_entry(q, &hb->chain, list) {  		if (q->net == nf && f->match(q, key)) {  			atomic_inc(&q->refcnt); +			spin_unlock(&hb->chain_lock);  			read_unlock(&f->lock);  			return q;  		} +		depth++;  	} +	spin_unlock(&hb->chain_lock);  	read_unlock(&f->lock); -	return inet_frag_create(nf, f, key); +	if (depth <= INETFRAGS_MAXDEPTH) +		return inet_frag_create(nf, f, key); +	else +		return ERR_PTR(-ENOBUFS);  }  EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, +				   const char *prefix) +{ +	static const char msg[] = "inet_frag_find: Fragment hash bucket" +		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) +		". Dropping fragment.\n"; + +	if (PTR_ERR(q) == -ENOBUFS) +		LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b344f30b46..43116e8c8e1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -21,8 +21,34 @@  #include <net/inet_connection_sock.h>  #include <net/inet_hashtables.h> +#include <net/secure_seq.h>  #include <net/ip.h> +static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, +				 const __u16 lport, const __be32 faddr, +				 const __be16 fport) +{ +	static u32 inet_ehash_secret __read_mostly; + +	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); + +	return __inet_ehashfn(laddr, lport, faddr, fport, +			      inet_ehash_secret + net_hash_mix(net)); +} + + +static unsigned int inet_sk_ehashfn(const struct sock *sk) +{ +	const struct inet_sock *inet = inet_sk(sk); +	const __be32 laddr = inet->inet_rcv_saddr; +	const __u16 lport = inet->inet_num; +	const __be32 faddr = inet->inet_daddr; +	const __be16 fport = inet->inet_dport; +	struct net *net = sock_net(sk); + +	return inet_ehashfn(net, laddr, lport, faddr, fport); +} +  /*   * Allocate and initialize a new local port bind bucket.   * The bindhash mutex for snum's hash chain must be held here. @@ -38,6 +64,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,  		write_pnet(&tb->ib_net, hold_net(net));  		tb->port      = snum;  		tb->fastreuse = 0; +		tb->fastreuseport = 0;  		tb->num_owners = 0;  		INIT_HLIST_HEAD(&tb->owners);  		hlist_add_head(&tb->node, &head->chain); @@ -118,13 +145,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)  		 * that the listener socket's icsk_bind_hash is the same  		 * as that of the child socket. We have to look up or  		 * create a new bind bucket for the child here. */ -		struct hlist_node *node; -		inet_bind_bucket_for_each(tb, node, &head->chain) { +		inet_bind_bucket_for_each(tb, &head->chain) {  			if (net_eq(ib_net(tb), sock_net(sk)) &&  			    tb->port == port)  				break;  		} -		if (!node) { +		if (!tb) {  			tb = inet_bind_bucket_create(table->bind_bucket_cachep,  						     sock_net(sk), head, port);  			if (!tb) { @@ -133,8 +159,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)  			}  		}  	} -	sk_add_bind_node(child, &tb->owners); -	inet_csk(child)->icsk_bind_hash = tb; +	inet_bind_hash(child, tb, port);  	spin_unlock(&head->lock);  	return 0; @@ -151,16 +176,16 @@ static inline int compute_score(struct sock *sk, struct net *net,  	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&  			!ipv6_only_sock(sk)) {  		__be32 rcv_saddr = inet->inet_rcv_saddr; -		score = sk->sk_family == PF_INET ? 1 : 0; +		score = sk->sk_family == PF_INET ? 2 : 1;  		if (rcv_saddr) {  			if (rcv_saddr != daddr)  				return -1; -			score += 2; +			score += 4;  		}  		if (sk->sk_bound_dev_if) {  			if (sk->sk_bound_dev_if != dif)  				return -1; -			score += 2; +			score += 4;  		}  	}  	return score; @@ -176,6 +201,7 @@ static inline int compute_score(struct sock *sk, struct net *net,  struct sock *__inet_lookup_listener(struct net *net,  				    struct inet_hashinfo *hashinfo, +				    const __be32 saddr, __be16 sport,  				    const __be32 daddr, const unsigned short hnum,  				    const int dif)  { @@ -183,17 +209,29 @@ struct sock *__inet_lookup_listener(struct net *net,  	struct hlist_nulls_node *node;  	unsigned int hash = inet_lhashfn(net, hnum);  	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; -	int score, hiscore; +	int score, hiscore, matches = 0, reuseport = 0; +	u32 phash = 0;  	rcu_read_lock();  begin:  	result = NULL; -	hiscore = -1; +	hiscore = 0;  	sk_nulls_for_each_rcu(sk, node, &ilb->head) {  		score = compute_score(sk, net, hnum, daddr, dif);  		if (score > hiscore) {  			result = sk;  			hiscore = score; +			reuseport = sk->sk_reuseport; +			if (reuseport) { +				phash = inet_ehashfn(net, daddr, hnum, +						     saddr, sport); +				matches = 1; +			} +		} else if (score == hiscore && reuseport) { +			matches++; +			if (((u64)phash * matches) >> 32 == 0) +				result = sk; +			phash = next_pseudo_random32(phash);  		}  	}  	/* @@ -217,13 +255,26 @@ begin:  }  EXPORT_SYMBOL_GPL(__inet_lookup_listener); -struct sock * __inet_lookup_established(struct net *net, +/* All sockets share common refcount, but have different destructors */ +void sock_gen_put(struct sock *sk) +{ +	if (!atomic_dec_and_test(&sk->sk_refcnt)) +		return; + +	if (sk->sk_state == TCP_TIME_WAIT) +		inet_twsk_free(inet_twsk(sk)); +	else +		sk_free(sk); +} +EXPORT_SYMBOL_GPL(sock_gen_put); + +struct sock *__inet_lookup_established(struct net *net,  				  struct inet_hashinfo *hashinfo,  				  const __be32 saddr, const __be16 sport,  				  const __be32 daddr, const u16 hnum,  				  const int dif)  { -	INET_ADDR_COOKIE(acookie, saddr, daddr) +	INET_ADDR_COOKIE(acookie, saddr, daddr);  	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);  	struct sock *sk;  	const struct hlist_nulls_node *node; @@ -237,16 +288,18 @@ struct sock * __inet_lookup_established(struct net *net,  	rcu_read_lock();  begin:  	sk_nulls_for_each_rcu(sk, node, &head->chain) { -		if (INET_MATCH(sk, net, hash, acookie, -					saddr, daddr, ports, dif)) { +		if (sk->sk_hash != hash) +			continue; +		if (likely(INET_MATCH(sk, net, acookie, +				      saddr, daddr, ports, dif))) {  			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) -				goto begintw; -			if (unlikely(!INET_MATCH(sk, net, hash, acookie, -				saddr, daddr, ports, dif))) { -				sock_put(sk); +				goto out; +			if (unlikely(!INET_MATCH(sk, net, acookie, +						 saddr, daddr, ports, dif))) { +				sock_gen_put(sk);  				goto begin;  			} -			goto out; +			goto found;  		}  	}  	/* @@ -256,33 +309,9 @@ begin:  	 */  	if (get_nulls_value(node) != slot)  		goto begin; - -begintw: -	/* Must check for a TIME_WAIT'er before going to listener hash. */ -	sk_nulls_for_each_rcu(sk, node, &head->twchain) { -		if (INET_TW_MATCH(sk, net, hash, acookie, -					saddr, daddr, ports, dif)) { -			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { -				sk = NULL; -				goto out; -			} -			if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie, -				 saddr, daddr, ports, dif))) { -				sock_put(sk); -				goto begintw; -			} -			goto out; -		} -	} -	/* -	 * if the nulls value we got at the end of this lookup is -	 * not the expected one, we must restart lookup. -	 * We probably met an item that was moved to another chain. -	 */ -	if (get_nulls_value(node) != slot) -		goto begintw; -	sk = NULL;  out: +	sk = NULL; +found:  	rcu_read_unlock();  	return sk;  } @@ -298,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,  	__be32 daddr = inet->inet_rcv_saddr;  	__be32 saddr = inet->inet_daddr;  	int dif = sk->sk_bound_dev_if; -	INET_ADDR_COOKIE(acookie, saddr, daddr) +	INET_ADDR_COOKIE(acookie, saddr, daddr);  	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);  	struct net *net = sock_net(sk);  	unsigned int hash = inet_ehashfn(net, daddr, lport, @@ -307,35 +336,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,  	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);  	struct sock *sk2;  	const struct hlist_nulls_node *node; -	struct inet_timewait_sock *tw; +	struct inet_timewait_sock *tw = NULL;  	int twrefcnt = 0;  	spin_lock(lock); -	/* Check TIME-WAIT sockets first. */ -	sk_nulls_for_each(sk2, node, &head->twchain) { -		tw = inet_twsk(sk2); - -		if (INET_TW_MATCH(sk2, net, hash, acookie, -					saddr, daddr, ports, dif)) { -			if (twsk_unique(sk, sk2, twp)) -				goto unique; -			else -				goto not_unique; -		} -	} -	tw = NULL; - -	/* And established part... */  	sk_nulls_for_each(sk2, node, &head->chain) { -		if (INET_MATCH(sk2, net, hash, acookie, -					saddr, daddr, ports, dif)) +		if (sk2->sk_hash != hash) +			continue; + +		if (likely(INET_MATCH(sk2, net, acookie, +					 saddr, daddr, ports, dif))) { +			if (sk2->sk_state == TCP_TIME_WAIT) { +				tw = inet_twsk(sk2); +				if (twsk_unique(sk, sk2, twp)) +					break; +			}  			goto not_unique; +		}  	} -unique:  	/* Must record num and sport now. Otherwise we will see -	 * in hash table socket with a funny identity. */ +	 * in hash table socket with a funny identity. +	 */  	inet->inet_num = lport;  	inet->inet_sport = htons(lport);  	sk->sk_hash = hash; @@ -444,7 +467,7 @@ void inet_unhash(struct sock *sk)  		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);  	spin_lock_bh(lock); -	done =__sk_nulls_del_node_init_rcu(sk); +	done = __sk_nulls_del_node_init_rcu(sk);  	if (done)  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);  	spin_unlock_bh(lock); @@ -469,16 +492,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  		int i, remaining, low, high, port;  		static u32 hint;  		u32 offset = hint + port_offset; -		struct hlist_node *node;  		struct inet_timewait_sock *tw = NULL; -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1;  		local_bh_disable();  		for (i = 1; i <= remaining; i++) {  			port = low + (i + offset) % remaining; -			if (inet_is_reserved_local_port(port)) +			if (inet_is_local_reserved_port(net, port))  				continue;  			head = &hinfo->bhash[inet_bhashfn(net, port,  					hinfo->bhash_size)]; @@ -488,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  			 * because the established check is already  			 * unique enough.  			 */ -			inet_bind_bucket_for_each(tb, node, &head->chain) { +			inet_bind_bucket_for_each(tb, &head->chain) {  				if (net_eq(ib_net(tb), net) &&  				    tb->port == port) { -					if (tb->fastreuse >= 0) +					if (tb->fastreuse >= 0 || +					    tb->fastreuseport >= 0)  						goto next_port;  					WARN_ON(hlist_empty(&tb->owners));  					if (!check_established(death_row, sk, @@ -508,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  				break;  			}  			tb->fastreuse = -1; +			tb->fastreuseport = -1;  			goto ok;  		next_port: diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 47038cb6c13..f17ea49b28f 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -29,6 +29,7 @@  #include <linux/module.h>  #include <linux/if_vlan.h>  #include <linux/inet_lro.h> +#include <net/checksum.h>  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); @@ -51,8 +52,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");   * Basic tcp checks whether packet is suitable for LRO   */ -static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, -			    int len, struct net_lro_desc *lro_desc) +static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, +			    int len, const struct net_lro_desc *lro_desc)  {          /* check ip header: don't aggregate padded frames */  	if (ntohs(iph->tot_len) != len) @@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)  		*(p+2) = lro_desc->tcp_rcv_tsecr;  	} +	csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));  	iph->tot_len = htons(lro_desc->ip_tot_len); -	iph->check = 0; -	iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); -  	tcph->check = 0;  	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);  	lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); @@ -146,8 +145,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)  }  static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, -			  struct iphdr *iph, struct tcphdr *tcph, -			  u16 vlan_tag, struct vlan_group *vgrp) +			  struct iphdr *iph, struct tcphdr *tcph)  {  	int nr_frags;  	__be32 *ptr; @@ -173,8 +171,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,  	}  	lro_desc->mss = tcp_data_len; -	lro_desc->vgrp = vgrp; -	lro_desc->vlan_tag = vlan_tag;  	lro_desc->active = 1;  	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, @@ -234,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,  	lro_desc->last_skb = skb;  } -static void lro_add_frags(struct net_lro_desc *lro_desc, -			  int len, int hlen, int truesize, -			  struct skb_frag_struct *skb_frags, -			  struct iphdr *iph, struct tcphdr *tcph) -{ -	struct sk_buff *skb = lro_desc->parent; -	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - -	lro_add_common(lro_desc, iph, tcph, tcp_data_len); - -	skb->truesize += truesize; - -	skb_frags[0].page_offset += hlen; -	skb_frags[0].size -= hlen; - -	while (tcp_data_len > 0) { -		*(lro_desc->next_frag) = *skb_frags; -		tcp_data_len -= skb_frags->size; -		lro_desc->next_frag++; -		skb_frags++; -		skb_shinfo(skb)->nr_frags++; -	} -}  static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,  			      struct iphdr *iph, @@ -309,29 +282,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,  	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; -	if (lro_desc->vgrp) { -		if (lro_mgr->features & LRO_F_NAPI) -			vlan_hwaccel_receive_skb(lro_desc->parent, -						 lro_desc->vgrp, -						 lro_desc->vlan_tag); -		else -			vlan_hwaccel_rx(lro_desc->parent, -					lro_desc->vgrp, -					lro_desc->vlan_tag); - -	} else { -		if (lro_mgr->features & LRO_F_NAPI) -			netif_receive_skb(lro_desc->parent); -		else -			netif_rx(lro_desc->parent); -	} +	if (lro_mgr->features & LRO_F_NAPI) +		netif_receive_skb(lro_desc->parent); +	else +		netif_rx(lro_desc->parent);  	LRO_INC_STATS(lro_mgr, flushed);  	lro_clear_desc(lro_desc);  }  static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, -			  struct vlan_group *vgrp, u16 vlan_tag, void *priv) +			  void *priv)  {  	struct net_lro_desc *lro_desc;  	struct iphdr *iph; @@ -360,7 +321,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,  			goto out;  		skb->ip_summed = lro_mgr->ip_summed_aggr; -		lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); +		lro_init_desc(lro_desc, skb, iph, tcph);  		LRO_INC_STATS(lro_mgr, aggregated);  		return 0;  	} @@ -387,134 +348,11 @@ out:  	return 1;  } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, -				   struct skb_frag_struct *frags, -				   int len, int true_size, -				   void *mac_hdr, -				   int hlen, __wsum sum, -				   u32 ip_summed) -{ -	struct sk_buff *skb; -	struct skb_frag_struct *skb_frags; -	int data_len = len; -	int hdr_len = min(len, hlen); - -	skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); -	if (!skb) -		return NULL; - -	skb_reserve(skb, lro_mgr->frag_align_pad); -	skb->len = len; -	skb->data_len = len - hdr_len; -	skb->truesize += true_size; -	skb->tail += hdr_len; - -	memcpy(skb->data, mac_hdr, hdr_len); - -	skb_frags = skb_shinfo(skb)->frags; -	while (data_len > 0) { -		*skb_frags = *frags; -		data_len -= frags->size; -		skb_frags++; -		frags++; -		skb_shinfo(skb)->nr_frags++; -	} - -	skb_shinfo(skb)->frags[0].page_offset += hdr_len; -	skb_shinfo(skb)->frags[0].size -= hdr_len; - -	skb->ip_summed = ip_summed; -	skb->csum = sum; -	skb->protocol = eth_type_trans(skb, lro_mgr->dev); -	return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, -					  struct skb_frag_struct *frags, -					  int len, int true_size, -					  struct vlan_group *vgrp, -					  u16 vlan_tag, void *priv, __wsum sum) -{ -	struct net_lro_desc *lro_desc; -	struct iphdr *iph; -	struct tcphdr *tcph; -	struct sk_buff *skb; -	u64 flags; -	void *mac_hdr; -	int mac_hdr_len; -	int hdr_len = LRO_MAX_PG_HLEN; -	int vlan_hdr_len = 0; - -	if (!lro_mgr->get_frag_header || -	    lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, -				     (void *)&tcph, &flags, priv)) { -		mac_hdr = page_address(frags->page) + frags->page_offset; -		goto out1; -	} - -	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) -		goto out1; - -	hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); -	mac_hdr_len = (int)((void *)(iph) - mac_hdr); - -	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); -	if (!lro_desc) -		goto out1; - -	if (!lro_desc->active) { /* start new lro session */ -		if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) -			goto out1; - -		skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, -				  hdr_len, 0, lro_mgr->ip_summed_aggr); -		if (!skb) -			goto out; - -		if ((skb->protocol == htons(ETH_P_8021Q)) && -		    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) -			vlan_hdr_len = VLAN_HLEN; - -		iph = (void *)(skb->data + vlan_hdr_len); -		tcph = (void *)((u8 *)skb->data + vlan_hdr_len -				+ IP_HDR_LEN(iph)); - -		lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); -		LRO_INC_STATS(lro_mgr, aggregated); -		return NULL; -	} - -	if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) -		goto out2; - -	if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) -		goto out2; - -	lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); -	LRO_INC_STATS(lro_mgr, aggregated); - -	if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || -	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) -		lro_flush(lro_mgr, lro_desc); - -	return NULL; - -out2: /* send aggregated packets to the stack */ -	lro_flush(lro_mgr, lro_desc); - -out1:  /* Original packet has to be posted to the stack */ -	skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, -			  hdr_len, sum, lro_mgr->ip_summed); -out: -	return skb; -} -  void lro_receive_skb(struct net_lro_mgr *lro_mgr,  		     struct sk_buff *skb,  		     void *priv)  { -	if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { +	if (__lro_proc_skb(lro_mgr, skb, priv)) {  		if (lro_mgr->features & LRO_F_NAPI)  			netif_receive_skb(skb);  		else @@ -523,59 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,  }  EXPORT_SYMBOL(lro_receive_skb); -void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, -				  struct sk_buff *skb, -				  struct vlan_group *vgrp, -				  u16 vlan_tag, -				  void *priv) -{ -	if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { -		if (lro_mgr->features & LRO_F_NAPI) -			vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); -		else -			vlan_hwaccel_rx(skb, vgrp, vlan_tag); -	} -} -EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); - -void lro_receive_frags(struct net_lro_mgr *lro_mgr, -		       struct skb_frag_struct *frags, -		       int len, int true_size, void *priv, __wsum sum) -{ -	struct sk_buff *skb; - -	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, -				 priv, sum); -	if (!skb) -		return; - -	if (lro_mgr->features & LRO_F_NAPI) -		netif_receive_skb(skb); -	else -		netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); - -void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, -				    struct skb_frag_struct *frags, -				    int len, int true_size, -				    struct vlan_group *vgrp, -				    u16 vlan_tag, void *priv, __wsum sum) -{ -	struct sk_buff *skb; - -	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, -				 vlan_tag, priv, sum); -	if (!skb) -		return; - -	if (lro_mgr->features & LRO_F_NAPI) -		vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); -	else -		vlan_hwaccel_rx(skb, vgrp, vlan_tag); -} -EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); -  void lro_flush_all(struct net_lro_mgr *lro_mgr)  {  	int i; @@ -587,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr)  	}  }  EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, -		  struct iphdr *iph, struct tcphdr *tcph) -{ -	struct net_lro_desc *lro_desc; - -	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); -	if (lro_desc->active) -		lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c5af909cf70..6d592f8555f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -11,6 +11,7 @@  #include <linux/kernel.h>  #include <linux/kmemcheck.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <net/inet_hashtables.h>  #include <net/inet_timewait_sock.h>  #include <net/ip.h> @@ -86,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,  	refcnt += inet_twsk_bind_unhash(tw, hashinfo);  	spin_unlock(&bhead->lock); -#ifdef SOCK_REFCNT_DEBUG -	if (atomic_read(&tw->tw_refcnt) != 1) { -		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", -		       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); -	} -#endif -	while (refcnt) { -		inet_twsk_put(tw); -		refcnt--; -	} +	BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); +	atomic_sub(refcnt, &tw->tw_refcnt);  } -static noinline void inet_twsk_free(struct inet_timewait_sock *tw) +void inet_twsk_free(struct inet_timewait_sock *tw)  {  	struct module *owner = tw->tw_prot->owner;  	twsk_destructor((struct sock *)tw); @@ -117,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)  }  EXPORT_SYMBOL_GPL(inet_twsk_put); +static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, +				   struct hlist_nulls_head *list) +{ +	hlist_nulls_add_head_rcu(&tw->tw_node, list); +} + +static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, +				    struct hlist_head *list) +{ +	hlist_add_head(&tw->tw_bind_node, list); +} +  /*   * Enter the time wait state. This is called with locally disabled BH.   * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -145,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,  	spin_lock(lock);  	/* -	 * Step 2: Hash TW into TIMEWAIT chain. -	 * Should be done before removing sk from established chain -	 * because readers are lockless and search established first. +	 * Step 2: Hash TW into tcp ehash chain. +	 * Notes : +	 * - tw_refcnt is set to 3 because : +	 * - We have one reference from bhash chain. +	 * - We have one reference from ehash chain. +	 * We can use atomic_set() because prior spin_lock()/spin_unlock() +	 * committed into memory all tw fields.  	 */ -	inet_twsk_add_node_rcu(tw, &ehead->twchain); +	atomic_set(&tw->tw_refcnt, 1 + 1 + 1); +	inet_twsk_add_node_rcu(tw, &ehead->chain); -	/* Step 3: Remove SK from established hash. */ +	/* Step 3: Remove SK from hash chain */  	if (__sk_nulls_del_node_init_rcu(sk))  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -	/* -	 * Notes : -	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc() -	 * - We add one reference for the bhash link -	 * - We add one reference for the ehash link -	 * - We want this refcnt update done before allowing other -	 *   threads to find this tw in ehash chain. -	 */ -	atomic_add(1 + 1 + 1, &tw->tw_refcnt); -  	spin_unlock(lock);  }  EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -183,6 +183,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat  		tw->tw_daddr	    = inet->inet_daddr;  		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;  		tw->tw_bound_dev_if = sk->sk_bound_dev_if; +		tw->tw_tos	    = inet->tos;  		tw->tw_num	    = inet->inet_num;  		tw->tw_state	    = TCP_TIME_WAIT;  		tw->tw_substate	    = state; @@ -214,7 +215,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,  				    const int slot)  {  	struct inet_timewait_sock *tw; -	struct hlist_node *node;  	unsigned int killed;  	int ret; @@ -227,7 +227,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,  	killed = 0;  	ret = 0;  rescan: -	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { +	inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {  		__inet_twsk_del_dead_node(tw);  		spin_unlock(&twdr->death_lock);  		__inet_twsk_kill(tw, twdr->hashinfo); @@ -261,7 +261,7 @@ rescan:  void inet_twdr_hangman(unsigned long data)  {  	struct inet_timewait_death_row *twdr; -	int unsigned need_timer; +	unsigned int need_timer;  	twdr = (struct inet_timewait_death_row *)data;  	spin_lock(&twdr->death_lock); @@ -386,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,  			if (slot >= INET_TWDR_TWKILL_SLOTS)  				slot = INET_TWDR_TWKILL_SLOTS - 1;  		} -		tw->tw_ttd = jiffies + timeo; +		tw->tw_ttd = inet_tw_time_stamp() + timeo;  		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);  		list = &twdr->cells[slot];  	} else { -		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); +		tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);  		if (twdr->twcal_hand < 0) {  			twdr->twcal_hand = 0; @@ -436,10 +436,10 @@ void inet_twdr_twcal_tick(unsigned long data)  	for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {  		if (time_before_eq(j, now)) { -			struct hlist_node *node, *safe; +			struct hlist_node *safe;  			struct inet_timewait_sock *tw; -			inet_twsk_for_each_inmate_safe(tw, node, safe, +			inet_twsk_for_each_inmate_safe(tw, safe,  						       &twdr->twcal_row[slot]) {  				__inet_twsk_del_dead_node(tw);  				__inet_twsk_kill(tw, twdr->hashinfo); @@ -489,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,  restart_rcu:  		rcu_read_lock();  restart: -		sk_nulls_for_each_rcu(sk, node, &head->twchain) { +		sk_nulls_for_each_rcu(sk, node, &head->chain) { +			if (sk->sk_state != TCP_TIME_WAIT) +				continue;  			tw = inet_twsk(sk);  			if ((tw->tw_family != family) ||  				atomic_read(&twsk_net(tw)->count)) @@ -505,7 +507,9 @@ restart:  			}  			rcu_read_unlock(); +			local_bh_disable();  			inet_twsk_deschedule(tw, twdr); +			local_bh_enable();  			inet_twsk_put(tw);  			goto restart_rcu;  		} diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d9bc85751c7..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -17,27 +17,16 @@  #include <linux/kernel.h>  #include <linux/mm.h>  #include <linux/net.h> +#include <linux/workqueue.h>  #include <net/ip.h>  #include <net/inetpeer.h> +#include <net/secure_seq.h>  /*   *  Theory of operations.   *  We keep one entry for each peer IP address.  The nodes contains long-living   *  information about the peer which doesn't depend on routes. - *  At this moment this information consists only of ID field for the next - *  outgoing IP packet.  This field is incremented with each packet as encoded - *  in inet_getid() function (include/net/inetpeer.h). - *  At the moment of writing this notes identifier of IP packets is generated - *  to be unpredictable using this code only for packets subjected - *  (actually or potentially) to defragmentation.  I.e. DF packets less than - *  PMTU in size uses a constant ID and do not use this code (see - *  ip_select_ident() in include/net/ip.h).   * - *  Route cache entries hold references to our nodes. - *  New cache entries get references via lookup by destination IP address in - *  the avl tree.  The reference is grabbed only when it's needed i.e. only - *  when we try to output IP packet which needs an unpredictable ID (see - *  __ip_select_ident() in net/ipv4/route.c).   *  Nodes are removed only when reference counter goes to 0.   *  When it's happened the node may be removed when a sufficient amount of   *  time has been passed since its last use.  The less-recently-used entry can @@ -54,21 +43,21 @@   *  1.  Nodes may appear in the tree only with the pool lock held.   *  2.  Nodes may disappear from the tree only with the pool lock held   *      AND reference count being 0. - *  3.  Nodes appears and disappears from unused node list only under - *      "inet_peer_unused_lock". - *  4.  Global variable peer_total is modified under the pool lock. - *  5.  struct inet_peer fields modification: + *  3.  Global variable peer_total is modified under the pool lock. + *  4.  struct inet_peer fields modification:   *		avl_left, avl_right, avl_parent, avl_height: pool lock - *		unused: unused node list lock   *		refcnt: atomically against modifications on other CPU;   *		   usually under some other lock to prevent node disappearing - *		dtime: unused node list lock   *		daddr: unchangeable - *		ip_id_count: atomic value (no lock needed)   */  static struct kmem_cache *peer_cachep __read_mostly; +static LIST_HEAD(gc_list); +static const int gc_delay = 60 * HZ; +static struct delayed_work gc_work; +static DEFINE_SPINLOCK(gc_lock); +  #define node_height(x) x->avl_height  #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) @@ -79,23 +68,32 @@ static const struct inet_peer peer_fake_node = {  	.avl_height	= 0  }; -struct inet_peer_base { -	struct inet_peer __rcu *root; -	spinlock_t	lock; -	int		total; -}; +void inet_peer_base_init(struct inet_peer_base *bp) +{ +	bp->root = peer_avl_empty_rcu; +	seqlock_init(&bp->lock); +	bp->flush_seq = ~0U; +	bp->total = 0; +} +EXPORT_SYMBOL_GPL(inet_peer_base_init); -static struct inet_peer_base v4_peers = { -	.root		= peer_avl_empty_rcu, -	.lock		= __SPIN_LOCK_UNLOCKED(v4_peers.lock), -	.total		= 0, -}; +static atomic_t v4_seq = ATOMIC_INIT(0); +static atomic_t v6_seq = ATOMIC_INIT(0); -static struct inet_peer_base v6_peers = { -	.root		= peer_avl_empty_rcu, -	.lock		= __SPIN_LOCK_UNLOCKED(v6_peers.lock), -	.total		= 0, -}; +static atomic_t *inetpeer_seq_ptr(int family) +{ +	return (family == AF_INET ? &v4_seq : &v6_seq); +} + +static inline void flush_check(struct inet_peer_base *base, int family) +{ +	atomic_t *fp = inetpeer_seq_ptr(family); + +	if (unlikely(base->flush_seq != atomic_read(fp))) { +		inetpeer_invalidate_tree(base); +		base->flush_seq = atomic_read(fp); +	} +}  #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ @@ -104,20 +102,53 @@ int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries m  					 * aggressively at this stage */  int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */  int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */ -int inet_peer_gc_mintime __read_mostly = 10 * HZ; -int inet_peer_gc_maxtime __read_mostly = 120 * HZ; - -static struct { -	struct list_head	list; -	spinlock_t		lock; -} unused_peers = { -	.list			= LIST_HEAD_INIT(unused_peers.list), -	.lock			= __SPIN_LOCK_UNLOCKED(unused_peers.lock), -}; -static void peer_check_expire(unsigned long dummy); -static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); +static void inetpeer_gc_worker(struct work_struct *work) +{ +	struct inet_peer *p, *n, *c; +	struct list_head list; + +	spin_lock_bh(&gc_lock); +	list_replace_init(&gc_list, &list); +	spin_unlock_bh(&gc_lock); + +	if (list_empty(&list)) +		return; + +	list_for_each_entry_safe(p, n, &list, gc_list) { + +		if (need_resched()) +			cond_resched(); +		c = rcu_dereference_protected(p->avl_left, 1); +		if (c != peer_avl_empty) { +			list_add_tail(&c->gc_list, &list); +			p->avl_left = peer_avl_empty_rcu; +		} + +		c = rcu_dereference_protected(p->avl_right, 1); +		if (c != peer_avl_empty) { +			list_add_tail(&c->gc_list, &list); +			p->avl_right = peer_avl_empty_rcu; +		} + +		n = list_entry(p->gc_list.next, struct inet_peer, gc_list); + +		if (!atomic_read(&p->refcnt)) { +			list_del(&p->gc_list); +			kmem_cache_free(peer_cachep, p); +		} +	} + +	if (list_empty(&list)) +		return; + +	spin_lock_bh(&gc_lock); +	list_splice(&list, &gc_list); +	spin_unlock_bh(&gc_lock); + +	schedule_delayed_work(&gc_work, gc_delay); +}  /* Called from ip_output.c:ip_init  */  void __init inet_initpeers(void) @@ -142,23 +173,7 @@ void __init inet_initpeers(void)  			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,  			NULL); -	/* All the timers, started at system startup tend -	   to synchronize. Perturb it a bit. -	 */ -	peer_periodic_timer.expires = jiffies -		+ net_random() % inet_peer_gc_maxtime -		+ inet_peer_gc_maxtime; -	add_timer(&peer_periodic_timer); -} - -/* Called with or without local BH being disabled. */ -static void unlink_from_unused(struct inet_peer *p) -{ -	if (!list_empty(&p->unused)) { -		spin_lock_bh(&unused_peers.lock); -		list_del_init(&p->unused); -		spin_unlock_bh(&unused_peers.lock); -	} +	INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);  }  static int addr_compare(const struct inetpeer_addr *a, @@ -167,9 +182,9 @@ static int addr_compare(const struct inetpeer_addr *a,  	int i, n = (a->family == AF_INET ? 1 : 4);  	for (i = 0; i < n; i++) { -		if (a->a6[i] == b->a6[i]) +		if (a->addr.a6[i] == b->addr.a6[i])  			continue; -		if (a->a6[i] < b->a6[i]) +		if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])  			return -1;  		return 1;  	} @@ -177,6 +192,9 @@ static int addr_compare(const struct inetpeer_addr *a,  	return 0;  } +#define rcu_deref_locked(X, BASE)				\ +	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) +  /*   * Called with local BH disabled and the pool lock held.   */ @@ -187,9 +205,8 @@ static int addr_compare(const struct inetpeer_addr *a,  								\  	stackptr = _stack;					\  	*stackptr++ = &_base->root;				\ -	for (u = rcu_dereference_protected(_base->root,		\ -			lockdep_is_held(&_base->lock));		\ -	     u != peer_avl_empty; ) {				\ +	for (u = rcu_deref_locked(_base->root, _base);		\ +	     u != peer_avl_empty;) {				\  		int cmp = addr_compare(_daddr, &u->daddr);	\  		if (cmp == 0)					\  			break;					\ @@ -198,41 +215,38 @@ static int addr_compare(const struct inetpeer_addr *a,  		else						\  			v = &u->avl_right;			\  		*stackptr++ = v;				\ -		u = rcu_dereference_protected(*v,		\ -			lockdep_is_held(&_base->lock));		\ +		u = rcu_deref_locked(*v, _base);		\  	}							\  	u;							\  })  /* - * Called with rcu_read_lock_bh() + * Called with rcu_read_lock()   * Because we hold no lock against a writer, its quite possible we fall   * in an endless loop.   * But every pointer we follow is guaranteed to be valid thanks to RCU.   * We exit from this function if number of links exceeds PEER_MAXDEPTH   */ -static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, -				       struct inet_peer_base *base) +static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, +				    struct inet_peer_base *base)  { -	struct inet_peer *u = rcu_dereference_bh(base->root); +	struct inet_peer *u = rcu_dereference(base->root);  	int count = 0;  	while (u != peer_avl_empty) {  		int cmp = addr_compare(daddr, &u->daddr);  		if (cmp == 0) {  			/* Before taking a reference, check if this entry was -			 * deleted, unlink_from_pool() sets refcnt=-1 to make -			 * distinction between an unused entry (refcnt=0) and -			 * a freed one. +			 * deleted (refcnt=-1)  			 */ -			if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) +			if (!atomic_add_unless(&u->refcnt, 1, -1))  				u = NULL;  			return u;  		}  		if (cmp == -1) -			u = rcu_dereference_bh(u->avl_left); +			u = rcu_dereference(u->avl_left);  		else -			u = rcu_dereference_bh(u->avl_right); +			u = rcu_dereference(u->avl_right);  		if (unlikely(++count == PEER_MAXDEPTH))  			break;  	} @@ -246,13 +260,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,  	struct inet_peer __rcu **v;				\  	*stackptr++ = &start->avl_left;				\  	v = &start->avl_left;					\ -	for (u = rcu_dereference_protected(*v,			\ -			lockdep_is_held(&base->lock));		\ -	     u->avl_right != peer_avl_empty_rcu; ) {		\ +	for (u = rcu_deref_locked(*v, base);			\ +	     u->avl_right != peer_avl_empty_rcu;) {		\  		v = &u->avl_right;				\  		*stackptr++ = v;				\ -		u = rcu_dereference_protected(*v,		\ -			lockdep_is_held(&base->lock));		\ +		u = rcu_deref_locked(*v, base);			\  	}							\  	u;							\  }) @@ -271,21 +283,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],  	while (stackend > stack) {  		nodep = *--stackend; -		node = rcu_dereference_protected(*nodep, -				lockdep_is_held(&base->lock)); -		l = rcu_dereference_protected(node->avl_left, -				lockdep_is_held(&base->lock)); -		r = rcu_dereference_protected(node->avl_right, -				lockdep_is_held(&base->lock)); +		node = rcu_deref_locked(*nodep, base); +		l = rcu_deref_locked(node->avl_left, base); +		r = rcu_deref_locked(node->avl_right, base);  		lh = node_height(l);  		rh = node_height(r);  		if (lh > rh + 1) { /* l: RH+2 */  			struct inet_peer *ll, *lr, *lrl, *lrr;  			int lrh; -			ll = rcu_dereference_protected(l->avl_left, -				lockdep_is_held(&base->lock)); -			lr = rcu_dereference_protected(l->avl_right, -				lockdep_is_held(&base->lock)); +			ll = rcu_deref_locked(l->avl_left, base); +			lr = rcu_deref_locked(l->avl_right, base);  			lrh = node_height(lr);  			if (lrh <= node_height(ll)) {	/* ll: RH+1 */  				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */ @@ -296,10 +303,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],  				l->avl_height = node->avl_height + 1;  				RCU_INIT_POINTER(*nodep, l);  			} else { /* ll: RH, lr: RH+1 */ -				lrl = rcu_dereference_protected(lr->avl_left, -					lockdep_is_held(&base->lock));	/* lrl: RH or RH-1 */ -				lrr = rcu_dereference_protected(lr->avl_right, -					lockdep_is_held(&base->lock));	/* lrr: RH or RH-1 */ +				lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ +				lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */  				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */  				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */  				node->avl_height = rh + 1; /* node: RH+1 */ @@ -314,10 +319,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],  		} else if (rh > lh + 1) { /* r: LH+2 */  			struct inet_peer *rr, *rl, *rlr, *rll;  			int rlh; -			rr = rcu_dereference_protected(r->avl_right, -				lockdep_is_held(&base->lock)); -			rl = rcu_dereference_protected(r->avl_left, -				lockdep_is_held(&base->lock)); +			rr = rcu_deref_locked(r->avl_right, base); +			rl = rcu_deref_locked(r->avl_left, base);  			rlh = node_height(rl);  			if (rlh <= node_height(rr)) {	/* rr: LH+1 */  				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */ @@ -328,10 +331,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],  				r->avl_height = node->avl_height + 1;  				RCU_INIT_POINTER(*nodep, r);  			} else { /* rr: RH, rl: RH+1 */ -				rlr = rcu_dereference_protected(rl->avl_right, -					lockdep_is_held(&base->lock));	/* rlr: LH or LH-1 */ -				rll = rcu_dereference_protected(rl->avl_left, -					lockdep_is_held(&base->lock));	/* rll: LH or LH-1 */ +				rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ +				rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */  				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */  				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */  				node->avl_height = lh + 1; /* node: LH+1 */ @@ -365,217 +366,214 @@ static void inetpeer_free_rcu(struct rcu_head *head)  	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));  } -/* May be called with local BH enabled. */ -static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) +static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, +			     struct inet_peer __rcu **stack[PEER_MAXDEPTH])  { -	int do_free; - -	do_free = 0; - -	spin_lock_bh(&base->lock); -	/* Check the reference counter.  It was artificially incremented by 1 -	 * in cleanup() function to prevent sudden disappearing.  If we can -	 * atomically (because of lockless readers) take this last reference, -	 * it's safe to remove the node and free it later. -	 * We use refcnt=-1 to alert lockless readers this entry is deleted. -	 */ -	if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { -		struct inet_peer __rcu **stack[PEER_MAXDEPTH]; -		struct inet_peer __rcu ***stackptr, ***delp; -		if (lookup(&p->daddr, stack, base) != p) -			BUG(); -		delp = stackptr - 1; /* *delp[0] == p */ -		if (p->avl_left == peer_avl_empty_rcu) { -			*delp[0] = p->avl_right; -			--stackptr; -		} else { -			/* look for a node to insert instead of p */ -			struct inet_peer *t; -			t = lookup_rightempty(p, base); -			BUG_ON(rcu_dereference_protected(*stackptr[-1], -					lockdep_is_held(&base->lock)) != t); -			**--stackptr = t->avl_left; -			/* t is removed, t->daddr > x->daddr for any -			 * x in p->avl_left subtree. -			 * Put t in the old place of p. */ -			RCU_INIT_POINTER(*delp[0], t); -			t->avl_left = p->avl_left; -			t->avl_right = p->avl_right; -			t->avl_height = p->avl_height; -			BUG_ON(delp[1] != &p->avl_left); -			delp[1] = &t->avl_left; /* was &p->avl_left */ -		} -		peer_avl_rebalance(stack, stackptr, base); -		base->total--; -		do_free = 1; +	struct inet_peer __rcu ***stackptr, ***delp; + +	if (lookup(&p->daddr, stack, base) != p) +		BUG(); +	delp = stackptr - 1; /* *delp[0] == p */ +	if (p->avl_left == peer_avl_empty_rcu) { +		*delp[0] = p->avl_right; +		--stackptr; +	} else { +		/* look for a node to insert instead of p */ +		struct inet_peer *t; +		t = lookup_rightempty(p, base); +		BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); +		**--stackptr = t->avl_left; +		/* t is removed, t->daddr > x->daddr for any +		 * x in p->avl_left subtree. +		 * Put t in the old place of p. */ +		RCU_INIT_POINTER(*delp[0], t); +		t->avl_left = p->avl_left; +		t->avl_right = p->avl_right; +		t->avl_height = p->avl_height; +		BUG_ON(delp[1] != &p->avl_left); +		delp[1] = &t->avl_left; /* was &p->avl_left */  	} -	spin_unlock_bh(&base->lock); - -	if (do_free) -		call_rcu_bh(&p->rcu, inetpeer_free_rcu); -	else -		/* The node is used again.  Decrease the reference counter -		 * back.  The loop "cleanup -> unlink_from_unused -		 *   -> unlink_from_pool -> putpeer -> link_to_unused -		 *   -> cleanup (for the same node)" -		 * doesn't really exist because the entry will have a -		 * recent deletion time and will not be cleaned again soon. -		 */ -		inet_putpeer(p); -} - -static struct inet_peer_base *family_to_base(int family) -{ -	return (family == AF_INET ? &v4_peers : &v6_peers); +	peer_avl_rebalance(stack, stackptr, base); +	base->total--; +	call_rcu(&p->rcu, inetpeer_free_rcu);  } -static struct inet_peer_base *peer_to_base(struct inet_peer *p) +/* perform garbage collect on all items stacked during a lookup */ +static int inet_peer_gc(struct inet_peer_base *base, +			struct inet_peer __rcu **stack[PEER_MAXDEPTH], +			struct inet_peer __rcu ***stackptr)  { -	return family_to_base(p->daddr.family); -} +	struct inet_peer *p, *gchead = NULL; +	__u32 delta, ttl; +	int cnt = 0; -/* May be called with local BH enabled. */ -static int cleanup_once(unsigned long ttl) -{ -	struct inet_peer *p = NULL; - -	/* Remove the first entry from the list of unused nodes. */ -	spin_lock_bh(&unused_peers.lock); -	if (!list_empty(&unused_peers.list)) { -		__u32 delta; - -		p = list_first_entry(&unused_peers.list, struct inet_peer, unused); -		delta = (__u32)jiffies - p->dtime; - -		if (delta < ttl) { -			/* Do not prune fresh entries. */ -			spin_unlock_bh(&unused_peers.lock); -			return -1; +	if (base->total >= inet_peer_threshold) +		ttl = 0; /* be aggressive */ +	else +		ttl = inet_peer_maxttl +				- (inet_peer_maxttl - inet_peer_minttl) / HZ * +					base->total / inet_peer_threshold * HZ; +	stackptr--; /* last stack slot is peer_avl_empty */ +	while (stackptr > stack) { +		stackptr--; +		p = rcu_deref_locked(**stackptr, base); +		if (atomic_read(&p->refcnt) == 0) { +			smp_rmb(); +			delta = (__u32)jiffies - p->dtime; +			if (delta >= ttl && +			    atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { +				p->gc_next = gchead; +				gchead = p; +			}  		} - -		list_del_init(&p->unused); - -		/* Grab an extra reference to prevent node disappearing -		 * before unlink_from_pool() call. */ -		atomic_inc(&p->refcnt);  	} -	spin_unlock_bh(&unused_peers.lock); - -	if (p == NULL) -		/* It means that the total number of USED entries has -		 * grown over inet_peer_threshold.  It shouldn't really -		 * happen because of entry limits in route cache. */ -		return -1; - -	unlink_from_pool(p, peer_to_base(p)); -	return 0; +	while ((p = gchead) != NULL) { +		gchead = p->gc_next; +		cnt++; +		unlink_from_pool(p, base, stack); +	} +	return cnt;  } -/* Called with or without local BH being disabled. */ -struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) +struct inet_peer *inet_getpeer(struct inet_peer_base *base, +			       const struct inetpeer_addr *daddr, +			       int create)  {  	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; -	struct inet_peer_base *base = family_to_base(AF_INET);  	struct inet_peer *p; +	unsigned int sequence; +	int invalidated, gccnt = 0; + +	flush_check(base, daddr->family); -	/* Look up for the address quickly, lockless. +	/* Attempt a lockless lookup first.  	 * Because of a concurrent writer, we might not find an existing entry.  	 */ -	rcu_read_lock_bh(); -	p = lookup_rcu_bh(daddr, base); -	rcu_read_unlock_bh(); +	rcu_read_lock(); +	sequence = read_seqbegin(&base->lock); +	p = lookup_rcu(daddr, base); +	invalidated = read_seqretry(&base->lock, sequence); +	rcu_read_unlock(); -	if (p) { -		/* The existing node has been found. -		 * Remove the entry from unused list if it was there. -		 */ -		unlink_from_unused(p); +	if (p)  		return p; -	} + +	/* If no writer did a change during our lookup, we can return early. */ +	if (!create && !invalidated) +		return NULL;  	/* retry an exact lookup, taking the lock before.  	 * At least, nodes should be hot in our cache.  	 */ -	spin_lock_bh(&base->lock); +	write_seqlock_bh(&base->lock); +relookup:  	p = lookup(daddr, stack, base);  	if (p != peer_avl_empty) {  		atomic_inc(&p->refcnt); -		spin_unlock_bh(&base->lock); -		/* Remove the entry from unused list if it was there. */ -		unlink_from_unused(p); +		write_sequnlock_bh(&base->lock);  		return p;  	} +	if (!gccnt) { +		gccnt = inet_peer_gc(base, stack, stackptr); +		if (gccnt && create) +			goto relookup; +	}  	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;  	if (p) {  		p->daddr = *daddr;  		atomic_set(&p->refcnt, 1);  		atomic_set(&p->rid, 0); -		atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); -		p->tcp_ts_stamp = 0; -		INIT_LIST_HEAD(&p->unused); - +		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; +		p->rate_tokens = 0; +		/* 60*HZ is arbitrary, but chosen enough high so that the first +		 * calculation of tokens is at its maximum. +		 */ +		p->rate_last = jiffies - 60*HZ; +		INIT_LIST_HEAD(&p->gc_list);  		/* Link the node. */  		link_to_pool(p, base);  		base->total++;  	} -	spin_unlock_bh(&base->lock); - -	if (base->total >= inet_peer_threshold) -		/* Remove one less-recently-used entry. */ -		cleanup_once(0); +	write_sequnlock_bh(&base->lock);  	return p;  } +EXPORT_SYMBOL_GPL(inet_getpeer); -static int compute_total(void) +void inet_putpeer(struct inet_peer *p)  { -	return v4_peers.total + v6_peers.total; +	p->dtime = (__u32)jiffies; +	smp_mb__before_atomic(); +	atomic_dec(&p->refcnt);  } -EXPORT_SYMBOL_GPL(inet_getpeer); +EXPORT_SYMBOL_GPL(inet_putpeer); -/* Called with local BH disabled. */ -static void peer_check_expire(unsigned long dummy) +/* + *	Check transmit rate limitation for given message. + *	The rate information is held in the inet_peer entries now. + *	This function is generic and could be used for other purposes + *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + *	Note that the same inet_peer fields are modified by functions in + *	route.c too, but these work for packet destinations while xrlim_allow + *	works for icmp destinations. This means the rate limiting information + *	for one "ip object" is shared - and these ICMPs are twice limited: + *	by source and by destination. + * + *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + *			  SHOULD allow setting of rate limits + * + * 	Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)  { -	unsigned long now = jiffies; -	int ttl, total; - -	total = compute_total(); -	if (total >= inet_peer_threshold) -		ttl = inet_peer_minttl; -	else -		ttl = inet_peer_maxttl -				- (inet_peer_maxttl - inet_peer_minttl) / HZ * -					total / inet_peer_threshold * HZ; -	while (!cleanup_once(ttl)) { -		if (jiffies != now) -			break; +	unsigned long now, token; +	bool rc = false; + +	if (!peer) +		return true; + +	token = peer->rate_tokens; +	now = jiffies; +	token += now - peer->rate_last; +	peer->rate_last = now; +	if (token > XRLIM_BURST_FACTOR * timeout) +		token = XRLIM_BURST_FACTOR * timeout; +	if (token >= timeout) { +		token -= timeout; +		rc = true;  	} +	peer->rate_tokens = token; +	return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); -	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime -	 * interval depending on the total number of entries (more entries, -	 * less interval). */ -	total = compute_total(); -	if (total >= inet_peer_threshold) -		peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; -	else -		peer_periodic_timer.expires = jiffies -			+ inet_peer_gc_maxtime -			- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * -				total / inet_peer_threshold * HZ; -	add_timer(&peer_periodic_timer); +static void inetpeer_inval_rcu(struct rcu_head *head) +{ +	struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); + +	spin_lock_bh(&gc_lock); +	list_add_tail(&p->gc_list, &gc_list); +	spin_unlock_bh(&gc_lock); + +	schedule_delayed_work(&gc_work, gc_delay);  } -void inet_putpeer(struct inet_peer *p) +void inetpeer_invalidate_tree(struct inet_peer_base *base)  { -	local_bh_disable(); +	struct inet_peer *root; + +	write_seqlock_bh(&base->lock); -	if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { -		list_add_tail(&p->unused, &unused_peers.list); -		p->dtime = (__u32)jiffies; -		spin_unlock(&unused_peers.lock); +	root = rcu_deref_locked(base->root, base); +	if (root != peer_avl_empty) { +		base->root = peer_avl_empty_rcu; +		base->total = 0; +		call_rcu(&root->gc_rcu, inetpeer_inval_rcu);  	} -	local_bh_enable(); +	write_sequnlock_bh(&base->lock);  } -EXPORT_SYMBOL_GPL(inet_putpeer); +EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 99461f09320..3a83ce5efa8 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,11 +39,30 @@  #include <net/route.h>  #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ +	return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || +		skb->ignore_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ +	if (skb->len <= mtu) +		return false; + +	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) +		return false; + +	return true; +} + +  static int ip_forward_finish(struct sk_buff *skb)  { -	struct ip_options * opt	= &(IPCB(skb)->opt); +	struct ip_options *opt	= &(IPCB(skb)->opt);  	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); +	IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);  	if (unlikely(opt->optlen))  		ip_forward_options(skb); @@ -53,9 +72,14 @@ static int ip_forward_finish(struct sk_buff *skb)  int ip_forward(struct sk_buff *skb)  { +	u32 mtu;  	struct iphdr *iph;	/* Our header */  	struct rtable *rt;	/* Route we use */ -	struct ip_options * opt	= &(IPCB(skb)->opt); +	struct ip_options *opt	= &(IPCB(skb)->opt); + +	/* that should never happen */ +	if (skb->pkt_type != PACKET_HOST) +		goto drop;  	if (skb_warn_if_lro(skb))  		goto drop; @@ -66,9 +90,6 @@ int ip_forward(struct sk_buff *skb)  	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))  		return NET_RX_SUCCESS; -	if (skb->pkt_type != PACKET_HOST) -		goto drop; -  	skb_forward_csum(skb);  	/* @@ -84,14 +105,15 @@ int ip_forward(struct sk_buff *skb)  	rt = skb_rtable(skb); -	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) +	if (opt->is_strictroute && rt->rt_uses_gateway)  		goto sr_failed; -	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && -		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { +	IPCB(skb)->flags |= IPSKB_FORWARDED; +	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); +	if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {  		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(dst_mtu(&rt->dst))); +			  htonl(mtu));  		goto drop;  	} diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e6215bdd96c..ed32313e307 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -20,6 +20,8 @@   *		Patrick McHardy :	LRU queue of frag heads for evictor.   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/compiler.h>  #include <linux/module.h>  #include <linux/types.h> @@ -45,6 +47,7 @@  #include <linux/udp.h>  #include <linux/inet.h>  #include <linux/netfilter_ipv4.h> +#include <net/inet_ecn.h>  /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6   * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -70,11 +73,17 @@ struct ipq {  	__be32		daddr;  	__be16		id;  	u8		protocol; +	u8		ecn; /* RFC3168 support */  	int             iif;  	unsigned int    rid;  	struct inet_peer *peer;  }; +static inline u8 ip4_frag_ecn(u8 tos) +{ +	return 1 << (tos & INET_ECN_MASK); +} +  static struct inet_frags ip4_frags;  int ip_frag_nqueues(struct net *net) @@ -84,7 +93,7 @@ int ip_frag_nqueues(struct net *net)  int ip_frag_mem(struct net *net)  { -	return atomic_read(&net->ipv4.frags.mem); +	return sum_frag_mem_limit(&net->ipv4.frags);  }  static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -97,6 +106,7 @@ struct ip4_create_arg {  static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)  { +	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));  	return jhash_3words((__force u32)id << 16 | prot,  			    (__force u32)saddr, (__force u32)daddr,  			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1); @@ -110,38 +120,36 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)  	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);  } -static int ip4_frag_match(struct inet_frag_queue *q, void *a) +static bool ip4_frag_match(struct inet_frag_queue *q, void *a)  {  	struct ipq *qp;  	struct ip4_create_arg *arg = a;  	qp = container_of(q, struct ipq, q);  	return	qp->id == arg->iph->id && -			qp->saddr == arg->iph->saddr && -			qp->daddr == arg->iph->daddr && -			qp->protocol == arg->iph->protocol && -			qp->user == arg->user; -} - -/* Memory Tracking Functions. */ -static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) -{ -	atomic_sub(skb->truesize, &nf->mem); -	kfree_skb(skb); +		qp->saddr == arg->iph->saddr && +		qp->daddr == arg->iph->daddr && +		qp->protocol == arg->iph->protocol && +		qp->user == arg->user;  }  static void ip4_frag_init(struct inet_frag_queue *q, void *a)  {  	struct ipq *qp = container_of(q, struct ipq, q); +	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4, +					       frags); +	struct net *net = container_of(ipv4, struct net, ipv4); +  	struct ip4_create_arg *arg = a;  	qp->protocol = arg->iph->protocol;  	qp->id = arg->iph->id; +	qp->ecn = ip4_frag_ecn(arg->iph->tos);  	qp->saddr = arg->iph->saddr;  	qp->daddr = arg->iph->daddr;  	qp->user = arg->user;  	qp->peer = sysctl_ipfrag_max_dist ? -		inet_getpeer_v4(arg->iph->saddr, 1) : NULL; +		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;  }  static __inline__ void ip4_frag_free(struct inet_frag_queue *q) @@ -176,7 +184,7 @@ static void ip_evictor(struct net *net)  {  	int evicted; -	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); +	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);  	if (evicted)  		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);  } @@ -204,31 +212,31 @@ static void ip_expire(unsigned long arg)  	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {  		struct sk_buff *head = qp->q.fragments; +		const struct iphdr *iph; +		int err;  		rcu_read_lock();  		head->dev = dev_get_by_index_rcu(net, qp->iif);  		if (!head->dev)  			goto out_rcu_unlock; +		/* skb has no dst, perform route lookup again */ +		iph = ip_hdr(head); +		err = ip_route_input_noref(head, iph->daddr, iph->saddr, +					   iph->tos, head->dev); +		if (err) +			goto out_rcu_unlock; +  		/* -		 * Only search router table for the head fragment, -		 * when defraging timeout at PRE_ROUTING HOOK. +		 * Only an end host needs to send an ICMP +		 * "Fragment Reassembly Timeout" message, per RFC792.  		 */ -		if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { -			const struct iphdr *iph = ip_hdr(head); -			int err = ip_route_input(head, iph->daddr, iph->saddr, -						 iph->tos, head->dev); -			if (unlikely(err)) -				goto out_rcu_unlock; - -			/* -			 * Only an end host needs to send an ICMP -			 * "Fragment Reassembly Timeout" message, per RFC792. -			 */ -			if (skb_rtable(head)->rt_type != RTN_LOCAL) -				goto out_rcu_unlock; +		if (qp->user == IP_DEFRAG_AF_PACKET || +		    ((qp->user >= IP_DEFRAG_CONNTRACK_IN) && +		     (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) && +		     (skb_rtable(head)->rt_type != RTN_LOCAL))) +			goto out_rcu_unlock; -		}  		/* Send an ICMP "Fragment Reassembly Timeout" message. */  		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -256,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)  	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);  	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); -	if (q == NULL) -		goto out_nomem; - +	if (IS_ERR_OR_NULL(q)) { +		inet_frag_maybe_warn_overflow(q, pr_fmt()); +		return NULL; +	}  	return container_of(q, struct ipq, q); - -out_nomem: -	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); -	return NULL;  }  /* Is the fragment too far ahead to be part of ipq? */ @@ -297,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp)  static int ip_frag_reinit(struct ipq *qp)  {  	struct sk_buff *fp; +	unsigned int sum_truesize = 0;  	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {  		atomic_inc(&qp->q.refcnt); @@ -306,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp)  	fp = qp->q.fragments;  	do {  		struct sk_buff *xp = fp->next; -		frag_kfree_skb(qp->q.net, fp); + +		sum_truesize += fp->truesize; +		kfree_skb(fp);  		fp = xp;  	} while (fp); +	sub_frag_mem_limit(&qp->q, sum_truesize);  	qp->q.last_in = 0;  	qp->q.len = 0; @@ -316,6 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)  	qp->q.fragments = NULL;  	qp->q.fragments_tail = NULL;  	qp->iif = 0; +	qp->ecn = 0;  	return 0;  } @@ -328,6 +338,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  	int flags, offset;  	int ihl, end;  	int err = -ENOENT; +	u8 ecn;  	if (qp->q.last_in & INET_FRAG_COMPLETE)  		goto err; @@ -339,6 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  		goto err;  	} +	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);  	offset = ntohs(ip_hdr(skb)->frag_off);  	flags = offset & ~IP_OFFSET;  	offset &= IP_OFFSET; @@ -352,7 +364,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)  	/* Is this the final fragment? */  	if ((flags & IP_MF) == 0) {  		/* If we already have some bits beyond end -		 * or have different end, the segment is corrrupted. +		 * or have different end, the segment is corrupted.  		 */  		if (end < qp->q.len ||  		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) @@ -450,7 +462,8 @@ found:  				qp->q.fragments = next;  			qp->q.meat -= free_it->len; -			frag_kfree_skb(qp->q.net, free_it); +			sub_frag_mem_limit(&qp->q, free_it->truesize); +			kfree_skb(free_it);  		}  	} @@ -472,17 +485,27 @@ found:  	}  	qp->q.stamp = skb->tstamp;  	qp->q.meat += skb->len; -	atomic_add(skb->truesize, &qp->q.net->mem); +	qp->ecn |= ecn; +	add_frag_mem_limit(&qp->q, skb->truesize);  	if (offset == 0)  		qp->q.last_in |= INET_FRAG_FIRST_IN; +	if (ip_hdr(skb)->frag_off & htons(IP_DF) && +	    skb->len + ihl > qp->q.max_size) +		qp->q.max_size = skb->len + ihl; +  	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && -	    qp->q.meat == qp->q.len) -		return ip_frag_reasm(qp, prev, dev); +	    qp->q.meat == qp->q.len) { +		unsigned long orefdst = skb->_skb_refdst; + +		skb->_skb_refdst = 0UL; +		err = ip_frag_reasm(qp, prev, dev); +		skb->_skb_refdst = orefdst; +		return err; +	} -	write_lock(&ip4_frags.lock); -	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); -	write_unlock(&ip4_frags.lock); +	skb_dst_drop(skb); +	inet_frag_lru_move(&qp->q);  	return -EINPROGRESS;  err: @@ -502,9 +525,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  	int len;  	int ihlen;  	int err; +	int sum_truesize; +	u8 ecn;  	ipq_kill(qp); +	ecn = ip_frag_ecn_table[qp->ecn]; +	if (unlikely(ecn == 0xff)) { +		err = -EINVAL; +		goto out_fail; +	}  	/* Make the one we just received the head. */  	if (prev) {  		head = prev->next; @@ -520,7 +550,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		skb_morph(head, qp->q.fragments);  		head->next = qp->q.fragments->next; -		kfree_skb(qp->q.fragments); +		consume_skb(qp->q.fragments);  		qp->q.fragments = head;  	} @@ -536,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		goto out_oversize;  	/* Head of list must not be cloned. */ -	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(head, GFP_ATOMIC))  		goto out_nomem;  	/* If the first fragment is fragmented itself, we split @@ -552,51 +582,65 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,  		head->next = clone;  		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;  		skb_frag_list_init(head); -		for (i=0; i<skb_shinfo(head)->nr_frags; i++) -			plen += skb_shinfo(head)->frags[i].size; +		for (i = 0; i < skb_shinfo(head)->nr_frags; i++) +			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);  		clone->len = clone->data_len = head->data_len - plen;  		head->data_len -= clone->len;  		head->len -= clone->len;  		clone->csum = 0;  		clone->ip_summed = head->ip_summed; -		atomic_add(clone->truesize, &qp->q.net->mem); +		add_frag_mem_limit(&qp->q, clone->truesize);  	} -	skb_shinfo(head)->frag_list = head->next;  	skb_push(head, head->data - skb_network_header(head)); -	for (fp=head->next; fp; fp = fp->next) { -		head->data_len += fp->len; -		head->len += fp->len; +	sum_truesize = head->truesize; +	for (fp = head->next; fp;) { +		bool headstolen; +		int delta; +		struct sk_buff *next = fp->next; + +		sum_truesize += fp->truesize;  		if (head->ip_summed != fp->ip_summed)  			head->ip_summed = CHECKSUM_NONE;  		else if (head->ip_summed == CHECKSUM_COMPLETE)  			head->csum = csum_add(head->csum, fp->csum); -		head->truesize += fp->truesize; + +		if (skb_try_coalesce(head, fp, &headstolen, &delta)) { +			kfree_skb_partial(fp, headstolen); +		} else { +			if (!skb_shinfo(head)->frag_list) +				skb_shinfo(head)->frag_list = fp; +			head->data_len += fp->len; +			head->len += fp->len; +			head->truesize += fp->truesize; +		} +		fp = next;  	} -	atomic_sub(head->truesize, &qp->q.net->mem); +	sub_frag_mem_limit(&qp->q, sum_truesize);  	head->next = NULL;  	head->dev = dev;  	head->tstamp = qp->q.stamp; +	IPCB(head)->frag_max_size = qp->q.max_size;  	iph = ip_hdr(head); -	iph->frag_off = 0; +	/* max_size != 0 implies at least one fragment had IP_DF set */ +	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;  	iph->tot_len = htons(len); +	iph->tos |= ecn;  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);  	qp->q.fragments = NULL;  	qp->q.fragments_tail = NULL;  	return 0;  out_nomem: -	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " -			      "queue %p\n", qp); +	LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), +		       qp);  	err = -ENOMEM;  	goto out_fail;  out_oversize: -	if (net_ratelimit()) -		printk(KERN_INFO "Oversized IP packet from %pI4.\n", -			&qp->saddr); +	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);  out_fail:  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);  	return err; @@ -612,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)  	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);  	/* Start by cleaning up the memory. */ -	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) -		ip_evictor(net); +	ip_evictor(net);  	/* Lookup (or create) queue header */  	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { @@ -634,6 +677,41 @@ int ip_defrag(struct sk_buff *skb, u32 user)  }  EXPORT_SYMBOL(ip_defrag); +struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) +{ +	struct iphdr iph; +	u32 len; + +	if (skb->protocol != htons(ETH_P_IP)) +		return skb; + +	if (!skb_copy_bits(skb, 0, &iph, sizeof(iph))) +		return skb; + +	if (iph.ihl < 5 || iph.version != 4) +		return skb; + +	len = ntohs(iph.tot_len); +	if (skb->len < len || len < (iph.ihl * 4)) +		return skb; + +	if (ip_is_fragment(&iph)) { +		skb = skb_share_check(skb, GFP_ATOMIC); +		if (skb) { +			if (!pskb_may_pull(skb, iph.ihl*4)) +				return skb; +			if (pskb_trim_rcsum(skb, len)) +				return skb; +			memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); +			if (ip_defrag(skb, user)) +				return NULL; +			skb_clear_hash(skb); +		} +	} +	return skb; +} +EXPORT_SYMBOL(ip_check_defrag); +  #ifdef CONFIG_SYSCTL  static int zero; @@ -695,9 +773,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)  		table[0].data = &net->ipv4.frags.high_thresh;  		table[1].data = &net->ipv4.frags.low_thresh;  		table[2].data = &net->ipv4.frags.timeout; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			table[0].procname = NULL;  	} -	hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); +	hdr = register_net_sysctl(net, "net/ipv4", table);  	if (hdr == NULL)  		goto err_reg; @@ -722,7 +804,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)  static void ip4_frags_ctl_register(void)  { -	register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +	register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);  }  #else  static inline int ip4_frags_ns_ctl_register(struct net *net) @@ -741,14 +823,22 @@ static inline void ip4_frags_ctl_register(void)  static int __net_init ipv4_frags_init_net(struct net *net)  { -	/* -	 * Fragment cache limits. We will commit 256K at one time. Should we -	 * cross that limit we will prune down to 192K. This should cope with -	 * even the most extreme cases without allowing an attacker to -	 * measurably harm machine performance. +	/* Fragment cache limits. +	 * +	 * The fragment memory accounting code, (tries to) account for +	 * the real memory usage, by measuring both the size of frag +	 * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) +	 * and the SKB's truesize. +	 * +	 * A 64K fragment consumes 129736 bytes (44*2944)+200 +	 * (1500 truesize == 2944, sizeof(struct ipq) == 200) +	 * +	 * We will commit 4MB at one time. Should we cross that limit +	 * we will prune down to 3MB, making room for approx 8 big 64K +	 * fragments 8x128k.  	 */ -	net->ipv4.frags.high_thresh = 256 * 1024; -	net->ipv4.frags.low_thresh = 192 * 1024; +	net->ipv4.frags.high_thresh = 4 * 1024 * 1024; +	net->ipv4.frags.low_thresh  = 3 * 1024 * 1024;  	/*  	 * Important NOTE! Fragment queue must be destroyed before MSL expires.  	 * RFC791 is wrong proposing to prolongate timer each fragment arrival diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 258c98d5fa7..9b842544aea 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -10,6 +10,8 @@   *   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/capability.h>  #include <linux/module.h>  #include <linux/types.h> @@ -35,7 +37,7 @@  #include <net/ip.h>  #include <net/icmp.h>  #include <net/protocol.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h>  #include <net/arp.h>  #include <net/checksum.h>  #include <net/dsfield.h> @@ -46,7 +48,7 @@  #include <net/rtnetlink.h>  #include <net/gre.h> -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  #include <net/ipv6.h>  #include <net/ip6_fib.h>  #include <net/ip6_route.h> @@ -65,7 +67,7 @@     it is infeasible task. The most general solutions would be     to keep skb->encapsulation counter (sort of local ttl),     and silently drop packet when it expires. It is a good -   solution, but it supposes maintaing new variable in ALL +   solution, but it supposes maintaining new variable in ALL     skb, even if no tunneling is used.     Current solution: xmit_recursion breaks dead loops. This is a percpu @@ -91,14 +93,14 @@     One of them is to parse packet trying to detect inner encapsulation     made by our node. It is difficult or even impossible, especially, -   taking into account fragmentation. TO be short, tt is not solution at all. +   taking into account fragmentation. TO be short, ttl is not solution at all.     Current solution: The solution was UNEXPECTEDLY SIMPLE.     We force DF flag on tunnels with preconfigured hop limit,     that is ALL. :-) Well, it does not remove the problem completely,     but exponential growth of network traffic is changed to linear     (branches, that exceed pmtu are pruned) and tunnel mtu -   fastly degrades to value <68, where looping stops. +   rapidly degrades to value <68, where looping stops.     Yes, it is not good if there exists a router in the loop,     which does not force DF, even when encapsulating packets have DF set.     But it is not our problem! Nobody could accuse us, we made @@ -106,399 +108,54 @@     fatal route to network, even if it were you who configured     fatal static route: you are innocent. :-) - - -   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain -   practically identical code. It would be good to glue them -   together, but it is not very evident, how to make them modular. -   sit is integral part of IPv6, ipip and gre are naturally modular. -   We could extract common parts (hash table, ioctl etc) -   to a separate module (ip_tunnel.c). -     Alexey Kuznetsov.   */ +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); +  static struct rtnl_link_ops ipgre_link_ops __read_mostly;  static int ipgre_tunnel_init(struct net_device *dev); -static void ipgre_tunnel_setup(struct net_device *dev); -static int ipgre_tunnel_bind_dev(struct net_device *dev); - -/* Fallback tunnel: no source, no destination, no key, no options */ - -#define HASH_SIZE  16  static int ipgre_net_id __read_mostly; -struct ipgre_net { -	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; - -	struct net_device *fb_tunnel_dev; -}; - -/* Tunnel hash table */ - -/* -   4 hash tables: - -   3: (remote,local) -   2: (remote,*) -   1: (*,local) -   0: (*,*) - -   We require exact key match i.e. if a key is present in packet -   it will match only tunnel with the same key; if it is not present, -   it will match only keyless tunnel. - -   All keysless packets, if not matched configured keyless tunnels -   will match fallback tunnel. - */ - -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) - -#define tunnels_r_l	tunnels[3] -#define tunnels_r	tunnels[2] -#define tunnels_l	tunnels[1] -#define tunnels_wc	tunnels[0] -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -#define for_each_ip_tunnel_rcu(start) \ -	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -/* often modified stats are per cpu, other are shared (netdev->stats) */ -struct pcpu_tstats { -	unsigned long	rx_packets; -	unsigned long	rx_bytes; -	unsigned long	tx_packets; -	unsigned long	tx_bytes; -}; - -static struct net_device_stats *ipgre_get_stats(struct net_device *dev) -{ -	struct pcpu_tstats sum = { 0 }; -	int i; - -	for_each_possible_cpu(i) { -		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - -		sum.rx_packets += tstats->rx_packets; -		sum.rx_bytes   += tstats->rx_bytes; -		sum.tx_packets += tstats->tx_packets; -		sum.tx_bytes   += tstats->tx_bytes; -	} -	dev->stats.rx_packets = sum.rx_packets; -	dev->stats.rx_bytes   = sum.rx_bytes; -	dev->stats.tx_packets = sum.tx_packets; -	dev->stats.tx_bytes   = sum.tx_bytes; -	return &dev->stats; -} - -/* Given src, dst and key, find appropriate for input tunnel. */ - -static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, -					      __be32 remote, __be32 local, -					      __be32 key, __be16 gre_proto) -{ -	struct net *net = dev_net(dev); -	int link = dev->ifindex; -	unsigned int h0 = HASH(remote); -	unsigned int h1 = HASH(key); -	struct ip_tunnel *t, *cand = NULL; -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); -	int dev_type = (gre_proto == htons(ETH_P_TEB)) ? -		       ARPHRD_ETHER : ARPHRD_IPGRE; -	int score, cand_score = 4; - -	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { -		if (local != t->parms.iph.saddr || -		    remote != t->parms.iph.daddr || -		    key != t->parms.i_key || -		    !(t->dev->flags & IFF_UP)) -			continue; - -		if (t->dev->type != ARPHRD_IPGRE && -		    t->dev->type != dev_type) -			continue; - -		score = 0; -		if (t->parms.link != link) -			score |= 1; -		if (t->dev->type != dev_type) -			score |= 2; -		if (score == 0) -			return t; - -		if (score < cand_score) { -			cand = t; -			cand_score = score; -		} -	} - -	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { -		if (remote != t->parms.iph.daddr || -		    key != t->parms.i_key || -		    !(t->dev->flags & IFF_UP)) -			continue; - -		if (t->dev->type != ARPHRD_IPGRE && -		    t->dev->type != dev_type) -			continue; - -		score = 0; -		if (t->parms.link != link) -			score |= 1; -		if (t->dev->type != dev_type) -			score |= 2; -		if (score == 0) -			return t; - -		if (score < cand_score) { -			cand = t; -			cand_score = score; -		} -	} - -	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { -		if ((local != t->parms.iph.saddr && -		     (local != t->parms.iph.daddr || -		      !ipv4_is_multicast(local))) || -		    key != t->parms.i_key || -		    !(t->dev->flags & IFF_UP)) -			continue; - -		if (t->dev->type != ARPHRD_IPGRE && -		    t->dev->type != dev_type) -			continue; - -		score = 0; -		if (t->parms.link != link) -			score |= 1; -		if (t->dev->type != dev_type) -			score |= 2; -		if (score == 0) -			return t; - -		if (score < cand_score) { -			cand = t; -			cand_score = score; -		} -	} - -	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { -		if (t->parms.i_key != key || -		    !(t->dev->flags & IFF_UP)) -			continue; - -		if (t->dev->type != ARPHRD_IPGRE && -		    t->dev->type != dev_type) -			continue; - -		score = 0; -		if (t->parms.link != link) -			score |= 1; -		if (t->dev->type != dev_type) -			score |= 2; -		if (score == 0) -			return t; - -		if (score < cand_score) { -			cand = t; -			cand_score = score; -		} -	} - -	if (cand != NULL) -		return cand; - -	dev = ign->fb_tunnel_dev; -	if (dev->flags & IFF_UP) -		return netdev_priv(dev); +static int gre_tap_net_id __read_mostly; -	return NULL; -} - -static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, -		struct ip_tunnel_parm *parms) -{ -	__be32 remote = parms->iph.daddr; -	__be32 local = parms->iph.saddr; -	__be32 key = parms->i_key; -	unsigned int h = HASH(key); -	int prio = 0; - -	if (local) -		prio |= 1; -	if (remote && !ipv4_is_multicast(remote)) { -		prio |= 2; -		h ^= HASH(remote); -	} - -	return &ign->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, -		struct ip_tunnel *t) -{ -	return __ipgre_bucket(ign, &t->parms); -} - -static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) +static int ipgre_err(struct sk_buff *skb, u32 info, +		     const struct tnl_ptk_info *tpi)  { -	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); - -	rcu_assign_pointer(t->next, rtnl_dereference(*tp)); -	rcu_assign_pointer(*tp, t); -} -static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) -{ -	struct ip_tunnel __rcu **tp; -	struct ip_tunnel *iter; - -	for (tp = ipgre_bucket(ign, t); -	     (iter = rtnl_dereference(*tp)) != NULL; -	     tp = &iter->next) { -		if (t == iter) { -			rcu_assign_pointer(*tp, t->next); -			break; -		} -	} -} - -static struct ip_tunnel *ipgre_tunnel_find(struct net *net, -					   struct ip_tunnel_parm *parms, -					   int type) -{ -	__be32 remote = parms->iph.daddr; -	__be32 local = parms->iph.saddr; -	__be32 key = parms->i_key; -	int link = parms->link; -	struct ip_tunnel *t; -	struct ip_tunnel __rcu **tp; -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); - -	for (tp = __ipgre_bucket(ign, parms); -	     (t = rtnl_dereference(*tp)) != NULL; -	     tp = &t->next) -		if (local == t->parms.iph.saddr && -		    remote == t->parms.iph.daddr && -		    key == t->parms.i_key && -		    link == t->parms.link && -		    type == t->dev->type) -			break; - -	return t; -} - -static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, -		struct ip_tunnel_parm *parms, int create) -{ -	struct ip_tunnel *t, *nt; -	struct net_device *dev; -	char name[IFNAMSIZ]; -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); - -	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); -	if (t || !create) -		return t; - -	if (parms->name[0]) -		strlcpy(name, parms->name, IFNAMSIZ); -	else -		strcpy(name, "gre%d"); - -	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); -	if (!dev) -		return NULL; - -	dev_net_set(dev, net); - -	if (strchr(name, '%')) { -		if (dev_alloc_name(dev, name) < 0) -			goto failed_free; -	} - -	nt = netdev_priv(dev); -	nt->parms = *parms; -	dev->rtnl_link_ops = &ipgre_link_ops; - -	dev->mtu = ipgre_tunnel_bind_dev(dev); - -	if (register_netdevice(dev) < 0) -		goto failed_free; - -	dev_hold(dev); -	ipgre_tunnel_link(ign, nt); -	return nt; - -failed_free: -	free_netdev(dev); -	return NULL; -} - -static void ipgre_tunnel_uninit(struct net_device *dev) -{ -	struct net *net = dev_net(dev); -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); - -	ipgre_tunnel_unlink(ign, netdev_priv(dev)); -	dev_put(dev); -} - - -static void ipgre_err(struct sk_buff *skb, u32 info) -{ +	/* All the routers (except for Linux) return only +	   8 bytes of packet payload. It means, that precise relaying of +	   ICMP in the real Internet is absolutely infeasible. -/* All the routers (except for Linux) return only -   8 bytes of packet payload. It means, that precise relaying of -   ICMP in the real Internet is absolutely infeasible. +	   Moreover, Cisco "wise men" put GRE key to the third word +	   in GRE header. It makes impossible maintaining even soft +	   state for keyed GRE tunnels with enabled checksum. Tell +	   them "thank you". -   Moreover, Cisco "wise men" put GRE key to the third word -   in GRE header. It makes impossible maintaining even soft state for keyed -   GRE tunnels with enabled checksum. Tell them "thank you". - -   Well, I wonder, rfc1812 was written by Cisco employee, -   what the hell these idiots break standrads established -   by themself??? - */ - -	struct iphdr *iph = (struct iphdr *)skb->data; -	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2)); -	int grehlen = (iph->ihl<<2) + 4; +	   Well, I wonder, rfc1812 was written by Cisco employee, +	   what the hell these idiots break standards established +	   by themselves??? +	   */ +	struct net *net = dev_net(skb->dev); +	struct ip_tunnel_net *itn; +	const struct iphdr *iph;  	const int type = icmp_hdr(skb)->type;  	const int code = icmp_hdr(skb)->code;  	struct ip_tunnel *t; -	__be16 flags; - -	flags = p[0]; -	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { -		if (flags&(GRE_VERSION|GRE_ROUTING)) -			return; -		if (flags&GRE_KEY) { -			grehlen += 4; -			if (flags&GRE_CSUM) -				grehlen += 4; -		} -	} - -	/* If only 8 bytes returned, keyed message will be dropped here */ -	if (skb_headlen(skb) < grehlen) -		return;  	switch (type) {  	default:  	case ICMP_PARAMETERPROB: -		return; +		return PACKET_RCVD;  	case ICMP_DEST_UNREACH:  		switch (code) {  		case ICMP_SR_FAILED:  		case ICMP_PORT_UNREACH:  			/* Impossible event. */ -			return; -		case ICMP_FRAG_NEEDED: -			/* Soft state for pmtu is maintained by IP core. */ -			return; +			return PACKET_RCVD;  		default:  			/* All others are translated to HOST_UNREACH.  			   rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -509,627 +166,173 @@ static void ipgre_err(struct sk_buff *skb, u32 info)  		break;  	case ICMP_TIME_EXCEEDED:  		if (code != ICMP_EXC_TTL) -			return; +			return PACKET_RCVD; +		break; + +	case ICMP_REDIRECT:  		break;  	} -	rcu_read_lock(); -	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, -				flags & GRE_KEY ? -				*(((__be32 *)p) + (grehlen / 4) - 1) : 0, -				p[1]); -	if (t == NULL || t->parms.iph.daddr == 0 || +	if (tpi->proto == htons(ETH_P_TEB)) +		itn = net_generic(net, gre_tap_net_id); +	else +		itn = net_generic(net, ipgre_net_id); + +	iph = (const struct iphdr *)(icmp_hdr(skb) + 1); +	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, +			     iph->daddr, iph->saddr, tpi->key); + +	if (t == NULL) +		return PACKET_REJECT; + +	if (t->parms.iph.daddr == 0 ||  	    ipv4_is_multicast(t->parms.iph.daddr)) -		goto out; +		return PACKET_RCVD;  	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) -		goto out; +		return PACKET_RCVD;  	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))  		t->err_count++;  	else  		t->err_count = 1;  	t->err_time = jiffies; -out: -	rcu_read_unlock(); +	return PACKET_RCVD;  } -static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)  { -	if (INET_ECN_is_ce(iph->tos)) { -		if (skb->protocol == htons(ETH_P_IP)) { -			IP_ECN_set_ce(ip_hdr(skb)); -		} else if (skb->protocol == htons(ETH_P_IPV6)) { -			IP6_ECN_set_ce(ipv6_hdr(skb)); -		} -	} -} - -static inline u8 -ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) -{ -	u8 inner = 0; -	if (skb->protocol == htons(ETH_P_IP)) -		inner = old_iph->tos; -	else if (skb->protocol == htons(ETH_P_IPV6)) -		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); -	return INET_ECN_encapsulate(tos, inner); -} - -static int ipgre_rcv(struct sk_buff *skb) -{ -	struct iphdr *iph; -	u8     *h; -	__be16    flags; -	__sum16   csum = 0; -	__be32 key = 0; -	u32    seqno = 0; +	struct net *net = dev_net(skb->dev); +	struct ip_tunnel_net *itn; +	const struct iphdr *iph;  	struct ip_tunnel *tunnel; -	int    offset = 4; -	__be16 gre_proto; -	if (!pskb_may_pull(skb, 16)) -		goto drop_nolock; +	if (tpi->proto == htons(ETH_P_TEB)) +		itn = net_generic(net, gre_tap_net_id); +	else +		itn = net_generic(net, ipgre_net_id);  	iph = ip_hdr(skb); -	h = skb->data; -	flags = *(__be16*)h; +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, +				  iph->saddr, iph->daddr, tpi->key); -	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { -		/* - Version must be 0. -		   - We do not support routing headers. -		 */ -		if (flags&(GRE_VERSION|GRE_ROUTING)) -			goto drop_nolock; - -		if (flags&GRE_CSUM) { -			switch (skb->ip_summed) { -			case CHECKSUM_COMPLETE: -				csum = csum_fold(skb->csum); -				if (!csum) -					break; -				/* fall through */ -			case CHECKSUM_NONE: -				skb->csum = 0; -				csum = __skb_checksum_complete(skb); -				skb->ip_summed = CHECKSUM_COMPLETE; -			} -			offset += 4; -		} -		if (flags&GRE_KEY) { -			key = *(__be32*)(h + offset); -			offset += 4; -		} -		if (flags&GRE_SEQ) { -			seqno = ntohl(*(__be32*)(h + offset)); -			offset += 4; -		} +	if (tunnel) { +		skb_pop_mac_header(skb); +		ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); +		return PACKET_RCVD;  	} - -	gre_proto = *(__be16 *)(h + 2); - -	rcu_read_lock(); -	if ((tunnel = ipgre_tunnel_lookup(skb->dev, -					  iph->saddr, iph->daddr, key, -					  gre_proto))) { -		struct pcpu_tstats *tstats; - -		secpath_reset(skb); - -		skb->protocol = gre_proto; -		/* WCCP version 1 and 2 protocol decoding. -		 * - Change protocol to IP -		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header -		 */ -		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { -			skb->protocol = htons(ETH_P_IP); -			if ((*(h + offset) & 0xF0) != 0x40) -				offset += 4; -		} - -		skb->mac_header = skb->network_header; -		__pskb_pull(skb, offset); -		skb_postpull_rcsum(skb, skb_transport_header(skb), offset); -		skb->pkt_type = PACKET_HOST; -#ifdef CONFIG_NET_IPGRE_BROADCAST -		if (ipv4_is_multicast(iph->daddr)) { -			/* Looped back packet, drop it! */ -			if (rt_is_output_route(skb_rtable(skb))) -				goto drop; -			tunnel->dev->stats.multicast++; -			skb->pkt_type = PACKET_BROADCAST; -		} -#endif - -		if (((flags&GRE_CSUM) && csum) || -		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { -			tunnel->dev->stats.rx_crc_errors++; -			tunnel->dev->stats.rx_errors++; -			goto drop; -		} -		if (tunnel->parms.i_flags&GRE_SEQ) { -			if (!(flags&GRE_SEQ) || -			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { -				tunnel->dev->stats.rx_fifo_errors++; -				tunnel->dev->stats.rx_errors++; -				goto drop; -			} -			tunnel->i_seqno = seqno + 1; -		} - -		/* Warning: All skb pointers will be invalidated! */ -		if (tunnel->dev->type == ARPHRD_ETHER) { -			if (!pskb_may_pull(skb, ETH_HLEN)) { -				tunnel->dev->stats.rx_length_errors++; -				tunnel->dev->stats.rx_errors++; -				goto drop; -			} - -			iph = ip_hdr(skb); -			skb->protocol = eth_type_trans(skb, tunnel->dev); -			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); -		} - -		tstats = this_cpu_ptr(tunnel->dev->tstats); -		tstats->rx_packets++; -		tstats->rx_bytes += skb->len; - -		__skb_tunnel_rx(skb, tunnel->dev); - -		skb_reset_network_header(skb); -		ipgre_ecn_decapsulate(iph, skb); - -		netif_rx(skb); - -		rcu_read_unlock(); -		return 0; -	} -	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - -drop: -	rcu_read_unlock(); -drop_nolock: -	kfree_skb(skb); -	return 0; +	return PACKET_REJECT;  } -static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, +		       const struct iphdr *tnl_params, +		       __be16 proto)  {  	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct pcpu_tstats *tstats; -	struct iphdr  *old_iph = ip_hdr(skb); -	struct iphdr  *tiph; -	u8     tos; -	__be16 df; -	struct rtable *rt;     			/* Route to the other host */ -	struct net_device *tdev;		/* Device to other host */ -	struct iphdr  *iph;			/* Our new IP header */ -	unsigned int max_headroom;		/* The extra header space needed */ -	int    gre_hlen; -	__be32 dst; -	int    mtu; - -	if (dev->type == ARPHRD_ETHER) -		IPCB(skb)->flags = 0; - -	if (dev->header_ops && dev->type == ARPHRD_IPGRE) { -		gre_hlen = 0; -		tiph = (struct iphdr *)skb->data; -	} else { -		gre_hlen = tunnel->hlen; -		tiph = &tunnel->parms.iph; -	} - -	if ((dst = tiph->daddr) == 0) { -		/* NBMA tunnel */ - -		if (skb_dst(skb) == NULL) { -			dev->stats.tx_fifo_errors++; -			goto tx_error; -		} - -		if (skb->protocol == htons(ETH_P_IP)) { -			rt = skb_rtable(skb); -			if ((dst = rt->rt_gateway) == 0) -				goto tx_error_icmp; -		} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -		else if (skb->protocol == htons(ETH_P_IPV6)) { -			struct in6_addr *addr6; -			int addr_type; -			struct neighbour *neigh = skb_dst(skb)->neighbour; - -			if (neigh == NULL) -				goto tx_error; +	struct tnl_ptk_info tpi; -			addr6 = (struct in6_addr *)&neigh->primary_key; -			addr_type = ipv6_addr_type(addr6); +	tpi.flags = tunnel->parms.o_flags; +	tpi.proto = proto; +	tpi.key = tunnel->parms.o_key; +	if (tunnel->parms.o_flags & TUNNEL_SEQ) +		tunnel->o_seqno++; +	tpi.seq = htonl(tunnel->o_seqno); -			if (addr_type == IPV6_ADDR_ANY) { -				addr6 = &ipv6_hdr(skb)->daddr; -				addr_type = ipv6_addr_type(addr6); -			} +	/* Push GRE header. */ +	gre_build_header(skb, &tpi, tunnel->hlen); -			if ((addr_type & IPV6_ADDR_COMPATv4) == 0) -				goto tx_error_icmp; - -			dst = addr6->s6_addr32[3]; -		} -#endif -		else -			goto tx_error; -	} - -	tos = tiph->tos; -	if (tos == 1) { -		tos = 0; -		if (skb->protocol == htons(ETH_P_IP)) -			tos = old_iph->tos; -		else if (skb->protocol == htons(ETH_P_IPV6)) -			tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); -	} - -	{ -		struct flowi fl = { -			.oif = tunnel->parms.link, -			.fl4_dst = dst, -			.fl4_src = tiph->saddr, -			.fl4_tos = RT_TOS(tos), -			.fl_gre_key = tunnel->parms.o_key -		}; -		if (ip_route_output_key(dev_net(dev), &rt, &fl)) { -			dev->stats.tx_carrier_errors++; -			goto tx_error; -		} -	} -	tdev = rt->dst.dev; - -	if (tdev == dev) { -		ip_rt_put(rt); -		dev->stats.collisions++; -		goto tx_error; -	} - -	df = tiph->frag_off; -	if (df) -		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; -	else -		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - -	if (skb_dst(skb)) -		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - -	if (skb->protocol == htons(ETH_P_IP)) { -		df |= (old_iph->frag_off&htons(IP_DF)); +	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); +} -		if ((old_iph->frag_off&htons(IP_DF)) && -		    mtu < ntohs(old_iph->tot_len)) { -			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); -			ip_rt_put(rt); -			goto tx_error; -		} -	} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -	else if (skb->protocol == htons(ETH_P_IPV6)) { -		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); - -		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { -			if ((tunnel->parms.iph.daddr && -			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) || -			    rt6->rt6i_dst.plen == 128) { -				rt6->rt6i_flags |= RTF_MODIFIED; -				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; -			} -		} +static netdev_tx_t ipgre_xmit(struct sk_buff *skb, +			      struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	const struct iphdr *tnl_params; -		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { -			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -			ip_rt_put(rt); -			goto tx_error; -		} -	} -#endif +	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); +	if (IS_ERR(skb)) +		goto out; -	if (tunnel->err_count > 0) { -		if (time_before(jiffies, -				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { -			tunnel->err_count--; +	if (dev->header_ops) { +		/* Need space for new headers */ +		if (skb_cow_head(skb, dev->needed_headroom - +				      (tunnel->hlen + sizeof(struct iphdr)))) +			goto free_skb; -			dst_link_failure(skb); -		} else -			tunnel->err_count = 0; -	} +		tnl_params = (const struct iphdr *)skb->data; -	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; - -	if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| -	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { -		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); -		if (max_headroom > dev->needed_headroom) -			dev->needed_headroom = max_headroom; -		if (!new_skb) { -			ip_rt_put(rt); -			dev->stats.tx_dropped++; -			dev_kfree_skb(skb); -			return NETDEV_TX_OK; -		} -		if (skb->sk) -			skb_set_owner_w(new_skb, skb->sk); -		dev_kfree_skb(skb); -		skb = new_skb; -		old_iph = ip_hdr(skb); -	} +		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing +		 * to gre header. +		 */ +		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); +	} else { +		if (skb_cow_head(skb, dev->needed_headroom)) +			goto free_skb; -	skb_reset_transport_header(skb); -	skb_push(skb, gre_hlen); -	skb_reset_network_header(skb); -	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | -			      IPSKB_REROUTED); -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); - -	/* -	 *	Push down and install the IPIP header. -	 */ - -	iph 			=	ip_hdr(skb); -	iph->version		=	4; -	iph->ihl		=	sizeof(struct iphdr) >> 2; -	iph->frag_off		=	df; -	iph->protocol		=	IPPROTO_GRE; -	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb); -	iph->daddr		=	rt->rt_dst; -	iph->saddr		=	rt->rt_src; - -	if ((iph->ttl = tiph->ttl) == 0) { -		if (skb->protocol == htons(ETH_P_IP)) -			iph->ttl = old_iph->ttl; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -		else if (skb->protocol == htons(ETH_P_IPV6)) -			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; -#endif -		else -			iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); +		tnl_params = &tunnel->parms.iph;  	} -	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; -	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? -				   htons(ETH_P_TEB) : skb->protocol; - -	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { -		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); +	__gre_xmit(skb, dev, tnl_params, skb->protocol); -		if (tunnel->parms.o_flags&GRE_SEQ) { -			++tunnel->o_seqno; -			*ptr = htonl(tunnel->o_seqno); -			ptr--; -		} -		if (tunnel->parms.o_flags&GRE_KEY) { -			*ptr = tunnel->parms.o_key; -			ptr--; -		} -		if (tunnel->parms.o_flags&GRE_CSUM) { -			*ptr = 0; -			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); -		} -	} - -	nf_reset(skb); -	tstats = this_cpu_ptr(dev->tstats); -	__IPTUNNEL_XMIT(tstats, &dev->stats);  	return NETDEV_TX_OK; -tx_error_icmp: -	dst_link_failure(skb); - -tx_error: -	dev->stats.tx_errors++; -	dev_kfree_skb(skb); +free_skb: +	kfree_skb(skb); +out: +	dev->stats.tx_dropped++;  	return NETDEV_TX_OK;  } -static int ipgre_tunnel_bind_dev(struct net_device *dev) +static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, +				struct net_device *dev)  { -	struct net_device *tdev = NULL; -	struct ip_tunnel *tunnel; -	struct iphdr *iph; -	int hlen = LL_MAX_HEADER; -	int mtu = ETH_DATA_LEN; -	int addend = sizeof(struct iphdr) + 4; - -	tunnel = netdev_priv(dev); -	iph = &tunnel->parms.iph; - -	/* Guess output device to choose reasonable mtu and needed_headroom */ - -	if (iph->daddr) { -		struct flowi fl = { -			.oif = tunnel->parms.link, -			.fl4_dst = iph->daddr, -			.fl4_src = iph->saddr, -			.fl4_tos = RT_TOS(iph->tos), -			.proto = IPPROTO_GRE, -			.fl_gre_key = tunnel->parms.o_key -		}; -		struct rtable *rt; - -		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { -			tdev = rt->dst.dev; -			ip_rt_put(rt); -		} - -		if (dev->type != ARPHRD_ETHER) -			dev->flags |= IFF_POINTOPOINT; -	} +	struct ip_tunnel *tunnel = netdev_priv(dev); -	if (!tdev && tunnel->parms.link) -		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); +	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); +	if (IS_ERR(skb)) +		goto out; -	if (tdev) { -		hlen = tdev->hard_header_len + tdev->needed_headroom; -		mtu = tdev->mtu; -	} -	dev->iflink = tunnel->parms.link; - -	/* Precalculate GRE options length */ -	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { -		if (tunnel->parms.o_flags&GRE_CSUM) -			addend += 4; -		if (tunnel->parms.o_flags&GRE_KEY) -			addend += 4; -		if (tunnel->parms.o_flags&GRE_SEQ) -			addend += 4; -	} -	dev->needed_headroom = addend + hlen; -	mtu -= dev->hard_header_len + addend; +	if (skb_cow_head(skb, dev->needed_headroom)) +		goto free_skb; -	if (mtu < 68) -		mtu = 68; +	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); -	tunnel->hlen = addend; +	return NETDEV_TX_OK; -	return mtu; +free_skb: +	kfree_skb(skb); +out: +	dev->stats.tx_dropped++; +	return NETDEV_TX_OK;  } -static int -ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ioctl(struct net_device *dev, +			      struct ifreq *ifr, int cmd)  {  	int err = 0;  	struct ip_tunnel_parm p; -	struct ip_tunnel *t; -	struct net *net = dev_net(dev); -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); - -	switch (cmd) { -	case SIOCGETTUNNEL: -		t = NULL; -		if (dev == ign->fb_tunnel_dev) { -			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { -				err = -EFAULT; -				break; -			} -			t = ipgre_tunnel_locate(net, &p, 0); -		} -		if (t == NULL) -			t = netdev_priv(dev); -		memcpy(&p, &t->parms, sizeof(p)); -		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) -			err = -EFAULT; -		break; - -	case SIOCADDTUNNEL: -	case SIOCCHGTUNNEL: -		err = -EPERM; -		if (!capable(CAP_NET_ADMIN)) -			goto done; -		err = -EFAULT; -		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) -			goto done; - -		err = -EINVAL; +	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) +		return -EFAULT; +	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {  		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||  		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||  		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) -			goto done; -		if (p.iph.ttl) -			p.iph.frag_off |= htons(IP_DF); - -		if (!(p.i_flags&GRE_KEY)) -			p.i_key = 0; -		if (!(p.o_flags&GRE_KEY)) -			p.o_key = 0; - -		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - -		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { -			if (t != NULL) { -				if (t->dev != dev) { -					err = -EEXIST; -					break; -				} -			} else { -				unsigned int nflags = 0; - -				t = netdev_priv(dev); - -				if (ipv4_is_multicast(p.iph.daddr)) -					nflags = IFF_BROADCAST; -				else if (p.iph.daddr) -					nflags = IFF_POINTOPOINT; - -				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { -					err = -EINVAL; -					break; -				} -				ipgre_tunnel_unlink(ign, t); -				synchronize_net(); -				t->parms.iph.saddr = p.iph.saddr; -				t->parms.iph.daddr = p.iph.daddr; -				t->parms.i_key = p.i_key; -				t->parms.o_key = p.o_key; -				memcpy(dev->dev_addr, &p.iph.saddr, 4); -				memcpy(dev->broadcast, &p.iph.daddr, 4); -				ipgre_tunnel_link(ign, t); -				netdev_state_change(dev); -			} -		} - -		if (t) { -			err = 0; -			if (cmd == SIOCCHGTUNNEL) { -				t->parms.iph.ttl = p.iph.ttl; -				t->parms.iph.tos = p.iph.tos; -				t->parms.iph.frag_off = p.iph.frag_off; -				if (t->parms.link != p.link) { -					t->parms.link = p.link; -					dev->mtu = ipgre_tunnel_bind_dev(dev); -					netdev_state_change(dev); -				} -			} -			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) -				err = -EFAULT; -		} else -			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); -		break; - -	case SIOCDELTUNNEL: -		err = -EPERM; -		if (!capable(CAP_NET_ADMIN)) -			goto done; - -		if (dev == ign->fb_tunnel_dev) { -			err = -EFAULT; -			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) -				goto done; -			err = -ENOENT; -			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) -				goto done; -			err = -EPERM; -			if (t == netdev_priv(ign->fb_tunnel_dev)) -				goto done; -			dev = t->dev; -		} -		unregister_netdevice(dev); -		err = 0; -		break; - -	default: -		err = -EINVAL; +			return -EINVAL;  	} +	p.i_flags = gre_flags_to_tnl_flags(p.i_flags); +	p.o_flags = gre_flags_to_tnl_flags(p.o_flags); -done: -	return err; -} +	err = ip_tunnel_ioctl(dev, &p, cmd); +	if (err) +		return err; -static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ -	struct ip_tunnel *tunnel = netdev_priv(dev); -	if (new_mtu < 68 || -	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) -		return -EINVAL; -	dev->mtu = new_mtu; +	p.i_flags = tnl_flags_to_gre_flags(p.i_flags); +	p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + +	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) +		return -EFAULT;  	return 0;  } @@ -1159,38 +362,36 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)     ...     ftp fec0:6666:6666::193.233.7.65     ... -   */ -  static int ipgre_header(struct sk_buff *skb, struct net_device *dev,  			unsigned short type,  			const void *daddr, const void *saddr, unsigned int len)  {  	struct ip_tunnel *t = netdev_priv(dev); -	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); -	__be16 *p = (__be16*)(iph+1); +	struct iphdr *iph; +	struct gre_base_hdr *greh; -	memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); -	p[0]		= t->parms.o_flags; -	p[1]		= htons(type); +	iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); +	greh = (struct gre_base_hdr *)(iph+1); +	greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); +	greh->protocol = htons(type); -	/* -	 *	Set the source hardware address. -	 */ +	memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); +	/* Set the source hardware address. */  	if (saddr)  		memcpy(&iph->saddr, saddr, 4);  	if (daddr)  		memcpy(&iph->daddr, daddr, 4);  	if (iph->daddr) -		return t->hlen; +		return t->hlen + sizeof(*iph); -	return -t->hlen; +	return -(t->hlen + sizeof(*iph));  }  static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)  { -	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); +	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);  	memcpy(haddr, &iph->saddr, 4);  	return 4;  } @@ -1206,17 +407,16 @@ static int ipgre_open(struct net_device *dev)  	struct ip_tunnel *t = netdev_priv(dev);  	if (ipv4_is_multicast(t->parms.iph.daddr)) { -		struct flowi fl = { -			.oif = t->parms.link, -			.fl4_dst = t->parms.iph.daddr, -			.fl4_src = t->parms.iph.saddr, -			.fl4_tos = RT_TOS(t->parms.iph.tos), -			.proto = IPPROTO_GRE, -			.fl_gre_key = t->parms.o_key -		}; +		struct flowi4 fl4;  		struct rtable *rt; -		if (ip_route_output_key(dev_net(dev), &rt, &fl)) +		rt = ip_route_output_gre(t->net, &fl4, +					 t->parms.iph.daddr, +					 t->parms.iph.saddr, +					 t->parms.o_key, +					 RT_TOS(t->parms.iph.tos), +					 t->parms.link); +		if (IS_ERR(rt))  			return -EADDRNOTAVAIL;  		dev = rt->dst.dev;  		ip_rt_put(rt); @@ -1234,62 +434,77 @@ static int ipgre_close(struct net_device *dev)  	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {  		struct in_device *in_dev; -		in_dev = inetdev_by_index(dev_net(dev), t->mlink); +		in_dev = inetdev_by_index(t->net, t->mlink);  		if (in_dev)  			ip_mc_dec_group(in_dev, t->parms.iph.daddr);  	}  	return 0;  } -  #endif  static const struct net_device_ops ipgre_netdev_ops = {  	.ndo_init		= ipgre_tunnel_init, -	.ndo_uninit		= ipgre_tunnel_uninit, +	.ndo_uninit		= ip_tunnel_uninit,  #ifdef CONFIG_NET_IPGRE_BROADCAST  	.ndo_open		= ipgre_open,  	.ndo_stop		= ipgre_close,  #endif -	.ndo_start_xmit		= ipgre_tunnel_xmit, +	.ndo_start_xmit		= ipgre_xmit,  	.ndo_do_ioctl		= ipgre_tunnel_ioctl, -	.ndo_change_mtu		= ipgre_tunnel_change_mtu, -	.ndo_get_stats		= ipgre_get_stats, +	.ndo_change_mtu		= ip_tunnel_change_mtu, +	.ndo_get_stats64	= ip_tunnel_get_stats64,  }; -static void ipgre_dev_free(struct net_device *dev) -{ -	free_percpu(dev->tstats); -	free_netdev(dev); -} +#define GRE_FEATURES (NETIF_F_SG |		\ +		      NETIF_F_FRAGLIST |	\ +		      NETIF_F_HIGHDMA |		\ +		      NETIF_F_HW_CSUM)  static void ipgre_tunnel_setup(struct net_device *dev)  {  	dev->netdev_ops		= &ipgre_netdev_ops; -	dev->destructor 	= ipgre_dev_free; -  	dev->type		= ARPHRD_IPGRE; -	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4; -	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4; -	dev->flags		= IFF_NOARP; -	dev->iflink		= 0; -	dev->addr_len		= 4; -	dev->features		|= NETIF_F_NETNS_LOCAL; -	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE; +	ip_tunnel_setup(dev, ipgre_net_id);  } -static int ipgre_tunnel_init(struct net_device *dev) +static void __gre_tunnel_init(struct net_device *dev)  {  	struct ip_tunnel *tunnel; -	struct iphdr *iph;  	tunnel = netdev_priv(dev); -	iph = &tunnel->parms.iph; +	tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); +	tunnel->parms.iph.protocol = IPPROTO_GRE; + +	dev->needed_headroom	= LL_MAX_HEADER + sizeof(struct iphdr) + 4; +	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4; + +	dev->features		|= GRE_FEATURES; +	dev->hw_features	|= GRE_FEATURES; + +	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { +		/* TCP offload with GRE SEQ is not supported. */ +		dev->features    |= NETIF_F_GSO_SOFTWARE; +		dev->hw_features |= NETIF_F_GSO_SOFTWARE; +		/* Can use a lockless transmit, unless we generate +		 * output sequences +		 */ +		dev->features |= NETIF_F_LLTX; +	} +} + +static int ipgre_tunnel_init(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct iphdr *iph = &tunnel->parms.iph; -	tunnel->dev = dev; -	strcpy(tunnel->parms.name, dev->name); +	__gre_tunnel_init(dev); -	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); -	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); +	memcpy(dev->dev_addr, &iph->saddr, 4); +	memcpy(dev->broadcast, &iph->daddr, 4); + +	dev->flags		= IFF_NOARP; +	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE; +	dev->addr_len		= 4;  	if (iph->daddr) {  #ifdef CONFIG_NET_IPGRE_BROADCAST @@ -1303,100 +518,31 @@ static int ipgre_tunnel_init(struct net_device *dev)  	} else  		dev->header_ops = &ipgre_header_ops; -	dev->tstats = alloc_percpu(struct pcpu_tstats); -	if (!dev->tstats) -		return -ENOMEM; - -	return 0; +	return ip_tunnel_init(dev);  } -static void ipgre_fb_tunnel_init(struct net_device *dev) -{ -	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct iphdr *iph = &tunnel->parms.iph; - -	tunnel->dev = dev; -	strcpy(tunnel->parms.name, dev->name); - -	iph->version		= 4; -	iph->protocol		= IPPROTO_GRE; -	iph->ihl		= 5; -	tunnel->hlen		= sizeof(struct iphdr) + 4; - -	dev_hold(dev); -} - - -static const struct gre_protocol ipgre_protocol = { -	.handler     = ipgre_rcv, -	.err_handler = ipgre_err, +static struct gre_cisco_protocol ipgre_protocol = { +	.handler        = ipgre_rcv, +	.err_handler    = ipgre_err, +	.priority       = 0,  }; -static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) -{ -	int prio; - -	for (prio = 0; prio < 4; prio++) { -		int h; -		for (h = 0; h < HASH_SIZE; h++) { -			struct ip_tunnel *t; - -			t = rtnl_dereference(ign->tunnels[prio][h]); - -			while (t != NULL) { -				unregister_netdevice_queue(t->dev, head); -				t = rtnl_dereference(t->next); -			} -		} -	} -} -  static int __net_init ipgre_init_net(struct net *net)  { -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); -	int err; - -	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", -					   ipgre_tunnel_setup); -	if (!ign->fb_tunnel_dev) { -		err = -ENOMEM; -		goto err_alloc_dev; -	} -	dev_net_set(ign->fb_tunnel_dev, net); - -	ipgre_fb_tunnel_init(ign->fb_tunnel_dev); -	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; - -	if ((err = register_netdev(ign->fb_tunnel_dev))) -		goto err_reg_dev; - -	rcu_assign_pointer(ign->tunnels_wc[0], -			   netdev_priv(ign->fb_tunnel_dev)); -	return 0; - -err_reg_dev: -	ipgre_dev_free(ign->fb_tunnel_dev); -err_alloc_dev: -	return err; +	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);  }  static void __net_exit ipgre_exit_net(struct net *net)  { -	struct ipgre_net *ign; -	LIST_HEAD(list); - -	ign = net_generic(net, ipgre_net_id); -	rtnl_lock(); -	ipgre_destroy_tunnels(ign, &list); -	unregister_netdevice_many(&list); -	rtnl_unlock(); +	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); +	ip_tunnel_delete_net(itn, &ipgre_link_ops);  }  static struct pernet_operations ipgre_net_ops = {  	.init = ipgre_init_net,  	.exit = ipgre_exit_net,  	.id   = &ipgre_net_id, -	.size = sizeof(struct ipgre_net), +	.size = sizeof(struct ip_tunnel_net),  };  static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1441,8 +587,8 @@ out:  	return ipgre_tunnel_validate(tb, data);  } -static void ipgre_netlink_parms(struct nlattr *data[], -				struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], +			       struct ip_tunnel_parm *parms)  {  	memset(parms, 0, sizeof(*parms)); @@ -1455,10 +601,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],  		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);  	if (data[IFLA_GRE_IFLAGS]) -		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); +		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));  	if (data[IFLA_GRE_OFLAGS]) -		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); +		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));  	if (data[IFLA_GRE_IKEY])  		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1482,145 +628,47 @@ static void ipgre_netlink_parms(struct nlattr *data[],  		parms->iph.frag_off = htons(IP_DF);  } -static int ipgre_tap_init(struct net_device *dev) +static int gre_tap_init(struct net_device *dev)  { -	struct ip_tunnel *tunnel; - -	tunnel = netdev_priv(dev); - -	tunnel->dev = dev; -	strcpy(tunnel->parms.name, dev->name); - -	ipgre_tunnel_bind_dev(dev); +	__gre_tunnel_init(dev); -	dev->tstats = alloc_percpu(struct pcpu_tstats); -	if (!dev->tstats) -		return -ENOMEM; - -	return 0; +	return ip_tunnel_init(dev);  } -static const struct net_device_ops ipgre_tap_netdev_ops = { -	.ndo_init		= ipgre_tap_init, -	.ndo_uninit		= ipgre_tunnel_uninit, -	.ndo_start_xmit		= ipgre_tunnel_xmit, +static const struct net_device_ops gre_tap_netdev_ops = { +	.ndo_init		= gre_tap_init, +	.ndo_uninit		= ip_tunnel_uninit, +	.ndo_start_xmit		= gre_tap_xmit,  	.ndo_set_mac_address 	= eth_mac_addr,  	.ndo_validate_addr	= eth_validate_addr, -	.ndo_change_mtu		= ipgre_tunnel_change_mtu, -	.ndo_get_stats		= ipgre_get_stats, +	.ndo_change_mtu		= ip_tunnel_change_mtu, +	.ndo_get_stats64	= ip_tunnel_get_stats64,  };  static void ipgre_tap_setup(struct net_device *dev)  { -  	ether_setup(dev); - -	dev->netdev_ops		= &ipgre_tap_netdev_ops; -	dev->destructor 	= ipgre_dev_free; - -	dev->iflink		= 0; -	dev->features		|= NETIF_F_NETNS_LOCAL; +	dev->netdev_ops		= &gre_tap_netdev_ops; +	dev->priv_flags 	|= IFF_LIVE_ADDR_CHANGE; +	ip_tunnel_setup(dev, gre_tap_net_id);  } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], -			 struct nlattr *data[]) +static int ipgre_newlink(struct net *src_net, struct net_device *dev, +			 struct nlattr *tb[], struct nlattr *data[])  { -	struct ip_tunnel *nt; -	struct net *net = dev_net(dev); -	struct ipgre_net *ign = net_generic(net, ipgre_net_id); -	int mtu; -	int err; - -	nt = netdev_priv(dev); -	ipgre_netlink_parms(data, &nt->parms); - -	if (ipgre_tunnel_find(net, &nt->parms, dev->type)) -		return -EEXIST; - -	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) -		random_ether_addr(dev->dev_addr); - -	mtu = ipgre_tunnel_bind_dev(dev); -	if (!tb[IFLA_MTU]) -		dev->mtu = mtu; - -	/* Can use a lockless transmit, unless we generate output sequences */ -	if (!(nt->parms.o_flags & GRE_SEQ)) -		dev->features |= NETIF_F_LLTX; - -	err = register_netdevice(dev); -	if (err) -		goto out; - -	dev_hold(dev); -	ipgre_tunnel_link(ign, nt); +	struct ip_tunnel_parm p; -out: -	return err; +	ipgre_netlink_parms(data, tb, &p); +	return ip_tunnel_newlink(dev, tb, &p);  }  static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],  			    struct nlattr *data[])  { -	struct ip_tunnel *t, *nt; -	struct net *net = dev_net(dev); -	struct ipgre_net *ign = net_generic(net, ipgre_net_id);  	struct ip_tunnel_parm p; -	int mtu; - -	if (dev == ign->fb_tunnel_dev) -		return -EINVAL; - -	nt = netdev_priv(dev); -	ipgre_netlink_parms(data, &p); - -	t = ipgre_tunnel_locate(net, &p, 0); - -	if (t) { -		if (t->dev != dev) -			return -EEXIST; -	} else { -		t = nt; - -		if (dev->type != ARPHRD_ETHER) { -			unsigned int nflags = 0; - -			if (ipv4_is_multicast(p.iph.daddr)) -				nflags = IFF_BROADCAST; -			else if (p.iph.daddr) -				nflags = IFF_POINTOPOINT; - -			if ((dev->flags ^ nflags) & -			    (IFF_POINTOPOINT | IFF_BROADCAST)) -				return -EINVAL; -		} -		ipgre_tunnel_unlink(ign, t); -		t->parms.iph.saddr = p.iph.saddr; -		t->parms.iph.daddr = p.iph.daddr; -		t->parms.i_key = p.i_key; -		if (dev->type != ARPHRD_ETHER) { -			memcpy(dev->dev_addr, &p.iph.saddr, 4); -			memcpy(dev->broadcast, &p.iph.daddr, 4); -		} -		ipgre_tunnel_link(ign, t); -		netdev_state_change(dev); -	} - -	t->parms.o_key = p.o_key; -	t->parms.iph.ttl = p.iph.ttl; -	t->parms.iph.tos = p.iph.tos; -	t->parms.iph.frag_off = p.iph.frag_off; - -	if (t->parms.link != p.link) { -		t->parms.link = p.link; -		mtu = ipgre_tunnel_bind_dev(dev); -		if (!tb[IFLA_MTU]) -			dev->mtu = mtu; -		netdev_state_change(dev); -	} - -	return 0; +	ipgre_netlink_parms(data, tb, &p); +	return ip_tunnel_changelink(dev, tb, &p);  }  static size_t ipgre_get_size(const struct net_device *dev) @@ -1654,17 +702,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)  	struct ip_tunnel *t = netdev_priv(dev);  	struct ip_tunnel_parm *p = &t->parms; -	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); -	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); -	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); -	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); -	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); -	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); -	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); -	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); -	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); -	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); - +	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || +	    nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || +	    nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || +	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || +	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || +	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || +	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || +	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || +	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || +	    nla_put_u8(skb, IFLA_GRE_PMTUDISC, +		       !!(p->iph.frag_off & htons(IP_DF)))) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -1693,6 +742,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {  	.validate	= ipgre_tunnel_validate,  	.newlink	= ipgre_newlink,  	.changelink	= ipgre_changelink, +	.dellink	= ip_tunnel_dellink,  	.get_size	= ipgre_get_size,  	.fill_info	= ipgre_fill_info,  }; @@ -1706,27 +756,46 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {  	.validate	= ipgre_tap_validate,  	.newlink	= ipgre_newlink,  	.changelink	= ipgre_changelink, +	.dellink	= ip_tunnel_dellink,  	.get_size	= ipgre_get_size,  	.fill_info	= ipgre_fill_info,  }; -/* - *	And now the modules code and kernel interface. - */ +static int __net_init ipgre_tap_init_net(struct net *net) +{ +	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); +} + +static void __net_exit ipgre_tap_exit_net(struct net *net) +{ +	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); +	ip_tunnel_delete_net(itn, &ipgre_tap_ops); +} + +static struct pernet_operations ipgre_tap_net_ops = { +	.init = ipgre_tap_init_net, +	.exit = ipgre_tap_exit_net, +	.id   = &gre_tap_net_id, +	.size = sizeof(struct ip_tunnel_net), +};  static int __init ipgre_init(void)  {  	int err; -	printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); +	pr_info("GRE over IPv4 tunneling driver\n");  	err = register_pernet_device(&ipgre_net_ops);  	if (err < 0)  		return err; -	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); +	err = register_pernet_device(&ipgre_tap_net_ops); +	if (err < 0) +		goto pnet_tap_faied; + +	err = gre_cisco_register(&ipgre_protocol);  	if (err < 0) { -		printk(KERN_INFO "ipgre init: can't add protocol\n"); +		pr_info("%s: can't add protocol\n", __func__);  		goto add_proto_failed;  	} @@ -1738,24 +807,25 @@ static int __init ipgre_init(void)  	if (err < 0)  		goto tap_ops_failed; -out: -	return err; +	return 0;  tap_ops_failed:  	rtnl_link_unregister(&ipgre_link_ops);  rtnl_link_failed: -	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); +	gre_cisco_unregister(&ipgre_protocol);  add_proto_failed: +	unregister_pernet_device(&ipgre_tap_net_ops); +pnet_tap_faied:  	unregister_pernet_device(&ipgre_net_ops); -	goto out; +	return err;  }  static void __exit ipgre_fini(void)  {  	rtnl_link_unregister(&ipgre_tap_ops);  	rtnl_link_unregister(&ipgre_link_ops); -	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) -		printk(KERN_INFO "ipgre close: can't remove protocol\n"); +	gre_cisco_unregister(&ipgre_protocol); +	unregister_pernet_device(&ipgre_tap_net_ops);  	unregister_pernet_device(&ipgre_net_ops);  } @@ -1764,4 +834,5 @@ module_exit(ipgre_fini);  MODULE_LICENSE("GPL");  MODULE_ALIAS_RTNL_LINK("gre");  MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); +MODULE_ALIAS_NETDEV("gretap0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb..3d4da2c16b6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -113,7 +113,8 @@   *		2 of the License, or (at your option) any later version.   */ -#include <asm/system.h> +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/module.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -140,6 +141,7 @@  #include <net/icmp.h>  #include <net/raw.h>  #include <net/checksum.h> +#include <net/inet_ecn.h>  #include <linux/netfilter_ipv4.h>  #include <net/xfrm.h>  #include <linux/mroute.h> @@ -148,7 +150,7 @@  /*   *	Process Router Attention IP option (RFC 2113)   */ -int ip_call_ra_chain(struct sk_buff *skb) +bool ip_call_ra_chain(struct sk_buff *skb)  {  	struct ip_ra_chain *ra;  	u8 protocol = ip_hdr(skb)->protocol; @@ -165,9 +167,9 @@ int ip_call_ra_chain(struct sk_buff *skb)  		    (!sk->sk_bound_dev_if ||  		     sk->sk_bound_dev_if == dev->ifindex) &&  		    net_eq(sock_net(sk), dev_net(dev))) { -			if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { +			if (ip_is_fragment(ip_hdr(skb))) {  				if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) -					return 1; +					return true;  			}  			if (last) {  				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -180,42 +182,30 @@ int ip_call_ra_chain(struct sk_buff *skb)  	if (last) {  		raw_rcv(last, skb); -		return 1; +		return true;  	} -	return 0; +	return false;  }  static int ip_local_deliver_finish(struct sk_buff *skb)  {  	struct net *net = dev_net(skb->dev); -	__skb_pull(skb, ip_hdrlen(skb)); - -	/* Point into the IP datagram, just past the header. */ -	skb_reset_transport_header(skb); +	__skb_pull(skb, skb_network_header_len(skb));  	rcu_read_lock();  	{  		int protocol = ip_hdr(skb)->protocol; -		int hash, raw;  		const struct net_protocol *ipprot; +		int raw;  	resubmit:  		raw = raw_local_deliver(skb, protocol); -		hash = protocol & (MAX_INET_PROTOS - 1); -		ipprot = rcu_dereference(inet_protos[hash]); +		ipprot = rcu_dereference(inet_protos[protocol]);  		if (ipprot != NULL) {  			int ret; -			if (!net_eq(net, &init_net) && !ipprot->netns_ok) { -				if (net_ratelimit()) -					printk("%s: proto %d isn't netns-ready\n", -						__func__, protocol); -				kfree_skb(skb); -				goto out; -			} -  			if (!ipprot->no_policy) {  				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {  					kfree_skb(skb); @@ -236,9 +226,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb)  					icmp_send(skb, ICMP_DEST_UNREACH,  						  ICMP_PROT_UNREACH, 0);  				} -			} else +				kfree_skb(skb); +			} else {  				IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); -			kfree_skb(skb); +				consume_skb(skb); +			}  		}  	}   out: @@ -256,7 +248,7 @@ int ip_local_deliver(struct sk_buff *skb)  	 *	Reassemble IP fragments.  	 */ -	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { +	if (ip_is_fragment(ip_hdr(skb))) {  		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))  			return 0;  	} @@ -265,10 +257,10 @@ int ip_local_deliver(struct sk_buff *skb)  		       ip_local_deliver_finish);  } -static inline int ip_rcv_options(struct sk_buff *skb) +static inline bool ip_rcv_options(struct sk_buff *skb)  {  	struct ip_options *opt; -	struct iphdr *iph; +	const struct iphdr *iph;  	struct net_device *dev = skb->dev;  	/* It looks as overkill, because not all @@ -297,10 +289,10 @@ static inline int ip_rcv_options(struct sk_buff *skb)  		if (in_dev) {  			if (!IN_DEV_SOURCE_ROUTE(in_dev)) { -				if (IN_DEV_LOG_MARTIANS(in_dev) && -				    net_ratelimit()) -					printk(KERN_INFO "source route option %pI4 -> %pI4\n", -					       &iph->saddr, &iph->daddr); +				if (IN_DEV_LOG_MARTIANS(in_dev)) +					net_info_ratelimited("source route option %pI4 -> %pI4\n", +							     &iph->saddr, +							     &iph->daddr);  				goto drop;  			}  		} @@ -309,38 +301,47 @@ static inline int ip_rcv_options(struct sk_buff *skb)  			goto drop;  	} -	return 0; +	return false;  drop: -	return -1; +	return true;  } +int sysctl_ip_early_demux __read_mostly = 1; +EXPORT_SYMBOL(sysctl_ip_early_demux); +  static int ip_rcv_finish(struct sk_buff *skb)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct rtable *rt; +	if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { +		const struct net_protocol *ipprot; +		int protocol = iph->protocol; + +		ipprot = rcu_dereference(inet_protos[protocol]); +		if (ipprot && ipprot->early_demux) { +			ipprot->early_demux(skb); +			/* must reload iph, skb->head might have changed */ +			iph = ip_hdr(skb); +		} +	} +  	/*  	 *	Initialise the virtual path cache for the packet. It describes  	 *	how the packet travels inside Linux networking.  	 */ -	if (skb_dst(skb) == NULL) { +	if (!skb_dst(skb)) {  		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,  					       iph->tos, skb->dev);  		if (unlikely(err)) { -			if (err == -EHOSTUNREACH) -				IP_INC_STATS_BH(dev_net(skb->dev), -						IPSTATS_MIB_INADDRERRORS); -			else if (err == -ENETUNREACH) -				IP_INC_STATS_BH(dev_net(skb->dev), -						IPSTATS_MIB_INNOROUTES); -			else if (err == -EXDEV) +			if (err == -EXDEV)  				NET_INC_STATS_BH(dev_net(skb->dev),  						 LINUX_MIB_IPRPFILTER);  			goto drop;  		}  	} -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	if (unlikely(skb_dst(skb)->tclassid)) {  		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);  		u32 idx = skb_dst(skb)->tclassid; @@ -374,7 +375,7 @@ drop:   */  int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)  { -	struct iphdr *iph; +	const struct iphdr *iph;  	u32 len;  	/* When the interface is in promisc. mode, drop all the crap @@ -410,13 +411,20 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,  	if (iph->ihl < 5 || iph->version != 4)  		goto inhdr_error; +	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); +	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); +	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); +	IP_ADD_STATS_BH(dev_net(dev), +			IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), +			max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); +  	if (!pskb_may_pull(skb, iph->ihl*4))  		goto inhdr_error;  	iph = ip_hdr(skb);  	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) -		goto inhdr_error; +		goto csum_error;  	len = ntohs(iph->tot_len);  	if (skb->len < len) { @@ -434,6 +442,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,  		goto drop;  	} +	skb->transport_header = skb->network_header + iph->ihl*4; +  	/* Remove any debris in the socket control block */  	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); @@ -443,6 +453,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,  	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,  		       ip_rcv_finish); +csum_error: +	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);  inhdr_error:  	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);  drop: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1906fa35860..ad382499bac 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -9,11 +9,14 @@   *   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/capability.h>  #include <linux/module.h>  #include <linux/slab.h>  #include <linux/types.h>  #include <asm/uaccess.h> +#include <asm/unaligned.h>  #include <linux/skbuff.h>  #include <linux/ip.h>  #include <linux/icmp.h> @@ -24,6 +27,7 @@  #include <net/icmp.h>  #include <net/route.h>  #include <net/cipso_ipv4.h> +#include <net/ip_fib.h>  /*   * Write options to IP header, record destination address to @@ -36,8 +40,8 @@   * saddr is address of outgoing interface.   */ -void ip_options_build(struct sk_buff * skb, struct ip_options * opt, -			    __be32 daddr, struct rtable *rt, int is_frag) +void ip_options_build(struct sk_buff *skb, struct ip_options *opt, +		      __be32 daddr, struct rtable *rt, int is_frag)  {  	unsigned char *iph = skb_network_header(skb); @@ -50,9 +54,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,  	if (!is_frag) {  		if (opt->rr_needaddr) -			ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); +			ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);  		if (opt->ts_needaddr) -			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); +			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);  		if (opt->ts_needtime) {  			struct timespec tv;  			__be32 midtime; @@ -83,28 +87,23 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,   * NOTE: dopt cannot point to skb.   */ -int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)  { -	struct ip_options *sopt; +	const struct ip_options *sopt;  	unsigned char *sptr, *dptr;  	int soffset, doffset;  	int	optlen; -	__be32	daddr;  	memset(dopt, 0, sizeof(struct ip_options));  	sopt = &(IPCB(skb)->opt); -	if (sopt->optlen == 0) { -		dopt->optlen = 0; +	if (sopt->optlen == 0)  		return 0; -	}  	sptr = skb_network_header(skb);  	dptr = dopt->__data; -	daddr = skb_rtable(skb)->rt_spec_dst; -  	if (sopt->rr) {  		optlen  = sptr[sopt->rr+1];  		soffset = sptr[sopt->rr+2]; @@ -140,11 +139,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)  				} else {  					dopt->ts_needtime = 0; -					if (soffset + 8 <= optlen) { +					if (soffset + 7 <= optlen) {  						__be32 addr; -						memcpy(&addr, sptr+soffset-1, 4); -						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { +						memcpy(&addr, dptr+soffset-1, 4); +						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {  							dopt->ts_needtime = 1;  							soffset += 8;  						} @@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)  		dopt->optlen += optlen;  	}  	if (sopt->srr) { -		unsigned char * start = sptr+sopt->srr; +		unsigned char *start = sptr+sopt->srr;  		__be32 faddr;  		optlen  = start[1]; @@ -168,7 +167,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)  		soffset -= 4;  		if (soffset > 3) {  			memcpy(&faddr, &start[soffset-1], 4); -			for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) +			for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)  				memcpy(&dptr[doffset-1], &start[soffset-1], 4);  			/*  			 * RFC1812 requires to fix illegal source routes. @@ -178,6 +177,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)  				doffset -= 4;  		}  		if (doffset > 3) { +			__be32 daddr = fib_compute_spec_dst(skb); +  			memcpy(&start[doffset-1], &daddr, 4);  			dopt->faddr = faddr;  			dptr[0] = start[0]; @@ -209,10 +210,10 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)   *	Simple and stupid 8), but the most efficient way.   */ -void ip_options_fragment(struct sk_buff * skb) +void ip_options_fragment(struct sk_buff *skb)  {  	unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); -	struct ip_options * opt = &(IPCB(skb)->opt); +	struct ip_options *opt = &(IPCB(skb)->opt);  	int  l = opt->optlen;  	int  optlen; @@ -226,7 +227,7 @@ void ip_options_fragment(struct sk_buff * skb)  			continue;  		}  		optlen = optptr[1]; -		if (optlen<2 || optlen>l) +		if (optlen < 2 || optlen > l)  		  return;  		if (!IPOPT_COPIED(*optptr))  			memset(optptr, IPOPT_NOOP, optlen); @@ -240,6 +241,15 @@ void ip_options_fragment(struct sk_buff * skb)  	opt->ts_needtime = 0;  } +/* helper used by ip_options_compile() to call fib_compute_spec_dst() + * at most one time. + */ +static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) +{ +	if (*spec_dst == htonl(INADDR_ANY)) +		*spec_dst = fib_compute_spec_dst(skb); +} +  /*   * Verify options and fill pointers in struct options.   * Caller should clear *opt, and set opt->data. @@ -247,14 +257,14 @@ void ip_options_fragment(struct sk_buff * skb)   */  int ip_options_compile(struct net *net, -		       struct ip_options * opt, struct sk_buff * skb) +		       struct ip_options *opt, struct sk_buff *skb)  { -	int l; -	unsigned char * iph; -	unsigned char * optptr; -	int optlen; -	unsigned char * pp_ptr = NULL; +	__be32 spec_dst = htonl(INADDR_ANY); +	unsigned char *pp_ptr = NULL;  	struct rtable *rt = NULL; +	unsigned char *optptr; +	unsigned char *iph; +	int optlen, l;  	if (skb != NULL) {  		rt = skb_rtable(skb); @@ -265,27 +275,31 @@ int ip_options_compile(struct net *net,  	for (l = opt->optlen; l > 0; ) {  		switch (*optptr) { -		      case IPOPT_END: -			for (optptr++, l--; l>0; optptr++, l--) { +		case IPOPT_END: +			for (optptr++, l--; l > 0; optptr++, l--) {  				if (*optptr != IPOPT_END) {  					*optptr = IPOPT_END;  					opt->is_changed = 1;  				}  			}  			goto eol; -		      case IPOPT_NOOP: +		case IPOPT_NOOP:  			l--;  			optptr++;  			continue;  		} +		if (unlikely(l < 2)) { +			pp_ptr = optptr; +			goto error; +		}  		optlen = optptr[1]; -		if (optlen<2 || optlen>l) { +		if (optlen < 2 || optlen > l) {  			pp_ptr = optptr;  			goto error;  		}  		switch (*optptr) { -		      case IPOPT_SSRR: -		      case IPOPT_LSRR: +		case IPOPT_SSRR: +		case IPOPT_LSRR:  			if (optlen < 3) {  				pp_ptr = optptr + 1;  				goto error; @@ -311,7 +325,7 @@ int ip_options_compile(struct net *net,  			opt->is_strictroute = (optptr[0] == IPOPT_SSRR);  			opt->srr = optptr - iph;  			break; -		      case IPOPT_RR: +		case IPOPT_RR:  			if (opt->rr) {  				pp_ptr = optptr;  				goto error; @@ -329,8 +343,9 @@ int ip_options_compile(struct net *net,  					pp_ptr = optptr + 2;  					goto error;  				} -				if (skb) { -					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); +				if (rt) { +					spec_dst_fill(&spec_dst, skb); +					memcpy(&optptr[optptr[2]-1], &spec_dst, 4);  					opt->is_changed = 1;  				}  				optptr[2] += 4; @@ -338,7 +353,7 @@ int ip_options_compile(struct net *net,  			}  			opt->rr = optptr - iph;  			break; -		      case IPOPT_TIMESTAMP: +		case IPOPT_TIMESTAMP:  			if (opt->ts) {  				pp_ptr = optptr;  				goto error; @@ -352,52 +367,50 @@ int ip_options_compile(struct net *net,  				goto error;  			}  			if (optptr[2] <= optlen) { -				__be32 *timeptr = NULL; -				if (optptr[2]+3 > optptr[1]) { +				unsigned char *timeptr = NULL; +				if (optptr[2]+3 > optlen) {  					pp_ptr = optptr + 2;  					goto error;  				}  				switch (optptr[3]&0xF) { -				      case IPOPT_TS_TSONLY: -					opt->ts = optptr - iph; +				case IPOPT_TS_TSONLY:  					if (skb) -						timeptr = (__be32*)&optptr[optptr[2]-1]; +						timeptr = &optptr[optptr[2]-1];  					opt->ts_needtime = 1;  					optptr[2] += 4;  					break; -				      case IPOPT_TS_TSANDADDR: -					if (optptr[2]+7 > optptr[1]) { +				case IPOPT_TS_TSANDADDR: +					if (optptr[2]+7 > optlen) {  						pp_ptr = optptr + 2;  						goto error;  					} -					opt->ts = optptr - iph; -					if (skb) { -						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); -						timeptr = (__be32*)&optptr[optptr[2]+3]; +					if (rt)  { +						spec_dst_fill(&spec_dst, skb); +						memcpy(&optptr[optptr[2]-1], &spec_dst, 4); +						timeptr = &optptr[optptr[2]+3];  					}  					opt->ts_needaddr = 1;  					opt->ts_needtime = 1;  					optptr[2] += 8;  					break; -				      case IPOPT_TS_PRESPEC: -					if (optptr[2]+7 > optptr[1]) { +				case IPOPT_TS_PRESPEC: +					if (optptr[2]+7 > optlen) {  						pp_ptr = optptr + 2;  						goto error;  					} -					opt->ts = optptr - iph;  					{  						__be32 addr;  						memcpy(&addr, &optptr[optptr[2]-1], 4);  						if (inet_addr_type(net, addr) == RTN_UNICAST)  							break;  						if (skb) -							timeptr = (__be32*)&optptr[optptr[2]+3]; +							timeptr = &optptr[optptr[2]+3];  					}  					opt->ts_needtime = 1;  					optptr[2] += 8;  					break; -				      default: -					if (!skb && !capable(CAP_NET_RAW)) { +				default: +					if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {  						pp_ptr = optptr + 3;  						goto error;  					} @@ -405,26 +418,26 @@ int ip_options_compile(struct net *net,  				}  				if (timeptr) {  					struct timespec tv; -					__be32  midtime; +					u32  midtime;  					getnstimeofday(&tv); -					midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); -					memcpy(timeptr, &midtime, sizeof(__be32)); +					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; +					put_unaligned_be32(midtime, timeptr);  					opt->is_changed = 1;  				} -			} else { -				unsigned overflow = optptr[3]>>4; +			} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { +				unsigned int overflow = optptr[3]>>4;  				if (overflow == 15) {  					pp_ptr = optptr + 3;  					goto error;  				} -				opt->ts = optptr - iph;  				if (skb) {  					optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);  					opt->is_changed = 1;  				}  			} +			opt->ts = optptr - iph;  			break; -		      case IPOPT_RA: +		case IPOPT_RA:  			if (optlen < 4) {  				pp_ptr = optptr + 1;  				goto error; @@ -432,8 +445,8 @@ int ip_options_compile(struct net *net,  			if (optptr[2] == 0 && optptr[3] == 0)  				opt->router_alert = optptr - iph;  			break; -		      case IPOPT_CIPSO: -			if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) { +		case IPOPT_CIPSO: +			if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {  				pp_ptr = optptr;  				goto error;  			} @@ -443,10 +456,10 @@ int ip_options_compile(struct net *net,  				goto error;  			}  			break; -		      case IPOPT_SEC: -		      case IPOPT_SID: -		      default: -			if (!skb && !capable(CAP_NET_RAW)) { +		case IPOPT_SEC: +		case IPOPT_SID: +		default: +			if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {  				pp_ptr = optptr;  				goto error;  			} @@ -472,20 +485,20 @@ EXPORT_SYMBOL(ip_options_compile);   *	Undo all the changes done by ip_options_compile().   */ -void ip_options_undo(struct ip_options * opt) +void ip_options_undo(struct ip_options *opt)  {  	if (opt->srr) { -		unsigned  char * optptr = opt->__data+opt->srr-sizeof(struct  iphdr); +		unsigned  char *optptr = opt->__data+opt->srr-sizeof(struct  iphdr);  		memmove(optptr+7, optptr+3, optptr[1]-7);  		memcpy(optptr+3, &opt->faddr, 4);  	}  	if (opt->rr_needaddr) { -		unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr); +		unsigned  char *optptr = opt->__data+opt->rr-sizeof(struct  iphdr);  		optptr[2] -= 4;  		memset(&optptr[optptr[2]-1], 0, 4);  	}  	if (opt->ts) { -		unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr); +		unsigned  char *optptr = opt->__data+opt->ts-sizeof(struct  iphdr);  		if (opt->ts_needtime) {  			optptr[2] -= 4;  			memset(&optptr[optptr[2]-1], 0, 4); @@ -499,19 +512,19 @@ void ip_options_undo(struct ip_options * opt)  	}  } -static struct ip_options *ip_options_get_alloc(const int optlen) +static struct ip_options_rcu *ip_options_get_alloc(const int optlen)  { -	return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), +	return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),  		       GFP_KERNEL);  } -static int ip_options_get_finish(struct net *net, struct ip_options **optp, -				 struct ip_options *opt, int optlen) +static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, +				 struct ip_options_rcu *opt, int optlen)  {  	while (optlen & 3) -		opt->__data[optlen++] = IPOPT_END; -	opt->optlen = optlen; -	if (optlen && ip_options_compile(net, opt, NULL)) { +		opt->opt.__data[optlen++] = IPOPT_END; +	opt->opt.optlen = optlen; +	if (optlen && ip_options_compile(net, &opt->opt, NULL)) {  		kfree(opt);  		return -EINVAL;  	} @@ -520,42 +533,42 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp,  	return 0;  } -int ip_options_get_from_user(struct net *net, struct ip_options **optp, +int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,  			     unsigned char __user *data, int optlen)  { -	struct ip_options *opt = ip_options_get_alloc(optlen); +	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);  	if (!opt)  		return -ENOMEM; -	if (optlen && copy_from_user(opt->__data, data, optlen)) { +	if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {  		kfree(opt);  		return -EFAULT;  	}  	return ip_options_get_finish(net, optp, opt, optlen);  } -int ip_options_get(struct net *net, struct ip_options **optp, +int ip_options_get(struct net *net, struct ip_options_rcu **optp,  		   unsigned char *data, int optlen)  { -	struct ip_options *opt = ip_options_get_alloc(optlen); +	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);  	if (!opt)  		return -ENOMEM;  	if (optlen) -		memcpy(opt->__data, data, optlen); +		memcpy(opt->opt.__data, data, optlen);  	return ip_options_get_finish(net, optp, opt, optlen);  }  void ip_forward_options(struct sk_buff *skb)  { -	struct   ip_options * opt	= &(IPCB(skb)->opt); -	unsigned char * optptr; +	struct   ip_options *opt	= &(IPCB(skb)->opt); +	unsigned char *optptr;  	struct rtable *rt = skb_rtable(skb);  	unsigned char *raw = skb_network_header(skb);  	if (opt->rr_needaddr) {  		optptr = (unsigned char *)raw + opt->rr; -		ip_rt_get_source(&optptr[optptr[2]-5], rt); +		ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);  		opt->is_changed = 1;  	}  	if (opt->srr_is_hit) { @@ -563,25 +576,27 @@ void ip_forward_options(struct sk_buff *skb)  		optptr = raw + opt->srr; -		for ( srrptr=optptr[2], srrspace = optptr[1]; +		for ( srrptr = optptr[2], srrspace = optptr[1];  		     srrptr <= srrspace;  		     srrptr += 4  		     ) {  			if (srrptr + 3 > srrspace)  				break; -			if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) +			if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)  				break;  		}  		if (srrptr + 3 <= srrspace) {  			opt->is_changed = 1; -			ip_rt_get_source(&optptr[srrptr-1], rt); -			ip_hdr(skb)->daddr = rt->rt_dst; +			ip_hdr(skb)->daddr = opt->nexthop; +			ip_rt_get_source(&optptr[srrptr-1], skb, rt);  			optptr[2] = srrptr+4; -		} else if (net_ratelimit()) -			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); +		} else { +			net_crit_ratelimited("%s(): Argh! Destination lost!\n", +					     __func__); +		}  		if (opt->ts_needaddr) {  			optptr = raw + opt->ts; -			ip_rt_get_source(&optptr[optptr[2]-9], rt); +			ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);  			opt->is_changed = 1;  		}  	} @@ -603,7 +618,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)  	unsigned long orefdst;  	int err; -	if (!opt->srr) +	if (!rt)  		return 0;  	if (skb->pkt_type != PACKET_HOST) @@ -617,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)  	if (rt->rt_type != RTN_LOCAL)  		return -EINVAL; -	for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { +	for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {  		if (srrptr + 3 > srrspace) {  			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));  			return -EINVAL; @@ -637,11 +652,12 @@ int ip_options_rcv_srr(struct sk_buff *skb)  		if (rt2->rt_type != RTN_LOCAL)  			break;  		/* Superfast 8) loopback forward */ -		memcpy(&iph->daddr, &optptr[srrptr-1], 4); +		iph->daddr = nexthop;  		opt->is_changed = 1;  	}  	if (srrptr <= srrspace) {  		opt->srr_is_hit = 1; +		opt->nexthop = nexthop;  		opt->is_changed = 1;  	}  	return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5090c7ff525..8d3b6b0e985 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -43,7 +43,6 @@   */  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/module.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -82,9 +81,10 @@  #include <linux/tcp.h>  int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; +EXPORT_SYMBOL(sysctl_ip_default_ttl);  /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph)  {  	iph->check = 0;  	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); @@ -101,36 +101,24 @@ int __ip_local_out(struct sk_buff *skb)  		       skb_dst(skb)->dev, dst_output);  } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)  {  	int err;  	err = __ip_local_out(skb);  	if (likely(err == 1)) -		err = dst_output(skb); +		err = dst_output_sk(sk, skb);  	return err;  } -EXPORT_SYMBOL_GPL(ip_local_out); - -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ -	skb_reset_mac_header(newskb); -	__skb_pull(newskb, skb_network_offset(newskb)); -	newskb->pkt_type = PACKET_LOOPBACK; -	newskb->ip_summed = CHECKSUM_UNNECESSARY; -	WARN_ON(!skb_dst(newskb)); -	netif_rx_ni(newskb); -	return 0; -} +EXPORT_SYMBOL_GPL(ip_local_out_sk);  static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)  {  	int ttl = inet->uc_ttl;  	if (ttl < 0) -		ttl = dst_metric(dst, RTAX_HOPLIMIT); +		ttl = ip4_dst_hoplimit(dst);  	return ttl;  } @@ -139,14 +127,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)   *   */  int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, -			  __be32 saddr, __be32 daddr, struct ip_options *opt) +			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)  {  	struct inet_sock *inet = inet_sk(sk);  	struct rtable *rt = skb_rtable(skb);  	struct iphdr *iph;  	/* Build the IP header. */ -	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); +	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	iph->version  = 4; @@ -157,14 +145,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,  	else  		iph->frag_off = 0;  	iph->ttl      = ip_select_ttl(inet, &rt->dst); -	iph->daddr    = rt->rt_dst; -	iph->saddr    = rt->rt_src; +	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr); +	iph->saddr    = saddr;  	iph->protocol = sk->sk_protocol; -	ip_select_ident(iph, &rt->dst, sk); +	ip_select_ident(skb, sk); -	if (opt && opt->optlen) { -		iph->ihl += opt->optlen>>2; -		ip_options_build(skb, opt, daddr, rt, 0); +	if (opt && opt->opt.optlen) { +		iph->ihl += opt->opt.optlen>>2; +		ip_options_build(skb, &opt->opt, daddr, rt, 0);  	}  	skb->priority = sk->sk_priority; @@ -181,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)  	struct rtable *rt = (struct rtable *)dst;  	struct net_device *dev = dst->dev;  	unsigned int hh_len = LL_RESERVED_SPACE(dev); +	struct neighbour *neigh; +	u32 nexthop;  	if (rt->rt_type == RTN_MULTICAST) {  		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -198,27 +188,69 @@ static inline int ip_finish_output2(struct sk_buff *skb)  		}  		if (skb->sk)  			skb_set_owner_w(skb2, skb->sk); -		kfree_skb(skb); +		consume_skb(skb);  		skb = skb2;  	} -	if (dst->hh) -		return neigh_hh_output(dst->hh, skb); -	else if (dst->neighbour) -		return dst->neighbour->output(skb); +	rcu_read_lock_bh(); +	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); +	neigh = __ipv4_neigh_lookup_noref(dev, nexthop); +	if (unlikely(!neigh)) +		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); +	if (!IS_ERR(neigh)) { +		int res = dst_neigh_output(dst, neigh, skb); -	if (net_ratelimit()) -		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); +		rcu_read_unlock_bh(); +		return res; +	} +	rcu_read_unlock_bh(); + +	net_dbg_ratelimited("%s: No header cache and no neighbour!\n", +			    __func__);  	kfree_skb(skb);  	return -EINVAL;  } -static inline int ip_skb_dst_mtu(struct sk_buff *skb) +static int ip_finish_output_gso(struct sk_buff *skb)  { -	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL; +	netdev_features_t features; +	struct sk_buff *segs; +	int ret = 0; + +	/* common case: locally created skb or seglen is <= mtu */ +	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || +	      skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb)) +		return ip_finish_output2(skb); -	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ? -	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); +	/* Slowpath -  GSO segment length is exceeding the dst MTU. +	 * +	 * This can happen in two cases: +	 * 1) TCP GRO packet, DF bit not set +	 * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly +	 * from host network stack. +	 */ +	features = netif_skb_features(skb); +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); +	if (IS_ERR(segs)) { +		kfree_skb(skb); +		return -ENOMEM; +	} + +	consume_skb(skb); + +	do { +		struct sk_buff *nskb = segs->next; +		int err; + +		segs->next = NULL; +		err = ip_fragment(segs, ip_finish_output2); + +		if (err && ret == 0) +			ret = err; +		segs = nskb; +	} while (segs); + +	return ret;  }  static int ip_finish_output(struct sk_buff *skb) @@ -230,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)  		return dst_output(skb);  	}  #endif -	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) +	if (skb_is_gso(skb)) +		return ip_finish_output_gso(skb); + +	if (skb->len > ip_skb_dst_mtu(skb))  		return ip_fragment(skb, ip_finish_output2); -	else -		return ip_finish_output2(skb); + +	return ip_finish_output2(skb);  } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb)  { -	struct sock *sk = skb->sk;  	struct rtable *rt = skb_rtable(skb);  	struct net_device *dev = rt->dst.dev; @@ -274,7 +308,7 @@ int ip_mc_output(struct sk_buff *skb)  			if (newskb)  				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,  					newskb, NULL, newskb->dev, -					ip_dev_loopback_xmit); +					dev_loopback_xmit);  		}  		/* Multicasts with ttl 0 must not go beyond the host */ @@ -289,7 +323,7 @@ int ip_mc_output(struct sk_buff *skb)  		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);  		if (newskb)  			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, -				NULL, newskb->dev, ip_dev_loopback_xmit); +				NULL, newskb->dev, dev_loopback_xmit);  	}  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, @@ -297,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb)  {  	struct net_device *dev = skb_dst(skb)->dev; @@ -311,11 +345,26 @@ int ip_output(struct sk_buff *skb)  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } -int ip_queue_xmit(struct sk_buff *skb) +/* + * copy saddr and daddr, possibly using 64bit load/stores + * Equivalent to : + *   iph->saddr = fl4->saddr; + *   iph->daddr = fl4->daddr; + */ +static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) +{ +	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != +		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); +	memcpy(&iph->saddr, &fl4->saddr, +	       sizeof(fl4->saddr) + sizeof(fl4->daddr)); +} + +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)  { -	struct sock *sk = skb->sk;  	struct inet_sock *inet = inet_sk(sk); -	struct ip_options *opt = inet->opt; +	struct ip_options_rcu *inet_opt; +	struct flowi4 *fl4;  	struct rtable *rt;  	struct iphdr *iph;  	int res; @@ -324,6 +373,8 @@ int ip_queue_xmit(struct sk_buff *skb)  	 * f.e. by something like SCTP.  	 */  	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt); +	fl4 = &fl->u.ip4;  	rt = skb_rtable(skb);  	if (rt != NULL)  		goto packet_routed; @@ -335,59 +386,53 @@ int ip_queue_xmit(struct sk_buff *skb)  		/* Use correct destination address if we have options. */  		daddr = inet->inet_daddr; -		if(opt && opt->srr) -			daddr = opt->faddr; - -		{ -			struct flowi fl = { .oif = sk->sk_bound_dev_if, -					    .mark = sk->sk_mark, -					    .fl4_dst = daddr, -					    .fl4_src = inet->inet_saddr, -					    .fl4_tos = RT_CONN_FLAGS(sk), -					    .proto = sk->sk_protocol, -					    .flags = inet_sk_flowi_flags(sk), -					    .fl_ip_sport = inet->inet_sport, -					    .fl_ip_dport = inet->inet_dport }; - -			/* If this fails, retransmit mechanism of transport layer will -			 * keep trying until route appears or the connection times -			 * itself out. -			 */ -			security_sk_classify_flow(sk, &fl); -			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) -				goto no_route; -		} +		if (inet_opt && inet_opt->opt.srr) +			daddr = inet_opt->opt.faddr; + +		/* If this fails, retransmit mechanism of transport layer will +		 * keep trying until route appears or the connection times +		 * itself out. +		 */ +		rt = ip_route_output_ports(sock_net(sk), fl4, sk, +					   daddr, inet->inet_saddr, +					   inet->inet_dport, +					   inet->inet_sport, +					   sk->sk_protocol, +					   RT_CONN_FLAGS(sk), +					   sk->sk_bound_dev_if); +		if (IS_ERR(rt)) +			goto no_route;  		sk_setup_caps(sk, &rt->dst);  	}  	skb_dst_set_noref(skb, &rt->dst);  packet_routed: -	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) +	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)  		goto no_route;  	/* OK, we know where to send it, allocate and build IP header. */ -	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); +	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));  	skb_reset_network_header(skb);  	iph = ip_hdr(skb);  	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); -	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) +	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)  		iph->frag_off = htons(IP_DF);  	else  		iph->frag_off = 0;  	iph->ttl      = ip_select_ttl(inet, &rt->dst);  	iph->protocol = sk->sk_protocol; -	iph->saddr    = rt->rt_src; -	iph->daddr    = rt->rt_dst; +	ip_copy_addrs(iph, fl4); +  	/* Transport layer set skb->h.foo itself. */ -	if (opt && opt->optlen) { -		iph->ihl += opt->optlen >> 2; -		ip_options_build(skb, opt, inet->inet_daddr, rt, 0); +	if (inet_opt && inet_opt->opt.optlen) { +		iph->ihl += inet_opt->opt.optlen >> 2; +		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);  	} -	ip_select_ident_more(iph, &rt->dst, sk, -			     (skb_shinfo(skb)->gso_segs ?: 1) - 1); +	ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); +	/* TODO : should we use skb->sk here instead of sk ? */  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; @@ -421,10 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)  	to->tc_index = from->tc_index;  #endif  	nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -	to->nf_trace = from->nf_trace; -#endif  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)  	to->ipvs_property = from->ipvs_property;  #endif @@ -458,10 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	iph = ip_hdr(skb); -	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { +	mtu = ip_skb_dst_mtu(skb); +	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || +		     (IPCB(skb)->frag_max_size && +		      IPCB(skb)->frag_max_size > mtu))) {  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);  		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -			  htonl(ip_skb_dst_mtu(skb))); +			  htonl(mtu));  		kfree_skb(skb);  		return -EMSGSIZE;  	} @@ -471,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  	 */  	hlen = iph->ihl * 4; -	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */ +	mtu = mtu - hlen;	/* Size of data space */  #ifdef CONFIG_BRIDGE_NETFILTER  	if (skb->nf_bridge)  		mtu -= nf_bridge_mtu_reduction(skb); @@ -491,7 +535,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))  		if (first_len - hlen > mtu ||  		    ((first_len - hlen) & 7) || -		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) || +		    ip_is_fragment(iph) ||  		    skb_cloned(skb))  			goto slow_path; @@ -584,6 +628,11 @@ slow_path_clean:  	}  slow_path: +	/* for offloaded checksums cleanup checksum before fragmentation */ +	if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb)) +		goto fail; +	iph = ip_hdr(skb); +  	left = skb->len - hlen;		/* Space per frame */  	ptr = hlen;		/* Where to start from */ @@ -608,7 +657,7 @@ slow_path:  		/* IF: it doesn't fit, use 'mtu' - the data space left */  		if (len > mtu)  			len = mtu; -		/* IF: we are not sending upto and including the packet end +		/* IF: we are not sending up to and including the packet end  		   then align the next start on an eight byte boundary */  		if (len < left)	{  			len &= ~7; @@ -691,7 +740,7 @@ slow_path:  		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);  	} -	kfree_skb(skb); +	consume_skb(skb);  	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);  	return err; @@ -732,10 +781,11 @@ csum_page(struct page *page, int offset, int copy)  }  static inline int ip_ufo_append_data(struct sock *sk, +			struct sk_buff_head *queue,  			int getfrag(void *from, char *to, int offset, int len,  			       int odd, struct sk_buff *skb),  			void *from, int length, int hh_len, int fragheaderlen, -			int transhdrlen, int mtu, unsigned int flags) +			int transhdrlen, int maxfraglen, unsigned int flags)  {  	struct sk_buff *skb;  	int err; @@ -744,7 +794,7 @@ static inline int ip_ufo_append_data(struct sock *sk,  	 * device, so create one single skb packet containing complete  	 * udp datagram  	 */ -	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { +	if ((skb = skb_peek_tail(queue)) == NULL) {  		skb = sock_alloc_send_skb(sk,  			hh_len + fragheaderlen + transhdrlen + 20,  			(flags & MSG_DONTWAIT), &err); @@ -764,104 +814,62 @@ static inline int ip_ufo_append_data(struct sock *sk,  		/* initialize protocol header pointer */  		skb->transport_header = skb->network_header + fragheaderlen; -		skb->ip_summed = CHECKSUM_PARTIAL;  		skb->csum = 0; -		sk->sk_sndmsg_off = 0; -		/* specify the length of each IP datagram fragment */ -		skb_shinfo(skb)->gso_size = mtu - fragheaderlen; -		skb_shinfo(skb)->gso_type = SKB_GSO_UDP; -		__skb_queue_tail(&sk->sk_write_queue, skb); + +		__skb_queue_tail(queue, skb); +	} else if (skb_is_gso(skb)) { +		goto append;  	} +	skb->ip_summed = CHECKSUM_PARTIAL; +	/* specify the length of each IP datagram fragment */ +	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen; +	skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + +append:  	return skb_append_datato_frags(sk, skb, getfrag, from,  				       (length - transhdrlen));  } -/* - *	ip_append_data() and ip_append_page() can make one large IP datagram - *	from many pieces of data. Each pieces will be holded on the socket - *	until ip_push_pending_frames() is called. Each piece can be a page - *	or non-page data. - * - *	Not only UDP, other transport protocols - e.g. raw sockets - can use - *	this interface potentially. - * - *	LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, -		   int getfrag(void *from, char *to, int offset, int len, -			       int odd, struct sk_buff *skb), -		   void *from, int length, int transhdrlen, -		   struct ipcm_cookie *ipc, struct rtable **rtp, -		   unsigned int flags) +static int __ip_append_data(struct sock *sk, +			    struct flowi4 *fl4, +			    struct sk_buff_head *queue, +			    struct inet_cork *cork, +			    struct page_frag *pfrag, +			    int getfrag(void *from, char *to, int offset, +					int len, int odd, struct sk_buff *skb), +			    void *from, int length, int transhdrlen, +			    unsigned int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	struct sk_buff *skb; -	struct ip_options *opt = NULL; +	struct ip_options *opt = cork->opt;  	int hh_len;  	int exthdrlen;  	int mtu;  	int copy;  	int err;  	int offset = 0; -	unsigned int maxfraglen, fragheaderlen; +	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;  	int csummode = CHECKSUM_NONE; -	struct rtable *rt; +	struct rtable *rt = (struct rtable *)cork->dst; -	if (flags&MSG_PROBE) -		return 0; +	skb = skb_peek_tail(queue); -	if (skb_queue_empty(&sk->sk_write_queue)) { -		/* -		 * setup for corking. -		 */ -		opt = ipc->opt; -		if (opt) { -			if (inet->cork.opt == NULL) { -				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); -				if (unlikely(inet->cork.opt == NULL)) -					return -ENOBUFS; -			} -			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); -			inet->cork.flags |= IPCORK_OPT; -			inet->cork.addr = ipc->addr; -		} -		rt = *rtp; -		if (unlikely(!rt)) -			return -EFAULT; -		/* -		 * We steal reference to this route, caller should not release it -		 */ -		*rtp = NULL; -		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? -					    rt->dst.dev->mtu : -					    dst_mtu(rt->dst.path); -		inet->cork.dst = &rt->dst; -		inet->cork.length = 0; -		sk->sk_sndmsg_page = NULL; -		sk->sk_sndmsg_off = 0; -		exthdrlen = rt->dst.header_len; -		length += exthdrlen; -		transhdrlen += exthdrlen; -	} else { -		rt = (struct rtable *)inet->cork.dst; -		if (inet->cork.flags & IPCORK_OPT) -			opt = inet->cork.opt; +	exthdrlen = !skb ? rt->dst.header_len : 0; +	mtu = cork->fragsize; -		transhdrlen = 0; -		exthdrlen = 0; -		mtu = inet->cork.fragsize; -	}  	hh_len = LL_RESERVED_SPACE(rt->dst.dev);  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (inet->cork.length + length > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, -			       mtu-exthdrlen); +	if (cork->length + length > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	} @@ -875,15 +883,13 @@ int ip_append_data(struct sock *sk,  	    !exthdrlen)  		csummode = CHECKSUM_PARTIAL; -	skb = skb_peek_tail(&sk->sk_write_queue); - -	inet->cork.length += length; +	cork->length += length;  	if (((length > mtu) || (skb && skb_is_gso(skb))) &&  	    (sk->sk_protocol == IPPROTO_UDP) && -	    (rt->dst.dev->features & NETIF_F_UFO)) { -		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, -					 fragheaderlen, transhdrlen, mtu, -					 flags); +	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { +		err = ip_ufo_append_data(sk, queue, getfrag, from, length, +					 hh_len, fragheaderlen, transhdrlen, +					 maxfraglen, flags);  		if (err)  			goto error;  		return 0; @@ -933,17 +939,16 @@ alloc_new_skb:  			else  				alloclen = fraglen; +			alloclen += exthdrlen; +  			/* The last fragment gets additional space at tail.  			 * Note, with MSG_MORE we overallocate on fragments,  			 * because we have no idea what fragment will be  			 * the last.  			 */ -			if (datalen == length + fraggap) { +			if (datalen == length + fraggap)  				alloclen += rt->dst.trailer_len; -				/* make sure mtu is not reached */ -				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) -					datalen -= ALIGN(rt->dst.trailer_len, 8); -			} +  			if (transhdrlen) {  				skb = sock_alloc_send_skb(sk,  						alloclen + hh_len + 15, @@ -960,7 +965,7 @@ alloc_new_skb:  				else  					/* only the initial fragment is  					   time stamped */ -					ipc->tx_flags = 0; +					cork->tx_flags = 0;  			}  			if (skb == NULL)  				goto error; @@ -971,16 +976,16 @@ alloc_new_skb:  			skb->ip_summed = csummode;  			skb->csum = 0;  			skb_reserve(skb, hh_len); -			skb_shinfo(skb)->tx_flags = ipc->tx_flags; +			skb_shinfo(skb)->tx_flags = cork->tx_flags;  			/*  			 *	Find where to start putting bytes.  			 */ -			data = skb_put(skb, fraglen); +			data = skb_put(skb, fraglen + exthdrlen);  			skb_set_network_header(skb, exthdrlen);  			skb->transport_header = (skb->network_header +  						 fragheaderlen); -			data += fragheaderlen; +			data += fragheaderlen + exthdrlen;  			if (fraggap) {  				skb->csum = skb_copy_and_csum_bits( @@ -1008,7 +1013,7 @@ alloc_new_skb:  			/*  			 * Put the packet on the pending queue.  			 */ -			__skb_queue_tail(&sk->sk_write_queue, skb); +			__skb_queue_tail(queue, skb);  			continue;  		} @@ -1027,46 +1032,30 @@ alloc_new_skb:  			}  		} else {  			int i = skb_shinfo(skb)->nr_frags; -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; -			struct page *page = sk->sk_sndmsg_page; -			int off = sk->sk_sndmsg_off; -			unsigned int left; - -			if (page && (left = PAGE_SIZE - off) > 0) { -				if (copy >= left) -					copy = left; -				if (page != frag->page) { -					if (i == MAX_SKB_FRAGS) { -						err = -EMSGSIZE; -						goto error; -					} -					get_page(page); -					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); -					frag = &skb_shinfo(skb)->frags[i]; -				} -			} else if (i < MAX_SKB_FRAGS) { -				if (copy > PAGE_SIZE) -					copy = PAGE_SIZE; -				page = alloc_pages(sk->sk_allocation, 0); -				if (page == NULL)  { -					err = -ENOMEM; -					goto error; -				} -				sk->sk_sndmsg_page = page; -				sk->sk_sndmsg_off = 0; -				skb_fill_page_desc(skb, i, page, 0, 0); -				frag = &skb_shinfo(skb)->frags[i]; -			} else { -				err = -EMSGSIZE; -				goto error; -			} -			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { -				err = -EFAULT; +			err = -ENOMEM; +			if (!sk_page_frag_refill(sk, pfrag))  				goto error; + +			if (!skb_can_coalesce(skb, i, pfrag->page, +					      pfrag->offset)) { +				err = -EMSGSIZE; +				if (i == MAX_SKB_FRAGS) +					goto error; + +				__skb_fill_page_desc(skb, i, pfrag->page, +						     pfrag->offset, 0); +				skb_shinfo(skb)->nr_frags = ++i; +				get_page(pfrag->page);  			} -			sk->sk_sndmsg_off += copy; -			frag->size += copy; +			copy = min_t(int, copy, pfrag->size - pfrag->offset); +			if (getfrag(from, +				    page_address(pfrag->page) + pfrag->offset, +				    offset, copy, skb->len, skb) < 0) +				goto error_efault; + +			pfrag->offset += copy; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  			skb->len += copy;  			skb->data_len += copy;  			skb->truesize += copy; @@ -1078,24 +1067,104 @@ alloc_new_skb:  	return 0; +error_efault: +	err = -EFAULT;  error: -	inet->cork.length -= length; +	cork->length -= length;  	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);  	return err;  } -ssize_t	ip_append_page(struct sock *sk, struct page *page, +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, +			 struct ipcm_cookie *ipc, struct rtable **rtp) +{ +	struct ip_options_rcu *opt; +	struct rtable *rt; + +	/* +	 * setup for corking. +	 */ +	opt = ipc->opt; +	if (opt) { +		if (cork->opt == NULL) { +			cork->opt = kmalloc(sizeof(struct ip_options) + 40, +					    sk->sk_allocation); +			if (unlikely(cork->opt == NULL)) +				return -ENOBUFS; +		} +		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); +		cork->flags |= IPCORK_OPT; +		cork->addr = ipc->addr; +	} +	rt = *rtp; +	if (unlikely(!rt)) +		return -EFAULT; +	/* +	 * We steal reference to this route, caller should not release it +	 */ +	*rtp = NULL; +	cork->fragsize = ip_sk_use_pmtu(sk) ? +			 dst_mtu(&rt->dst) : rt->dst.dev->mtu; +	cork->dst = &rt->dst; +	cork->length = 0; +	cork->ttl = ipc->ttl; +	cork->tos = ipc->tos; +	cork->priority = ipc->priority; +	cork->tx_flags = ipc->tx_flags; + +	return 0; +} + +/* + *	ip_append_data() and ip_append_page() can make one large IP datagram + *	from many pieces of data. Each pieces will be holded on the socket + *	until ip_push_pending_frames() is called. Each piece can be a page + *	or non-page data. + * + *	Not only UDP, other transport protocols - e.g. raw sockets - can use + *	this interface potentially. + * + *	LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, struct flowi4 *fl4, +		   int getfrag(void *from, char *to, int offset, int len, +			       int odd, struct sk_buff *skb), +		   void *from, int length, int transhdrlen, +		   struct ipcm_cookie *ipc, struct rtable **rtp, +		   unsigned int flags) +{ +	struct inet_sock *inet = inet_sk(sk); +	int err; + +	if (flags&MSG_PROBE) +		return 0; + +	if (skb_queue_empty(&sk->sk_write_queue)) { +		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); +		if (err) +			return err; +	} else { +		transhdrlen = 0; +	} + +	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, +				sk_page_frag(sk), getfrag, +				from, length, transhdrlen, flags); +} + +ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,  		       int offset, size_t size, int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	struct sk_buff *skb;  	struct rtable *rt;  	struct ip_options *opt = NULL; +	struct inet_cork *cork;  	int hh_len;  	int mtu;  	int len;  	int err; -	unsigned int maxfraglen, fragheaderlen, fraggap; +	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;  	if (inet->hdrincl)  		return -EPERM; @@ -1106,28 +1175,31 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  	if (skb_queue_empty(&sk->sk_write_queue))  		return -EINVAL; -	rt = (struct rtable *)inet->cork.dst; -	if (inet->cork.flags & IPCORK_OPT) -		opt = inet->cork.opt; +	cork = &inet->cork.base; +	rt = (struct rtable *)cork->dst; +	if (cork->flags & IPCORK_OPT) +		opt = cork->opt;  	if (!(rt->dst.dev->features&NETIF_F_SG))  		return -EOPNOTSUPP;  	hh_len = LL_RESERVED_SPACE(rt->dst.dev); -	mtu = inet->cork.fragsize; +	mtu = cork->fragsize;  	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);  	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; +	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; -	if (inet->cork.length + size > 0xFFFF - fragheaderlen) { -		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); +	if (cork->length + size > maxnonfragsize - fragheaderlen) { +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, +			       mtu - (opt ? opt->optlen : 0));  		return -EMSGSIZE;  	}  	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)  		return -EINVAL; -	inet->cork.length += size; +	cork->length += size;  	if ((size + skb->len > mtu) &&  	    (sk->sk_protocol == IPPROTO_UDP) &&  	    (rt->dst.dev->features & NETIF_F_UFO)) { @@ -1197,7 +1269,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  		if (len > size)  			len = size;  		if (skb_can_coalesce(skb, i, page, offset)) { -			skb_shinfo(skb)->frags[i-1].size += len; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);  		} else if (i < MAX_SKB_FRAGS) {  			get_page(page);  			skb_fill_page_desc(skb, i, page, offset, len); @@ -1222,45 +1294,47 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,  	return 0;  error: -	inet->cork.length -= size; +	cork->length -= size;  	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);  	return err;  } -static void ip_cork_release(struct inet_sock *inet) +static void ip_cork_release(struct inet_cork *cork)  { -	inet->cork.flags &= ~IPCORK_OPT; -	kfree(inet->cork.opt); -	inet->cork.opt = NULL; -	dst_release(inet->cork.dst); -	inet->cork.dst = NULL; +	cork->flags &= ~IPCORK_OPT; +	kfree(cork->opt); +	cork->opt = NULL; +	dst_release(cork->dst); +	cork->dst = NULL;  }  /*   *	Combined all pending IP fragments on the socket as one IP datagram   *	and push them out.   */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, +			      struct flowi4 *fl4, +			      struct sk_buff_head *queue, +			      struct inet_cork *cork)  {  	struct sk_buff *skb, *tmp_skb;  	struct sk_buff **tail_skb;  	struct inet_sock *inet = inet_sk(sk);  	struct net *net = sock_net(sk);  	struct ip_options *opt = NULL; -	struct rtable *rt = (struct rtable *)inet->cork.dst; +	struct rtable *rt = (struct rtable *)cork->dst;  	struct iphdr *iph;  	__be16 df = 0;  	__u8 ttl; -	int err = 0; -	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) +	if ((skb = __skb_dequeue(queue)) == NULL)  		goto out;  	tail_skb = &(skb_shinfo(skb)->frag_list);  	/* move skb->data to ip header from ext header */  	if (skb->data < skb_network_header(skb))  		__skb_pull(skb, skb_network_offset(skb)); -	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { +	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {  		__skb_pull(tmp_skb, skb_network_header_len(skb));  		*tail_skb = tmp_skb;  		tail_skb = &(tmp_skb->next); @@ -1275,84 +1349,141 @@ int ip_push_pending_frames(struct sock *sk)  	 * to fragment the frame generated here. No matter, what transforms  	 * how transforms change size of the packet, it will come out.  	 */ -	if (inet->pmtudisc < IP_PMTUDISC_DO) -		skb->local_df = 1; +	skb->ignore_df = ip_sk_ignore_df(sk);  	/* DF bit is set when we want to see DF on outgoing frames. -	 * If local_df is set too, we still allow to fragment this frame +	 * If ignore_df is set too, we still allow to fragment this frame  	 * locally. */ -	if (inet->pmtudisc >= IP_PMTUDISC_DO || +	if (inet->pmtudisc == IP_PMTUDISC_DO || +	    inet->pmtudisc == IP_PMTUDISC_PROBE ||  	    (skb->len <= dst_mtu(&rt->dst) &&  	     ip_dont_fragment(sk, &rt->dst)))  		df = htons(IP_DF); -	if (inet->cork.flags & IPCORK_OPT) -		opt = inet->cork.opt; +	if (cork->flags & IPCORK_OPT) +		opt = cork->opt; -	if (rt->rt_type == RTN_MULTICAST) +	if (cork->ttl != 0) +		ttl = cork->ttl; +	else if (rt->rt_type == RTN_MULTICAST)  		ttl = inet->mc_ttl;  	else  		ttl = ip_select_ttl(inet, &rt->dst); -	iph = (struct iphdr *)skb->data; +	iph = ip_hdr(skb);  	iph->version = 4;  	iph->ihl = 5; -	if (opt) { -		iph->ihl += opt->optlen>>2; -		ip_options_build(skb, opt, inet->cork.addr, rt, 0); -	} -	iph->tos = inet->tos; +	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;  	iph->frag_off = df; -	ip_select_ident(iph, &rt->dst, sk);  	iph->ttl = ttl;  	iph->protocol = sk->sk_protocol; -	iph->saddr = rt->rt_src; -	iph->daddr = rt->rt_dst; +	ip_copy_addrs(iph, fl4); +	ip_select_ident(skb, sk); -	skb->priority = sk->sk_priority; +	if (opt) { +		iph->ihl += opt->optlen>>2; +		ip_options_build(skb, opt, cork->addr, rt, 0); +	} + +	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;  	skb->mark = sk->sk_mark;  	/*  	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec  	 * on dst refcount  	 */ -	inet->cork.dst = NULL; +	cork->dst = NULL;  	skb_dst_set(skb, &rt->dst);  	if (iph->protocol == IPPROTO_ICMP)  		icmp_out_count(net, ((struct icmphdr *)  			skb_transport_header(skb))->type); -	/* Netfilter gets whole the not fragmented skb. */ +	ip_cork_release(cork); +out: +	return skb; +} + +int ip_send_skb(struct net *net, struct sk_buff *skb) +{ +	int err; +  	err = ip_local_out(skb);  	if (err) {  		if (err > 0)  			err = net_xmit_errno(err);  		if (err) -			goto error; +			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);  	} -out: -	ip_cork_release(inet);  	return err; +} -error: -	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); -	goto out; +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) +{ +	struct sk_buff *skb; + +	skb = ip_finish_skb(sk, fl4); +	if (!skb) +		return 0; + +	/* Netfilter gets whole the not fragmented skb. */ +	return ip_send_skb(sock_net(sk), skb);  }  /*   *	Throw away all pending data on the socket.   */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, +				      struct sk_buff_head *queue, +				      struct inet_cork *cork)  {  	struct sk_buff *skb; -	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) +	while ((skb = __skb_dequeue_tail(queue)) != NULL)  		kfree_skb(skb); -	ip_cork_release(inet_sk(sk)); +	ip_cork_release(cork);  } +void ip_flush_pending_frames(struct sock *sk) +{ +	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); +} + +struct sk_buff *ip_make_skb(struct sock *sk, +			    struct flowi4 *fl4, +			    int getfrag(void *from, char *to, int offset, +					int len, int odd, struct sk_buff *skb), +			    void *from, int length, int transhdrlen, +			    struct ipcm_cookie *ipc, struct rtable **rtp, +			    unsigned int flags) +{ +	struct inet_cork cork; +	struct sk_buff_head queue; +	int err; + +	if (flags & MSG_PROBE) +		return NULL; + +	__skb_queue_head_init(&queue); + +	cork.flags = 0; +	cork.addr = 0; +	cork.opt = NULL; +	err = ip_setup_cork(sk, &cork, ipc, rtp); +	if (err) +		return ERR_PTR(err); + +	err = __ip_append_data(sk, fl4, &queue, &cork, +			       ¤t->task_frag, getfrag, +			       from, length, transhdrlen, flags); +	if (err) { +		__ip_flush_pending_frames(sk, &queue, &cork); +		return ERR_PTR(err); +	} + +	return __ip_make_skb(sk, fl4, &queue, &cork); +}  /*   *	Fetch data from kernel space and fill in checksum if needed. @@ -1369,74 +1500,88 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,  /*   *	Generic function to send a packet as reply to another packet. - *	Used to send TCP resets so far. ICMP should use this function too. + *	Used to send some TCP resets/acks so far.   * - *	Should run single threaded per socket because it uses the sock - *     	structure to pass arguments. + *	Use a fake percpu inet socket to avoid false sharing and contention.   */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, -		   unsigned int len) +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { +	.sk = { +		.__sk_common = { +			.skc_refcnt = ATOMIC_INIT(1), +		}, +		.sk_wmem_alloc	= ATOMIC_INIT(1), +		.sk_allocation	= GFP_ATOMIC, +		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE), +	}, +	.pmtudisc	= IP_PMTUDISC_WANT, +	.uc_ttl		= -1, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, +			   __be32 saddr, const struct ip_reply_arg *arg, +			   unsigned int len)  { -	struct inet_sock *inet = inet_sk(sk); -	struct { -		struct ip_options	opt; -		char			data[40]; -	} replyopts; +	struct ip_options_data replyopts;  	struct ipcm_cookie ipc; -	__be32 daddr; +	struct flowi4 fl4;  	struct rtable *rt = skb_rtable(skb); +	struct sk_buff *nskb; +	struct sock *sk; +	struct inet_sock *inet; -	if (ip_options_echo(&replyopts.opt, skb)) +	if (ip_options_echo(&replyopts.opt.opt, skb))  		return; -	daddr = ipc.addr = rt->rt_src; +	ipc.addr = daddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; -	if (replyopts.opt.optlen) { +	if (replyopts.opt.opt.optlen) {  		ipc.opt = &replyopts.opt; -		if (ipc.opt->srr) -			daddr = replyopts.opt.faddr; +		if (replyopts.opt.opt.srr) +			daddr = replyopts.opt.opt.faddr;  	} -	{ -		struct flowi fl = { .oif = arg->bound_dev_if, -				    .fl4_dst = daddr, -				    .fl4_src = rt->rt_spec_dst, -				    .fl4_tos = RT_TOS(ip_hdr(skb)->tos), -				    .fl_ip_sport = tcp_hdr(skb)->dest, -				    .fl_ip_dport = tcp_hdr(skb)->source, -				    .proto = sk->sk_protocol, -				    .flags = ip_reply_arg_flowi_flags(arg) }; -		security_skb_classify_flow(skb, &fl); -		if (ip_route_output_key(sock_net(sk), &rt, &fl)) -			return; -	} +	flowi4_init_output(&fl4, arg->bound_dev_if, +			   IP4_REPLY_MARK(net, skb->mark), +			   RT_TOS(arg->tos), +			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, +			   ip_reply_arg_flowi_flags(arg), +			   daddr, saddr, +			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest); +	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt)) +		return; -	/* And let IP do all the hard work. +	inet = &get_cpu_var(unicast_sock); -	   This chunk is not reenterable, hence spinlock. -	   Note that it uses the fact, that this function is called -	   with locally disabled BH and that sk cannot be already spinlocked. -	 */ -	bh_lock_sock(sk); -	inet->tos = ip_hdr(skb)->tos; +	inet->tos = arg->tos; +	sk = &inet->sk;  	sk->sk_priority = skb->priority;  	sk->sk_protocol = ip_hdr(skb)->protocol;  	sk->sk_bound_dev_if = arg->bound_dev_if; -	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, +	sock_net_set(sk, net); +	__skb_queue_head_init(&sk->sk_write_queue); +	sk->sk_sndbuf = sysctl_wmem_default; +	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,  		       &ipc, &rt, MSG_DONTWAIT); -	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { +	nskb = skb_peek(&sk->sk_write_queue); +	if (nskb) {  		if (arg->csumoffset >= 0) -			*((__sum16 *)skb_transport_header(skb) + -			  arg->csumoffset) = csum_fold(csum_add(skb->csum, +			*((__sum16 *)skb_transport_header(nskb) + +			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,  								arg->csum)); -		skb->ip_summed = CHECKSUM_NONE; -		ip_push_pending_frames(sk); +		nskb->ip_summed = CHECKSUM_NONE; +		skb_orphan(nskb); +		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); +		ip_push_pending_frames(sk, &fl4);  	} -	bh_unlock_sock(sk); +	put_cpu_var(unicast_sock);  	ip_rt_put(rt);  } @@ -1446,7 +1591,7 @@ void __init ip_init(void)  	ip_rt_init();  	inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) -	igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) +	igmp_mc_init();  #endif  } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3948c86e59c..64741b93863 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -33,12 +33,14 @@  #include <linux/netfilter.h>  #include <linux/route.h>  #include <linux/mroute.h> +#include <net/inet_ecn.h>  #include <net/route.h>  #include <net/xfrm.h>  #include <net/compat.h> -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  #include <net/transp_v6.h>  #endif +#include <net/ip_fib.h>  #include <linux/errqueue.h>  #include <asm/uaccess.h> @@ -57,17 +59,9 @@  static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)  { -	struct in_pktinfo info; -	struct rtable *rt = skb_rtable(skb); +	struct in_pktinfo info = *PKTINFO_SKB_CB(skb);  	info.ipi_addr.s_addr = ip_hdr(skb)->daddr; -	if (rt) { -		info.ipi_ifindex = rt->rt_iif; -		info.ipi_spec_dst.s_addr = rt->rt_spec_dst; -	} else { -		info.ipi_ifindex = 0; -		info.ipi_spec_dst.s_addr = 0; -	}  	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);  } @@ -96,7 +90,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)  static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)  {  	unsigned char optbuf[sizeof(struct ip_options) + 40]; -	struct ip_options * opt = (struct ip_options *)optbuf; +	struct ip_options *opt = (struct ip_options *)optbuf;  	if (IPCB(skb)->opt.optlen == 0)  		return; @@ -131,7 +125,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)  static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)  {  	struct sockaddr_in sin; -	struct iphdr *iph = ip_hdr(skb); +	const struct iphdr *iph = ip_hdr(skb);  	__be16 *ports = (__be16 *)skb_transport_header(skb);  	if (skb_transport_offset(skb) + 4 > skb->len) @@ -153,7 +147,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)  void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)  {  	struct inet_sock *inet = inet_sk(skb->sk); -	unsigned flags = inet->cmsg_flags; +	unsigned int flags = inet->cmsg_flags;  	/* Ordered by supposed usage frequency */  	if (flags & 1) @@ -192,14 +186,31 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)  }  EXPORT_SYMBOL(ip_cmsg_recv); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) +int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, +		 bool allow_ipv6)  { -	int err; +	int err, val;  	struct cmsghdr *cmsg;  	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {  		if (!CMSG_OK(msg, cmsg))  			return -EINVAL; +#if defined(CONFIG_IPV6) +		if (allow_ipv6 && +		    cmsg->cmsg_level == SOL_IPV6 && +		    cmsg->cmsg_type == IPV6_PKTINFO) { +			struct in6_pktinfo *src_info; + +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) +				return -EINVAL; +			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); +			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) +				return -EINVAL; +			ipc->oif = src_info->ipi6_ifindex; +			ipc->addr = src_info->ipi6_addr.s6_addr32[3]; +			continue; +		} +#endif  		if (cmsg->cmsg_level != SOL_IP)  			continue;  		switch (cmsg->cmsg_type) { @@ -220,6 +231,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)  			ipc->addr = info->ipi_spec_dst.s_addr;  			break;  		} +		case IP_TTL: +			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) +				return -EINVAL; +			val = *(int *)CMSG_DATA(cmsg); +			if (val < 1 || val > 255) +				return -EINVAL; +			ipc->ttl = val; +			break; +		case IP_TOS: +			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) +				return -EINVAL; +			val = *(int *)CMSG_DATA(cmsg); +			if (val < 0 || val > 255) +				return -EINVAL; +			ipc->tos = val; +			ipc->priority = rt_tos2priority(ipc->tos); +			break; +  		default:  			return -EINVAL;  		} @@ -373,11 +402,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf  /*   *	Handle MSG_ERRQUEUE   */ -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)  {  	struct sock_exterr_skb *serr;  	struct sk_buff *skb, *skb2; -	struct sockaddr_in *sin; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct {  		struct sock_extended_err ee;  		struct sockaddr_in	 offender; @@ -403,13 +432,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)  	serr = SKB_EXT_ERR(skb); -	sin = (struct sockaddr_in *)msg->msg_name;  	if (sin) {  		sin->sin_family = AF_INET;  		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +  						   serr->addr_offset);  		sin->sin_port = serr->port;  		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -462,18 +491,28 @@ static int do_ip_setsockopt(struct sock *sk, int level,  	struct inet_sock *inet = inet_sk(sk);  	int val = 0, err; -	if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | -			     (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | -			     (1<<IP_RETOPTS) | (1<<IP_TOS) | -			     (1<<IP_TTL) | (1<<IP_HDRINCL) | -			     (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | -			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | -			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | -			     (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || -	    optname == IP_MULTICAST_TTL || -	    optname == IP_MULTICAST_ALL || -	    optname == IP_MULTICAST_LOOP || -	    optname == IP_RECVORIGDSTADDR) { +	switch (optname) { +	case IP_PKTINFO: +	case IP_RECVTTL: +	case IP_RECVOPTS: +	case IP_RECVTOS: +	case IP_RETOPTS: +	case IP_TOS: +	case IP_TTL: +	case IP_HDRINCL: +	case IP_MTU_DISCOVER: +	case IP_RECVERR: +	case IP_ROUTER_ALERT: +	case IP_FREEBIND: +	case IP_PASSSEC: +	case IP_TRANSPARENT: +	case IP_MINTTL: +	case IP_NODEFRAG: +	case IP_UNICAST_IF: +	case IP_MULTICAST_TTL: +	case IP_MULTICAST_ALL: +	case IP_MULTICAST_LOOP: +	case IP_RECVORIGDSTADDR:  		if (optlen >= sizeof(int)) {  			if (get_user(val, (int __user *) optval))  				return -EFAULT; @@ -497,32 +536,36 @@ static int do_ip_setsockopt(struct sock *sk, int level,  	switch (optname) {  	case IP_OPTIONS:  	{ -		struct ip_options *opt = NULL; +		struct ip_options_rcu *old, *opt = NULL; +  		if (optlen > 40)  			goto e_inval;  		err = ip_options_get_from_user(sock_net(sk), &opt,  					       optval, optlen);  		if (err)  			break; +		old = rcu_dereference_protected(inet->inet_opt, +						sock_owned_by_user(sk));  		if (inet->is_icsk) {  			struct inet_connection_sock *icsk = inet_csk(sk); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  			if (sk->sk_family == PF_INET ||  			    (!((1 << sk->sk_state) &  			       (TCPF_LISTEN | TCPF_CLOSE)) &&  			     inet->inet_daddr != LOOPBACK4_IPV6)) {  #endif -				if (inet->opt) -					icsk->icsk_ext_hdr_len -= inet->opt->optlen; +				if (old) +					icsk->icsk_ext_hdr_len -= old->opt.optlen;  				if (opt) -					icsk->icsk_ext_hdr_len += opt->optlen; +					icsk->icsk_ext_hdr_len += opt->opt.optlen;  				icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  			}  #endif  		} -		opt = xchg(&inet->opt, opt); -		kfree(opt); +		rcu_assign_pointer(inet->inet_opt, opt); +		if (old) +			kfree_rcu(old, rcu);  		break;  	}  	case IP_PKTINFO: @@ -569,8 +612,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,  		break;  	case IP_TOS:	/* This sets both TOS and Precedence */  		if (sk->sk_type == SOCK_STREAM) { -			val &= ~3; -			val |= inet->tos & 3; +			val &= ~INET_ECN_MASK; +			val |= inet->tos & INET_ECN_MASK;  		}  		if (inet->tos != val) {  			inet->tos = val; @@ -581,7 +624,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  	case IP_TTL:  		if (optlen < 1)  			goto e_inval; -		if (val != -1 && (val < 0 || val > 255)) +		if (val != -1 && (val < 1 || val > 255))  			goto e_inval;  		inet->uc_ttl = val;  		break; @@ -600,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,  		inet->nodefrag = val ? 1 : 0;  		break;  	case IP_MTU_DISCOVER: -		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) +		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)  			goto e_inval;  		inet->pmtudisc = val;  		break; @@ -625,6 +668,35 @@ static int do_ip_setsockopt(struct sock *sk, int level,  			goto e_inval;  		inet->mc_loop = !!val;  		break; +	case IP_UNICAST_IF: +	{ +		struct net_device *dev = NULL; +		int ifindex; + +		if (optlen != sizeof(int)) +			goto e_inval; + +		ifindex = (__force int)ntohl((__force __be32)val); +		if (ifindex == 0) { +			inet->uc_index = 0; +			err = 0; +			break; +		} + +		dev = dev_get_by_index(sock_net(sk), ifindex); +		err = -EADDRNOTAVAIL; +		if (!dev) +			break; +		dev_put(dev); + +		err = -EINVAL; +		if (sk->sk_bound_dev_if) +			break; + +		inet->uc_index = ifindex; +		err = 0; +		break; +	}  	case IP_MULTICAST_IF:  	{  		struct ip_mreqn mreq; @@ -645,10 +717,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,  				break;  		} else {  			memset(&mreq, 0, sizeof(mreq)); -			if (optlen >= sizeof(struct in_addr) && -			    copy_from_user(&mreq.imr_address, optval, -					   sizeof(struct in_addr))) -				break; +			if (optlen >= sizeof(struct ip_mreq)) { +				if (copy_from_user(&mreq, optval, +						   sizeof(struct ip_mreq))) +					break; +			} else if (optlen >= sizeof(struct in_addr)) { +				if (copy_from_user(&mreq.imr_address, optval, +						   sizeof(struct in_addr))) +					break; +			}  		}  		if (!mreq.imr_ifindex) { @@ -946,13 +1023,14 @@ mc_msf_out:  	case IP_IPSEC_POLICY:  	case IP_XFRM_POLICY:  		err = -EPERM; -		if (!capable(CAP_NET_ADMIN)) +		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  			break;  		err = xfrm_user_policy(sk, optname, optval, optlen);  		break;  	case IP_TRANSPARENT: -		if (!capable(CAP_NET_ADMIN)) { +		if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && +		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {  			err = -EPERM;  			break;  		} @@ -982,20 +1060,29 @@ e_inval:  }  /** - * ip_queue_rcv_skb - Queue an skb into sock receive queue + * ipv4_pktinfo_prepare - transfert some info from rtable to skb   * @sk: socket   * @skb: buffer   * - * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option - * is not set, we drop skb dst entry now, while dst cache line is hot. + * To support IP_CMSG_PKTINFO option, we store rt_iif and specific + * destination in skb->cb[] before dst drop. + * This way, receiver doesn't make cache line misses to read rtable.   */ -int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)  { -	if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) -		skb_dst_drop(skb); -	return sock_queue_rcv_skb(sk, skb); +	struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); +	bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || +		       ipv6_sk_rxinfo(sk); + +	if (prepare && skb_rtable(skb)) { +		pktinfo->ipi_ifindex = inet_iif(skb); +		pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); +	} else { +		pktinfo->ipi_ifindex = 0; +		pktinfo->ipi_spec_dst.s_addr = 0; +	} +	skb_dst_drop(skb);  } -EXPORT_SYMBOL(ip_queue_rcv_skb);  int ip_setsockopt(struct sock *sk, int level,  		int optname, char __user *optval, unsigned int optlen) @@ -1058,7 +1145,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);   */  static int do_ip_getsockopt(struct sock *sk, int level, int optname, -			    char __user *optval, int __user *optlen) +			    char __user *optval, int __user *optlen, unsigned int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	int val; @@ -1081,12 +1168,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,  	case IP_OPTIONS:  	{  		unsigned char optbuf[sizeof(struct ip_options)+40]; -		struct ip_options * opt = (struct ip_options *)optbuf; +		struct ip_options *opt = (struct ip_options *)optbuf; +		struct ip_options_rcu *inet_opt; + +		inet_opt = rcu_dereference_protected(inet->inet_opt, +						     sock_owned_by_user(sk));  		opt->optlen = 0; -		if (inet->opt) -			memcpy(optbuf, inet->opt, -			       sizeof(struct ip_options)+ -			       inet->opt->optlen); +		if (inet_opt) +			memcpy(optbuf, &inet_opt->opt, +			       sizeof(struct ip_options) + +			       inet_opt->opt.optlen);  		release_sock(sk);  		if (opt->optlen == 0) @@ -1163,6 +1254,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,  	case IP_MULTICAST_LOOP:  		val = inet->mc_loop;  		break; +	case IP_UNICAST_IF: +		val = (__force int)htonl((__u32) inet->uc_index); +		break;  	case IP_MULTICAST_IF:  	{  		struct in_addr addr; @@ -1227,7 +1321,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,  		msg.msg_control = optval;  		msg.msg_controllen = len; -		msg.msg_flags = 0; +		msg.msg_flags = flags;  		if (inet->cmsg_flags & IP_CMSG_PKTINFO) {  			struct in_pktinfo info; @@ -1241,6 +1335,10 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,  			int hlim = inet->mc_ttl;  			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);  		} +		if (inet->cmsg_flags & IP_CMSG_TOS) { +			int tos = inet->rcv_tos; +			put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); +		}  		len -= msg.msg_controllen;  		return put_user(len, optlen);  	} @@ -1281,7 +1379,7 @@ int ip_getsockopt(struct sock *sk, int level,  {  	int err; -	err = do_ip_getsockopt(sk, level, optname, optval, optlen); +	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);  #ifdef CONFIG_NETFILTER  	/* we need to exclude all possible ENOPROTOOPTs except default case */  	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && @@ -1314,7 +1412,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,  		return compat_mc_getsockopt(sk, level, optname, optval, optlen,  			ip_getsockopt); -	err = do_ip_getsockopt(sk, level, optname, optval, optlen); +	err = do_ip_getsockopt(sk, level, optname, optval, optlen, +		MSG_CMSG_COMPAT);  #ifdef CONFIG_NETFILTER  	/* we need to exclude all possible ENOPROTOOPTs except default case */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c new file mode 100644 index 00000000000..6f9de61dce5 --- /dev/null +++ b/net/ipv4/ip_tunnel.c @@ -0,0 +1,1062 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/rculist.h> +#include <linux/err.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#endif + +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) +{ +	return hash_32((__force u32)key ^ (__force u32)remote, +			 IP_TNL_HASH_BITS); +} + +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, +			     struct dst_entry *dst) +{ +	struct dst_entry *old_dst; + +	dst_clone(dst); +	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); +	dst_release(old_dst); +} + +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ +	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); +} + +static void tunnel_dst_reset(struct ip_tunnel *t) +{ +	tunnel_dst_set(t, NULL); +} + +void ip_tunnel_dst_reset_all(struct ip_tunnel *t) +{ +	int i; + +	for_each_possible_cpu(i) +		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} +EXPORT_SYMBOL(ip_tunnel_dst_reset_all); + +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) +{ +	struct dst_entry *dst; + +	rcu_read_lock(); +	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); +	if (dst && !atomic_inc_not_zero(&dst->__refcnt)) +		dst = NULL; +	if (dst) { +		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { +			tunnel_dst_reset(t); +			dst_release(dst); +			dst = NULL; +		} +	} +	rcu_read_unlock(); +	return (struct rtable *)dst; +} + +static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, +				__be16 flags, __be32 key) +{ +	if (p->i_flags & TUNNEL_KEY) { +		if (flags & TUNNEL_KEY) +			return key == p->i_key; +		else +			/* key expected, none present */ +			return false; +	} else +		return !(flags & TUNNEL_KEY); +} + +/* Fallback tunnel: no source, no destination, no key, no options + +   Tunnel hash table: +   We require exact key match i.e. if a key is present in packet +   it will match only tunnel with the same key; if it is not present, +   it will match only keyless tunnel. + +   All keysless packets, if not matched configured keyless tunnels +   will match fallback tunnel. +   Given src, dst and key, find appropriate for input tunnel. +*/ +struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, +				   int link, __be16 flags, +				   __be32 remote, __be32 local, +				   __be32 key) +{ +	unsigned int hash; +	struct ip_tunnel *t, *cand = NULL; +	struct hlist_head *head; + +	hash = ip_tunnel_hash(key, remote); +	head = &itn->tunnels[hash]; + +	hlist_for_each_entry_rcu(t, head, hash_node) { +		if (local != t->parms.iph.saddr || +		    remote != t->parms.iph.daddr || +		    !(t->dev->flags & IFF_UP)) +			continue; + +		if (!ip_tunnel_key_match(&t->parms, flags, key)) +			continue; + +		if (t->parms.link == link) +			return t; +		else +			cand = t; +	} + +	hlist_for_each_entry_rcu(t, head, hash_node) { +		if (remote != t->parms.iph.daddr || +		    t->parms.iph.saddr != 0 || +		    !(t->dev->flags & IFF_UP)) +			continue; + +		if (!ip_tunnel_key_match(&t->parms, flags, key)) +			continue; + +		if (t->parms.link == link) +			return t; +		else if (!cand) +			cand = t; +	} + +	hash = ip_tunnel_hash(key, 0); +	head = &itn->tunnels[hash]; + +	hlist_for_each_entry_rcu(t, head, hash_node) { +		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && +		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) +			continue; + +		if (!(t->dev->flags & IFF_UP)) +			continue; + +		if (!ip_tunnel_key_match(&t->parms, flags, key)) +			continue; + +		if (t->parms.link == link) +			return t; +		else if (!cand) +			cand = t; +	} + +	if (flags & TUNNEL_NO_KEY) +		goto skip_key_lookup; + +	hlist_for_each_entry_rcu(t, head, hash_node) { +		if (t->parms.i_key != key || +		    t->parms.iph.saddr != 0 || +		    t->parms.iph.daddr != 0 || +		    !(t->dev->flags & IFF_UP)) +			continue; + +		if (t->parms.link == link) +			return t; +		else if (!cand) +			cand = t; +	} + +skip_key_lookup: +	if (cand) +		return cand; + +	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) +		return netdev_priv(itn->fb_tunnel_dev); + + +	return NULL; +} +EXPORT_SYMBOL_GPL(ip_tunnel_lookup); + +static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, +				    struct ip_tunnel_parm *parms) +{ +	unsigned int h; +	__be32 remote; +	__be32 i_key = parms->i_key; + +	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) +		remote = parms->iph.daddr; +	else +		remote = 0; + +	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) +		i_key = 0; + +	h = ip_tunnel_hash(i_key, remote); +	return &itn->tunnels[h]; +} + +static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) +{ +	struct hlist_head *head = ip_bucket(itn, &t->parms); + +	hlist_add_head_rcu(&t->hash_node, head); +} + +static void ip_tunnel_del(struct ip_tunnel *t) +{ +	hlist_del_init_rcu(&t->hash_node); +} + +static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, +					struct ip_tunnel_parm *parms, +					int type) +{ +	__be32 remote = parms->iph.daddr; +	__be32 local = parms->iph.saddr; +	__be32 key = parms->i_key; +	__be16 flags = parms->i_flags; +	int link = parms->link; +	struct ip_tunnel *t = NULL; +	struct hlist_head *head = ip_bucket(itn, parms); + +	hlist_for_each_entry_rcu(t, head, hash_node) { +		if (local == t->parms.iph.saddr && +		    remote == t->parms.iph.daddr && +		    link == t->parms.link && +		    type == t->dev->type && +		    ip_tunnel_key_match(&t->parms, flags, key)) +			break; +	} +	return t; +} + +static struct net_device *__ip_tunnel_create(struct net *net, +					     const struct rtnl_link_ops *ops, +					     struct ip_tunnel_parm *parms) +{ +	int err; +	struct ip_tunnel *tunnel; +	struct net_device *dev; +	char name[IFNAMSIZ]; + +	if (parms->name[0]) +		strlcpy(name, parms->name, IFNAMSIZ); +	else { +		if (strlen(ops->kind) > (IFNAMSIZ - 3)) { +			err = -E2BIG; +			goto failed; +		} +		strlcpy(name, ops->kind, IFNAMSIZ); +		strncat(name, "%d", 2); +	} + +	ASSERT_RTNL(); +	dev = alloc_netdev(ops->priv_size, name, ops->setup); +	if (!dev) { +		err = -ENOMEM; +		goto failed; +	} +	dev_net_set(dev, net); + +	dev->rtnl_link_ops = ops; + +	tunnel = netdev_priv(dev); +	tunnel->parms = *parms; +	tunnel->net = net; + +	err = register_netdevice(dev); +	if (err) +		goto failed_free; + +	return dev; + +failed_free: +	free_netdev(dev); +failed: +	return ERR_PTR(err); +} + +static inline void init_tunnel_flow(struct flowi4 *fl4, +				    int proto, +				    __be32 daddr, __be32 saddr, +				    __be32 key, __u8 tos, int oif) +{ +	memset(fl4, 0, sizeof(*fl4)); +	fl4->flowi4_oif = oif; +	fl4->daddr = daddr; +	fl4->saddr = saddr; +	fl4->flowi4_tos = tos; +	fl4->flowi4_proto = proto; +	fl4->fl4_gre_key = key; +} + +static int ip_tunnel_bind_dev(struct net_device *dev) +{ +	struct net_device *tdev = NULL; +	struct ip_tunnel *tunnel = netdev_priv(dev); +	const struct iphdr *iph; +	int hlen = LL_MAX_HEADER; +	int mtu = ETH_DATA_LEN; +	int t_hlen = tunnel->hlen + sizeof(struct iphdr); + +	iph = &tunnel->parms.iph; + +	/* Guess output device to choose reasonable mtu and needed_headroom */ +	if (iph->daddr) { +		struct flowi4 fl4; +		struct rtable *rt; + +		init_tunnel_flow(&fl4, iph->protocol, iph->daddr, +				 iph->saddr, tunnel->parms.o_key, +				 RT_TOS(iph->tos), tunnel->parms.link); +		rt = ip_route_output_key(tunnel->net, &fl4); + +		if (!IS_ERR(rt)) { +			tdev = rt->dst.dev; +			tunnel_dst_set(tunnel, &rt->dst); +			ip_rt_put(rt); +		} +		if (dev->type != ARPHRD_ETHER) +			dev->flags |= IFF_POINTOPOINT; +	} + +	if (!tdev && tunnel->parms.link) +		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); + +	if (tdev) { +		hlen = tdev->hard_header_len + tdev->needed_headroom; +		mtu = tdev->mtu; +	} +	dev->iflink = tunnel->parms.link; + +	dev->needed_headroom = t_hlen + hlen; +	mtu -= (dev->hard_header_len + t_hlen); + +	if (mtu < 68) +		mtu = 68; + +	return mtu; +} + +static struct ip_tunnel *ip_tunnel_create(struct net *net, +					  struct ip_tunnel_net *itn, +					  struct ip_tunnel_parm *parms) +{ +	struct ip_tunnel *nt; +	struct net_device *dev; + +	BUG_ON(!itn->fb_tunnel_dev); +	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); +	if (IS_ERR(dev)) +		return ERR_CAST(dev); + +	dev->mtu = ip_tunnel_bind_dev(dev); + +	nt = netdev_priv(dev); +	ip_tunnel_add(itn, nt); +	return nt; +} + +int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, +		  const struct tnl_ptk_info *tpi, bool log_ecn_error) +{ +	struct pcpu_sw_netstats *tstats; +	const struct iphdr *iph = ip_hdr(skb); +	int err; + +#ifdef CONFIG_NET_IPGRE_BROADCAST +	if (ipv4_is_multicast(iph->daddr)) { +		tunnel->dev->stats.multicast++; +		skb->pkt_type = PACKET_BROADCAST; +	} +#endif + +	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) || +	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { +		tunnel->dev->stats.rx_crc_errors++; +		tunnel->dev->stats.rx_errors++; +		goto drop; +	} + +	if (tunnel->parms.i_flags&TUNNEL_SEQ) { +		if (!(tpi->flags&TUNNEL_SEQ) || +		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { +			tunnel->dev->stats.rx_fifo_errors++; +			tunnel->dev->stats.rx_errors++; +			goto drop; +		} +		tunnel->i_seqno = ntohl(tpi->seq) + 1; +	} + +	skb_reset_network_header(skb); + +	err = IP_ECN_decapsulate(iph, skb); +	if (unlikely(err)) { +		if (log_ecn_error) +			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", +					&iph->saddr, iph->tos); +		if (err > 1) { +			++tunnel->dev->stats.rx_frame_errors; +			++tunnel->dev->stats.rx_errors; +			goto drop; +		} +	} + +	tstats = this_cpu_ptr(tunnel->dev->tstats); +	u64_stats_update_begin(&tstats->syncp); +	tstats->rx_packets++; +	tstats->rx_bytes += skb->len; +	u64_stats_update_end(&tstats->syncp); + +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); + +	if (tunnel->dev->type == ARPHRD_ETHER) { +		skb->protocol = eth_type_trans(skb, tunnel->dev); +		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); +	} else { +		skb->dev = tunnel->dev; +	} + +	gro_cells_receive(&tunnel->gro_cells, skb); +	return 0; + +drop: +	kfree_skb(skb); +	return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_rcv); + +static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, +			    struct rtable *rt, __be16 df) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; +	int mtu; + +	if (df) +		mtu = dst_mtu(&rt->dst) - dev->hard_header_len +					- sizeof(struct iphdr) - tunnel->hlen; +	else +		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; + +	if (skb_dst(skb)) +		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + +	if (skb->protocol == htons(ETH_P_IP)) { +		if (!skb_is_gso(skb) && +		    (df & htons(IP_DF)) && mtu < pkt_size) { +			memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +			return -E2BIG; +		} +	} +#if IS_ENABLED(CONFIG_IPV6) +	else if (skb->protocol == htons(ETH_P_IPV6)) { +		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + +		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && +			   mtu >= IPV6_MIN_MTU) { +			if ((tunnel->parms.iph.daddr && +			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) || +			    rt6->rt6i_dst.plen == 128) { +				rt6->rt6i_flags |= RTF_MODIFIED; +				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); +			} +		} + +		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && +					mtu < pkt_size) { +			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); +			return -E2BIG; +		} +	} +#endif +	return 0; +} + +void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, +		    const struct iphdr *tnl_params, const u8 protocol) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	const struct iphdr *inner_iph; +	struct flowi4 fl4; +	u8     tos, ttl; +	__be16 df; +	struct rtable *rt;		/* Route to the other host */ +	unsigned int max_headroom;	/* The extra header space needed */ +	__be32 dst; +	int err; +	bool connected; + +	inner_iph = (const struct iphdr *)skb_inner_network_header(skb); +	connected = (tunnel->parms.iph.daddr != 0); + +	dst = tnl_params->daddr; +	if (dst == 0) { +		/* NBMA tunnel */ + +		if (skb_dst(skb) == NULL) { +			dev->stats.tx_fifo_errors++; +			goto tx_error; +		} + +		if (skb->protocol == htons(ETH_P_IP)) { +			rt = skb_rtable(skb); +			dst = rt_nexthop(rt, inner_iph->daddr); +		} +#if IS_ENABLED(CONFIG_IPV6) +		else if (skb->protocol == htons(ETH_P_IPV6)) { +			const struct in6_addr *addr6; +			struct neighbour *neigh; +			bool do_tx_error_icmp; +			int addr_type; + +			neigh = dst_neigh_lookup(skb_dst(skb), +						 &ipv6_hdr(skb)->daddr); +			if (neigh == NULL) +				goto tx_error; + +			addr6 = (const struct in6_addr *)&neigh->primary_key; +			addr_type = ipv6_addr_type(addr6); + +			if (addr_type == IPV6_ADDR_ANY) { +				addr6 = &ipv6_hdr(skb)->daddr; +				addr_type = ipv6_addr_type(addr6); +			} + +			if ((addr_type & IPV6_ADDR_COMPATv4) == 0) +				do_tx_error_icmp = true; +			else { +				do_tx_error_icmp = false; +				dst = addr6->s6_addr32[3]; +			} +			neigh_release(neigh); +			if (do_tx_error_icmp) +				goto tx_error_icmp; +		} +#endif +		else +			goto tx_error; + +		connected = false; +	} + +	tos = tnl_params->tos; +	if (tos & 0x1) { +		tos &= ~0x1; +		if (skb->protocol == htons(ETH_P_IP)) { +			tos = inner_iph->tos; +			connected = false; +		} else if (skb->protocol == htons(ETH_P_IPV6)) { +			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); +			connected = false; +		} +	} + +	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, +			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + +	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; + +	if (!rt) { +		rt = ip_route_output_key(tunnel->net, &fl4); + +		if (IS_ERR(rt)) { +			dev->stats.tx_carrier_errors++; +			goto tx_error; +		} +		if (connected) +			tunnel_dst_set(tunnel, &rt->dst); +	} + +	if (rt->dst.dev == dev) { +		ip_rt_put(rt); +		dev->stats.collisions++; +		goto tx_error; +	} + +	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) { +		ip_rt_put(rt); +		goto tx_error; +	} + +	if (tunnel->err_count > 0) { +		if (time_before(jiffies, +				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { +			tunnel->err_count--; + +			memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +			dst_link_failure(skb); +		} else +			tunnel->err_count = 0; +	} + +	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); +	ttl = tnl_params->ttl; +	if (ttl == 0) { +		if (skb->protocol == htons(ETH_P_IP)) +			ttl = inner_iph->ttl; +#if IS_ENABLED(CONFIG_IPV6) +		else if (skb->protocol == htons(ETH_P_IPV6)) +			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; +#endif +		else +			ttl = ip4_dst_hoplimit(&rt->dst); +	} + +	df = tnl_params->frag_off; +	if (skb->protocol == htons(ETH_P_IP)) +		df |= (inner_iph->frag_off&htons(IP_DF)); + +	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) +			+ rt->dst.header_len; +	if (max_headroom > dev->needed_headroom) +		dev->needed_headroom = max_headroom; + +	if (skb_cow_head(skb, dev->needed_headroom)) { +		ip_rt_put(rt); +		dev->stats.tx_dropped++; +		kfree_skb(skb); +		return; +	} + +	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, +			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); +	iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + +	return; + +#if IS_ENABLED(CONFIG_IPV6) +tx_error_icmp: +	dst_link_failure(skb); +#endif +tx_error: +	dev->stats.tx_errors++; +	kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(ip_tunnel_xmit); + +static void ip_tunnel_update(struct ip_tunnel_net *itn, +			     struct ip_tunnel *t, +			     struct net_device *dev, +			     struct ip_tunnel_parm *p, +			     bool set_mtu) +{ +	ip_tunnel_del(t); +	t->parms.iph.saddr = p->iph.saddr; +	t->parms.iph.daddr = p->iph.daddr; +	t->parms.i_key = p->i_key; +	t->parms.o_key = p->o_key; +	if (dev->type != ARPHRD_ETHER) { +		memcpy(dev->dev_addr, &p->iph.saddr, 4); +		memcpy(dev->broadcast, &p->iph.daddr, 4); +	} +	ip_tunnel_add(itn, t); + +	t->parms.iph.ttl = p->iph.ttl; +	t->parms.iph.tos = p->iph.tos; +	t->parms.iph.frag_off = p->iph.frag_off; + +	if (t->parms.link != p->link) { +		int mtu; + +		t->parms.link = p->link; +		mtu = ip_tunnel_bind_dev(dev); +		if (set_mtu) +			dev->mtu = mtu; +	} +	ip_tunnel_dst_reset_all(t); +	netdev_state_change(dev); +} + +int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +{ +	int err = 0; +	struct ip_tunnel *t = netdev_priv(dev); +	struct net *net = t->net; +	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); + +	BUG_ON(!itn->fb_tunnel_dev); +	switch (cmd) { +	case SIOCGETTUNNEL: +		if (dev == itn->fb_tunnel_dev) { +			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); +			if (t == NULL) +				t = netdev_priv(dev); +		} +		memcpy(p, &t->parms, sizeof(*p)); +		break; + +	case SIOCADDTUNNEL: +	case SIOCCHGTUNNEL: +		err = -EPERM; +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			goto done; +		if (p->iph.ttl) +			p->iph.frag_off |= htons(IP_DF); +		if (!(p->i_flags & VTI_ISVTI)) { +			if (!(p->i_flags & TUNNEL_KEY)) +				p->i_key = 0; +			if (!(p->o_flags & TUNNEL_KEY)) +				p->o_key = 0; +		} + +		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + +		if (!t && (cmd == SIOCADDTUNNEL)) { +			t = ip_tunnel_create(net, itn, p); +			err = PTR_ERR_OR_ZERO(t); +			break; +		} +		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { +			if (t != NULL) { +				if (t->dev != dev) { +					err = -EEXIST; +					break; +				} +			} else { +				unsigned int nflags = 0; + +				if (ipv4_is_multicast(p->iph.daddr)) +					nflags = IFF_BROADCAST; +				else if (p->iph.daddr) +					nflags = IFF_POINTOPOINT; + +				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { +					err = -EINVAL; +					break; +				} + +				t = netdev_priv(dev); +			} +		} + +		if (t) { +			err = 0; +			ip_tunnel_update(itn, t, dev, p, true); +		} else { +			err = -ENOENT; +		} +		break; + +	case SIOCDELTUNNEL: +		err = -EPERM; +		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) +			goto done; + +		if (dev == itn->fb_tunnel_dev) { +			err = -ENOENT; +			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); +			if (t == NULL) +				goto done; +			err = -EPERM; +			if (t == netdev_priv(itn->fb_tunnel_dev)) +				goto done; +			dev = t->dev; +		} +		unregister_netdevice(dev); +		err = 0; +		break; + +	default: +		err = -EINVAL; +	} + +done: +	return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	int t_hlen = tunnel->hlen + sizeof(struct iphdr); + +	if (new_mtu < 68 || +	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) +		return -EINVAL; +	dev->mtu = new_mtu; +	return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); + +static void ip_tunnel_dev_free(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); + +	gro_cells_destroy(&tunnel->gro_cells); +	free_percpu(tunnel->dst_cache); +	free_percpu(dev->tstats); +	free_netdev(dev); +} + +void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct ip_tunnel_net *itn; + +	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); + +	if (itn->fb_tunnel_dev != dev) { +		ip_tunnel_del(netdev_priv(dev)); +		unregister_netdevice_queue(dev, head); +	} +} +EXPORT_SYMBOL_GPL(ip_tunnel_dellink); + +int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +				  struct rtnl_link_ops *ops, char *devname) +{ +	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); +	struct ip_tunnel_parm parms; +	unsigned int i; + +	for (i = 0; i < IP_TNL_HASH_SIZE; i++) +		INIT_HLIST_HEAD(&itn->tunnels[i]); + +	if (!ops) { +		itn->fb_tunnel_dev = NULL; +		return 0; +	} + +	memset(&parms, 0, sizeof(parms)); +	if (devname) +		strlcpy(parms.name, devname, IFNAMSIZ); + +	rtnl_lock(); +	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); +	/* FB netdevice is special: we have one, and only one per netns. +	 * Allowing to move it to another netns is clearly unsafe. +	 */ +	if (!IS_ERR(itn->fb_tunnel_dev)) { +		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; +		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); +		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); +	} +	rtnl_unlock(); + +	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); +} +EXPORT_SYMBOL_GPL(ip_tunnel_init_net); + +static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, +			      struct rtnl_link_ops *ops) +{ +	struct net *net = dev_net(itn->fb_tunnel_dev); +	struct net_device *dev, *aux; +	int h; + +	for_each_netdev_safe(net, dev, aux) +		if (dev->rtnl_link_ops == ops) +			unregister_netdevice_queue(dev, head); + +	for (h = 0; h < IP_TNL_HASH_SIZE; h++) { +		struct ip_tunnel *t; +		struct hlist_node *n; +		struct hlist_head *thead = &itn->tunnels[h]; + +		hlist_for_each_entry_safe(t, n, thead, hash_node) +			/* If dev is in the same netns, it has already +			 * been added to the list by the previous loop. +			 */ +			if (!net_eq(dev_net(t->dev), net)) +				unregister_netdevice_queue(t->dev, head); +	} +} + +void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +{ +	LIST_HEAD(list); + +	rtnl_lock(); +	ip_tunnel_destroy(itn, &list, ops); +	unregister_netdevice_many(&list); +	rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); + +int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], +		      struct ip_tunnel_parm *p) +{ +	struct ip_tunnel *nt; +	struct net *net = dev_net(dev); +	struct ip_tunnel_net *itn; +	int mtu; +	int err; + +	nt = netdev_priv(dev); +	itn = net_generic(net, nt->ip_tnl_net_id); + +	if (ip_tunnel_find(itn, p, dev->type)) +		return -EEXIST; + +	nt->net = net; +	nt->parms = *p; +	err = register_netdevice(dev); +	if (err) +		goto out; + +	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) +		eth_hw_addr_random(dev); + +	mtu = ip_tunnel_bind_dev(dev); +	if (!tb[IFLA_MTU]) +		dev->mtu = mtu; + +	ip_tunnel_add(itn, nt); + +out: +	return err; +} +EXPORT_SYMBOL_GPL(ip_tunnel_newlink); + +int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], +			 struct ip_tunnel_parm *p) +{ +	struct ip_tunnel *t; +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct net *net = tunnel->net; +	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); + +	if (dev == itn->fb_tunnel_dev) +		return -EINVAL; + +	t = ip_tunnel_find(itn, p, dev->type); + +	if (t) { +		if (t->dev != dev) +			return -EEXIST; +	} else { +		t = tunnel; + +		if (dev->type != ARPHRD_ETHER) { +			unsigned int nflags = 0; + +			if (ipv4_is_multicast(p->iph.daddr)) +				nflags = IFF_BROADCAST; +			else if (p->iph.daddr) +				nflags = IFF_POINTOPOINT; + +			if ((dev->flags ^ nflags) & +			    (IFF_POINTOPOINT | IFF_BROADCAST)) +				return -EINVAL; +		} +	} + +	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); +	return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_changelink); + +int ip_tunnel_init(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct iphdr *iph = &tunnel->parms.iph; +	int err; + +	dev->destructor	= ip_tunnel_dev_free; +	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); +	if (!dev->tstats) +		return -ENOMEM; + +	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); +	if (!tunnel->dst_cache) { +		free_percpu(dev->tstats); +		return -ENOMEM; +	} + +	err = gro_cells_init(&tunnel->gro_cells, dev); +	if (err) { +		free_percpu(tunnel->dst_cache); +		free_percpu(dev->tstats); +		return err; +	} + +	tunnel->dev = dev; +	tunnel->net = dev_net(dev); +	strcpy(tunnel->parms.name, dev->name); +	iph->version		= 4; +	iph->ihl		= 5; + +	return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_init); + +void ip_tunnel_uninit(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct net *net = tunnel->net; +	struct ip_tunnel_net *itn; + +	itn = net_generic(net, tunnel->ip_tnl_net_id); +	/* fb_tunnel_dev will be unregisted in net-exit call. */ +	if (itn->fb_tunnel_dev != dev) +		ip_tunnel_del(netdev_priv(dev)); + +	ip_tunnel_dst_reset_all(tunnel); +} +EXPORT_SYMBOL_GPL(ip_tunnel_uninit); + +/* Do least required initialization, rest of init is done in tunnel_init call */ +void ip_tunnel_setup(struct net_device *dev, int net_id) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	tunnel->ip_tnl_net_id = net_id; +} +EXPORT_SYMBOL_GPL(ip_tunnel_setup); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c new file mode 100644 index 00000000000..f4c987bb7e9 --- /dev/null +++ b/net/ipv4/ip_tunnel_core.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> + +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/ip_tunnels.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, +		  __be32 src, __be32 dst, __u8 proto, +		  __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ +	int pkt_len = skb->len; +	struct iphdr *iph; +	int err; + +	skb_scrub_packet(skb, xnet); + +	skb_clear_hash(skb); +	skb_dst_set(skb, &rt->dst); +	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + +	/* Push down and install the IP header. */ +	skb_push(skb, sizeof(struct iphdr)); +	skb_reset_network_header(skb); + +	iph = ip_hdr(skb); + +	iph->version	=	4; +	iph->ihl	=	sizeof(struct iphdr) >> 2; +	iph->frag_off	=	df; +	iph->protocol	=	proto; +	iph->tos	=	tos; +	iph->daddr	=	dst; +	iph->saddr	=	src; +	iph->ttl	=	ttl; +	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); + +	err = ip_local_out_sk(sk, skb); +	if (unlikely(net_xmit_eval(err))) +		pkt_len = 0; +	return pkt_len; +} +EXPORT_SYMBOL_GPL(iptunnel_xmit); + +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +{ +	if (unlikely(!pskb_may_pull(skb, hdr_len))) +		return -ENOMEM; + +	skb_pull_rcsum(skb, hdr_len); + +	if (inner_proto == htons(ETH_P_TEB)) { +		struct ethhdr *eh = (struct ethhdr *)skb->data; + +		if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) +			return -ENOMEM; + +		if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) +			skb->protocol = eh->h_proto; +		else +			skb->protocol = htons(ETH_P_802_2); + +	} else { +		skb->protocol = inner_proto; +	} + +	nf_reset(skb); +	secpath_reset(skb); +	skb_clear_hash_if_not_l4(skb); +	skb_dst_drop(skb); +	skb->vlan_tci = 0; +	skb_set_queue_mapping(skb, 0); +	skb->pkt_type = PACKET_HOST; +	return 0; +} +EXPORT_SYMBOL_GPL(iptunnel_pull_header); + +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, +					 bool csum_help, +					 int gso_type_mask) +{ +	int err; + +	if (likely(!skb->encapsulation)) { +		skb_reset_inner_headers(skb); +		skb->encapsulation = 1; +	} + +	if (skb_is_gso(skb)) { +		err = skb_unclone(skb, GFP_ATOMIC); +		if (unlikely(err)) +			goto error; +		skb_shinfo(skb)->gso_type |= gso_type_mask; +		return skb; +	} + +	/* If packet is not gso and we are resolving any partial checksum, +	 * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL +	 * on the outer header without confusing devices that implement +	 * NETIF_F_IP_CSUM with encapsulation. +	 */ +	if (csum_help) +		skb->encapsulation = 0; + +	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { +		err = skb_checksum_help(skb); +		if (unlikely(err)) +			goto error; +	} else if (skb->ip_summed != CHECKSUM_PARTIAL) +		skb->ip_summed = CHECKSUM_NONE; + +	return skb; +error: +	kfree_skb(skb); +	return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); + +/* Often modified stats are per cpu, other are shared (netdev->stats) */ +struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, +						struct rtnl_link_stats64 *tot) +{ +	int i; + +	for_each_possible_cpu(i) { +		const struct pcpu_sw_netstats *tstats = +						   per_cpu_ptr(dev->tstats, i); +		u64 rx_packets, rx_bytes, tx_packets, tx_bytes; +		unsigned int start; + +		do { +			start = u64_stats_fetch_begin_irq(&tstats->syncp); +			rx_packets = tstats->rx_packets; +			tx_packets = tstats->tx_packets; +			rx_bytes = tstats->rx_bytes; +			tx_bytes = tstats->tx_bytes; +		} while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + +		tot->rx_packets += rx_packets; +		tot->tx_packets += tx_packets; +		tot->rx_bytes   += rx_bytes; +		tot->tx_bytes   += tx_bytes; +	} + +	tot->multicast = dev->stats.multicast; + +	tot->rx_crc_errors = dev->stats.rx_crc_errors; +	tot->rx_fifo_errors = dev->stats.rx_fifo_errors; +	tot->rx_length_errors = dev->stats.rx_length_errors; +	tot->rx_frame_errors = dev->stats.rx_frame_errors; +	tot->rx_errors = dev->stats.rx_errors; + +	tot->tx_fifo_errors = dev->stats.tx_fifo_errors; +	tot->tx_carrier_errors = dev->stats.tx_carrier_errors; +	tot->tx_dropped = dev->stats.tx_dropped; +	tot->tx_aborted_errors = dev->stats.tx_aborted_errors; +	tot->tx_errors = dev->stats.tx_errors; + +	tot->collisions  = dev->stats.collisions; + +	return tot; +} +EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c new file mode 100644 index 00000000000..b8960f3527f --- /dev/null +++ b/net/ipv4/ip_vti.c @@ -0,0 +1,603 @@ +/* + *	Linux NET3: IP/IP protocol decoder modified to support + *		    virtual tunnel interface + * + *	Authors: + *		Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + */ + +/* +   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c + +   For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/ip_tunnels.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +static struct rtnl_link_ops vti_link_ops __read_mostly; + +static int vti_net_id __read_mostly; +static int vti_tunnel_init(struct net_device *dev); + +static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, +		     int encap_type) +{ +	struct ip_tunnel *tunnel; +	const struct iphdr *iph = ip_hdr(skb); +	struct net *net = dev_net(skb->dev); +	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +				  iph->saddr, iph->daddr, 0); +	if (tunnel != NULL) { +		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) +			goto drop; + +		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; +		skb->mark = be32_to_cpu(tunnel->parms.i_key); + +		return xfrm_input(skb, nexthdr, spi, encap_type); +	} + +	return -EINVAL; +drop: +	kfree_skb(skb); +	return 0; +} + +static int vti_rcv(struct sk_buff *skb) +{ +	XFRM_SPI_SKB_CB(skb)->family = AF_INET; +	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + +	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); +} + +static int vti_rcv_cb(struct sk_buff *skb, int err) +{ +	unsigned short family; +	struct net_device *dev; +	struct pcpu_sw_netstats *tstats; +	struct xfrm_state *x; +	struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; + +	if (!tunnel) +		return 1; + +	dev = tunnel->dev; + +	if (err) { +		dev->stats.rx_errors++; +		dev->stats.rx_dropped++; + +		return 0; +	} + +	x = xfrm_input_state(skb); +	family = x->inner_mode->afinfo->family; + +	if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) +		return -EPERM; + +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); +	skb->dev = dev; + +	tstats = this_cpu_ptr(dev->tstats); + +	u64_stats_update_begin(&tstats->syncp); +	tstats->rx_packets++; +	tstats->rx_bytes += skb->len; +	u64_stats_update_end(&tstats->syncp); + +	return 0; +} + +static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) +{ +	xfrm_address_t *daddr = (xfrm_address_t *)&dst; +	xfrm_address_t *saddr = (xfrm_address_t *)&src; + +	/* if there is no transform then this tunnel is not functional. +	 * Or if the xfrm is not mode tunnel. +	 */ +	if (!x || x->props.mode != XFRM_MODE_TUNNEL || +	    x->props.family != AF_INET) +		return false; + +	if (!dst) +		return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); + +	if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) +		return false; + +	return true; +} + +static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, +			    struct flowi *fl) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct ip_tunnel_parm *parms = &tunnel->parms; +	struct dst_entry *dst = skb_dst(skb); +	struct net_device *tdev;	/* Device to other host */ +	int err; + +	if (!dst) { +		dev->stats.tx_carrier_errors++; +		goto tx_error_icmp; +	} + +	dst_hold(dst); +	dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0); +	if (IS_ERR(dst)) { +		dev->stats.tx_carrier_errors++; +		goto tx_error_icmp; +	} + +	if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { +		dev->stats.tx_carrier_errors++; +		dst_release(dst); +		goto tx_error_icmp; +	} + +	tdev = dst->dev; + +	if (tdev == dev) { +		dst_release(dst); +		dev->stats.collisions++; +		goto tx_error; +	} + +	if (tunnel->err_count > 0) { +		if (time_before(jiffies, +				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { +			tunnel->err_count--; +			dst_link_failure(skb); +		} else +			tunnel->err_count = 0; +	} + +	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); +	skb_dst_set(skb, dst); +	skb->dev = skb_dst(skb)->dev; + +	err = dst_output(skb); +	if (net_xmit_eval(err) == 0) +		err = skb->len; +	iptunnel_xmit_stats(err, &dev->stats, dev->tstats); +	return NETDEV_TX_OK; + +tx_error_icmp: +	dst_link_failure(skb); +tx_error: +	dev->stats.tx_errors++; +	kfree_skb(skb); +	return NETDEV_TX_OK; +} + +/* This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ +static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct flowi fl; + +	memset(&fl, 0, sizeof(fl)); + +	skb->mark = be32_to_cpu(tunnel->parms.o_key); + +	switch (skb->protocol) { +	case htons(ETH_P_IP): +		xfrm_decode_session(skb, &fl, AF_INET); +		memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +		break; +	case htons(ETH_P_IPV6): +		xfrm_decode_session(skb, &fl, AF_INET6); +		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); +		break; +	default: +		dev->stats.tx_errors++; +		dev_kfree_skb(skb); +		return NETDEV_TX_OK; +	} + +	return vti_xmit(skb, dev, &fl); +} + +static int vti4_err(struct sk_buff *skb, u32 info) +{ +	__be32 spi; +	__u32 mark; +	struct xfrm_state *x; +	struct ip_tunnel *tunnel; +	struct ip_esp_hdr *esph; +	struct ip_auth_hdr *ah ; +	struct ip_comp_hdr *ipch; +	struct net *net = dev_net(skb->dev); +	const struct iphdr *iph = (const struct iphdr *)skb->data; +	int protocol = iph->protocol; +	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +				  iph->daddr, iph->saddr, 0); +	if (!tunnel) +		return -1; + +	mark = be32_to_cpu(tunnel->parms.o_key); + +	switch (protocol) { +	case IPPROTO_ESP: +		esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); +		spi = esph->spi; +		break; +	case IPPROTO_AH: +		ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); +		spi = ah->spi; +		break; +	case IPPROTO_COMP: +		ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); +		spi = htonl(ntohs(ipch->cpi)); +		break; +	default: +		return 0; +	} + +	switch (icmp_hdr(skb)->type) { +	case ICMP_DEST_UNREACH: +		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) +			return 0; +	case ICMP_REDIRECT: +		break; +	default: +		return 0; +	} + +	x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, +			      spi, protocol, AF_INET); +	if (!x) +		return 0; + +	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) +		ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); +	else +		ipv4_redirect(skb, net, 0, 0, protocol, 0); +	xfrm_state_put(x); + +	return 0; +} + +static int +vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ +	int err = 0; +	struct ip_tunnel_parm p; + +	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) +		return -EFAULT; + +	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { +		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || +		    p.iph.ihl != 5) +			return -EINVAL; +	} + +	if (!(p.i_flags & GRE_KEY)) +		p.i_key = 0; +	if (!(p.o_flags & GRE_KEY)) +		p.o_key = 0; + +	p.i_flags = VTI_ISVTI; + +	err = ip_tunnel_ioctl(dev, &p, cmd); +	if (err) +		return err; + +	if (cmd != SIOCDELTUNNEL) { +		p.i_flags |= GRE_KEY; +		p.o_flags |= GRE_KEY; +	} + +	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) +		return -EFAULT; +	return 0; +} + +static const struct net_device_ops vti_netdev_ops = { +	.ndo_init	= vti_tunnel_init, +	.ndo_uninit	= ip_tunnel_uninit, +	.ndo_start_xmit	= vti_tunnel_xmit, +	.ndo_do_ioctl	= vti_tunnel_ioctl, +	.ndo_change_mtu	= ip_tunnel_change_mtu, +	.ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void vti_tunnel_setup(struct net_device *dev) +{ +	dev->netdev_ops		= &vti_netdev_ops; +	dev->type		= ARPHRD_TUNNEL; +	ip_tunnel_setup(dev, vti_net_id); +} + +static int vti_tunnel_init(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct iphdr *iph = &tunnel->parms.iph; + +	memcpy(dev->dev_addr, &iph->saddr, 4); +	memcpy(dev->broadcast, &iph->daddr, 4); + +	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr); +	dev->mtu		= ETH_DATA_LEN; +	dev->flags		= IFF_NOARP; +	dev->iflink		= 0; +	dev->addr_len		= 4; +	dev->features		|= NETIF_F_LLTX; +	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE; + +	return ip_tunnel_init(dev); +} + +static void __net_init vti_fb_tunnel_init(struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct iphdr *iph = &tunnel->parms.iph; + +	iph->version		= 4; +	iph->protocol		= IPPROTO_IPIP; +	iph->ihl		= 5; +} + +static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { +	.handler	=	vti_rcv, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100, +}; + +static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { +	.handler	=	vti_rcv, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100, +}; + +static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { +	.handler	=	vti_rcv, +	.input_handler	=	vti_input, +	.cb_handler	=	vti_rcv_cb, +	.err_handler	=	vti4_err, +	.priority	=	100, +}; + +static int __net_init vti_init_net(struct net *net) +{ +	int err; +	struct ip_tunnel_net *itn; + +	err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); +	if (err) +		return err; +	itn = net_generic(net, vti_net_id); +	vti_fb_tunnel_init(itn->fb_tunnel_dev); +	return 0; +} + +static void __net_exit vti_exit_net(struct net *net) +{ +	struct ip_tunnel_net *itn = net_generic(net, vti_net_id); +	ip_tunnel_delete_net(itn, &vti_link_ops); +} + +static struct pernet_operations vti_net_ops = { +	.init = vti_init_net, +	.exit = vti_exit_net, +	.id   = &vti_net_id, +	.size = sizeof(struct ip_tunnel_net), +}; + +static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ +	return 0; +} + +static void vti_netlink_parms(struct nlattr *data[], +			      struct ip_tunnel_parm *parms) +{ +	memset(parms, 0, sizeof(*parms)); + +	parms->iph.protocol = IPPROTO_IPIP; + +	if (!data) +		return; + +	parms->i_flags = VTI_ISVTI; + +	if (data[IFLA_VTI_LINK]) +		parms->link = nla_get_u32(data[IFLA_VTI_LINK]); + +	if (data[IFLA_VTI_IKEY]) +		parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); + +	if (data[IFLA_VTI_OKEY]) +		parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); + +	if (data[IFLA_VTI_LOCAL]) +		parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); + +	if (data[IFLA_VTI_REMOTE]) +		parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); + +} + +static int vti_newlink(struct net *src_net, struct net_device *dev, +		       struct nlattr *tb[], struct nlattr *data[]) +{ +	struct ip_tunnel_parm parms; + +	vti_netlink_parms(data, &parms); +	return ip_tunnel_newlink(dev, tb, &parms); +} + +static int vti_changelink(struct net_device *dev, struct nlattr *tb[], +			  struct nlattr *data[]) +{ +	struct ip_tunnel_parm p; + +	vti_netlink_parms(data, &p); +	return ip_tunnel_changelink(dev, tb, &p); +} + +static size_t vti_get_size(const struct net_device *dev) +{ +	return +		/* IFLA_VTI_LINK */ +		nla_total_size(4) + +		/* IFLA_VTI_IKEY */ +		nla_total_size(4) + +		/* IFLA_VTI_OKEY */ +		nla_total_size(4) + +		/* IFLA_VTI_LOCAL */ +		nla_total_size(4) + +		/* IFLA_VTI_REMOTE */ +		nla_total_size(4) + +		0; +} + +static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ +	struct ip_tunnel *t = netdev_priv(dev); +	struct ip_tunnel_parm *p = &t->parms; + +	nla_put_u32(skb, IFLA_VTI_LINK, p->link); +	nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); +	nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); +	nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); +	nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); + +	return 0; +} + +static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { +	[IFLA_VTI_LINK]		= { .type = NLA_U32 }, +	[IFLA_VTI_IKEY]		= { .type = NLA_U32 }, +	[IFLA_VTI_OKEY]		= { .type = NLA_U32 }, +	[IFLA_VTI_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) }, +	[IFLA_VTI_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) }, +}; + +static struct rtnl_link_ops vti_link_ops __read_mostly = { +	.kind		= "vti", +	.maxtype	= IFLA_VTI_MAX, +	.policy		= vti_policy, +	.priv_size	= sizeof(struct ip_tunnel), +	.setup		= vti_tunnel_setup, +	.validate	= vti_tunnel_validate, +	.newlink	= vti_newlink, +	.changelink	= vti_changelink, +	.get_size	= vti_get_size, +	.fill_info	= vti_fill_info, +}; + +static int __init vti_init(void) +{ +	int err; + +	pr_info("IPv4 over IPSec tunneling driver\n"); + +	err = register_pernet_device(&vti_net_ops); +	if (err < 0) +		return err; +	err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); +	if (err < 0) { +		unregister_pernet_device(&vti_net_ops); +		pr_info("vti init: can't register tunnel\n"); + +		return err; +	} + +	err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); +	if (err < 0) { +		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +		unregister_pernet_device(&vti_net_ops); +		pr_info("vti init: can't register tunnel\n"); + +		return err; +	} + +	err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); +	if (err < 0) { +		xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +		unregister_pernet_device(&vti_net_ops); +		pr_info("vti init: can't register tunnel\n"); + +		return err; +	} + +	err = rtnl_link_register(&vti_link_ops); +	if (err < 0) +		goto rtnl_link_failed; + +	return err; + +rtnl_link_failed: +	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); +	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); +	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); +	unregister_pernet_device(&vti_net_ops); +	return err; +} + +static void __exit vti_fini(void) +{ +	rtnl_link_unregister(&vti_link_ops); +	if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP)) +		pr_info("vti close: can't deregister tunnel\n"); +	if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH)) +		pr_info("vti close: can't deregister tunnel\n"); +	if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP)) +		pr_info("vti close: can't deregister tunnel\n"); + + +	unregister_pernet_device(&vti_net_ops); +} + +module_init(vti_init); +module_exit(vti_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("vti"); +MODULE_ALIAS_NETDEV("ip_vti0"); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 629067571f0..c0855d50a3f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -23,26 +23,37 @@  #include <net/protocol.h>  #include <net/sock.h> -static void ipcomp4_err(struct sk_buff *skb, u32 info) +static int ipcomp4_err(struct sk_buff *skb, u32 info)  {  	struct net *net = dev_net(skb->dev);  	__be32 spi; -	struct iphdr *iph = (struct iphdr *)skb->data; +	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));  	struct xfrm_state *x; -	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH || -	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) -		return; +	switch (icmp_hdr(skb)->type) { +	case ICMP_DEST_UNREACH: +		if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) +			return 0; +	case ICMP_REDIRECT: +		break; +	default: +		return 0; +	}  	spi = htonl(ntohs(ipch->cpi)); -	x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, +	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,  			      spi, IPPROTO_COMP, AF_INET);  	if (!x) -		return; -	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n", -		 spi, &iph->daddr); +		return 0; + +	if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) +		ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0); +	else +		ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);  	xfrm_state_put(x); + +	return 0;  }  /* We always hold one tunnel user reference to indicate a tunnel */ @@ -63,6 +74,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)  	t->props.mode = x->props.mode;  	t->props.saddr.a4 = x->props.saddr.a4;  	t->props.flags = x->props.flags; +	t->props.extra_flags = x->props.extra_flags;  	memcpy(&t->mark, &x->mark, sizeof(t->mark));  	if (xfrm_init_state(t)) @@ -137,6 +149,11 @@ out:  	return err;  } +static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) +{ +	return 0; +} +  static const struct xfrm_type ipcomp_type = {  	.description	= "IPCOMP4",  	.owner		= THIS_MODULE, @@ -147,20 +164,22 @@ static const struct xfrm_type ipcomp_type = {  	.output		= ipcomp_output  }; -static const struct net_protocol ipcomp4_protocol = { +static struct xfrm4_protocol ipcomp4_protocol = {  	.handler	=	xfrm4_rcv, +	.input_handler	=	xfrm_input, +	.cb_handler	=	ipcomp4_rcv_cb,  	.err_handler	=	ipcomp4_err, -	.no_policy	=	1, +	.priority	=	0,  };  static int __init ipcomp4_init(void)  {  	if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { -		printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); +		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	} -	if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { -		printk(KERN_INFO "ipcomp init: can't add protocol\n"); +	if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { +		pr_info("%s: can't add protocol\n", __func__);  		xfrm_unregister_type(&ipcomp_type, AF_INET);  		return -EAGAIN;  	} @@ -169,10 +188,10 @@ static int __init ipcomp4_init(void)  static void __exit ipcomp4_fini(void)  { -	if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) -		printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); +	if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) +		pr_info("%s: can't remove protocol\n", __func__);  	if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) -		printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); +		pr_info("%s: can't remove xfrm type\n", __func__);  }  module_init(ipcomp4_init); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 2b097752426..b3e86ea7b71 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -54,6 +54,7 @@  #include <linux/delay.h>  #include <linux/nfs_fs.h>  #include <linux/slab.h> +#include <linux/export.h>  #include <net/net_namespace.h>  #include <net/arp.h>  #include <net/ip.h> @@ -87,8 +88,8 @@  #endif  /* Define the friendly delay before and after opening net devices */ -#define CONF_PRE_OPEN		500	/* Before opening: 1/2 second */ -#define CONF_POST_OPEN		1	/* After opening: 1 second */ +#define CONF_POST_OPEN		10	/* After opening: 10 msecs */ +#define CONF_CARRIER_TIMEOUT	120000	/* Wait for carrier timeout */  /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */  #define CONF_OPEN_RETRIES 	2	/* (Re)open devices twice */ @@ -135,12 +136,14 @@ __be32 ic_myaddr = NONE;		/* My IP address */  static __be32 ic_netmask = NONE;	/* Netmask for local subnet */  __be32 ic_gateway = NONE;	/* Gateway IP address */ +__be32 ic_addrservaddr = NONE;	/* IP Address of the IP addresses'server */ +  __be32 ic_servaddr = NONE;	/* Boot server IP address */  __be32 root_server_addr = NONE;	/* Address of NFS server */  u8 root_server_path[256] = { 0, };	/* Path to mount as root */ -u32 ic_dev_xid;		/* Device under configuration */ +__be32 ic_dev_xid;		/* Device under configuration */  /* vendor class identifier */  static char vendor_class_identifier[253] __initdata; @@ -188,14 +191,14 @@ struct ic_device {  static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */  static struct net_device *ic_dev __initdata = NULL;	/* Selected device */ -static bool __init ic_device_match(struct net_device *dev) +static bool __init ic_is_init_dev(struct net_device *dev)  { -	if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : +	if (dev->flags & IFF_LOOPBACK) +		return false; +	return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :  	    (!(dev->flags & IFF_LOOPBACK) &&  	     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && -	     strncmp(dev->name, "dummy", 5))) -		return true; -	return false; +	     strncmp(dev->name, "dummy", 5));  }  static int __init ic_open_devs(void) @@ -203,6 +206,7 @@ static int __init ic_open_devs(void)  	struct ic_device *d, **last;  	struct net_device *dev;  	unsigned short oflags; +	unsigned long start, next_msg;  	last = &ic_first_dev;  	rtnl_lock(); @@ -212,18 +216,17 @@ static int __init ic_open_devs(void)  		if (!(dev->flags & IFF_LOOPBACK))  			continue;  		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) -			printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); +			pr_err("IP-Config: Failed to open %s\n", dev->name);  	}  	for_each_netdev(&init_net, dev) { -		if (dev->flags & IFF_LOOPBACK) -			continue; -		if (ic_device_match(dev)) { +		if (ic_is_init_dev(dev)) {  			int able = 0;  			if (dev->mtu >= 364)  				able |= IC_BOOTP;  			else -				printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); +				pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small", +					dev->name, dev->mtu);  			if (!(dev->flags & IFF_NOARP))  				able |= IC_RARP;  			able &= ic_proto_enabled; @@ -231,7 +234,8 @@ static int __init ic_open_devs(void)  				continue;  			oflags = dev->flags;  			if (dev_change_flags(dev, oflags | IFF_UP) < 0) { -				printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); +				pr_err("IP-Config: Failed to open %s\n", +				       dev->name);  				continue;  			}  			if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { @@ -252,15 +256,42 @@ static int __init ic_open_devs(void)  				dev->name, able, d->xid));  		}  	} + +	/* no point in waiting if we could not bring up at least one device */ +	if (!ic_first_dev) +		goto have_carrier; + +	/* wait for a carrier on at least one device */ +	start = jiffies; +	next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); +	while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { +		int wait, elapsed; + +		for_each_netdev(&init_net, dev) +			if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) +				goto have_carrier; + +		msleep(1); + +		if (time_before(jiffies, next_msg)) +			continue; + +		elapsed = jiffies_to_msecs(jiffies - start); +		wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000; +		pr_info("Waiting up to %d more seconds for network.\n", wait); +		next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12); +	} +have_carrier:  	rtnl_unlock();  	*last = NULL;  	if (!ic_first_dev) {  		if (user_dev_name[0]) -			printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); +			pr_err("IP-Config: Device `%s' not found\n", +			       user_dev_name);  		else -			printk(KERN_ERR "IP-Config: No network devices available.\n"); +			pr_err("IP-Config: No network devices available\n");  		return -ENODEV;  	}  	return 0; @@ -344,17 +375,20 @@ static int __init ic_setup_if(void)  	strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);  	set_sockaddr(sin, ic_myaddr, 0);  	if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { -		printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); +		pr_err("IP-Config: Unable to set interface address (%d)\n", +		       err);  		return -1;  	}  	set_sockaddr(sin, ic_netmask, 0);  	if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { -		printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); +		pr_err("IP-Config: Unable to set interface netmask (%d)\n", +		       err);  		return -1;  	}  	set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);  	if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { -		printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); +		pr_err("IP-Config: Unable to set interface broadcast address (%d)\n", +		       err);  		return -1;  	}  	/* Handle the case where we need non-standard MTU on the boot link (a network @@ -365,8 +399,8 @@ static int __init ic_setup_if(void)  		strcpy(ir.ifr_name, ic_dev->name);  		ir.ifr_mtu = ic_dev_mtu;  		if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) -			printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n", -			                 ic_dev_mtu, err); +			pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", +			       ic_dev_mtu, err);  	}  	return 0;  } @@ -381,7 +415,7 @@ static int __init ic_setup_routes(void)  		memset(&rm, 0, sizeof(rm));  		if ((ic_gateway ^ ic_myaddr) & ic_netmask) { -			printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); +			pr_err("IP-Config: Gateway not on directly connected network\n");  			return -1;  		}  		set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); @@ -389,7 +423,8 @@ static int __init ic_setup_routes(void)  		set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);  		rm.rt_flags = RTF_UP | RTF_GATEWAY;  		if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { -			printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); +			pr_err("IP-Config: Cannot add default route (%d)\n", +			       err);  			return -1;  		}  	} @@ -422,8 +457,8 @@ static int __init ic_defaults(void)  		else if (IN_CLASSC(ntohl(ic_myaddr)))  			ic_netmask = htonl(IN_CLASSC_NET);  		else { -			printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n", -				&ic_myaddr); +			pr_err("IP-Config: Unable to guess netmask for address %pI4\n", +			       &ic_myaddr);  			return -1;  		}  		printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); @@ -536,6 +571,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt  	if (ic_myaddr == NONE)  		ic_myaddr = tip;  	ic_servaddr = sip; +	ic_addrservaddr = sip;  	ic_got_reply = IC_RARP;  drop_unlock: @@ -561,6 +597,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)  #endif  /* + *  Predefine Nameservers + */ +static inline void __init ic_nameservers_predef(void) +{ +	int i; + +	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) +		ic_nameservers[i] = NONE; +} + +/*   *	DHCP/BOOTP support.   */ @@ -673,8 +720,8 @@ ic_dhcp_init_options(u8 *options)  			e += len;  		}  		if (*vendor_class_identifier) { -			printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", -			       vendor_class_identifier); +			pr_info("DHCP: sending class identifier \"%s\"\n", +				vendor_class_identifier);  			*e++ = 60;	/* Class-identifier */  			len = strlen(vendor_class_identifier);  			*e++ = len; @@ -725,10 +772,7 @@ static void __init ic_bootp_init_ext(u8 *e)   */  static inline void __init ic_bootp_init(void)  { -	int i; - -	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) -		ic_nameservers[i] = NONE; +	ic_nameservers_predef();  	dev_add_pack(&bootp_packet_type);  } @@ -752,13 +796,15 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d  	struct sk_buff *skb;  	struct bootp_pkt *b;  	struct iphdr *h; +	int hlen = LL_RESERVED_SPACE(dev); +	int tlen = dev->needed_tailroom;  	/* Allocate packet */ -	skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15, +	skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,  			GFP_KERNEL);  	if (!skb)  		return; -	skb_reserve(skb, LL_RESERVED_SPACE(dev)); +	skb_reserve(skb, hlen);  	b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));  	memset(b, 0, sizeof(struct bootp_pkt)); @@ -784,8 +830,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d  	b->op = BOOTP_REQUEST;  	if (dev->type < 256) /* check for false types */  		b->htype = dev->type; -	else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ -		b->htype = ARPHRD_IEEE802;  	else if (dev->type == ARPHRD_FDDI)  		b->htype = ARPHRD_ETHER;  	else { @@ -811,8 +855,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d  	skb->dev = dev;  	skb->protocol = htons(ETH_P_IP);  	if (dev_hard_header(skb, dev, ntohs(skb->protocol), -			    dev->broadcast, dev->dev_addr, skb->len) < 0 || -	    dev_queue_xmit(skb) < 0) +			    dev->broadcast, dev->dev_addr, skb->len) < 0) { +		kfree_skb(skb); +		printk("E"); +		return; +	} + +	if (dev_queue_xmit(skb) < 0)  		printk("E");  } @@ -837,9 +886,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)   */  static void __init ic_do_bootp_ext(u8 *ext)  { -       u8 servers; -       int i; -	u16 mtu; +	u8 servers; +	int i; +	__be16 mtu;  #ifdef IPCONFIG_DEBUG  	u8 *c; @@ -851,41 +900,44 @@ static void __init ic_do_bootp_ext(u8 *ext)  #endif  	switch (*ext++) { -		case 1:		/* Subnet mask */ -			if (ic_netmask == NONE) -				memcpy(&ic_netmask, ext+1, 4); -			break; -		case 3:		/* Default gateway */ -			if (ic_gateway == NONE) -				memcpy(&ic_gateway, ext+1, 4); -			break; -		case 6:		/* DNS server */ -			servers= *ext/4; -			if (servers > CONF_NAMESERVERS_MAX) -				servers = CONF_NAMESERVERS_MAX; -			for (i = 0; i < servers; i++) { -				if (ic_nameservers[i] == NONE) -					memcpy(&ic_nameservers[i], ext+1+4*i, 4); -			} -			break; -		case 12:	/* Host name */ -			ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); -			ic_host_name_set = 1; -			break; -		case 15:	/* Domain name (DNS) */ -			ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); -			break; -		case 17:	/* Root path */ -			if (!root_server_path[0]) -				ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); -			break; -		case 26:	/* Interface MTU */ -			memcpy(&mtu, ext+1, sizeof(mtu)); -			ic_dev_mtu = ntohs(mtu); -			break; -		case 40:	/* NIS Domain name (_not_ DNS) */ -			ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); -			break; +	case 1:		/* Subnet mask */ +		if (ic_netmask == NONE) +			memcpy(&ic_netmask, ext+1, 4); +		break; +	case 3:		/* Default gateway */ +		if (ic_gateway == NONE) +			memcpy(&ic_gateway, ext+1, 4); +		break; +	case 6:		/* DNS server */ +		servers= *ext/4; +		if (servers > CONF_NAMESERVERS_MAX) +			servers = CONF_NAMESERVERS_MAX; +		for (i = 0; i < servers; i++) { +			if (ic_nameservers[i] == NONE) +				memcpy(&ic_nameservers[i], ext+1+4*i, 4); +		} +		break; +	case 12:	/* Host name */ +		ic_bootp_string(utsname()->nodename, ext+1, *ext, +				__NEW_UTS_LEN); +		ic_host_name_set = 1; +		break; +	case 15:	/* Domain name (DNS) */ +		ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); +		break; +	case 17:	/* Root path */ +		if (!root_server_path[0]) +			ic_bootp_string(root_server_path, ext+1, *ext, +					sizeof(root_server_path)); +		break; +	case 26:	/* Interface MTU */ +		memcpy(&mtu, ext+1, sizeof(mtu)); +		ic_dev_mtu = ntohs(mtu); +		break; +	case 40:	/* NIS Domain name (_not_ DNS) */ +		ic_bootp_string(utsname()->domainname, ext+1, *ext, +				__NEW_UTS_LEN); +		break;  	}  } @@ -922,10 +974,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str  		goto drop;  	/* Fragments are not supported */ -	if (h->frag_off & htons(IP_OFFSET | IP_MF)) { -		if (net_ratelimit()) -			printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " -			       "reply.\n"); +	if (ip_is_fragment(h)) { +		net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");  		goto drop;  	} @@ -973,17 +1023,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str  	/* Is it a reply to our BOOTP request? */  	if (b->op != BOOTP_REPLY ||  	    b->xid != d->xid) { -		if (net_ratelimit()) -			printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " -			       "op[%x] xid[%x]\n", -			       b->op, b->xid); +		net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", +				    b->op, b->xid);  		goto drop_unlock;  	}  	/* Is it a reply for the device we are configuring? */  	if (b->xid != ic_dev_xid) { -		if (net_ratelimit()) -			printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n"); +		net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");  		goto drop_unlock;  	} @@ -1035,7 +1082,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str  				ic_servaddr = server_id;  #ifdef IPCONFIG_DEBUG  				printk("DHCP: Offered address %pI4 by server %pI4\n", -				       &ic_myaddr, &ic_servaddr); +				       &ic_myaddr, &b->iph.saddr);  #endif  				/* The DHCP indicated server address takes  				 * precedence over the bootp header one if @@ -1080,6 +1127,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str  	ic_dev = dev;  	ic_myaddr = b->your_ip;  	ic_servaddr = b->server_ip; +	ic_addrservaddr = b->iph.saddr;  	if (ic_gateway == NONE && b->relay_ip)  		ic_gateway = b->relay_ip;  	if (ic_nameservers[0] == NONE) @@ -1121,17 +1169,17 @@ static int __init ic_dynamic(void)  	 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.  	 */  	if (!ic_proto_enabled) { -		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); +		pr_err("IP-Config: Incomplete network configuration information\n");  		return -1;  	}  #ifdef IPCONFIG_BOOTP  	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) -		printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); +		pr_err("DHCP/BOOTP: No suitable device found\n");  #endif  #ifdef IPCONFIG_RARP  	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) -		printk(KERN_ERR "RARP: No suitable device found.\n"); +		pr_err("RARP: No suitable device found\n");  #endif  	if (!ic_proto_have_if) @@ -1158,17 +1206,17 @@ static int __init ic_dynamic(void)  	 * [Actually we could now, but the nothing else running note still  	 *  applies.. - AC]  	 */ -	printk(KERN_NOTICE "Sending %s%s%s requests .", -	       do_bootp -		? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", -	       (do_bootp && do_rarp) ? " and " : "", -	       do_rarp ? "RARP" : ""); +	pr_notice("Sending %s%s%s requests .", +		  do_bootp +		  ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", +		  (do_bootp && do_rarp) ? " and " : "", +		  do_rarp ? "RARP" : "");  	start_jiffies = jiffies;  	d = ic_first_dev;  	retries = CONF_SEND_RETRIES;  	get_random_bytes(&timeout, sizeof(timeout)); -	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); +	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);  	for (;;) {  		/* Track the device we are configuring */  		ic_dev_xid = d->xid; @@ -1191,13 +1239,13 @@ static int __init ic_dynamic(void)  		    (ic_proto_enabled & IC_USE_DHCP) &&  		    ic_dhcp_msgtype != DHCPACK) {  			ic_got_reply = 0; -			printk(KERN_CONT ","); +			pr_cont(",");  			continue;  		}  #endif /* IPCONFIG_DHCP */  		if (ic_got_reply) { -			printk(KERN_CONT " OK\n"); +			pr_cont(" OK\n");  			break;  		} @@ -1205,7 +1253,7 @@ static int __init ic_dynamic(void)  			continue;  		if (! --retries) { -			printk(KERN_CONT " timed out!\n"); +			pr_cont(" timed out!\n");  			break;  		} @@ -1215,7 +1263,7 @@ static int __init ic_dynamic(void)  		if (timeout > CONF_TIMEOUT_MAX)  			timeout = CONF_TIMEOUT_MAX; -		printk(KERN_CONT "."); +		pr_cont(".");  	}  #ifdef IPCONFIG_BOOTP @@ -1235,8 +1283,8 @@ static int __init ic_dynamic(void)  	printk("IP-Config: Got %s answer from %pI4, ",  		((ic_got_reply & IC_RARP) ? "RARP"  		 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), -		&ic_servaddr); -	printk(KERN_CONT "my address is %pI4\n", &ic_myaddr); +	       &ic_addrservaddr); +	pr_cont("my address is %pI4\n", &ic_myaddr);  	return 0;  } @@ -1324,14 +1372,13 @@ static int __init wait_for_devices(void)  {  	int i; -	msleep(CONF_PRE_OPEN);  	for (i = 0; i < DEVICE_WAIT_MAX; i++) {  		struct net_device *dev;  		int found = 0;  		rtnl_lock();  		for_each_netdev(&init_net, dev) { -			if (ic_device_match(dev)) { +			if (ic_is_init_dev(dev)) {  				found = 1;  				break;  			} @@ -1355,9 +1402,10 @@ static int __init ip_auto_config(void)  	int retries = CONF_OPEN_RETRIES;  #endif  	int err; +	unsigned int i;  #ifdef CONFIG_PROC_FS -	proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); +	proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);  #endif /* CONFIG_PROC_FS */  	if (!ic_enable) @@ -1378,7 +1426,7 @@ static int __init ip_auto_config(void)  		return err;  	/* Give drivers a chance to settle */ -	ssleep(CONF_POST_OPEN); +	msleep(CONF_POST_OPEN);  	/*  	 * If the config information is insufficient (e.g., our IP address or @@ -1413,24 +1461,22 @@ static int __init ip_auto_config(void)  			 */  #ifdef CONFIG_ROOT_NFS  			if (ROOT_DEV ==  Root_NFS) { -				printk(KERN_ERR -					"IP-Config: Retrying forever (NFS root)...\n"); +				pr_err("IP-Config: Retrying forever (NFS root)...\n");  				goto try_try_again;  			}  #endif  			if (--retries) { -				printk(KERN_ERR -				       "IP-Config: Reopening network devices...\n"); +				pr_err("IP-Config: Reopening network devices...\n");  				goto try_try_again;  			}  			/* Oh, well.  At least we tried. */ -			printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); +			pr_err("IP-Config: Auto-configuration of network failed\n");  			return -1;  		}  #else /* !DYNAMIC */ -		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); +		pr_err("IP-Config: Incomplete network configuration information\n");  		ic_close_devs();  		return -1;  #endif /* IPCONFIG_DYNAMIC */ @@ -1444,7 +1490,7 @@ static int __init ip_auto_config(void)  		root_server_addr = addr;  	/* -	 * Use defaults whereever applicable. +	 * Use defaults wherever applicable.  	 */  	if (ic_defaults() < 0)  		return -1; @@ -1468,19 +1514,27 @@ static int __init ip_auto_config(void)  	/*  	 * Clue in the operator.  	 */ -	printk("IP-Config: Complete:\n"); -	printk("     device=%s", ic_dev->name); -	printk(KERN_CONT ", addr=%pI4", &ic_myaddr); -	printk(KERN_CONT ", mask=%pI4", &ic_netmask); -	printk(KERN_CONT ", gw=%pI4", &ic_gateway); -	printk(KERN_CONT ",\n     host=%s, domain=%s, nis-domain=%s", -	       utsname()->nodename, ic_domain, utsname()->domainname); -	printk(KERN_CONT ",\n     bootserver=%pI4", &ic_servaddr); -	printk(KERN_CONT ", rootserver=%pI4", &root_server_addr); -	printk(KERN_CONT ", rootpath=%s", root_server_path); +	pr_info("IP-Config: Complete:\n"); + +	pr_info("     device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", +		ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, +		&ic_myaddr, &ic_netmask, &ic_gateway); +	pr_info("     host=%s, domain=%s, nis-domain=%s\n", +		utsname()->nodename, ic_domain, utsname()->domainname); +	pr_info("     bootserver=%pI4, rootserver=%pI4, rootpath=%s", +		&ic_servaddr, &root_server_addr, root_server_path);  	if (ic_dev_mtu) -		printk(KERN_CONT ", mtu=%d", ic_dev_mtu); -	printk(KERN_CONT "\n"); +		pr_cont(", mtu=%d", ic_dev_mtu); +	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) +		if (ic_nameservers[i] != NONE) { +			pr_info("     nameserver%u=%pI4", +				i, &ic_nameservers[i]); +			break; +		} +	for (i++; i < CONF_NAMESERVERS_MAX; i++) +		if (ic_nameservers[i] != NONE) +			pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); +	pr_cont("\n");  #endif /* !SILENT */  	return 0; @@ -1551,6 +1605,8 @@ static int __init ip_auto_config_setup(char *addrs)  		return 1;  	} +	ic_nameservers_predef(); +  	/* Parse string for static IP assignment.  */  	ip = addrs;  	while (ip && *ip) { @@ -1594,6 +1650,20 @@ static int __init ip_auto_config_setup(char *addrs)  					ic_enable = 0;  				}  				break; +			case 7: +				if (CONF_NAMESERVERS_MAX >= 1) { +					ic_nameservers[0] = in_aton(ip); +					if (ic_nameservers[0] == ANY) +						ic_nameservers[0] = NONE; +				} +				break; +			case 8: +				if (CONF_NAMESERVERS_MAX >= 2) { +					ic_nameservers[1] = in_aton(ip); +					if (ic_nameservers[1] == ANY) +						ic_nameservers[1] = NONE; +				} +				break;  			}  		}  		ip = cp; @@ -1602,22 +1672,21 @@ static int __init ip_auto_config_setup(char *addrs)  	return 1;  } +__setup("ip=", ip_auto_config_setup);  static int __init nfsaddrs_config_setup(char *addrs)  {  	return ip_auto_config_setup(addrs);  } +__setup("nfsaddrs=", nfsaddrs_config_setup);  static int __init vendor_class_identifier_setup(char *addrs)  {  	if (strlcpy(vendor_class_identifier, addrs,  		    sizeof(vendor_class_identifier))  	    >= sizeof(vendor_class_identifier)) -		printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", -		       vendor_class_identifier); +		pr_warn("DHCP: vendorclass too long, truncated to \"%s\"", +			vendor_class_identifier);  	return 1;  } - -__setup("ip=", ip_auto_config_setup); -__setup("nfsaddrs=", nfsaddrs_config_setup);  __setup("dhcpclass=", vendor_class_identifier_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52fba54..62eaa005e14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -111,206 +111,20 @@  #include <net/sock.h>  #include <net/ip.h>  #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h>  #include <net/inet_ecn.h>  #include <net/xfrm.h>  #include <net/net_namespace.h>  #include <net/netns/generic.h> -#define HASH_SIZE  16 -#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");  static int ipip_net_id __read_mostly; -struct ipip_net { -	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; -	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; -	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; -	struct ip_tunnel __rcu *tunnels_wc[1]; -	struct ip_tunnel __rcu **tunnels[4]; - -	struct net_device *fb_tunnel_dev; -};  static int ipip_tunnel_init(struct net_device *dev); -static void ipip_tunnel_setup(struct net_device *dev); -static void ipip_dev_free(struct net_device *dev); - -/* - * Locking : hash tables are protected by RCU and RTNL - */ - -#define for_each_ip_tunnel_rcu(start) \ -	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -/* often modified stats are per cpu, other are shared (netdev->stats) */ -struct pcpu_tstats { -	unsigned long	rx_packets; -	unsigned long	rx_bytes; -	unsigned long	tx_packets; -	unsigned long	tx_bytes; -}; - -static struct net_device_stats *ipip_get_stats(struct net_device *dev) -{ -	struct pcpu_tstats sum = { 0 }; -	int i; - -	for_each_possible_cpu(i) { -		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); - -		sum.rx_packets += tstats->rx_packets; -		sum.rx_bytes   += tstats->rx_bytes; -		sum.tx_packets += tstats->tx_packets; -		sum.tx_bytes   += tstats->tx_bytes; -	} -	dev->stats.rx_packets = sum.rx_packets; -	dev->stats.rx_bytes   = sum.rx_bytes; -	dev->stats.tx_packets = sum.tx_packets; -	dev->stats.tx_bytes   = sum.tx_bytes; -	return &dev->stats; -} - -static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, -		__be32 remote, __be32 local) -{ -	unsigned int h0 = HASH(remote); -	unsigned int h1 = HASH(local); -	struct ip_tunnel *t; -	struct ipip_net *ipn = net_generic(net, ipip_net_id); - -	for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) -		if (local == t->parms.iph.saddr && -		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) -			return t; - -	for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) -		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) -			return t; - -	for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) -		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) -			return t; - -	t = rcu_dereference(ipn->tunnels_wc[0]); -	if (t && (t->dev->flags&IFF_UP)) -		return t; -	return NULL; -} - -static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, -		struct ip_tunnel_parm *parms) -{ -	__be32 remote = parms->iph.daddr; -	__be32 local = parms->iph.saddr; -	unsigned int h = 0; -	int prio = 0; - -	if (remote) { -		prio |= 2; -		h ^= HASH(remote); -	} -	if (local) { -		prio |= 1; -		h ^= HASH(local); -	} -	return &ipn->tunnels[prio][h]; -} - -static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, -		struct ip_tunnel *t) -{ -	return __ipip_bucket(ipn, &t->parms); -} - -static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) -{ -	struct ip_tunnel __rcu **tp; -	struct ip_tunnel *iter; - -	for (tp = ipip_bucket(ipn, t); -	     (iter = rtnl_dereference(*tp)) != NULL; -	     tp = &iter->next) { -		if (t == iter) { -			rcu_assign_pointer(*tp, t->next); -			break; -		} -	} -} - -static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) -{ -	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - -	rcu_assign_pointer(t->next, rtnl_dereference(*tp)); -	rcu_assign_pointer(*tp, t); -} - -static struct ip_tunnel * ipip_tunnel_locate(struct net *net, -		struct ip_tunnel_parm *parms, int create) -{ -	__be32 remote = parms->iph.daddr; -	__be32 local = parms->iph.saddr; -	struct ip_tunnel *t, *nt; -	struct ip_tunnel __rcu **tp; -	struct net_device *dev; -	char name[IFNAMSIZ]; -	struct ipip_net *ipn = net_generic(net, ipip_net_id); - -	for (tp = __ipip_bucket(ipn, parms); -		 (t = rtnl_dereference(*tp)) != NULL; -		 tp = &t->next) { -		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) -			return t; -	} -	if (!create) -		return NULL; - -	if (parms->name[0]) -		strlcpy(name, parms->name, IFNAMSIZ); -	else -		strcpy(name, "tunl%d"); - -	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); -	if (dev == NULL) -		return NULL; - -	dev_net_set(dev, net); - -	if (strchr(name, '%')) { -		if (dev_alloc_name(dev, name) < 0) -			goto failed_free; -	} - -	nt = netdev_priv(dev); -	nt->parms = *parms; - -	if (ipip_tunnel_init(dev) < 0) -		goto failed_free; - -	if (register_netdevice(dev) < 0) -		goto failed_free; - -	dev_hold(dev); -	ipip_tunnel_link(ipn, nt); -	return nt; - -failed_free: -	ipip_dev_free(dev); -	return NULL; -} - -/* called with RTNL */ -static void ipip_tunnel_uninit(struct net_device *dev) -{ -	struct net *net = dev_net(dev); -	struct ipip_net *ipn = net_generic(net, ipip_net_id); - -	if (dev == ipn->fb_tunnel_dev) -		rcu_assign_pointer(ipn->tunnels_wc[0], NULL); -	else -		ipip_tunnel_unlink(ipn, netdev_priv(dev)); -	dev_put(dev); -} +static struct rtnl_link_ops ipip_link_ops __read_mostly;  static int ipip_err(struct sk_buff *skb, u32 info)  { @@ -319,45 +133,35 @@ static int ipip_err(struct sk_buff *skb, u32 info)     8 bytes of packet payload. It means, that precise relaying of     ICMP in the real Internet is absolutely infeasible.   */ -	struct iphdr *iph = (struct iphdr *)skb->data; -	const int type = icmp_hdr(skb)->type; -	const int code = icmp_hdr(skb)->code; +	struct net *net = dev_net(skb->dev); +	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); +	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct ip_tunnel *t;  	int err; +	const int type = icmp_hdr(skb)->type; +	const int code = icmp_hdr(skb)->code; -	switch (type) { -	default: -	case ICMP_PARAMETERPROB: -		return 0; - -	case ICMP_DEST_UNREACH: -		switch (code) { -		case ICMP_SR_FAILED: -		case ICMP_PORT_UNREACH: -			/* Impossible event. */ -			return 0; -		case ICMP_FRAG_NEEDED: -			/* Soft state for pmtu is maintained by IP core. */ -			return 0; -		default: -			/* All others are translated to HOST_UNREACH. -			   rfc2003 contains "deep thoughts" about NET_UNREACH, -			   I believe they are just ether pollution. --ANK -			 */ -			break; -		} -		break; -	case ICMP_TIME_EXCEEDED: -		if (code != ICMP_EXC_TTL) -			return 0; -		break; +	err = -ENOENT; +	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +			     iph->daddr, iph->saddr, 0); +	if (t == NULL) +		goto out; + +	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { +		ipv4_update_pmtu(skb, dev_net(skb->dev), info, +				 t->parms.link, 0, IPPROTO_IPIP, 0); +		err = 0; +		goto out;  	} -	err = -ENOENT; +	if (type == ICMP_REDIRECT) { +		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, +			      IPPROTO_IPIP, 0); +		err = 0; +		goto out; +	} -	rcu_read_lock(); -	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); -	if (t == NULL || t->parms.iph.daddr == 0) +	if (t->parms.iph.daddr == 0)  		goto out;  	err = 0; @@ -369,543 +173,312 @@ static int ipip_err(struct sk_buff *skb, u32 info)  	else  		t->err_count = 1;  	t->err_time = jiffies; +  out: -	rcu_read_unlock();  	return err;  } -static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph, -					struct sk_buff *skb) -{ -	struct iphdr *inner_iph = ip_hdr(skb); - -	if (INET_ECN_is_ce(outer_iph->tos)) -		IP_ECN_set_ce(inner_iph); -} +static const struct tnl_ptk_info tpi = { +	/* no tunnel info required for ipip. */ +	.proto = htons(ETH_P_IP), +};  static int ipip_rcv(struct sk_buff *skb)  { +	struct net *net = dev_net(skb->dev); +	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);  	struct ip_tunnel *tunnel; -	const struct iphdr *iph = ip_hdr(skb); - -	rcu_read_lock(); -	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); -	if (tunnel != NULL) { -		struct pcpu_tstats *tstats; - -		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { -			rcu_read_unlock(); -			kfree_skb(skb); -			return 0; -		} - -		secpath_reset(skb); - -		skb->mac_header = skb->network_header; -		skb_reset_network_header(skb); -		skb->protocol = htons(ETH_P_IP); -		skb->pkt_type = PACKET_HOST; - -		tstats = this_cpu_ptr(tunnel->dev->tstats); -		tstats->rx_packets++; -		tstats->rx_bytes += skb->len; - -		__skb_tunnel_rx(skb, tunnel->dev); - -		ipip_ecn_decapsulate(iph, skb); - -		netif_rx(skb); - -		rcu_read_unlock(); -		return 0; +	const struct iphdr *iph; + +	iph = ip_hdr(skb); +	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +			iph->saddr, iph->daddr, 0); +	if (tunnel) { +		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) +			goto drop; +		if (iptunnel_pull_header(skb, 0, tpi.proto)) +			goto drop; +		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);  	} -	rcu_read_unlock();  	return -1; + +drop: +	kfree_skb(skb); +	return 0;  }  /*   *	This function assumes it is being called from dev_queue_xmit()   *	and that skb is filled properly by that function.   */ -  static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct pcpu_tstats *tstats; -	struct iphdr  *tiph = &tunnel->parms.iph; -	u8     tos = tunnel->parms.iph.tos; -	__be16 df = tiph->frag_off; -	struct rtable *rt;     			/* Route to the other host */ -	struct net_device *tdev;		/* Device to other host */ -	struct iphdr  *old_iph = ip_hdr(skb); -	struct iphdr  *iph;			/* Our new IP header */ -	unsigned int max_headroom;		/* The extra header space needed */ -	__be32 dst = tiph->daddr; -	int    mtu; - -	if (skb->protocol != htons(ETH_P_IP)) -		goto tx_error; - -	if (tos & 1) -		tos = old_iph->tos; - -	if (!dst) { -		/* NBMA tunnel */ -		if ((rt = skb_rtable(skb)) == NULL) { -			dev->stats.tx_fifo_errors++; -			goto tx_error; -		} -		if ((dst = rt->rt_gateway) == 0) -			goto tx_error_icmp; -	} +	const struct iphdr  *tiph = &tunnel->parms.iph; -	{ -		struct flowi fl = { -			.oif = tunnel->parms.link, -			.fl4_dst = dst, -			.fl4_src= tiph->saddr, -			.fl4_tos = RT_TOS(tos), -			.proto = IPPROTO_IPIP -		}; - -		if (ip_route_output_key(dev_net(dev), &rt, &fl)) { -			dev->stats.tx_carrier_errors++; -			goto tx_error_icmp; -		} -	} -	tdev = rt->dst.dev; - -	if (tdev == dev) { -		ip_rt_put(rt); -		dev->stats.collisions++; +	if (unlikely(skb->protocol != htons(ETH_P_IP)))  		goto tx_error; -	} - -	df |= old_iph->frag_off & htons(IP_DF); - -	if (df) { -		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - -		if (mtu < 68) { -			dev->stats.collisions++; -			ip_rt_put(rt); -			goto tx_error; -		} -		if (skb_dst(skb)) -			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - -		if ((old_iph->frag_off & htons(IP_DF)) && -		    mtu < ntohs(old_iph->tot_len)) { -			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, -				  htonl(mtu)); -			ip_rt_put(rt); -			goto tx_error; -		} -	} - -	if (tunnel->err_count > 0) { -		if (time_before(jiffies, -				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { -			tunnel->err_count--; -			dst_link_failure(skb); -		} else -			tunnel->err_count = 0; -	} - -	/* -	 * Okay, now see if we can stuff it in the buffer as-is. -	 */ -	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); - -	if (skb_headroom(skb) < max_headroom || skb_shared(skb) || -	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { -		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); -		if (!new_skb) { -			ip_rt_put(rt); -			dev->stats.tx_dropped++; -			dev_kfree_skb(skb); -			return NETDEV_TX_OK; -		} -		if (skb->sk) -			skb_set_owner_w(new_skb, skb->sk); -		dev_kfree_skb(skb); -		skb = new_skb; -		old_iph = ip_hdr(skb); -	} +	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); +	if (IS_ERR(skb)) +		goto out; -	skb->transport_header = skb->network_header; -	skb_push(skb, sizeof(struct iphdr)); -	skb_reset_network_header(skb); -	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | -			      IPSKB_REROUTED); -	skb_dst_drop(skb); -	skb_dst_set(skb, &rt->dst); - -	/* -	 *	Push down and install the IPIP header. -	 */ - -	iph 			=	ip_hdr(skb); -	iph->version		=	4; -	iph->ihl		=	sizeof(struct iphdr)>>2; -	iph->frag_off		=	df; -	iph->protocol		=	IPPROTO_IPIP; -	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos); -	iph->daddr		=	rt->rt_dst; -	iph->saddr		=	rt->rt_src; - -	if ((iph->ttl = tiph->ttl) == 0) -		iph->ttl	=	old_iph->ttl; - -	nf_reset(skb); -	tstats = this_cpu_ptr(dev->tstats); -	__IPTUNNEL_XMIT(tstats, &dev->stats); +	ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);  	return NETDEV_TX_OK; -tx_error_icmp: -	dst_link_failure(skb);  tx_error: +	kfree_skb(skb); +out:  	dev->stats.tx_errors++; -	dev_kfree_skb(skb);  	return NETDEV_TX_OK;  } -static void ipip_tunnel_bind_dev(struct net_device *dev) -{ -	struct net_device *tdev = NULL; -	struct ip_tunnel *tunnel; -	struct iphdr *iph; - -	tunnel = netdev_priv(dev); -	iph = &tunnel->parms.iph; - -	if (iph->daddr) { -		struct flowi fl = { -			.oif = tunnel->parms.link, -			.fl4_dst = iph->daddr, -			.fl4_src = iph->saddr, -			.fl4_tos = RT_TOS(iph->tos), -			.proto = IPPROTO_IPIP -		}; -		struct rtable *rt; - -		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { -			tdev = rt->dst.dev; -			ip_rt_put(rt); -		} -		dev->flags |= IFF_POINTOPOINT; -	} - -	if (!tdev && tunnel->parms.link) -		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); - -	if (tdev) { -		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); -		dev->mtu = tdev->mtu - sizeof(struct iphdr); -	} -	dev->iflink = tunnel->parms.link; -} -  static int -ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)  {  	int err = 0;  	struct ip_tunnel_parm p; -	struct ip_tunnel *t; -	struct net *net = dev_net(dev); -	struct ipip_net *ipn = net_generic(net, ipip_net_id); - -	switch (cmd) { -	case SIOCGETTUNNEL: -		t = NULL; -		if (dev == ipn->fb_tunnel_dev) { -			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { -				err = -EFAULT; -				break; -			} -			t = ipip_tunnel_locate(net, &p, 0); -		} -		if (t == NULL) -			t = netdev_priv(dev); -		memcpy(&p, &t->parms, sizeof(p)); -		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) -			err = -EFAULT; -		break; - -	case SIOCADDTUNNEL: -	case SIOCCHGTUNNEL: -		err = -EPERM; -		if (!capable(CAP_NET_ADMIN)) -			goto done; - -		err = -EFAULT; -		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) -			goto done; - -		err = -EINVAL; + +	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) +		return -EFAULT; + +	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {  		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||  		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) -			goto done; -		if (p.iph.ttl) -			p.iph.frag_off |= htons(IP_DF); - -		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - -		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { -			if (t != NULL) { -				if (t->dev != dev) { -					err = -EEXIST; -					break; -				} -			} else { -				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || -				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { -					err = -EINVAL; -					break; -				} -				t = netdev_priv(dev); -				ipip_tunnel_unlink(ipn, t); -				synchronize_net(); -				t->parms.iph.saddr = p.iph.saddr; -				t->parms.iph.daddr = p.iph.daddr; -				memcpy(dev->dev_addr, &p.iph.saddr, 4); -				memcpy(dev->broadcast, &p.iph.daddr, 4); -				ipip_tunnel_link(ipn, t); -				netdev_state_change(dev); -			} -		} - -		if (t) { -			err = 0; -			if (cmd == SIOCCHGTUNNEL) { -				t->parms.iph.ttl = p.iph.ttl; -				t->parms.iph.tos = p.iph.tos; -				t->parms.iph.frag_off = p.iph.frag_off; -				if (t->parms.link != p.link) { -					t->parms.link = p.link; -					ipip_tunnel_bind_dev(dev); -					netdev_state_change(dev); -				} -			} -			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) -				err = -EFAULT; -		} else -			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); -		break; - -	case SIOCDELTUNNEL: -		err = -EPERM; -		if (!capable(CAP_NET_ADMIN)) -			goto done; - -		if (dev == ipn->fb_tunnel_dev) { -			err = -EFAULT; -			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) -				goto done; -			err = -ENOENT; -			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL) -				goto done; -			err = -EPERM; -			if (t->dev == ipn->fb_tunnel_dev) -				goto done; -			dev = t->dev; -		} -		unregister_netdevice(dev); -		err = 0; -		break; - -	default: -		err = -EINVAL; +			return -EINVAL;  	} -done: -	return err; -} +	p.i_key = p.o_key = p.i_flags = p.o_flags = 0; +	if (p.iph.ttl) +		p.iph.frag_off |= htons(IP_DF); + +	err = ip_tunnel_ioctl(dev, &p, cmd); +	if (err) +		return err; + +	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) +		return -EFAULT; -static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ -	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) -		return -EINVAL; -	dev->mtu = new_mtu;  	return 0;  }  static const struct net_device_ops ipip_netdev_ops = { -	.ndo_uninit	= ipip_tunnel_uninit, +	.ndo_init       = ipip_tunnel_init, +	.ndo_uninit     = ip_tunnel_uninit,  	.ndo_start_xmit	= ipip_tunnel_xmit,  	.ndo_do_ioctl	= ipip_tunnel_ioctl, -	.ndo_change_mtu	= ipip_tunnel_change_mtu, -	.ndo_get_stats  = ipip_get_stats, +	.ndo_change_mtu = ip_tunnel_change_mtu, +	.ndo_get_stats64 = ip_tunnel_get_stats64,  }; -static void ipip_dev_free(struct net_device *dev) -{ -	free_percpu(dev->tstats); -	free_netdev(dev); -} +#define IPIP_FEATURES (NETIF_F_SG |		\ +		       NETIF_F_FRAGLIST |	\ +		       NETIF_F_HIGHDMA |	\ +		       NETIF_F_GSO_SOFTWARE |	\ +		       NETIF_F_HW_CSUM)  static void ipip_tunnel_setup(struct net_device *dev)  {  	dev->netdev_ops		= &ipip_netdev_ops; -	dev->destructor		= ipip_dev_free;  	dev->type		= ARPHRD_TUNNEL; -	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr); -	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);  	dev->flags		= IFF_NOARP;  	dev->iflink		= 0;  	dev->addr_len		= 4; -	dev->features		|= NETIF_F_NETNS_LOCAL;  	dev->features		|= NETIF_F_LLTX;  	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE; + +	dev->features		|= IPIP_FEATURES; +	dev->hw_features	|= IPIP_FEATURES; +	ip_tunnel_setup(dev, ipip_net_id);  }  static int ipip_tunnel_init(struct net_device *dev)  {  	struct ip_tunnel *tunnel = netdev_priv(dev); -	tunnel->dev = dev; -	strcpy(tunnel->parms.name, dev->name); -  	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);  	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); -	ipip_tunnel_bind_dev(dev); - -	dev->tstats = alloc_percpu(struct pcpu_tstats); -	if (!dev->tstats) -		return -ENOMEM; - -	return 0; +	tunnel->hlen = 0; +	tunnel->parms.iph.protocol = IPPROTO_IPIP; +	return ip_tunnel_init(dev);  } -static int __net_init ipip_fb_tunnel_init(struct net_device *dev) +static void ipip_netlink_parms(struct nlattr *data[], +			       struct ip_tunnel_parm *parms)  { -	struct ip_tunnel *tunnel = netdev_priv(dev); -	struct iphdr *iph = &tunnel->parms.iph; -	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id); +	memset(parms, 0, sizeof(*parms)); -	tunnel->dev = dev; -	strcpy(tunnel->parms.name, dev->name); +	parms->iph.version = 4; +	parms->iph.protocol = IPPROTO_IPIP; +	parms->iph.ihl = 5; -	iph->version		= 4; -	iph->protocol		= IPPROTO_IPIP; -	iph->ihl		= 5; +	if (!data) +		return; -	dev->tstats = alloc_percpu(struct pcpu_tstats); -	if (!dev->tstats) -		return -ENOMEM; +	if (data[IFLA_IPTUN_LINK]) +		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); -	dev_hold(dev); -	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); -	return 0; -} +	if (data[IFLA_IPTUN_LOCAL]) +		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); -static struct xfrm_tunnel ipip_handler __read_mostly = { -	.handler	=	ipip_rcv, -	.err_handler	=	ipip_err, -	.priority	=	1, -}; +	if (data[IFLA_IPTUN_REMOTE]) +		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); -static const char banner[] __initconst = -	KERN_INFO "IPv4 over IPv4 tunneling driver\n"; +	if (data[IFLA_IPTUN_TTL]) { +		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); +		if (parms->iph.ttl) +			parms->iph.frag_off = htons(IP_DF); +	} -static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) +	if (data[IFLA_IPTUN_TOS]) +		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + +	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) +		parms->iph.frag_off = htons(IP_DF); +} + +static int ipip_newlink(struct net *src_net, struct net_device *dev, +			struct nlattr *tb[], struct nlattr *data[])  { -	int prio; - -	for (prio = 1; prio < 4; prio++) { -		int h; -		for (h = 0; h < HASH_SIZE; h++) { -			struct ip_tunnel *t; - -			t = rtnl_dereference(ipn->tunnels[prio][h]); -			while (t != NULL) { -				unregister_netdevice_queue(t->dev, head); -				t = rtnl_dereference(t->next); -			} -		} -	} +	struct ip_tunnel_parm p; + +	ipip_netlink_parms(data, &p); +	return ip_tunnel_newlink(dev, tb, &p);  } -static int __net_init ipip_init_net(struct net *net) +static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], +			   struct nlattr *data[])  { -	struct ipip_net *ipn = net_generic(net, ipip_net_id); -	int err; +	struct ip_tunnel_parm p; -	ipn->tunnels[0] = ipn->tunnels_wc; -	ipn->tunnels[1] = ipn->tunnels_l; -	ipn->tunnels[2] = ipn->tunnels_r; -	ipn->tunnels[3] = ipn->tunnels_r_l; - -	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), -					   "tunl0", -					   ipip_tunnel_setup); -	if (!ipn->fb_tunnel_dev) { -		err = -ENOMEM; -		goto err_alloc_dev; -	} -	dev_net_set(ipn->fb_tunnel_dev, net); +	ipip_netlink_parms(data, &p); -	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); -	if (err) -		goto err_reg_dev; +	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || +	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) +		return -EINVAL; + +	return ip_tunnel_changelink(dev, tb, &p); +} -	if ((err = register_netdev(ipn->fb_tunnel_dev))) -		goto err_reg_dev; +static size_t ipip_get_size(const struct net_device *dev) +{ +	return +		/* IFLA_IPTUN_LINK */ +		nla_total_size(4) + +		/* IFLA_IPTUN_LOCAL */ +		nla_total_size(4) + +		/* IFLA_IPTUN_REMOTE */ +		nla_total_size(4) + +		/* IFLA_IPTUN_TTL */ +		nla_total_size(1) + +		/* IFLA_IPTUN_TOS */ +		nla_total_size(1) + +		/* IFLA_IPTUN_PMTUDISC */ +		nla_total_size(1) + +		0; +} +static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ +	struct ip_tunnel *tunnel = netdev_priv(dev); +	struct ip_tunnel_parm *parm = &tunnel->parms; + +	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || +	    nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || +	    nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || +	    nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || +	    nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || +	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, +		       !!(parm->iph.frag_off & htons(IP_DF)))) +		goto nla_put_failure;  	return 0; -err_reg_dev: -	ipip_dev_free(ipn->fb_tunnel_dev); -err_alloc_dev: -	/* nothing */ -	return err; +nla_put_failure: +	return -EMSGSIZE; +} + +static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { +	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 }, +	[IFLA_IPTUN_LOCAL]		= { .type = NLA_U32 }, +	[IFLA_IPTUN_REMOTE]		= { .type = NLA_U32 }, +	[IFLA_IPTUN_TTL]		= { .type = NLA_U8 }, +	[IFLA_IPTUN_TOS]		= { .type = NLA_U8 }, +	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ipip_link_ops __read_mostly = { +	.kind		= "ipip", +	.maxtype	= IFLA_IPTUN_MAX, +	.policy		= ipip_policy, +	.priv_size	= sizeof(struct ip_tunnel), +	.setup		= ipip_tunnel_setup, +	.newlink	= ipip_newlink, +	.changelink	= ipip_changelink, +	.dellink	= ip_tunnel_dellink, +	.get_size	= ipip_get_size, +	.fill_info	= ipip_fill_info, +}; + +static struct xfrm_tunnel ipip_handler __read_mostly = { +	.handler	=	ipip_rcv, +	.err_handler	=	ipip_err, +	.priority	=	1, +}; + +static int __net_init ipip_init_net(struct net *net) +{ +	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");  }  static void __net_exit ipip_exit_net(struct net *net)  { -	struct ipip_net *ipn = net_generic(net, ipip_net_id); -	LIST_HEAD(list); - -	rtnl_lock(); -	ipip_destroy_tunnels(ipn, &list); -	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); -	unregister_netdevice_many(&list); -	rtnl_unlock(); +	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); +	ip_tunnel_delete_net(itn, &ipip_link_ops);  }  static struct pernet_operations ipip_net_ops = {  	.init = ipip_init_net,  	.exit = ipip_exit_net,  	.id   = &ipip_net_id, -	.size = sizeof(struct ipip_net), +	.size = sizeof(struct ip_tunnel_net),  };  static int __init ipip_init(void)  {  	int err; -	printk(banner); +	pr_info("ipip: IPv4 over IPv4 tunneling driver\n");  	err = register_pernet_device(&ipip_net_ops);  	if (err < 0)  		return err;  	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);  	if (err < 0) { -		unregister_pernet_device(&ipip_net_ops); -		printk(KERN_INFO "ipip init: can't register tunnel\n"); +		pr_info("%s: can't register tunnel\n", __func__); +		goto xfrm_tunnel_failed;  	} +	err = rtnl_link_register(&ipip_link_ops); +	if (err < 0) +		goto rtnl_link_failed; + +out:  	return err; + +rtnl_link_failed: +	xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel_failed: +	unregister_pernet_device(&ipip_net_ops); +	goto out;  }  static void __exit ipip_fini(void)  { +	rtnl_link_unregister(&ipip_link_ops);  	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) -		printk(KERN_INFO "ipip close: can't deregister tunnel\n"); +		pr_info("%s: can't deregister tunnel\n", __func__);  	unregister_pernet_device(&ipip_net_ops);  } @@ -913,4 +486,5 @@ static void __exit ipip_fini(void)  module_init(ipip_init);  module_exit(ipip_fini);  MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_RTNL_LINK("ipip"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3f3a9afd73e..65bcaa78904 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -26,7 +26,6 @@   *   */ -#include <asm/system.h>  #include <asm/uaccess.h>  #include <linux/types.h>  #include <linux/capability.h> @@ -60,10 +59,13 @@  #include <linux/notifier.h>  #include <linux/if_arp.h>  #include <linux/netfilter_ipv4.h> -#include <net/ipip.h> +#include <linux/compat.h> +#include <linux/export.h> +#include <net/ip_tunnels.h>  #include <net/checksum.h>  #include <net/netlink.h>  #include <net/fib_rules.h> +#include <linux/netconf.h>  #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)  #define CONFIG_IP_PIMSM	1 @@ -82,8 +84,8 @@ struct mr_table {  	struct vif_device	vif_table[MAXVIFS];  	int			maxvif;  	atomic_t		cache_resolve_queue_len; -	int			mroute_do_assert; -	int			mroute_do_pim; +	bool			mroute_do_assert; +	bool			mroute_do_pim;  #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)  	int			mroute_reg_vif_num;  #endif @@ -123,13 +125,18 @@ static DEFINE_SPINLOCK(mfc_unres_lock);  static struct kmem_cache *mrt_cachep __read_mostly;  static struct mr_table *ipmr_new_table(struct net *net, u32 id); -static int ip_mr_forward(struct net *net, struct mr_table *mrt, -			 struct sk_buff *skb, struct mfc_cache *cache, -			 int local); +static void ipmr_free_table(struct mr_table *mrt); + +static void ip_mr_forward(struct net *net, struct mr_table *mrt, +			  struct sk_buff *skb, struct mfc_cache *cache, +			  int local);  static int ipmr_cache_report(struct mr_table *mrt,  			     struct sk_buff *pkt, vifi_t vifi, int assert);  static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,  			      struct mfc_cache *c, struct rtmsg *rtm); +static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, +				 int cmd); +static void mroute_clean_tables(struct mr_table *mrt);  static void ipmr_expire_process(unsigned long arg);  #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -147,14 +154,18 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)  	return NULL;  } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,  			   struct mr_table **mrt)  { -	struct ipmr_result res; -	struct fib_lookup_arg arg = { .result = &res, };  	int err; +	struct ipmr_result res; +	struct fib_lookup_arg arg = { +		.result = &res, +		.flags = FIB_LOOKUP_NOREF, +	}; -	err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); +	err = fib_rules_lookup(net->ipv4.mr_rules_ops, +			       flowi4_to_flowi(flp4), 0, &arg);  	if (err < 0)  		return err;  	*mrt = res.mrt; @@ -216,7 +227,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,  	return 0;  } -static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {  	.family		= RTNL_FAMILY_IPMR,  	.rule_size	= sizeof(struct ipmr_rule),  	.addr_size	= sizeof(u32), @@ -269,7 +280,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)  	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {  		list_del(&mrt->list); -		kfree(mrt); +		ipmr_free_table(mrt);  	}  	fib_rules_unregister(net->ipv4.mr_rules_ops);  } @@ -282,7 +293,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)  	return net->ipv4.mrt;  } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,  			   struct mr_table **mrt)  {  	*mrt = net->ipv4.mrt; @@ -297,7 +308,7 @@ static int __net_init ipmr_rules_init(struct net *net)  static void __net_exit ipmr_rules_exit(struct net *net)  { -	kfree(net->ipv4.mrt); +	ipmr_free_table(net->ipv4.mrt);  }  #endif @@ -334,6 +345,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)  	return mrt;  } +static void ipmr_free_table(struct mr_table *mrt) +{ +	del_timer_sync(&mrt->ipmr_expire_timer); +	mroute_clean_tables(mrt); +	kfree(mrt); +} +  /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */  static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) @@ -410,6 +428,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)  				goto failure;  			ipv4_devconf_setall(in_dev); +			neigh_parms_data_state_setall(in_dev->arp_parms);  			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;  			if (dev_open(dev)) @@ -434,14 +453,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct net *net = dev_net(dev);  	struct mr_table *mrt; -	struct flowi fl = { -		.oif		= dev->ifindex, -		.iif		= skb->skb_iif, -		.mark		= skb->mark, +	struct flowi4 fl4 = { +		.flowi4_oif	= dev->ifindex, +		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX, +		.flowi4_mark	= skb->mark,  	};  	int err; -	err = ipmr_fib_lookup(net, &fl, &mrt); +	err = ipmr_fib_lookup(net, &fl4, &mrt);  	if (err < 0) {  		kfree_skb(skb);  		return err; @@ -465,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)  	dev->type		= ARPHRD_PIMREG;  	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;  	dev->flags		= IFF_NOARP; -	dev->netdev_ops		= ®_vif_netdev_ops, +	dev->netdev_ops		= ®_vif_netdev_ops;  	dev->destructor		= free_netdev;  	dev->features		|= NETIF_F_NETNS_LOCAL;  } @@ -502,6 +521,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)  	}  	ipv4_devconf_setall(in_dev); +	neigh_parms_data_state_setall(in_dev->arp_parms);  	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;  	rcu_read_unlock(); @@ -522,8 +542,8 @@ failure:  }  #endif -/* - *	Delete a VIF entry +/** + *	vif_delete - Delete a VIF entry   *	@notify: Set to 1, if the caller is a notifier_call   */ @@ -570,6 +590,9 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,  	in_dev = __in_dev_get_rtnl(dev);  	if (in_dev) {  		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; +		inet_netconf_notify_devconf(dev_net(dev), +					    NETCONFA_MC_FORWARDING, +					    dev->ifindex, &in_dev->cnf);  		ip_rt_multicast_event(in_dev);  	} @@ -608,13 +631,13 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)  		if (ip_hdr(skb)->version == 0) {  			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));  			nlh->nlmsg_type = NLMSG_ERROR; -			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); +			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));  			skb_trim(skb, nlh->nlmsg_len); -			e = NLMSG_DATA(nlh); +			e = nlmsg_data(nlh);  			e->error = -ETIMEDOUT;  			memset(&e->msg, 0, sizeof(e->msg)); -			rtnl_unicast(skb, net, NETLINK_CB(skb).pid); +			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);  		} else {  			kfree_skb(skb);  		} @@ -653,6 +676,7 @@ static void ipmr_expire_process(unsigned long arg)  		}  		list_del(&c->list); +		mroute_netlink_event(mrt, c, RTM_DELROUTE);  		ipmr_destroy_unres(mrt, c);  	} @@ -760,6 +784,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,  		return -EADDRNOTAVAIL;  	}  	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; +	inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex, +				    &in_dev->cnf);  	ip_rt_multicast_event(in_dev);  	/* Fill in the VIF structures */ @@ -807,6 +833,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,  	return NULL;  } +/* Look for a (*,*,oif) entry */ +static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, +						    int vifi) +{ +	int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); +	struct mfc_cache *c; + +	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) +		if (c->mfc_origin == htonl(INADDR_ANY) && +		    c->mfc_mcastgrp == htonl(INADDR_ANY) && +		    c->mfc_un.res.ttls[vifi] < 255) +			return c; + +	return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, +					     __be32 mcastgrp, int vifi) +{ +	int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); +	struct mfc_cache *c, *proxy; + +	if (mcastgrp == htonl(INADDR_ANY)) +		goto skip; + +	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) +		if (c->mfc_origin == htonl(INADDR_ANY) && +		    c->mfc_mcastgrp == mcastgrp) { +			if (c->mfc_un.res.ttls[vifi] < 255) +				return c; + +			/* It's ok if the vifi is part of the static tree */ +			proxy = ipmr_cache_find_any_parent(mrt, +							   c->mfc_parent); +			if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) +				return c; +		} + +skip: +	return ipmr_cache_find_any_parent(mrt, vifi); +} +  /*   *	Allocate a multicast cache entry   */ @@ -846,19 +915,19 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,  		if (ip_hdr(skb)->version == 0) {  			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); -			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { +			if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {  				nlh->nlmsg_len = skb_tail_pointer(skb) -  						 (u8 *)nlh;  			} else {  				nlh->nlmsg_type = NLMSG_ERROR; -				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); +				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));  				skb_trim(skb, nlh->nlmsg_len); -				e = NLMSG_DATA(nlh); +				e = nlmsg_data(nlh);  				e->error = -EMSGSIZE;  				memset(&e->msg, 0, sizeof(e->msg));  			} -			rtnl_unicast(skb, net, NETLINK_CB(skb).pid); +			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);  		} else {  			ip_mr_forward(net, mrt, skb, c, 0);  		} @@ -916,7 +985,7 @@ static int ipmr_cache_report(struct mr_table *mrt,  	/* Copy the IP header */ -	skb->network_header = skb->tail; +	skb_set_network_header(skb, skb->len);  	skb_put(skb, ihl);  	skb_copy_to_linear_data(skb, pkt->data, ihl);  	ip_hdr(skb)->protocol = 0;	/* Flag to the kernel this is a route add */ @@ -947,8 +1016,7 @@ static int ipmr_cache_report(struct mr_table *mrt,  	ret = sock_queue_rcv_skb(mroute_sk, skb);  	rcu_read_unlock();  	if (ret < 0) { -		if (net_ratelimit()) -			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); +		net_warn_ratelimited("mroute: pending queue full, dropping entries\n");  		kfree_skb(skb);  	} @@ -1009,6 +1077,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)  		atomic_inc(&mrt->cache_resolve_queue_len);  		list_add(&c->list, &mrt->mfc_unres_queue); +		mroute_netlink_event(mrt, c, RTM_NEWROUTE);  		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)  			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); @@ -1032,7 +1101,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)   *	MFC cache manipulation by user space mroute daemon   */ -static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) +static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)  {  	int line;  	struct mfc_cache *c, *next; @@ -1041,9 +1110,10 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)  	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {  		if (c->mfc_origin == mfc->mfcc_origin.s_addr && -		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { +		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && +		    (parent == -1 || parent == c->mfc_parent)) {  			list_del_rcu(&c->list); - +			mroute_netlink_event(mrt, c, RTM_DELROUTE);  			ipmr_cache_free(c);  			return 0;  		} @@ -1052,7 +1122,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)  }  static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, -			struct mfcctl *mfc, int mrtsock) +			struct mfcctl *mfc, int mrtsock, int parent)  {  	bool found = false;  	int line; @@ -1065,7 +1135,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,  	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {  		if (c->mfc_origin == mfc->mfcc_origin.s_addr && -		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { +		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && +		    (parent == -1 || parent == c->mfc_parent)) {  			found = true;  			break;  		} @@ -1078,10 +1149,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,  		if (!mrtsock)  			c->mfc_flags |= MFC_STATIC;  		write_unlock_bh(&mrt_lock); +		mroute_netlink_event(mrt, c, RTM_NEWROUTE);  		return 0;  	} -	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) +	if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && +	    !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))  		return -EINVAL;  	c = ipmr_cache_alloc(); @@ -1120,6 +1193,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,  		ipmr_cache_resolve(net, mrt, uc, c);  		ipmr_cache_free(uc);  	} +	mroute_netlink_event(mrt, c, RTM_NEWROUTE);  	return 0;  } @@ -1148,6 +1222,7 @@ static void mroute_clean_tables(struct mr_table *mrt)  			if (c->mfc_flags & MFC_STATIC)  				continue;  			list_del_rcu(&c->list); +			mroute_netlink_event(mrt, c, RTM_DELROUTE);  			ipmr_cache_free(c);  		}  	} @@ -1156,6 +1231,7 @@ static void mroute_clean_tables(struct mr_table *mrt)  		spin_lock_bh(&mfc_unres_lock);  		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {  			list_del(&c->list); +			mroute_netlink_event(mrt, c, RTM_DELROUTE);  			ipmr_destroy_unres(mrt, c);  		}  		spin_unlock_bh(&mfc_unres_lock); @@ -1174,7 +1250,10 @@ static void mrtsock_destruct(struct sock *sk)  	ipmr_for_each_table(mrt, net) {  		if (sk == rtnl_dereference(mrt->mroute_sk)) {  			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; -			rcu_assign_pointer(mrt->mroute_sk, NULL); +			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, +						    NETCONFA_IFINDEX_ALL, +						    net->ipv4.devconf_all); +			RCU_INIT_POINTER(mrt->mroute_sk, NULL);  			mroute_clean_tables(mrt);  		}  	} @@ -1190,29 +1269,30 @@ static void mrtsock_destruct(struct sock *sk)  int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)  { -	int ret; +	int ret, parent = 0;  	struct vifctl vif;  	struct mfcctl mfc;  	struct net *net = sock_net(sk);  	struct mr_table *mrt; +	if (sk->sk_type != SOCK_RAW || +	    inet_sk(sk)->inet_num != IPPROTO_IGMP) +		return -EOPNOTSUPP; +  	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);  	if (mrt == NULL)  		return -ENOENT;  	if (optname != MRT_INIT) { -		if (sk != rcu_dereference_raw(mrt->mroute_sk) && -		    !capable(CAP_NET_ADMIN)) +		if (sk != rcu_access_pointer(mrt->mroute_sk) && +		    !ns_capable(net->user_ns, CAP_NET_ADMIN))  			return -EACCES;  	}  	switch (optname) {  	case MRT_INIT: -		if (sk->sk_type != SOCK_RAW || -		    inet_sk(sk)->inet_num != IPPROTO_IGMP) -			return -EOPNOTSUPP;  		if (optlen != sizeof(int)) -			return -ENOPROTOOPT; +			return -EINVAL;  		rtnl_lock();  		if (rtnl_dereference(mrt->mroute_sk)) { @@ -1224,11 +1304,14 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  		if (ret == 0) {  			rcu_assign_pointer(mrt->mroute_sk, sk);  			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; +			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, +						    NETCONFA_IFINDEX_ALL, +						    net->ipv4.devconf_all);  		}  		rtnl_unlock();  		return ret;  	case MRT_DONE: -		if (sk != rcu_dereference_raw(mrt->mroute_sk)) +		if (sk != rcu_access_pointer(mrt->mroute_sk))  			return -EACCES;  		return ip_ra_control(sk, 0, NULL);  	case MRT_ADD_VIF: @@ -1255,16 +1338,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  		 */  	case MRT_ADD_MFC:  	case MRT_DEL_MFC: +		parent = -1; +	case MRT_ADD_MFC_PROXY: +	case MRT_DEL_MFC_PROXY:  		if (optlen != sizeof(mfc))  			return -EINVAL;  		if (copy_from_user(&mfc, optval, sizeof(mfc)))  			return -EFAULT; +		if (parent == 0) +			parent = mfc.mfcc_parent;  		rtnl_lock(); -		if (optname == MRT_DEL_MFC) -			ret = ipmr_mfc_delete(mrt, &mfc); +		if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) +			ret = ipmr_mfc_delete(mrt, &mfc, parent);  		else  			ret = ipmr_mfc_add(net, mrt, &mfc, -					   sk == rtnl_dereference(mrt->mroute_sk)); +					   sk == rtnl_dereference(mrt->mroute_sk), +					   parent);  		rtnl_unlock();  		return ret;  		/* @@ -1273,9 +1362,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  	case MRT_ASSERT:  	{  		int v; +		if (optlen != sizeof(v)) +			return -EINVAL;  		if (get_user(v, (int __user *)optval))  			return -EFAULT; -		mrt->mroute_do_assert = (v) ? 1 : 0; +		mrt->mroute_do_assert = v;  		return 0;  	}  #ifdef CONFIG_IP_PIMSM @@ -1283,9 +1374,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  	{  		int v; +		if (optlen != sizeof(v)) +			return -EINVAL;  		if (get_user(v, (int __user *)optval))  			return -EFAULT; -		v = (v) ? 1 : 0; +		v = !!v;  		rtnl_lock();  		ret = 0; @@ -1307,6 +1400,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  		if (get_user(v, (u32 __user *)optval))  			return -EFAULT; +		/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ +		if (v != RT_TABLE_DEFAULT && v >= 1000000000) +			return -EINVAL; +  		rtnl_lock();  		ret = 0;  		if (sk == rtnl_dereference(mrt->mroute_sk)) { @@ -1314,7 +1411,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi  		} else {  			if (!ipmr_new_table(net, v))  				ret = -ENOMEM; -			raw_sk(sk)->ipmr_table = v; +			else +				raw_sk(sk)->ipmr_table = v;  		}  		rtnl_unlock();  		return ret; @@ -1340,6 +1438,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int  	struct net *net = sock_net(sk);  	struct mr_table *mrt; +	if (sk->sk_type != SOCK_RAW || +	    inet_sk(sk)->inet_num != IPPROTO_IGMP) +		return -EOPNOTSUPP; +  	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);  	if (mrt == NULL)  		return -ENOENT; @@ -1434,15 +1536,89 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)  	}  } +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req { +	struct in_addr src; +	struct in_addr grp; +	compat_ulong_t pktcnt; +	compat_ulong_t bytecnt; +	compat_ulong_t wrong_if; +}; + +struct compat_sioc_vif_req { +	vifi_t	vifi;		/* Which iface */ +	compat_ulong_t icount; +	compat_ulong_t ocount; +	compat_ulong_t ibytes; +	compat_ulong_t obytes; +}; + +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ +	struct compat_sioc_sg_req sr; +	struct compat_sioc_vif_req vr; +	struct vif_device *vif; +	struct mfc_cache *c; +	struct net *net = sock_net(sk); +	struct mr_table *mrt; + +	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); +	if (mrt == NULL) +		return -ENOENT; + +	switch (cmd) { +	case SIOCGETVIFCNT: +		if (copy_from_user(&vr, arg, sizeof(vr))) +			return -EFAULT; +		if (vr.vifi >= mrt->maxvif) +			return -EINVAL; +		read_lock(&mrt_lock); +		vif = &mrt->vif_table[vr.vifi]; +		if (VIF_EXISTS(mrt, vr.vifi)) { +			vr.icount = vif->pkt_in; +			vr.ocount = vif->pkt_out; +			vr.ibytes = vif->bytes_in; +			vr.obytes = vif->bytes_out; +			read_unlock(&mrt_lock); + +			if (copy_to_user(arg, &vr, sizeof(vr))) +				return -EFAULT; +			return 0; +		} +		read_unlock(&mrt_lock); +		return -EADDRNOTAVAIL; +	case SIOCGETSGCNT: +		if (copy_from_user(&sr, arg, sizeof(sr))) +			return -EFAULT; + +		rcu_read_lock(); +		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); +		if (c) { +			sr.pktcnt = c->mfc_un.res.pkt; +			sr.bytecnt = c->mfc_un.res.bytes; +			sr.wrong_if = c->mfc_un.res.wrong_if; +			rcu_read_unlock(); + +			if (copy_to_user(arg, &sr, sizeof(sr))) +				return -EFAULT; +			return 0; +		} +		rcu_read_unlock(); +		return -EADDRNOTAVAIL; +	default: +		return -ENOIOCTLCMD; +	} +} +#endif +  static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)  { -	struct net_device *dev = ptr; +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct net *net = dev_net(dev);  	struct mr_table *mrt;  	struct vif_device *v;  	int ct; -	LIST_HEAD(list);  	if (event != NETDEV_UNREGISTER)  		return NOTIFY_DONE; @@ -1451,10 +1627,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v  		v = &mrt->vif_table[0];  		for (ct = 0; ct < mrt->maxvif; ct++, v++) {  			if (v->dev == dev) -				vif_delete(mrt, ct, 1, &list); +				vif_delete(mrt, ct, 1, NULL);  		}  	} -	unregister_netdevice_many(&list);  	return NOTIFY_DONE;  } @@ -1472,7 +1647,7 @@ static struct notifier_block ip_mr_notifier = {  static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)  {  	struct iphdr *iph; -	struct iphdr *old_iph = ip_hdr(skb); +	const struct iphdr *old_iph = ip_hdr(skb);  	skb_push(skb, sizeof(struct iphdr));  	skb->transport_header = skb->network_header; @@ -1488,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)  	iph->protocol	=	IPPROTO_IPIP;  	iph->ihl	=	5;  	iph->tot_len	=	htons(skb->len); -	ip_select_ident(iph, skb_dst(skb), NULL); +	ip_select_ident(skb, NULL);  	ip_send_check(iph);  	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1500,6 +1675,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)  	struct ip_options *opt = &(IPCB(skb)->opt);  	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); +	IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);  	if (unlikely(opt->optlen))  		ip_forward_options(skb); @@ -1518,6 +1694,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,  	struct vif_device *vif = &mrt->vif_table[vifi];  	struct net_device *dev;  	struct rtable *rt; +	struct flowi4 fl4;  	int    encap = 0;  	if (vif->dev == NULL) @@ -1535,26 +1712,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,  #endif  	if (vif->flags & VIFF_TUNNEL) { -		struct flowi fl = { -			.oif = vif->link, -			.fl4_dst = vif->remote, -			.fl4_src = vif->local, -			.fl4_tos = RT_TOS(iph->tos), -			.proto = IPPROTO_IPIP -		}; - -		if (ip_route_output_key(net, &rt, &fl)) +		rt = ip_route_output_ports(net, &fl4, NULL, +					   vif->remote, vif->local, +					   0, 0, +					   IPPROTO_IPIP, +					   RT_TOS(iph->tos), vif->link); +		if (IS_ERR(rt))  			goto out_free;  		encap = sizeof(struct iphdr);  	} else { -		struct flowi fl = { -			.oif = vif->link, -			.fl4_dst = iph->daddr, -			.fl4_tos = RT_TOS(iph->tos), -			.proto = IPPROTO_IPIP -		}; - -		if (ip_route_output_key(net, &rt, &fl)) +		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, +					   0, 0, +					   IPPROTO_IPIP, +					   RT_TOS(iph->tos), vif->link); +		if (IS_ERR(rt))  			goto out_free;  	} @@ -1629,23 +1800,34 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)  /* "local" means that we should preserve one skb (for local delivery) */ -static int ip_mr_forward(struct net *net, struct mr_table *mrt, -			 struct sk_buff *skb, struct mfc_cache *cache, -			 int local) +static void ip_mr_forward(struct net *net, struct mr_table *mrt, +			  struct sk_buff *skb, struct mfc_cache *cache, +			  int local)  {  	int psend = -1;  	int vif, ct; +	int true_vifi = ipmr_find_vif(mrt, skb->dev);  	vif = cache->mfc_parent;  	cache->mfc_un.res.pkt++;  	cache->mfc_un.res.bytes += skb->len; +	if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { +		struct mfc_cache *cache_proxy; + +		/* For an (*,G) entry, we only check that the incomming +		 * interface is part of the static tree. +		 */ +		cache_proxy = ipmr_cache_find_any_parent(mrt, vif); +		if (cache_proxy && +		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255) +			goto forward; +	} +  	/*  	 * Wrong interface: drop packet and (maybe) send PIM assert.  	 */  	if (mrt->vif_table[vif].dev != skb->dev) { -		int true_vifi; -  		if (rt_is_output_route(skb_rtable(skb))) {  			/* It is our own packet, looped back.  			 * Very complicated situation... @@ -1662,7 +1844,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,  		}  		cache->mfc_un.res.wrong_if++; -		true_vifi = ipmr_find_vif(mrt, skb->dev);  		if (true_vifi >= 0 && mrt->mroute_do_assert &&  		    /* pimsm uses asserts, when switching from RPT to SPT, @@ -1680,15 +1861,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,  		goto dont_forward;  	} +forward:  	mrt->vif_table[vif].pkt_in++;  	mrt->vif_table[vif].bytes_in += skb->len;  	/*  	 *	Forward the frame  	 */ +	if (cache->mfc_origin == htonl(INADDR_ANY) && +	    cache->mfc_mcastgrp == htonl(INADDR_ANY)) { +		if (true_vifi >= 0 && +		    true_vifi != cache->mfc_parent && +		    ip_hdr(skb)->ttl > +				cache->mfc_un.res.ttls[cache->mfc_parent]) { +			/* It's an (*,*) entry and the packet is not coming from +			 * the upstream: forward the packet to the upstream +			 * only. +			 */ +			psend = cache->mfc_parent; +			goto last_forward; +		} +		goto dont_forward; +	}  	for (ct = cache->mfc_un.res.maxvif - 1;  	     ct >= cache->mfc_un.res.minvif; ct--) { -		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { +		/* For (*,G) entry, don't forward to the incoming interface */ +		if ((cache->mfc_origin != htonl(INADDR_ANY) || +		     ct != true_vifi) && +		    ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {  			if (psend != -1) {  				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1699,6 +1899,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,  			psend = ct;  		}  	} +last_forward:  	if (psend != -1) {  		if (local) {  			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -1707,16 +1908,38 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,  				ipmr_queue_xmit(net, mrt, skb2, cache, psend);  		} else {  			ipmr_queue_xmit(net, mrt, skb, cache, psend); -			return 0; +			return;  		}  	}  dont_forward:  	if (!local)  		kfree_skb(skb); -	return 0;  } +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) +{ +	struct rtable *rt = skb_rtable(skb); +	struct iphdr *iph = ip_hdr(skb); +	struct flowi4 fl4 = { +		.daddr = iph->daddr, +		.saddr = iph->saddr, +		.flowi4_tos = RT_TOS(iph->tos), +		.flowi4_oif = (rt_is_output_route(rt) ? +			       skb->dev->ifindex : 0), +		.flowi4_iif = (rt_is_output_route(rt) ? +			       LOOPBACK_IFINDEX : +			       skb->dev->ifindex), +		.flowi4_mark = skb->mark, +	}; +	struct mr_table *mrt; +	int err; + +	err = ipmr_fib_lookup(net, &fl4, &mrt); +	if (err) +		return ERR_PTR(err); +	return mrt; +}  /*   *	Multicast packets for forwarding arrive here @@ -1729,7 +1952,6 @@ int ip_mr_input(struct sk_buff *skb)  	struct net *net = dev_net(skb->dev);  	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;  	struct mr_table *mrt; -	int err;  	/* Packet is looped back after forward, it should not be  	 * forwarded second time, but still can be delivered locally. @@ -1737,12 +1959,11 @@ int ip_mr_input(struct sk_buff *skb)  	if (IPCB(skb)->flags & IPSKB_FORWARDED)  		goto dont_forward; -	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); -	if (err < 0) { +	mrt = ipmr_rt_fib_lookup(net, skb); +	if (IS_ERR(mrt)) {  		kfree_skb(skb); -		return err; +		return PTR_ERR(mrt);  	} -  	if (!local) {  		if (IPCB(skb)->opt.router_alert) {  			if (ip_call_ra_chain(skb)) @@ -1767,6 +1988,13 @@ int ip_mr_input(struct sk_buff *skb)  	/* already under rcu_read_lock() */  	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); +	if (cache == NULL) { +		int vif = ipmr_find_vif(mrt, skb->dev); + +		if (vif >= 0) +			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, +						    vif); +	}  	/*  	 *	No usable cache entry @@ -1844,9 +2072,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,  	skb_reset_network_header(skb);  	skb->protocol = htons(ETH_P_IP);  	skb->ip_summed = CHECKSUM_NONE; -	skb->pkt_type = PACKET_HOST; -	skb_tunnel_rx(skb, reg_dev); +	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));  	netif_rx(skb); @@ -1870,9 +2097,9 @@ int pim_rcv_v1(struct sk_buff *skb)  	pim = igmp_hdr(skb); -	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) +	mrt = ipmr_rt_fib_lookup(net, skb); +	if (IS_ERR(mrt))  		goto drop; -  	if (!mrt->mroute_do_pim ||  	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)  		goto drop; @@ -1902,9 +2129,9 @@ static int pim_rcv(struct sk_buff *skb)  	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))  		goto drop; -	if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) +	mrt = ipmr_rt_fib_lookup(net, skb); +	if (IS_ERR(mrt))  		goto drop; -  	if (__pim_rcv(mrt, skb, sizeof(*pim))) {  drop:  		kfree_skb(skb); @@ -1918,54 +2145,66 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,  {  	int ct;  	struct rtnexthop *nhp; -	u8 *b = skb_tail_pointer(skb); -	struct rtattr *mp_head; +	struct nlattr *mp_attr; +	struct rta_mfc_stats mfcs;  	/* If cache is unresolved, don't try to parse IIF and OIF */  	if (c->mfc_parent >= MAXVIFS)  		return -ENOENT; -	if (VIF_EXISTS(mrt, c->mfc_parent)) -		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex); +	if (VIF_EXISTS(mrt, c->mfc_parent) && +	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) +		return -EMSGSIZE; -	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); +	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) +		return -EMSGSIZE;  	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {  		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { -			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) -				goto rtattr_failure; -			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); +			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { +				nla_nest_cancel(skb, mp_attr); +				return -EMSGSIZE; +			} +  			nhp->rtnh_flags = 0;  			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];  			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;  			nhp->rtnh_len = sizeof(*nhp);  		}  	} -	mp_head->rta_type = RTA_MULTIPATH; -	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + +	nla_nest_end(skb, mp_attr); + +	mfcs.mfcs_packets = c->mfc_un.res.pkt; +	mfcs.mfcs_bytes = c->mfc_un.res.bytes; +	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; +	if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) +		return -EMSGSIZE; +  	rtm->rtm_type = RTN_MULTICAST;  	return 1; - -rtattr_failure: -	nlmsg_trim(skb, b); -	return -EMSGSIZE;  } -int ipmr_get_route(struct net *net, -		   struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ipmr_get_route(struct net *net, struct sk_buff *skb, +		   __be32 saddr, __be32 daddr, +		   struct rtmsg *rtm, int nowait)  { -	int err; -	struct mr_table *mrt;  	struct mfc_cache *cache; -	struct rtable *rt = skb_rtable(skb); +	struct mr_table *mrt; +	int err;  	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);  	if (mrt == NULL)  		return -ENOENT;  	rcu_read_lock(); -	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); +	cache = ipmr_cache_find(mrt, saddr, daddr); +	if (cache == NULL && skb->dev) { +		int vif = ipmr_find_vif(mrt, skb->dev); +		if (vif >= 0) +			cache = ipmr_cache_find_any(mrt, daddr, vif); +	}  	if (cache == NULL) {  		struct sk_buff *skb2;  		struct iphdr *iph; @@ -1997,8 +2236,8 @@ int ipmr_get_route(struct net *net,  		skb_reset_network_header(skb2);  		iph = ip_hdr(skb2);  		iph->ihl = sizeof(struct iphdr) >> 2; -		iph->saddr = rt->rt_src; -		iph->daddr = rt->rt_dst; +		iph->saddr = saddr; +		iph->daddr = daddr;  		iph->version = 0;  		err = ipmr_cache_unresolved(mrt, vif, skb2);  		read_unlock(&mrt_lock); @@ -2016,12 +2255,14 @@ int ipmr_get_route(struct net *net,  }  static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, -			    u32 pid, u32 seq, struct mfc_cache *c) +			    u32 portid, u32 seq, struct mfc_cache *c, int cmd, +			    int flags)  {  	struct nlmsghdr *nlh;  	struct rtmsg *rtm; +	int err; -	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); +	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);  	if (nlh == NULL)  		return -EMSGSIZE; @@ -2031,16 +2272,22 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,  	rtm->rtm_src_len  = 32;  	rtm->rtm_tos      = 0;  	rtm->rtm_table    = mrt->id; -	NLA_PUT_U32(skb, RTA_TABLE, mrt->id); +	if (nla_put_u32(skb, RTA_TABLE, mrt->id)) +		goto nla_put_failure;  	rtm->rtm_type     = RTN_MULTICAST;  	rtm->rtm_scope    = RT_SCOPE_UNIVERSE; -	rtm->rtm_protocol = RTPROT_UNSPEC; +	if (c->mfc_flags & MFC_STATIC) +		rtm->rtm_protocol = RTPROT_STATIC; +	else +		rtm->rtm_protocol = RTPROT_MROUTED;  	rtm->rtm_flags    = 0; -	NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); -	NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); - -	if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) +	if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || +	    nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) +		goto nla_put_failure; +	err = __ipmr_fill_mroute(mrt, skb, c, rtm); +	/* do not break the dump if cache is unresolved */ +	if (err < 0 && err != -ENOENT)  		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -2050,6 +2297,52 @@ nla_put_failure:  	return -EMSGSIZE;  } +static size_t mroute_msgsize(bool unresolved, int maxvif) +{ +	size_t len = +		NLMSG_ALIGN(sizeof(struct rtmsg)) +		+ nla_total_size(4)	/* RTA_TABLE */ +		+ nla_total_size(4)	/* RTA_SRC */ +		+ nla_total_size(4)	/* RTA_DST */ +		; + +	if (!unresolved) +		len = len +		      + nla_total_size(4)	/* RTA_IIF */ +		      + nla_total_size(0)	/* RTA_MULTIPATH */ +		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) +						/* RTA_MFC_STATS */ +		      + nla_total_size(sizeof(struct rta_mfc_stats)) +		; + +	return len; +} + +static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, +				 int cmd) +{ +	struct net *net = read_pnet(&mrt->net); +	struct sk_buff *skb; +	int err = -ENOBUFS; + +	skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), +			GFP_ATOMIC); +	if (skb == NULL) +		goto errout; + +	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); +	if (err < 0) +		goto errout; + +	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); +	return; + +errout: +	kfree_skb(skb); +	if (err < 0) +		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); +} +  static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct net *net = sock_net(skb->sk); @@ -2074,15 +2367,33 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)  				if (e < s_e)  					goto next_entry;  				if (ipmr_fill_mroute(mrt, skb, -						     NETLINK_CB(cb->skb).pid, +						     NETLINK_CB(cb->skb).portid,  						     cb->nlh->nlmsg_seq, -						     mfc) < 0) +						     mfc, RTM_NEWROUTE, +						     NLM_F_MULTI) < 0)  					goto done;  next_entry:  				e++;  			}  			e = s_e = 0;  		} +		spin_lock_bh(&mfc_unres_lock); +		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { +			if (e < s_e) +				goto next_entry2; +			if (ipmr_fill_mroute(mrt, skb, +					     NETLINK_CB(cb->skb).portid, +					     cb->nlh->nlmsg_seq, +					     mfc, RTM_NEWROUTE, +					     NLM_F_MULTI) < 0) { +				spin_unlock_bh(&mfc_unres_lock); +				goto done; +			} +next_entry2: +			e++; +		} +		spin_unlock_bh(&mfc_unres_lock); +		e = s_e = 0;  		s_h = 0;  next_table:  		t++; @@ -2398,16 +2709,16 @@ static int __net_init ipmr_net_init(struct net *net)  #ifdef CONFIG_PROC_FS  	err = -ENOMEM; -	if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops)) +	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))  		goto proc_vif_fail; -	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops)) +	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))  		goto proc_cache_fail;  #endif  	return 0;  #ifdef CONFIG_PROC_FS  proc_cache_fail: -	proc_net_remove(net, "ip_mr_vif"); +	remove_proc_entry("ip_mr_vif", net->proc_net);  proc_vif_fail:  	ipmr_rules_exit(net);  #endif @@ -2418,8 +2729,8 @@ fail:  static void __net_exit ipmr_net_exit(struct net *net)  {  #ifdef CONFIG_PROC_FS -	proc_net_remove(net, "ip_mr_cache"); -	proc_net_remove(net, "ip_mr_vif"); +	remove_proc_entry("ip_mr_cache", net->proc_net); +	remove_proc_entry("ip_mr_vif", net->proc_net);  #endif  	ipmr_rules_exit(net);  } @@ -2449,12 +2760,13 @@ int __init ip_mr_init(void)  		goto reg_notif_fail;  #ifdef CONFIG_IP_PIMSM_V2  	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { -		printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n"); +		pr_err("%s: can't add PIM protocol\n", __func__);  		err = -EAGAIN;  		goto add_proto_fail;  	}  #endif -	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); +	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, +		      NULL, ipmr_rtm_dumproute, NULL);  	return 0;  #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 994a1f29ebb..7ebd6e37875 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,76 +1,67 @@ -/* IPv4 specific functions of netfilter core */ +/* + * IPv4 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */  #include <linux/kernel.h>  #include <linux/netfilter.h>  #include <linux/netfilter_ipv4.h>  #include <linux/ip.h>  #include <linux/skbuff.h>  #include <linux/gfp.h> +#include <linux/export.h>  #include <net/route.h>  #include <net/xfrm.h>  #include <net/ip.h>  #include <net/netfilter/nf_queue.h>  /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) +int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)  {  	struct net *net = dev_net(skb_dst(skb)->dev);  	const struct iphdr *iph = ip_hdr(skb);  	struct rtable *rt; -	struct flowi fl = {}; -	unsigned long orefdst; +	struct flowi4 fl4 = {}; +	__be32 saddr = iph->saddr; +	__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;  	unsigned int hh_len; -	unsigned int type; -	type = inet_addr_type(net, iph->saddr); -	if (skb->sk && inet_sk(skb->sk)->transparent) -		type = RTN_LOCAL;  	if (addr_type == RTN_UNSPEC) -		addr_type = type; +		addr_type = inet_addr_type(net, saddr); +	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) +		flags |= FLOWI_FLAG_ANYSRC; +	else +		saddr = 0;  	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause  	 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.  	 */ -	if (addr_type == RTN_LOCAL) { -		fl.fl4_dst = iph->daddr; -		if (type == RTN_LOCAL) -			fl.fl4_src = iph->saddr; -		fl.fl4_tos = RT_TOS(iph->tos); -		fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; -		fl.mark = skb->mark; -		fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; -		if (ip_route_output_key(net, &rt, &fl) != 0) -			return -1; - -		/* Drop old route. */ -		skb_dst_drop(skb); -		skb_dst_set(skb, &rt->dst); -	} else { -		/* non-local src, find valid iif to satisfy -		 * rp-filter when calling ip_route_input. */ -		fl.fl4_dst = iph->saddr; -		if (ip_route_output_key(net, &rt, &fl) != 0) -			return -1; - -		orefdst = skb->_skb_refdst; -		if (ip_route_input(skb, iph->daddr, iph->saddr, -				   RT_TOS(iph->tos), rt->dst.dev) != 0) { -			dst_release(&rt->dst); -			return -1; -		} -		dst_release(&rt->dst); -		refdst_drop(orefdst); -	} +	fl4.daddr = iph->daddr; +	fl4.saddr = saddr; +	fl4.flowi4_tos = RT_TOS(iph->tos); +	fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; +	fl4.flowi4_mark = skb->mark; +	fl4.flowi4_flags = flags; +	rt = ip_route_output_key(net, &fl4); +	if (IS_ERR(rt)) +		return PTR_ERR(rt); + +	/* Drop old route. */ +	skb_dst_drop(skb); +	skb_dst_set(skb, &rt->dst);  	if (skb_dst(skb)->error) -		return -1; +		return skb_dst(skb)->error;  #ifdef CONFIG_XFRM  	if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && -	    xfrm_decode_session(skb, &fl, AF_INET) == 0) { +	    xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {  		struct dst_entry *dst = skb_dst(skb);  		skb_dst_set(skb, NULL); -		if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) -			return -1; +		dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); +		if (IS_ERR(dst)) +			return PTR_ERR(dst);  		skb_dst_set(skb, dst);  	}  #endif @@ -78,49 +69,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)  	/* Change in oif may mean change in hh_len. */  	hh_len = skb_dst(skb)->dev->hard_header_len;  	if (skb_headroom(skb) < hh_len && -	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) -		return -1; +	    pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), +				0, GFP_ATOMIC)) +		return -ENOMEM;  	return 0;  }  EXPORT_SYMBOL(ip_route_me_harder); -#ifdef CONFIG_XFRM -int ip_xfrm_me_harder(struct sk_buff *skb) -{ -	struct flowi fl; -	unsigned int hh_len; -	struct dst_entry *dst; - -	if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) -		return 0; -	if (xfrm_decode_session(skb, &fl, AF_INET) < 0) -		return -1; - -	dst = skb_dst(skb); -	if (dst->xfrm) -		dst = ((struct xfrm_dst *)dst)->route; -	dst_hold(dst); - -	if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) -		return -1; - -	skb_dst_drop(skb); -	skb_dst_set(skb, dst); - -	/* Change in oif may mean change in hh_len. */ -	hh_len = skb_dst(skb)->dev->hard_header_len; -	if (skb_headroom(skb) < hh_len && -	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) -		return -1; -	return 0; -} -EXPORT_SYMBOL(ip_xfrm_me_harder); -#endif - -void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *); -EXPORT_SYMBOL(ip_nat_decode_session); -  /*   * Extra routing may needed on local out, as the QUEUE target never   * returns control to the table. @@ -217,9 +173,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,  	return csum;  } -static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip_route(struct net *net, struct dst_entry **dst, +		       struct flowi *fl, bool strict __always_unused)  { -	return ip_route_output_key(&init_net, (struct rtable **)dst, fl); +	struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); +	if (IS_ERR(rt)) +		return PTR_ERR(rt); +	*dst = &rt->dst; +	return 0;  }  static const struct nf_afinfo nf_ip_afinfo = { @@ -232,25 +193,15 @@ static const struct nf_afinfo nf_ip_afinfo = {  	.route_key_size		= sizeof(struct ip_rt_info),  }; -static int ipv4_netfilter_init(void) +static int __init ipv4_netfilter_init(void)  {  	return nf_register_afinfo(&nf_ip_afinfo);  } -static void ipv4_netfilter_fini(void) +static void __exit ipv4_netfilter_fini(void)  {  	nf_unregister_afinfo(&nf_ip_afinfo);  }  module_init(ipv4_netfilter_init);  module_exit(ipv4_netfilter_fini); - -#ifdef CONFIG_SYSCTL -struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ .procname = "netfilter", }, -	{ } -}; -EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); -#endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5..a26ce035e3f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4  config NF_CONNTRACK_PROC_COMPAT  	bool "proc/sysctl compatibility with old connection tracking" -	depends on NF_CONNTRACK_IPV4 +	depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4  	default y  	help  	  This option enables /proc and sysctl compatibility with the old @@ -36,18 +36,41 @@ config NF_CONNTRACK_PROC_COMPAT  	  If unsure, say Y. -config IP_NF_QUEUE -	tristate "IP Userspace queueing via NETLINK (OBSOLETE)" -	depends on NETFILTER_ADVANCED +config NF_TABLES_IPV4 +	depends on NF_TABLES +	tristate "IPv4 nf_tables support"  	help -	  Netfilter has the ability to queue packets to user space: the -	  netlink device can be used to access them using this driver. +	  This option enables the IPv4 support for nf_tables. -	  This option enables the old IPv4-only "ip_queue" implementation -	  which has been obsoleted by the new "nfnetlink_queue" code (see -	  CONFIG_NETFILTER_NETLINK_QUEUE). +config NFT_CHAIN_ROUTE_IPV4 +	depends on NF_TABLES_IPV4 +	tristate "IPv4 nf_tables route chain support" +	help +	  This option enables the "route" chain for IPv4 in nf_tables. This +	  chain type is used to force packet re-routing after mangling header +	  fields such as the source, destination, type of service and +	  the packet mark. + +config NFT_CHAIN_NAT_IPV4 +	depends on NF_TABLES_IPV4 +	depends on NF_NAT_IPV4 && NFT_NAT +	tristate "IPv4 nf_tables nat chain support" +	help +	  This option enables the "nat" chain for IPv4 in nf_tables. This +	  chain type is used to perform Network Address Translation (NAT) +	  packet transformations such as the source, destination address and +	  source and destination ports. + +config NFT_REJECT_IPV4 +	depends on NF_TABLES_IPV4 +	default NFT_REJECT +	tristate -	  To compile it as a module, choose M here.  If unsure, say N. +config NF_TABLES_ARP +	depends on NF_TABLES +	tristate "ARP nf_tables support" +	help +	  This option enables the ARP support for nf_tables.  config IP_NF_IPTABLES  	tristate "IP tables support (required for filtering/masq/NAT)" @@ -64,16 +87,6 @@ config IP_NF_IPTABLES  if IP_NF_IPTABLES  # The matches. -config IP_NF_MATCH_ADDRTYPE -	tristate '"addrtype" address type match support' -	depends on NETFILTER_ADVANCED -	help -	  This option allows you to match what routing thinks of an address, -	  eg. UNICAST, LOCAL, BROADCAST, ... - -	  If you want to compile it as a module, say M here and read -	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'. -  config IP_NF_MATCH_AH  	tristate '"ah" match support'  	depends on NETFILTER_ADVANCED @@ -86,11 +99,21 @@ config IP_NF_MATCH_AH  config IP_NF_MATCH_ECN  	tristate '"ecn" match support'  	depends on NETFILTER_ADVANCED -	help -	  This option adds a `ECN' match, which allows you to match against -	  the IPv4 and TCP header ECN fields. +	select NETFILTER_XT_MATCH_ECN +	---help--- +	This is a backwards-compat option for the user's convenience +	(e.g. when running oldconfig). It selects +	CONFIG_NETFILTER_XT_MATCH_ECN. + +config IP_NF_MATCH_RPFILTER +	tristate '"rpfilter" reverse path filter match support' +	depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW) +	---help--- +	  This option allows you to match packets whose replies would +	  go out via the interface the packet came in.  	  To compile it as a module, choose M here.  If unsure, say N. +	  The module will be called ipt_rpfilter.  config IP_NF_MATCH_TTL  	tristate '"ttl" match support' @@ -123,17 +146,21 @@ config IP_NF_TARGET_REJECT  	  To compile it as a module, choose M here.  If unsure, say N. -config IP_NF_TARGET_LOG -	tristate "LOG target support" -	default m if NETFILTER_ADVANCED=n +config IP_NF_TARGET_SYNPROXY +	tristate "SYNPROXY target support" +	depends on NF_CONNTRACK && NETFILTER_ADVANCED +	select NETFILTER_SYNPROXY +	select SYN_COOKIES  	help -	  This option adds a `LOG' target, which allows you to create rules in -	  any iptables table which records the packet header to the syslog. +	  The SYNPROXY target allows you to intercept TCP connections and +	  establish them using syncookies before they are passed on to the +	  server. This allows to avoid conntrack and server resource usage +	  during SYN-flood attacks. -	  To compile it as a module, choose M here.  If unsure, say N. +	  To compile it as a module, choose M here. If unsure, say N.  config IP_NF_TARGET_ULOG -	tristate "ULOG target support" +	tristate "ULOG target support (obsolete)"  	default m if NETFILTER_ADVANCED=n  	---help--- @@ -152,25 +179,22 @@ config IP_NF_TARGET_ULOG  	  To compile it as a module, choose M here.  If unsure, say N.  # NAT + specific targets: nf_conntrack -config NF_NAT -	tristate "Full NAT" +config NF_NAT_IPV4 +	tristate "IPv4 NAT"  	depends on NF_CONNTRACK_IPV4  	default m if NETFILTER_ADVANCED=n +	select NF_NAT  	help -	  The Full NAT option allows masquerading, port forwarding and other +	  The IPv4 NAT option allows masquerading, port forwarding and other  	  forms of full Network Address Port Translation.  It is controlled by  	  the `nat' table in iptables: see the man page for iptables(8).  	  To compile it as a module, choose M here.  If unsure, say N. -config NF_NAT_NEEDED -	bool -	depends on NF_NAT -	default y +if NF_NAT_IPV4  config IP_NF_TARGET_MASQUERADE  	tristate "MASQUERADE target support" -	depends on NF_NAT  	default m if NETFILTER_ADVANCED=n  	help  	  Masquerading is a special case of NAT: all outgoing connections are @@ -183,31 +207,29 @@ config IP_NF_TARGET_MASQUERADE  config IP_NF_TARGET_NETMAP  	tristate "NETMAP target support" -	depends on NF_NAT  	depends on NETFILTER_ADVANCED -	help -	  NETMAP is an implementation of static 1:1 NAT mapping of network -	  addresses. It maps the network address part, while keeping the host -	  address part intact. - -	  To compile it as a module, choose M here.  If unsure, say N. +	select NETFILTER_XT_TARGET_NETMAP +	---help--- +	This is a backwards-compat option for the user's convenience +	(e.g. when running oldconfig). It selects +	CONFIG_NETFILTER_XT_TARGET_NETMAP.  config IP_NF_TARGET_REDIRECT  	tristate "REDIRECT target support" -	depends on NF_NAT  	depends on NETFILTER_ADVANCED -	help -	  REDIRECT is a special case of NAT: all incoming connections are -	  mapped onto the incoming interface's address, causing the packets to -	  come to the local machine instead of passing through.  This is -	  useful for transparent proxies. +	select NETFILTER_XT_TARGET_REDIRECT +	---help--- +	This is a backwards-compat option for the user's convenience +	(e.g. when running oldconfig). It selects +	CONFIG_NETFILTER_XT_TARGET_REDIRECT. -	  To compile it as a module, choose M here.  If unsure, say N. +endif  config NF_NAT_SNMP_BASIC  	tristate "Basic SNMP-ALG support" -	depends on NF_NAT +	depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4  	depends on NETFILTER_ADVANCED +	default NF_NAT && NF_CONNTRACK_SNMP  	---help---  	  This module implements an Application Layer Gateway (ALG) for @@ -227,61 +249,21 @@ config NF_NAT_SNMP_BASIC  #           <expr> '&&' <expr>                   (6)  #  # (6) Returns the result of min(/expr/, /expr/). -config NF_NAT_PROTO_DCCP -	tristate -	depends on NF_NAT && NF_CT_PROTO_DCCP -	default NF_NAT && NF_CT_PROTO_DCCP  config NF_NAT_PROTO_GRE  	tristate -	depends on NF_NAT && NF_CT_PROTO_GRE - -config NF_NAT_PROTO_UDPLITE -	tristate -	depends on NF_NAT && NF_CT_PROTO_UDPLITE -	default NF_NAT && NF_CT_PROTO_UDPLITE - -config NF_NAT_PROTO_SCTP -	tristate -	default NF_NAT && NF_CT_PROTO_SCTP -	depends on NF_NAT && NF_CT_PROTO_SCTP -	select LIBCRC32C - -config NF_NAT_FTP -	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_FTP - -config NF_NAT_IRC -	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_IRC - -config NF_NAT_TFTP -	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_TFTP - -config NF_NAT_AMANDA -	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_AMANDA +	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE  config NF_NAT_PPTP  	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_PPTP +	depends on NF_CONNTRACK && NF_NAT_IPV4 +	default NF_NAT_IPV4 && NF_CONNTRACK_PPTP  	select NF_NAT_PROTO_GRE  config NF_NAT_H323  	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_H323 - -config NF_NAT_SIP -	tristate -	depends on NF_CONNTRACK && NF_NAT -	default NF_NAT && NF_CONNTRACK_SIP +	depends on NF_CONNTRACK && NF_NAT_IPV4 +	default NF_NAT_IPV4 && NF_CONNTRACK_H323  # mangle + specific targets  config IP_NF_MANGLE @@ -295,8 +277,8 @@ config IP_NF_MANGLE  	  To compile it as a module, choose M here.  If unsure, say N.  config IP_NF_TARGET_CLUSTERIP -	tristate "CLUSTERIP target support (EXPERIMENTAL)" -	depends on IP_NF_MANGLE && EXPERIMENTAL +	tristate "CLUSTERIP target support" +	depends on IP_NF_MANGLE  	depends on NF_CONNTRACK_IPV4  	depends on NETFILTER_ADVANCED  	select NF_CONNTRACK_MARK @@ -334,7 +316,6 @@ config IP_NF_TARGET_TTL  # raw + specific targets  config IP_NF_RAW  	tristate  'raw table support (required for NOTRACK/TRACE)' -	depends on NETFILTER_ADVANCED  	help  	  This option adds a `raw' table to iptables. This table is the very  	  first in the netfilter framework and hooks in at the PREROUTING diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19eb59d0103..90b82405331 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -10,32 +10,28 @@ nf_conntrack_ipv4-objs	+= nf_conntrack_l3proto_ipv4_compat.o  endif  endif -nf_nat-y		:= nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o -iptable_nat-y	:= nf_nat_rule.o nf_nat_standalone.o -  # connection tracking  obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o -obj-$(CONFIG_NF_NAT) += nf_nat.o +nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o  # defrag  obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o  # NAT helpers (nf_conntrack) -obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o -obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o  obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o -obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o  obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o -obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o  obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o  # NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o  obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o -obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o + +obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o  # generic IP tables   obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o @@ -43,23 +39,20 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o  # the three instances of ip_tables  obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o  obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_NF_NAT) += iptable_nat.o +obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o  obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o  obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o  # matches -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o  obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o -obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o +obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o  # targets  obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o  obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o -obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o  obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o -obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o -obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o  obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o  obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o  # generic ARP tables @@ -68,6 +61,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o  # just filtering instance of ARP tables for now  obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o - -obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o - diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3fac340a28d..f95b6f93814 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -6,6 +6,7 @@   * Some ARP specific bits are:   *   * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>   *   */  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -76,7 +77,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,  }  /* - * Unfortunatly, _b and _mask are not aligned to an int (or long int) + * Unfortunately, _b and _mask are not aligned to an int (or long int)   * Some arches dont care, unrolling the loop is a win on them.   * For other arches, we only have a 16bit alignement.   */ @@ -221,9 +222,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp)  static unsigned int  arpt_error(struct sk_buff *skb, const struct xt_action_param *par)  { -	if (net_ratelimit()) -		pr_err("arp_tables: error: '%s'\n", -		       (const char *)par->targinfo); +	net_err_ratelimited("arp_tables: error: '%s'\n", +			    (const char *)par->targinfo);  	return NF_DROP;  } @@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,  	void *table_base;  	const struct xt_table_info *private;  	struct xt_action_param acpar; +	unsigned int addend;  	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))  		return NF_DROP; @@ -267,8 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,  	indev = in ? in->name : nulldevname;  	outdev = out ? out->name : nulldevname; -	xt_info_rdlock_bh(); +	local_bh_disable(); +	addend = xt_write_recseq_begin();  	private = table->private; +	/* +	 * Ensure we load private-> members after we've fetched the base +	 * pointer. +	 */ +	smp_read_barrier_depends();  	table_base = private->entries[smp_processor_id()];  	e = get_entry(table_base, private->hook_entry[hook]); @@ -301,7 +308,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,  			if (v < 0) {  				/* Pop from stack? */  				if (v != XT_RETURN) { -					verdict = (unsigned)(-v) - 1; +					verdict = (unsigned int)(-v) - 1;  					break;  				}  				e = back; @@ -338,7 +345,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,  			/* Verdict */  			break;  	} while (!acpar.hotdrop); -	xt_info_rdunlock_bh(); +	xt_write_recseq_end(addend); +	local_bh_enable();  	if (acpar.hotdrop)  		return NF_DROP; @@ -710,42 +718,25 @@ static void get_counters(const struct xt_table_info *t,  	struct arpt_entry *iter;  	unsigned int cpu;  	unsigned int i; -	unsigned int curcpu = get_cpu(); - -	/* Instead of clearing (by a previous call to memset()) -	 * the counters and using adds, we set the counters -	 * with data used by 'current' CPU -	 * -	 * Bottom half has to be disabled to prevent deadlock -	 * if new softirq were to run and call ipt_do_table -	 */ -	local_bh_disable(); -	i = 0; -	xt_entry_foreach(iter, t->entries[curcpu], t->size) { -		SET_COUNTER(counters[i], iter->counters.bcnt, -			    iter->counters.pcnt); -		++i; -	} -	local_bh_enable(); -	/* Processing counters from other cpus, we can let bottom half enabled, -	 * (preemption is disabled) -	 */  	for_each_possible_cpu(cpu) { -		if (cpu == curcpu) -			continue; +		seqcount_t *s = &per_cpu(xt_recseq, cpu); +  		i = 0; -		local_bh_disable(); -		xt_info_wrlock(cpu);  		xt_entry_foreach(iter, t->entries[cpu], t->size) { -			ADD_COUNTER(counters[i], iter->counters.bcnt, -				    iter->counters.pcnt); +			u64 bcnt, pcnt; +			unsigned int start; + +			do { +				start = read_seqcount_begin(s); +				bcnt = iter->counters.bcnt; +				pcnt = iter->counters.pcnt; +			} while (read_seqcount_retry(s, start)); + +			ADD_COUNTER(counters[i], bcnt, pcnt);  			++i;  		} -		xt_info_wrunlock(cpu); -		local_bh_enable();  	} -	put_cpu();  }  static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -759,7 +750,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)  	 * about).  	 */  	countersize = sizeof(struct xt_counters) * private->number; -	counters = vmalloc(countersize); +	counters = vzalloc(countersize);  	if (counters == NULL)  		return ERR_PTR(-ENOMEM); @@ -883,6 +874,7 @@ static int compat_table_info(const struct xt_table_info *info,  	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));  	newinfo->initial_entries = 0;  	loc_cpu_entry = info->entries[raw_smp_processor_id()]; +	xt_compat_init_offsets(NFPROTO_ARP, info->number);  	xt_entry_foreach(iter, loc_cpu_entry, info->size) {  		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);  		if (ret != 0) @@ -915,7 +907,7 @@ static int get_info(struct net *net, void __user *user,  #endif  	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),  				    "arptable_%s", name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		struct arpt_getinfo info;  		const struct xt_table_info *private = t->private;  #ifdef CONFIG_COMPAT @@ -972,7 +964,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,  	}  	t = xt_find_table_lock(net, NFPROTO_ARP, get.name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		const struct xt_table_info *private = t->private;  		duprintf("t->private->number = %u\n", @@ -1007,7 +999,7 @@ static int __do_replace(struct net *net, const char *name,  	struct arpt_entry *iter;  	ret = 0; -	counters = vmalloc(num_counters * sizeof(struct xt_counters)); +	counters = vzalloc(num_counters * sizeof(struct xt_counters));  	if (!counters) {  		ret = -ENOMEM;  		goto out; @@ -1015,7 +1007,7 @@ static int __do_replace(struct net *net, const char *name,  	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),  				    "arptable_%s", name); -	if (!t || IS_ERR(t)) { +	if (IS_ERR_OR_NULL(t)) {  		ret = t ? PTR_ERR(t) : -ENOENT;  		goto free_newinfo_counters_untrans;  	} @@ -1052,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,  	xt_free_table_info(oldinfo);  	if (copy_to_user(counters_ptr, counters, -			 sizeof(struct xt_counters) * num_counters) != 0) -		ret = -EFAULT; +			 sizeof(struct xt_counters) * num_counters) != 0) { +		/* Silent error, can't fail, new table is already in place */ +		net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); +	}  	vfree(counters);  	xt_table_unlock(t);  	return ret; @@ -1082,6 +1076,7 @@ static int do_replace(struct net *net, const void __user *user,  	/* overflow check */  	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))  		return -ENOMEM; +	tmp.name[sizeof(tmp.name)-1] = 0;  	newinfo = xt_alloc_table_info(tmp.size);  	if (!newinfo) @@ -1130,6 +1125,7 @@ static int do_add_counters(struct net *net, const void __user *user,  	int ret = 0;  	void *loc_cpu_entry;  	struct arpt_entry *iter; +	unsigned int addend;  #ifdef CONFIG_COMPAT  	struct compat_xt_counters_info compat_tmp; @@ -1170,7 +1166,7 @@ static int do_add_counters(struct net *net, const void __user *user,  	}  	t = xt_find_table_lock(net, NFPROTO_ARP, name); -	if (!t || IS_ERR(t)) { +	if (IS_ERR_OR_NULL(t)) {  		ret = t ? PTR_ERR(t) : -ENOENT;  		goto free;  	} @@ -1186,12 +1182,12 @@ static int do_add_counters(struct net *net, const void __user *user,  	/* Choose the copy that is on our node */  	curcpu = smp_processor_id();  	loc_cpu_entry = private->entries[curcpu]; -	xt_info_wrlock(curcpu); +	addend = xt_write_recseq_begin();  	xt_entry_foreach(iter, loc_cpu_entry, private->size) {  		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);  		++i;  	} -	xt_info_wrunlock(curcpu); +	xt_write_recseq_end(addend);   unlock_up_free:  	local_bh_enable();  	xt_table_unlock(t); @@ -1350,6 +1346,7 @@ static int translate_compat_table(const char *name,  	duprintf("translate_compat_table: size %u\n", info->size);  	j = 0;  	xt_compat_lock(NFPROTO_ARP); +	xt_compat_init_offsets(NFPROTO_ARP, number);  	/* Walk through entries, checking offsets. */  	xt_entry_foreach(iter0, entry0, total_size) {  		ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1503,6 +1500,7 @@ static int compat_do_replace(struct net *net, void __user *user,  		return -ENOMEM;  	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))  		return -ENOMEM; +	tmp.name[sizeof(tmp.name)-1] = 0;  	newinfo = xt_alloc_table_info(tmp.size);  	if (!newinfo) @@ -1543,7 +1541,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -1656,7 +1654,7 @@ static int compat_get_entries(struct net *net,  	xt_compat_lock(NFPROTO_ARP);  	t = xt_find_table_lock(net, NFPROTO_ARP, get.name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		const struct xt_table_info *private = t->private;  		struct xt_table_info info; @@ -1687,7 +1685,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -1708,7 +1706,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -1732,7 +1730,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -1755,6 +1753,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len  			ret = -EFAULT;  			break;  		} +		rev.name[sizeof(rev.name)-1] = 0;  		try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,  							 rev.revision, 1, &ret), @@ -1886,7 +1885,7 @@ static int __init arp_tables_init(void)  	if (ret < 0)  		goto err1; -	/* Noone else will be downing sem now, so we won't sleep */ +	/* No one else will be downing sem now, so we won't sleep */  	ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));  	if (ret < 0)  		goto err2; diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index b8ddcc480ed..a5e52a9f0a1 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)  	if (mangle->flags & ~ARPT_MANGLE_MASK ||  	    !(mangle->flags & ARPT_MANGLE_MASK)) -		return false; +		return -EINVAL;  	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&  	   mangle->target != XT_CONTINUE) -		return false; -	return true; +		return -EINVAL; +	return 0;  }  static struct xt_target arpt_mangle_reg __read_mostly = { diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 79ca5e70d49..802ddecb30b 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {  /* The work comes in here from netfilter.c */  static unsigned int -arptable_filter_hook(unsigned int hook, struct sk_buff *skb, +arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		     const struct net_device *in, const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  {  	const struct net *net = dev_net((in != NULL) ? in : out); -	return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); +	return arpt_do_table(skb, ops->hooknum, in, out, +			     net->ipv4.arptable_filter);  }  static struct nf_hook_ops *arpfilter_ops __read_mostly; @@ -48,9 +49,7 @@ static int __net_init arptable_filter_net_init(struct net *net)  	net->ipv4.arptable_filter =  		arpt_register_table(net, &packet_filter, repl);  	kfree(repl); -	if (IS_ERR(net->ipv4.arptable_filter)) -		return PTR_ERR(net->ipv4.arptable_filter); -	return 0; +	return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);  }  static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c deleted file mode 100644 index d2c1311cb28..00000000000 --- a/net/ipv4/netfilter/ip_queue.c +++ /dev/null @@ -1,637 +0,0 @@ -/* - * This is a module which is used for queueing IPv4 packets and - * communicating with userspace via netlink. - * - * (C) 2000-2002 James Morris <jmorris@intercode.com.au> - * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4/ip_queue.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/security.h> -#include <linux/net.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/route.h> -#include <net/netfilter/nf_queue.h> -#include <net/ip.h> - -#define IPQ_QMAX_DEFAULT 1024 -#define IPQ_PROC_FS_NAME "ip_queue" -#define NET_IPQ_QMAX 2088 -#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" - -typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); - -static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; -static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_SPINLOCK(queue_lock); -static int peer_pid __read_mostly; -static unsigned int copy_range __read_mostly; -static unsigned int queue_total; -static unsigned int queue_dropped = 0; -static unsigned int queue_user_dropped = 0; -static struct sock *ipqnl __read_mostly; -static LIST_HEAD(queue_list); -static DEFINE_MUTEX(ipqnl_mutex); - -static inline void -__ipq_enqueue_entry(struct nf_queue_entry *entry) -{ -       list_add_tail(&entry->list, &queue_list); -       queue_total++; -} - -static inline int -__ipq_set_mode(unsigned char mode, unsigned int range) -{ -	int status = 0; - -	switch(mode) { -	case IPQ_COPY_NONE: -	case IPQ_COPY_META: -		copy_mode = mode; -		copy_range = 0; -		break; - -	case IPQ_COPY_PACKET: -		if (range > 0xFFFF) -			range = 0xFFFF; -		copy_range = range; -		copy_mode = mode; -		break; - -	default: -		status = -EINVAL; - -	} -	return status; -} - -static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); - -static inline void -__ipq_reset(void) -{ -	peer_pid = 0; -	net_disable_timestamp(); -	__ipq_set_mode(IPQ_COPY_NONE, 0); -	__ipq_flush(NULL, 0); -} - -static struct nf_queue_entry * -ipq_find_dequeue_entry(unsigned long id) -{ -	struct nf_queue_entry *entry = NULL, *i; - -	spin_lock_bh(&queue_lock); - -	list_for_each_entry(i, &queue_list, list) { -		if ((unsigned long)i == id) { -			entry = i; -			break; -		} -	} - -	if (entry) { -		list_del(&entry->list); -		queue_total--; -	} - -	spin_unlock_bh(&queue_lock); -	return entry; -} - -static void -__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ -	struct nf_queue_entry *entry, *next; - -	list_for_each_entry_safe(entry, next, &queue_list, list) { -		if (!cmpfn || cmpfn(entry, data)) { -			list_del(&entry->list); -			queue_total--; -			nf_reinject(entry, NF_DROP); -		} -	} -} - -static void -ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ -	spin_lock_bh(&queue_lock); -	__ipq_flush(cmpfn, data); -	spin_unlock_bh(&queue_lock); -} - -static struct sk_buff * -ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) -{ -	sk_buff_data_t old_tail; -	size_t size = 0; -	size_t data_len = 0; -	struct sk_buff *skb; -	struct ipq_packet_msg *pmsg; -	struct nlmsghdr *nlh; -	struct timeval tv; - -	switch (ACCESS_ONCE(copy_mode)) { -	case IPQ_COPY_META: -	case IPQ_COPY_NONE: -		size = NLMSG_SPACE(sizeof(*pmsg)); -		break; - -	case IPQ_COPY_PACKET: -		if (entry->skb->ip_summed == CHECKSUM_PARTIAL && -		    (*errp = skb_checksum_help(entry->skb))) -			return NULL; - -		data_len = ACCESS_ONCE(copy_range); -		if (data_len == 0 || data_len > entry->skb->len) -			data_len = entry->skb->len; - -		size = NLMSG_SPACE(sizeof(*pmsg) + data_len); -		break; - -	default: -		*errp = -EINVAL; -		return NULL; -	} - -	skb = alloc_skb(size, GFP_ATOMIC); -	if (!skb) -		goto nlmsg_failure; - -	old_tail = skb->tail; -	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); -	pmsg = NLMSG_DATA(nlh); -	memset(pmsg, 0, sizeof(*pmsg)); - -	pmsg->packet_id       = (unsigned long )entry; -	pmsg->data_len        = data_len; -	tv = ktime_to_timeval(entry->skb->tstamp); -	pmsg->timestamp_sec   = tv.tv_sec; -	pmsg->timestamp_usec  = tv.tv_usec; -	pmsg->mark            = entry->skb->mark; -	pmsg->hook            = entry->hook; -	pmsg->hw_protocol     = entry->skb->protocol; - -	if (entry->indev) -		strcpy(pmsg->indev_name, entry->indev->name); -	else -		pmsg->indev_name[0] = '\0'; - -	if (entry->outdev) -		strcpy(pmsg->outdev_name, entry->outdev->name); -	else -		pmsg->outdev_name[0] = '\0'; - -	if (entry->indev && entry->skb->dev) { -		pmsg->hw_type = entry->skb->dev->type; -		pmsg->hw_addrlen = dev_parse_header(entry->skb, -						    pmsg->hw_addr); -	} - -	if (data_len) -		if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) -			BUG(); - -	nlh->nlmsg_len = skb->tail - old_tail; -	return skb; - -nlmsg_failure: -	*errp = -EINVAL; -	printk(KERN_ERR "ip_queue: error creating packet message\n"); -	return NULL; -} - -static int -ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ -	int status = -EINVAL; -	struct sk_buff *nskb; - -	if (copy_mode == IPQ_COPY_NONE) -		return -EAGAIN; - -	nskb = ipq_build_packet_message(entry, &status); -	if (nskb == NULL) -		return status; - -	spin_lock_bh(&queue_lock); - -	if (!peer_pid) -		goto err_out_free_nskb; - -	if (queue_total >= queue_maxlen) { -		queue_dropped++; -		status = -ENOSPC; -		if (net_ratelimit()) -			  printk (KERN_WARNING "ip_queue: full at %d entries, " -				  "dropping packets(s). Dropped: %d\n", queue_total, -				  queue_dropped); -		goto err_out_free_nskb; -	} - -	/* netlink_unicast will either free the nskb or attach it to a socket */ -	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); -	if (status < 0) { -		queue_user_dropped++; -		goto err_out_unlock; -	} - -	__ipq_enqueue_entry(entry); - -	spin_unlock_bh(&queue_lock); -	return status; - -err_out_free_nskb: -	kfree_skb(nskb); - -err_out_unlock: -	spin_unlock_bh(&queue_lock); -	return status; -} - -static int -ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) -{ -	int diff; -	struct iphdr *user_iph = (struct iphdr *)v->payload; -	struct sk_buff *nskb; - -	if (v->data_len < sizeof(*user_iph)) -		return 0; -	diff = v->data_len - e->skb->len; -	if (diff < 0) { -		if (pskb_trim(e->skb, v->data_len)) -			return -ENOMEM; -	} else if (diff > 0) { -		if (v->data_len > 0xFFFF) -			return -EINVAL; -		if (diff > skb_tailroom(e->skb)) { -			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), -					       diff, GFP_ATOMIC); -			if (!nskb) { -				printk(KERN_WARNING "ip_queue: error " -				      "in mangle, dropping packet\n"); -				return -ENOMEM; -			} -			kfree_skb(e->skb); -			e->skb = nskb; -		} -		skb_put(e->skb, diff); -	} -	if (!skb_make_writable(e->skb, v->data_len)) -		return -ENOMEM; -	skb_copy_to_linear_data(e->skb, v->payload, v->data_len); -	e->skb->ip_summed = CHECKSUM_NONE; - -	return 0; -} - -static int -ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) -{ -	struct nf_queue_entry *entry; - -	if (vmsg->value > NF_MAX_VERDICT) -		return -EINVAL; - -	entry = ipq_find_dequeue_entry(vmsg->id); -	if (entry == NULL) -		return -ENOENT; -	else { -		int verdict = vmsg->value; - -		if (vmsg->data_len && vmsg->data_len == len) -			if (ipq_mangle_ipv4(vmsg, entry) < 0) -				verdict = NF_DROP; - -		nf_reinject(entry, verdict); -		return 0; -	} -} - -static int -ipq_set_mode(unsigned char mode, unsigned int range) -{ -	int status; - -	spin_lock_bh(&queue_lock); -	status = __ipq_set_mode(mode, range); -	spin_unlock_bh(&queue_lock); -	return status; -} - -static int -ipq_receive_peer(struct ipq_peer_msg *pmsg, -		 unsigned char type, unsigned int len) -{ -	int status = 0; - -	if (len < sizeof(*pmsg)) -		return -EINVAL; - -	switch (type) { -	case IPQM_MODE: -		status = ipq_set_mode(pmsg->msg.mode.value, -				      pmsg->msg.mode.range); -		break; - -	case IPQM_VERDICT: -		if (pmsg->msg.verdict.value > NF_MAX_VERDICT) -			status = -EINVAL; -		else -			status = ipq_set_verdict(&pmsg->msg.verdict, -						 len - sizeof(*pmsg)); -			break; -	default: -		status = -EINVAL; -	} -	return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ -	if (entry->indev) -		if (entry->indev->ifindex == ifindex) -			return 1; -	if (entry->outdev) -		if (entry->outdev->ifindex == ifindex) -			return 1; -#ifdef CONFIG_BRIDGE_NETFILTER -	if (entry->skb->nf_bridge) { -		if (entry->skb->nf_bridge->physindev && -		    entry->skb->nf_bridge->physindev->ifindex == ifindex) -			return 1; -		if (entry->skb->nf_bridge->physoutdev && -		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex) -			return 1; -	} -#endif -	return 0; -} - -static void -ipq_dev_drop(int ifindex) -{ -	ipq_flush(dev_cmp, ifindex); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static inline void -__ipq_rcv_skb(struct sk_buff *skb) -{ -	int status, type, pid, flags, nlmsglen, skblen; -	struct nlmsghdr *nlh; - -	skblen = skb->len; -	if (skblen < sizeof(*nlh)) -		return; - -	nlh = nlmsg_hdr(skb); -	nlmsglen = nlh->nlmsg_len; -	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) -		return; - -	pid = nlh->nlmsg_pid; -	flags = nlh->nlmsg_flags; - -	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) -		RCV_SKB_FAIL(-EINVAL); - -	if (flags & MSG_TRUNC) -		RCV_SKB_FAIL(-ECOMM); - -	type = nlh->nlmsg_type; -	if (type < NLMSG_NOOP || type >= IPQM_MAX) -		RCV_SKB_FAIL(-EINVAL); - -	if (type <= IPQM_BASE) -		return; - -	if (security_netlink_recv(skb, CAP_NET_ADMIN)) -		RCV_SKB_FAIL(-EPERM); - -	spin_lock_bh(&queue_lock); - -	if (peer_pid) { -		if (peer_pid != pid) { -			spin_unlock_bh(&queue_lock); -			RCV_SKB_FAIL(-EBUSY); -		} -	} else { -		net_enable_timestamp(); -		peer_pid = pid; -	} - -	spin_unlock_bh(&queue_lock); - -	status = ipq_receive_peer(NLMSG_DATA(nlh), type, -				  nlmsglen - NLMSG_LENGTH(0)); -	if (status < 0) -		RCV_SKB_FAIL(status); - -	if (flags & NLM_F_ACK) -		netlink_ack(skb, nlh, 0); -} - -static void -ipq_rcv_skb(struct sk_buff *skb) -{ -	mutex_lock(&ipqnl_mutex); -	__ipq_rcv_skb(skb); -	mutex_unlock(&ipqnl_mutex); -} - -static int -ipq_rcv_dev_event(struct notifier_block *this, -		  unsigned long event, void *ptr) -{ -	struct net_device *dev = ptr; - -	if (!net_eq(dev_net(dev), &init_net)) -		return NOTIFY_DONE; - -	/* Drop any packets associated with the downed device */ -	if (event == NETDEV_DOWN) -		ipq_dev_drop(dev->ifindex); -	return NOTIFY_DONE; -} - -static struct notifier_block ipq_dev_notifier = { -	.notifier_call	= ipq_rcv_dev_event, -}; - -static int -ipq_rcv_nl_event(struct notifier_block *this, -		 unsigned long event, void *ptr) -{ -	struct netlink_notify *n = ptr; - -	if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { -		spin_lock_bh(&queue_lock); -		if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) -			__ipq_reset(); -		spin_unlock_bh(&queue_lock); -	} -	return NOTIFY_DONE; -} - -static struct notifier_block ipq_nl_notifier = { -	.notifier_call	= ipq_rcv_nl_event, -}; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *ipq_sysctl_header; - -static ctl_table ipq_table[] = { -	{ -		.procname	= NET_IPQ_QMAX_NAME, -		.data		= &queue_maxlen, -		.maxlen		= sizeof(queue_maxlen), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{ } -}; -#endif - -#ifdef CONFIG_PROC_FS -static int ip_queue_show(struct seq_file *m, void *v) -{ -	spin_lock_bh(&queue_lock); - -	seq_printf(m, -		      "Peer PID          : %d\n" -		      "Copy mode         : %hu\n" -		      "Copy range        : %u\n" -		      "Queue length      : %u\n" -		      "Queue max. length : %u\n" -		      "Queue dropped     : %u\n" -		      "Netlink dropped   : %u\n", -		      peer_pid, -		      copy_mode, -		      copy_range, -		      queue_total, -		      queue_maxlen, -		      queue_dropped, -		      queue_user_dropped); - -	spin_unlock_bh(&queue_lock); -	return 0; -} - -static int ip_queue_open(struct inode *inode, struct file *file) -{ -	return single_open(file, ip_queue_show, NULL); -} - -static const struct file_operations ip_queue_proc_fops = { -	.open		= ip_queue_open, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= single_release, -	.owner		= THIS_MODULE, -}; -#endif - -static const struct nf_queue_handler nfqh = { -	.name	= "ip_queue", -	.outfn	= &ipq_enqueue_packet, -}; - -static int __init ip_queue_init(void) -{ -	int status = -ENOMEM; -	struct proc_dir_entry *proc __maybe_unused; - -	netlink_register_notifier(&ipq_nl_notifier); -	ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, -				      ipq_rcv_skb, NULL, THIS_MODULE); -	if (ipqnl == NULL) { -		printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); -		goto cleanup_netlink_notifier; -	} - -#ifdef CONFIG_PROC_FS -	proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, -			   &ip_queue_proc_fops); -	if (!proc) { -		printk(KERN_ERR "ip_queue: failed to create proc entry\n"); -		goto cleanup_ipqnl; -	} -#endif -	register_netdevice_notifier(&ipq_dev_notifier); -#ifdef CONFIG_SYSCTL -	ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table); -#endif -	status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh); -	if (status < 0) { -		printk(KERN_ERR "ip_queue: failed to register queue handler\n"); -		goto cleanup_sysctl; -	} -	return status; - -cleanup_sysctl: -#ifdef CONFIG_SYSCTL -	unregister_sysctl_table(ipq_sysctl_header); -#endif -	unregister_netdevice_notifier(&ipq_dev_notifier); -	proc_net_remove(&init_net, IPQ_PROC_FS_NAME); -cleanup_ipqnl: __maybe_unused -	netlink_kernel_release(ipqnl); -	mutex_lock(&ipqnl_mutex); -	mutex_unlock(&ipqnl_mutex); - -cleanup_netlink_notifier: -	netlink_unregister_notifier(&ipq_nl_notifier); -	return status; -} - -static void __exit ip_queue_fini(void) -{ -	nf_unregister_queue_handlers(&nfqh); - -	ipq_flush(NULL, 0); - -#ifdef CONFIG_SYSCTL -	unregister_sysctl_table(ipq_sysctl_header); -#endif -	unregister_netdevice_notifier(&ipq_dev_notifier); -	proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - -	netlink_kernel_release(ipqnl); -	mutex_lock(&ipqnl_mutex); -	mutex_unlock(&ipqnl_mutex); - -	netlink_unregister_notifier(&ipq_nl_notifier); -} - -MODULE_DESCRIPTION("IPv4 packet queue handler"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL); - -module_init(ip_queue_init); -module_exit(ip_queue_fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a846d633b3b..99e810f8467 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -3,6 +3,7 @@   *   * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling   * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -68,15 +69,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)  }  EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); -/* -   We keep a set of rules for each CPU, so we can avoid write-locking -   them in the softirq when updating the counters and therefore -   only need to read-lock in the softirq; doing a write_lock_bh() in user -   context stops packets coming through and allows user context to read -   the counters or update the rules. - -   Hence the start of any table is given by get_table() below.  */ -  /* Returns whether matches rule or not. */  /* Performance critical - called for every packet */  static inline bool @@ -162,8 +154,7 @@ ip_checkentry(const struct ipt_ip *ip)  static unsigned int  ipt_error(struct sk_buff *skb, const struct xt_action_param *par)  { -	if (net_ratelimit()) -		pr_info("error: `%s'\n", (const char *)par->targinfo); +	net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);  	return NF_DROP;  } @@ -192,8 +183,7 @@ ipt_get_target_c(const struct ipt_entry *e)  	return ipt_get_target((struct ipt_entry *)e);  } -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)  static const char *const hooknames[] = {  	[NF_INET_PRE_ROUTING]		= "PREROUTING",  	[NF_INET_LOCAL_IN]		= "INPUT", @@ -269,6 +259,7 @@ static void trace_packet(const struct sk_buff *skb,  	const char *hookname, *chainname, *comment;  	const struct ipt_entry *iter;  	unsigned int rulenum = 0; +	struct net *net = dev_net(in ? in : out);  	table_base = private->entries[smp_processor_id()];  	root = get_entry(table_base, private->hook_entry[hook]); @@ -281,7 +272,7 @@ static void trace_packet(const struct sk_buff *skb,  		    &chainname, &comment, &rulenum) != 0)  			break; -	nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, +	nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,  		      "TRACE: %s:%s:%s:%u ",  		      tablename, chainname, comment, rulenum);  } @@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,  	unsigned int *stackptr, origptr, cpu;  	const struct xt_table_info *private;  	struct xt_action_param acpar; +	unsigned int addend;  	/* Initialization */  	ip = ip_hdr(skb); @@ -331,9 +323,15 @@ ipt_do_table(struct sk_buff *skb,  	acpar.hooknum = hook;  	IP_NF_ASSERT(table->valid_hooks & (1 << hook)); -	xt_info_rdlock_bh(); +	local_bh_disable(); +	addend = xt_write_recseq_begin();  	private = table->private;  	cpu        = smp_processor_id(); +	/* +	 * Ensure we load private-> members after we've fetched the base +	 * pointer. +	 */ +	smp_read_barrier_depends();  	table_base = private->entries[cpu];  	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];  	stackptr   = per_cpu_ptr(private->stackptr, cpu); @@ -369,8 +367,7 @@ ipt_do_table(struct sk_buff *skb,  		t = ipt_get_target(e);  		IP_NF_ASSERT(t->u.kernel.target); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ -    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)  		/* The packet is traced: log it */  		if (unlikely(skb->nf_trace))  			trace_packet(skb, hook, in, out, @@ -384,10 +381,10 @@ ipt_do_table(struct sk_buff *skb,  			if (v < 0) {  				/* Pop from stack? */  				if (v != XT_RETURN) { -					verdict = (unsigned)(-v) - 1; +					verdict = (unsigned int)(-v) - 1;  					break;  				} -				if (*stackptr == 0) { +				if (*stackptr <= origptr) {  					e = get_entry(table_base,  					    private->underflow[hook]);  					pr_debug("Underflow (this is normal) " @@ -427,10 +424,12 @@ ipt_do_table(struct sk_buff *skb,  			/* Verdict */  			break;  	} while (!acpar.hotdrop); -	xt_info_rdunlock_bh();  	pr_debug("Exiting %s; resetting sp from %u to %u\n",  		 __func__, *stackptr, origptr);  	*stackptr = origptr; + 	xt_write_recseq_end(addend); + 	local_bh_enable(); +  #ifdef DEBUG_ALLOW_ALL  	return NF_ACCEPT;  #else @@ -571,7 +570,7 @@ check_entry(const struct ipt_entry *e, const char *name)  	const struct xt_entry_target *t;  	if (!ip_checkentry(&e->ip)) { -		duprintf("ip check failed %p %s.\n", e, par->match->name); +		duprintf("ip check failed %p %s.\n", e, name);  		return -EINVAL;  	} @@ -884,42 +883,25 @@ get_counters(const struct xt_table_info *t,  	struct ipt_entry *iter;  	unsigned int cpu;  	unsigned int i; -	unsigned int curcpu = get_cpu(); - -	/* Instead of clearing (by a previous call to memset()) -	 * the counters and using adds, we set the counters -	 * with data used by 'current' CPU. -	 * -	 * Bottom half has to be disabled to prevent deadlock -	 * if new softirq were to run and call ipt_do_table -	 */ -	local_bh_disable(); -	i = 0; -	xt_entry_foreach(iter, t->entries[curcpu], t->size) { -		SET_COUNTER(counters[i], iter->counters.bcnt, -			    iter->counters.pcnt); -		++i; -	} -	local_bh_enable(); -	/* Processing counters from other cpus, we can let bottom half enabled, -	 * (preemption is disabled) -	 */  	for_each_possible_cpu(cpu) { -		if (cpu == curcpu) -			continue; +		seqcount_t *s = &per_cpu(xt_recseq, cpu); +  		i = 0; -		local_bh_disable(); -		xt_info_wrlock(cpu);  		xt_entry_foreach(iter, t->entries[cpu], t->size) { -			ADD_COUNTER(counters[i], iter->counters.bcnt, -				    iter->counters.pcnt); +			u64 bcnt, pcnt; +			unsigned int start; + +			do { +				start = read_seqcount_begin(s); +				bcnt = iter->counters.bcnt; +				pcnt = iter->counters.pcnt; +			} while (read_seqcount_retry(s, start)); + +			ADD_COUNTER(counters[i], bcnt, pcnt);  			++i; /* macro does multi eval of i */  		} -		xt_info_wrunlock(cpu); -		local_bh_enable();  	} -	put_cpu();  }  static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -932,7 +914,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)  	   (other than comefrom, which userspace doesn't care  	   about). */  	countersize = sizeof(struct xt_counters) * private->number; -	counters = vmalloc(countersize); +	counters = vzalloc(countersize);  	if (counters == NULL)  		return ERR_PTR(-ENOMEM); @@ -1080,6 +1062,7 @@ static int compat_table_info(const struct xt_table_info *info,  	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));  	newinfo->initial_entries = 0;  	loc_cpu_entry = info->entries[raw_smp_processor_id()]; +	xt_compat_init_offsets(AF_INET, info->number);  	xt_entry_foreach(iter, loc_cpu_entry, info->size) {  		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);  		if (ret != 0) @@ -1112,7 +1095,7 @@ static int get_info(struct net *net, void __user *user,  #endif  	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),  				    "iptable_%s", name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		struct ipt_getinfo info;  		const struct xt_table_info *private = t->private;  #ifdef CONFIG_COMPAT @@ -1171,7 +1154,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,  	}  	t = xt_find_table_lock(net, AF_INET, get.name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		const struct xt_table_info *private = t->private;  		duprintf("t->private->number = %u\n", private->number);  		if (get.size == private->size) @@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,  	struct ipt_entry *iter;  	ret = 0; -	counters = vmalloc(num_counters * sizeof(struct xt_counters)); +	counters = vzalloc(num_counters * sizeof(struct xt_counters));  	if (!counters) {  		ret = -ENOMEM;  		goto out; @@ -1211,7 +1194,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,  	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),  				    "iptable_%s", name); -	if (!t || IS_ERR(t)) { +	if (IS_ERR_OR_NULL(t)) {  		ret = t ? PTR_ERR(t) : -ENOENT;  		goto free_newinfo_counters_untrans;  	} @@ -1248,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,  	xt_free_table_info(oldinfo);  	if (copy_to_user(counters_ptr, counters, -			 sizeof(struct xt_counters) * num_counters) != 0) -		ret = -EFAULT; +			 sizeof(struct xt_counters) * num_counters) != 0) { +		/* Silent error, can't fail, new table is already in place */ +		net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); +	}  	vfree(counters);  	xt_table_unlock(t);  	return ret; @@ -1278,6 +1263,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)  	/* overflow check */  	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))  		return -ENOMEM; +	tmp.name[sizeof(tmp.name)-1] = 0;  	newinfo = xt_alloc_table_info(tmp.size);  	if (!newinfo) @@ -1327,6 +1313,7 @@ do_add_counters(struct net *net, const void __user *user,  	int ret = 0;  	void *loc_cpu_entry;  	struct ipt_entry *iter; +	unsigned int addend;  #ifdef CONFIG_COMPAT  	struct compat_xt_counters_info compat_tmp; @@ -1367,7 +1354,7 @@ do_add_counters(struct net *net, const void __user *user,  	}  	t = xt_find_table_lock(net, AF_INET, name); -	if (!t || IS_ERR(t)) { +	if (IS_ERR_OR_NULL(t)) {  		ret = t ? PTR_ERR(t) : -ENOENT;  		goto free;  	} @@ -1383,12 +1370,12 @@ do_add_counters(struct net *net, const void __user *user,  	/* Choose the copy that is on our node */  	curcpu = smp_processor_id();  	loc_cpu_entry = private->entries[curcpu]; -	xt_info_wrlock(curcpu); +	addend = xt_write_recseq_begin();  	xt_entry_foreach(iter, loc_cpu_entry, private->size) {  		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);  		++i;  	} -	xt_info_wrunlock(curcpu); +	xt_write_recseq_end(addend);   unlock_up_free:  	local_bh_enable();  	xt_table_unlock(t); @@ -1681,6 +1668,7 @@ translate_compat_table(struct net *net,  	duprintf("translate_compat_table: size %u\n", info->size);  	j = 0;  	xt_compat_lock(AF_INET); +	xt_compat_init_offsets(AF_INET, number);  	/* Walk through entries, checking offsets. */  	xt_entry_foreach(iter0, entry0, total_size) {  		ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1822,6 +1810,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)  		return -ENOMEM;  	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))  		return -ENOMEM; +	tmp.name[sizeof(tmp.name)-1] = 0;  	newinfo = xt_alloc_table_info(tmp.size);  	if (!newinfo) @@ -1864,7 +1853,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -1949,7 +1938,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,  	xt_compat_lock(AF_INET);  	t = xt_find_table_lock(net, AF_INET, get.name); -	if (t && !IS_ERR(t)) { +	if (!IS_ERR_OR_NULL(t)) {  		const struct xt_table_info *private = t->private;  		struct xt_table_info info;  		duprintf("t->private->number = %u\n", private->number); @@ -1979,7 +1968,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -2001,7 +1990,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -2026,7 +2015,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  {  	int ret; -	if (!capable(CAP_NET_ADMIN)) +	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))  		return -EPERM;  	switch (cmd) { @@ -2051,6 +2040,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)  			ret = -EFAULT;  			break;  		} +		rev.name[sizeof(rev.name)-1] = 0;  		if (cmd == IPT_SO_GET_REVISION_TARGET)  			target = 1; @@ -2245,7 +2235,7 @@ static int __init ip_tables_init(void)  	if (ret < 0)  		goto err1; -	/* Noone else will be downing sem now, so we won't sleep */ +	/* No one else will be downing sem now, so we won't sleep */  	ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));  	if (ret < 0)  		goto err2; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a489765..2510c02c2d2 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -28,6 +28,7 @@  #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>  #include <net/netfilter/nf_conntrack.h>  #include <net/net_namespace.h> +#include <net/netns/generic.h>  #include <net/checksum.h>  #include <net/ip.h> @@ -57,15 +58,21 @@ struct clusterip_config {  	struct rcu_head rcu;  }; -static LIST_HEAD(clusterip_configs); +#ifdef CONFIG_PROC_FS +static const struct file_operations clusterip_proc_fops; +#endif -/* clusterip_lock protects the clusterip_configs list */ -static DEFINE_SPINLOCK(clusterip_lock); +static int clusterip_net_id __read_mostly; + +struct clusterip_net { +	struct list_head configs; +	/* lock protects the configs list */ +	spinlock_t lock;  #ifdef CONFIG_PROC_FS -static const struct file_operations clusterip_proc_fops; -static struct proc_dir_entry *clusterip_procdir; +	struct proc_dir_entry *procdir;  #endif +};  static inline void  clusterip_config_get(struct clusterip_config *c) @@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)  static inline void  clusterip_config_entry_put(struct clusterip_config *c)  { +	struct net *net = dev_net(c->dev); +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +  	local_bh_disable(); -	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { +	if (atomic_dec_and_lock(&c->entries, &cn->lock)) {  		list_del_rcu(&c->list); -		spin_unlock(&clusterip_lock); +		spin_unlock(&cn->lock);  		local_bh_enable();  		dev_mc_del(c->dev, c->clustermac); @@ -105,7 +115,7 @@ clusterip_config_entry_put(struct clusterip_config *c)  		 * functions are also incrementing the refcount on their own,  		 * so it's safe to remove the entry even if it's in use. */  #ifdef CONFIG_PROC_FS -		remove_proc_entry(c->pde->name, c->pde->parent); +		proc_remove(c->pde);  #endif  		return;  	} @@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)  }  static struct clusterip_config * -__clusterip_config_find(__be32 clusterip) +__clusterip_config_find(struct net *net, __be32 clusterip)  {  	struct clusterip_config *c; +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); -	list_for_each_entry_rcu(c, &clusterip_configs, list) { +	list_for_each_entry_rcu(c, &cn->configs, list) {  		if (c->clusterip == clusterip)  			return c;  	} @@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)  }  static inline struct clusterip_config * -clusterip_config_find_get(__be32 clusterip, int entry) +clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)  {  	struct clusterip_config *c;  	rcu_read_lock_bh(); -	c = __clusterip_config_find(clusterip); +	c = __clusterip_config_find(net, clusterip);  	if (c) {  		if (unlikely(!atomic_inc_not_zero(&c->refcount)))  			c = NULL; @@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  			struct net_device *dev)  {  	struct clusterip_config *c; +	struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);  	c = kzalloc(sizeof(*c), GFP_ATOMIC);  	if (!c) @@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  		/* create proc dir entry */  		sprintf(buffer, "%pI4", &ip);  		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, -					  clusterip_procdir, +					  cn->procdir,  					  &clusterip_proc_fops, c);  		if (!c->pde) {  			kfree(c); @@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,  	}  #endif -	spin_lock_bh(&clusterip_lock); -	list_add_rcu(&c->list, &clusterip_configs); -	spin_unlock_bh(&clusterip_lock); +	spin_lock_bh(&cn->lock); +	list_add_rcu(&c->list, &cn->configs); +	spin_unlock_bh(&cn->lock);  	return c;  } @@ -246,8 +258,7 @@ clusterip_hashfn(const struct sk_buff *skb,  			dport = ports[1];  		}  	} else { -		if (net_ratelimit()) -			pr_info("unknown protocol %u\n", iph->protocol); +		net_info_ratelimited("unknown protocol %u\n", iph->protocol);  	}  	switch (config->hash_mode) { @@ -300,19 +311,14 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)  	 * that the ->target() function isn't called after ->destroy() */  	ct = nf_ct_get(skb, &ctinfo); -	if (ct == NULL) { -		pr_info("no conntrack!\n"); -			/* FIXME: need to drop invalid ones, since replies -			 * to outgoing connections of other nodes will be -			 * marked as INVALID */ +	if (ct == NULL)  		return NF_DROP; -	}  	/* special case: ICMP error handling. conntrack distinguishes between  	 * error messages (RELATED) and information requests (see below) */  	if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&  	    (ctinfo == IP_CT_RELATED || -	     ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) +	     ctinfo == IP_CT_RELATED_REPLY))  		return XT_CONTINUE;  	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, @@ -322,19 +328,19 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)  	hash = clusterip_hashfn(skb, cipinfo->config);  	switch (ctinfo) { -		case IP_CT_NEW: -			ct->mark = hash; -			break; -		case IP_CT_RELATED: -		case IP_CT_RELATED+IP_CT_IS_REPLY: -			/* FIXME: we don't handle expectations at the -			 * moment.  they can arrive on a different node than -			 * the master connection (e.g. FTP passive mode) */ -		case IP_CT_ESTABLISHED: -		case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: -			break; -		default: -			break; +	case IP_CT_NEW: +		ct->mark = hash; +		break; +	case IP_CT_RELATED: +	case IP_CT_RELATED_REPLY: +		/* FIXME: we don't handle expectations at the moment. +		 * They can arrive on a different node than +		 * the master connection (e.g. FTP passive mode) */ +	case IP_CT_ESTABLISHED: +	case IP_CT_ESTABLISHED_REPLY: +		break; +	default:			/* Prevent gcc warnings */ +		break;  	}  #ifdef DEBUG @@ -376,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  	/* FIXME: further sanity checks */ -	config = clusterip_config_find_get(e->ip.dst.s_addr, 1); +	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);  	if (!config) {  		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {  			pr_info("no config found for %pI4, need 'new'\n", @@ -390,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  				return -EINVAL;  			} -			dev = dev_get_by_name(&init_net, e->ip.iniface); +			dev = dev_get_by_name(par->net, e->ip.iniface);  			if (!dev) {  				pr_info("no such interface %s\n",  					e->ip.iniface); @@ -400,7 +406,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  			config = clusterip_config_init(cipinfo,  							e->ip.dst.s_addr, dev);  			if (!config) { -				pr_info("cannot allocate config\n");  				dev_put(dev);  				return -ENOMEM;  			} @@ -490,7 +495,7 @@ static void arp_print(struct arp_payload *payload)  #endif  static unsigned int -arp_mangle(unsigned int hook, +arp_mangle(const struct nf_hook_ops *ops,  	   struct sk_buff *skb,  	   const struct net_device *in,  	   const struct net_device *out, @@ -499,6 +504,7 @@ arp_mangle(unsigned int hook,  	struct arphdr *arp = arp_hdr(skb);  	struct arp_payload *payload;  	struct clusterip_config *c; +	struct net *net = dev_net(in ? in : out);  	/* we don't care about non-ethernet and non-ipv4 ARP */  	if (arp->ar_hrd != htons(ARPHRD_ETHER) || @@ -515,7 +521,7 @@ arp_mangle(unsigned int hook,  	/* if there is no clusterip configuration for the arp reply's  	 * source ip, we don't want to mangle it */ -	c = clusterip_config_find_get(payload->src_ip, 0); +	c = clusterip_config_find_get(net, payload->src_ip, 0);  	if (!c)  		return NF_ACCEPT; @@ -638,7 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)  	if (!ret) {  		struct seq_file *sf = file->private_data; -		struct clusterip_config *c = PDE(inode)->data; +		struct clusterip_config *c = PDE_DATA(inode);  		sf->private = c; @@ -650,7 +656,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)  static int clusterip_proc_release(struct inode *inode, struct file *file)  { -	struct clusterip_config *c = PDE(inode)->data; +	struct clusterip_config *c = PDE_DATA(inode);  	int ret;  	ret = seq_release(inode, file); @@ -664,20 +670,28 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)  static ssize_t clusterip_proc_write(struct file *file, const char __user *input,  				size_t size, loff_t *ofs)  { -	struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; +	struct clusterip_config *c = PDE_DATA(file_inode(file));  #define PROC_WRITELEN	10  	char buffer[PROC_WRITELEN+1];  	unsigned long nodenum; +	int rc; -	if (copy_from_user(buffer, input, PROC_WRITELEN)) +	if (size > PROC_WRITELEN) +		return -EIO; +	if (copy_from_user(buffer, input, size))  		return -EFAULT; +	buffer[size] = 0;  	if (*buffer == '+') { -		nodenum = simple_strtoul(buffer+1, NULL, 10); +		rc = kstrtoul(buffer+1, 10, &nodenum); +		if (rc) +			return rc;  		if (clusterip_add_node(c, nodenum))  			return -ENOMEM;  	} else if (*buffer == '-') { -		nodenum = simple_strtoul(buffer+1, NULL,10); +		rc = kstrtoul(buffer+1, 10, &nodenum); +		if (rc) +			return rc;  		if (clusterip_del_node(c, nodenum))  			return -ENOENT;  	} else @@ -697,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {  #endif /* CONFIG_PROC_FS */ +static int clusterip_net_init(struct net *net) +{ +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); + +	INIT_LIST_HEAD(&cn->configs); + +	spin_lock_init(&cn->lock); + +#ifdef CONFIG_PROC_FS +	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); +	if (!cn->procdir) { +		pr_err("Unable to proc dir entry\n"); +		return -ENOMEM; +	} +#endif /* CONFIG_PROC_FS */ + +	return 0; +} + +static void clusterip_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS +	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	proc_remove(cn->procdir); +#endif +} + +static struct pernet_operations clusterip_net_ops = { +	.init = clusterip_net_init, +	.exit = clusterip_net_exit, +	.id   = &clusterip_net_id, +	.size = sizeof(struct clusterip_net), +}; +  static int __init clusterip_tg_init(void)  {  	int ret; -	ret = xt_register_target(&clusterip_tg_reg); +	ret = register_pernet_subsys(&clusterip_net_ops);  	if (ret < 0)  		return ret; +	ret = xt_register_target(&clusterip_tg_reg); +	if (ret < 0) +		goto cleanup_subsys; +  	ret = nf_register_hook(&cip_arp_ops);  	if (ret < 0)  		goto cleanup_target; -#ifdef CONFIG_PROC_FS -	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); -	if (!clusterip_procdir) { -		pr_err("Unable to proc dir entry\n"); -		ret = -ENOMEM; -		goto cleanup_hook; -	} -#endif /* CONFIG_PROC_FS */ -  	pr_info("ClusterIP Version %s loaded successfully\n",  		CLUSTERIP_VERSION); +  	return 0; -#ifdef CONFIG_PROC_FS -cleanup_hook: -	nf_unregister_hook(&cip_arp_ops); -#endif /* CONFIG_PROC_FS */  cleanup_target:  	xt_unregister_target(&clusterip_tg_reg); +cleanup_subsys: +	unregister_pernet_subsys(&clusterip_net_ops);  	return ret;  }  static void __exit clusterip_tg_exit(void)  {  	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); -#ifdef CONFIG_PROC_FS -	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); -#endif +  	nf_unregister_hook(&cip_arp_ops);  	xt_unregister_target(&clusterip_tg_reg); +	unregister_pernet_subsys(&clusterip_net_ops);  	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */  	rcu_barrier_bh(); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c deleted file mode 100644 index 72ffc8fda2e..00000000000 --- a/net/ipv4/netfilter/ipt_LOG.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * This is a module which is used for logging packets. - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_LOG.h> -#include <net/netfilter/nf_log.h> -#include <net/netfilter/xt_log.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); - -/* One level of recursion won't kill us */ -static void dump_packet(struct sbuff *m, -			const struct nf_loginfo *info, -			const struct sk_buff *skb, -			unsigned int iphoff) -{ -	struct iphdr _iph; -	const struct iphdr *ih; -	unsigned int logflags; - -	if (info->type == NF_LOG_TYPE_LOG) -		logflags = info->u.log.logflags; -	else -		logflags = NF_LOG_MASK; - -	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); -	if (ih == NULL) { -		sb_add(m, "TRUNCATED"); -		return; -	} - -	/* Important fields: -	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ -	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ -	sb_add(m, "SRC=%pI4 DST=%pI4 ", -	       &ih->saddr, &ih->daddr); - -	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ -	sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", -	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, -	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); - -	/* Max length: 6 "CE DF MF " */ -	if (ntohs(ih->frag_off) & IP_CE) -		sb_add(m, "CE "); -	if (ntohs(ih->frag_off) & IP_DF) -		sb_add(m, "DF "); -	if (ntohs(ih->frag_off) & IP_MF) -		sb_add(m, "MF "); - -	/* Max length: 11 "FRAG:65535 " */ -	if (ntohs(ih->frag_off) & IP_OFFSET) -		sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - -	if ((logflags & IPT_LOG_IPOPT) && -	    ih->ihl * 4 > sizeof(struct iphdr)) { -		const unsigned char *op; -		unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; -		unsigned int i, optsize; - -		optsize = ih->ihl * 4 - sizeof(struct iphdr); -		op = skb_header_pointer(skb, iphoff+sizeof(_iph), -					optsize, _opt); -		if (op == NULL) { -			sb_add(m, "TRUNCATED"); -			return; -		} - -		/* Max length: 127 "OPT (" 15*4*2chars ") " */ -		sb_add(m, "OPT ("); -		for (i = 0; i < optsize; i++) -			sb_add(m, "%02X", op[i]); -		sb_add(m, ") "); -	} - -	switch (ih->protocol) { -	case IPPROTO_TCP: { -		struct tcphdr _tcph; -		const struct tcphdr *th; - -		/* Max length: 10 "PROTO=TCP " */ -		sb_add(m, "PROTO=TCP "); - -		if (ntohs(ih->frag_off) & IP_OFFSET) -			break; - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		th = skb_header_pointer(skb, iphoff + ih->ihl * 4, -					sizeof(_tcph), &_tcph); -		if (th == NULL) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		/* Max length: 20 "SPT=65535 DPT=65535 " */ -		sb_add(m, "SPT=%u DPT=%u ", -		       ntohs(th->source), ntohs(th->dest)); -		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ -		if (logflags & IPT_LOG_TCPSEQ) -			sb_add(m, "SEQ=%u ACK=%u ", -			       ntohl(th->seq), ntohl(th->ack_seq)); -		/* Max length: 13 "WINDOW=65535 " */ -		sb_add(m, "WINDOW=%u ", ntohs(th->window)); -		/* Max length: 9 "RES=0x3F " */ -		sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); -		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ -		if (th->cwr) -			sb_add(m, "CWR "); -		if (th->ece) -			sb_add(m, "ECE "); -		if (th->urg) -			sb_add(m, "URG "); -		if (th->ack) -			sb_add(m, "ACK "); -		if (th->psh) -			sb_add(m, "PSH "); -		if (th->rst) -			sb_add(m, "RST "); -		if (th->syn) -			sb_add(m, "SYN "); -		if (th->fin) -			sb_add(m, "FIN "); -		/* Max length: 11 "URGP=65535 " */ -		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - -		if ((logflags & IPT_LOG_TCPOPT) && -		    th->doff * 4 > sizeof(struct tcphdr)) { -			unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; -			const unsigned char *op; -			unsigned int i, optsize; - -			optsize = th->doff * 4 - sizeof(struct tcphdr); -			op = skb_header_pointer(skb, -						iphoff+ih->ihl*4+sizeof(_tcph), -						optsize, _opt); -			if (op == NULL) { -				sb_add(m, "TRUNCATED"); -				return; -			} - -			/* Max length: 127 "OPT (" 15*4*2chars ") " */ -			sb_add(m, "OPT ("); -			for (i = 0; i < optsize; i++) -				sb_add(m, "%02X", op[i]); -			sb_add(m, ") "); -		} -		break; -	} -	case IPPROTO_UDP: -	case IPPROTO_UDPLITE: { -		struct udphdr _udph; -		const struct udphdr *uh; - -		if (ih->protocol == IPPROTO_UDP) -			/* Max length: 10 "PROTO=UDP "     */ -			sb_add(m, "PROTO=UDP " ); -		else	/* Max length: 14 "PROTO=UDPLITE " */ -			sb_add(m, "PROTO=UDPLITE "); - -		if (ntohs(ih->frag_off) & IP_OFFSET) -			break; - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		uh = skb_header_pointer(skb, iphoff+ih->ihl*4, -					sizeof(_udph), &_udph); -		if (uh == NULL) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		/* Max length: 20 "SPT=65535 DPT=65535 " */ -		sb_add(m, "SPT=%u DPT=%u LEN=%u ", -		       ntohs(uh->source), ntohs(uh->dest), -		       ntohs(uh->len)); -		break; -	} -	case IPPROTO_ICMP: { -		struct icmphdr _icmph; -		const struct icmphdr *ich; -		static const size_t required_len[NR_ICMP_TYPES+1] -			= { [ICMP_ECHOREPLY] = 4, -			    [ICMP_DEST_UNREACH] -			    = 8 + sizeof(struct iphdr), -			    [ICMP_SOURCE_QUENCH] -			    = 8 + sizeof(struct iphdr), -			    [ICMP_REDIRECT] -			    = 8 + sizeof(struct iphdr), -			    [ICMP_ECHO] = 4, -			    [ICMP_TIME_EXCEEDED] -			    = 8 + sizeof(struct iphdr), -			    [ICMP_PARAMETERPROB] -			    = 8 + sizeof(struct iphdr), -			    [ICMP_TIMESTAMP] = 20, -			    [ICMP_TIMESTAMPREPLY] = 20, -			    [ICMP_ADDRESS] = 12, -			    [ICMP_ADDRESSREPLY] = 12 }; - -		/* Max length: 11 "PROTO=ICMP " */ -		sb_add(m, "PROTO=ICMP "); - -		if (ntohs(ih->frag_off) & IP_OFFSET) -			break; - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, -					 sizeof(_icmph), &_icmph); -		if (ich == NULL) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		/* Max length: 18 "TYPE=255 CODE=255 " */ -		sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		if (ich->type <= NR_ICMP_TYPES && -		    required_len[ich->type] && -		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		switch (ich->type) { -		case ICMP_ECHOREPLY: -		case ICMP_ECHO: -			/* Max length: 19 "ID=65535 SEQ=65535 " */ -			sb_add(m, "ID=%u SEQ=%u ", -			       ntohs(ich->un.echo.id), -			       ntohs(ich->un.echo.sequence)); -			break; - -		case ICMP_PARAMETERPROB: -			/* Max length: 14 "PARAMETER=255 " */ -			sb_add(m, "PARAMETER=%u ", -			       ntohl(ich->un.gateway) >> 24); -			break; -		case ICMP_REDIRECT: -			/* Max length: 24 "GATEWAY=255.255.255.255 " */ -			sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); -			/* Fall through */ -		case ICMP_DEST_UNREACH: -		case ICMP_SOURCE_QUENCH: -		case ICMP_TIME_EXCEEDED: -			/* Max length: 3+maxlen */ -			if (!iphoff) { /* Only recurse once. */ -				sb_add(m, "["); -				dump_packet(m, info, skb, -					    iphoff + ih->ihl*4+sizeof(_icmph)); -				sb_add(m, "] "); -			} - -			/* Max length: 10 "MTU=65535 " */ -			if (ich->type == ICMP_DEST_UNREACH && -			    ich->code == ICMP_FRAG_NEEDED) -				sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); -		} -		break; -	} -	/* Max Length */ -	case IPPROTO_AH: { -		struct ip_auth_hdr _ahdr; -		const struct ip_auth_hdr *ah; - -		if (ntohs(ih->frag_off) & IP_OFFSET) -			break; - -		/* Max length: 9 "PROTO=AH " */ -		sb_add(m, "PROTO=AH "); - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		ah = skb_header_pointer(skb, iphoff+ih->ihl*4, -					sizeof(_ahdr), &_ahdr); -		if (ah == NULL) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		/* Length: 15 "SPI=0xF1234567 " */ -		sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); -		break; -	} -	case IPPROTO_ESP: { -		struct ip_esp_hdr _esph; -		const struct ip_esp_hdr *eh; - -		/* Max length: 10 "PROTO=ESP " */ -		sb_add(m, "PROTO=ESP "); - -		if (ntohs(ih->frag_off) & IP_OFFSET) -			break; - -		/* Max length: 25 "INCOMPLETE [65535 bytes] " */ -		eh = skb_header_pointer(skb, iphoff+ih->ihl*4, -					sizeof(_esph), &_esph); -		if (eh == NULL) { -			sb_add(m, "INCOMPLETE [%u bytes] ", -			       skb->len - iphoff - ih->ihl*4); -			break; -		} - -		/* Length: 15 "SPI=0xF1234567 " */ -		sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); -		break; -	} -	/* Max length: 10 "PROTO 255 " */ -	default: -		sb_add(m, "PROTO=%u ", ih->protocol); -	} - -	/* Max length: 15 "UID=4294967295 " */ -	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { -		read_lock_bh(&skb->sk->sk_callback_lock); -		if (skb->sk->sk_socket && skb->sk->sk_socket->file) -			sb_add(m, "UID=%u GID=%u ", -				skb->sk->sk_socket->file->f_cred->fsuid, -				skb->sk->sk_socket->file->f_cred->fsgid); -		read_unlock_bh(&skb->sk->sk_callback_lock); -	} - -	/* Max length: 16 "MARK=0xFFFFFFFF " */ -	if (!iphoff && skb->mark) -		sb_add(m, "MARK=0x%x ", skb->mark); - -	/* Proto    Max log string length */ -	/* IP:      40+46+6+11+127 = 230 */ -	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */ -	/* UDP:     10+max(25,20) = 35 */ -	/* UDPLITE: 14+max(25,20) = 39 */ -	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ -	/* ESP:     10+max(25)+15 = 50 */ -	/* AH:      9+max(25)+15 = 49 */ -	/* unknown: 10 */ - -	/* (ICMP allows recursion one level deep) */ -	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */ -	/* maxlen = 230+   91  + 230 + 252 = 803 */ -} - -static void dump_mac_header(struct sbuff *m, -			    const struct nf_loginfo *info, -			    const struct sk_buff *skb) -{ -	struct net_device *dev = skb->dev; -	unsigned int logflags = 0; - -	if (info->type == NF_LOG_TYPE_LOG) -		logflags = info->u.log.logflags; - -	if (!(logflags & IPT_LOG_MACDECODE)) -		goto fallback; - -	switch (dev->type) { -	case ARPHRD_ETHER: -		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", -		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, -		       ntohs(eth_hdr(skb)->h_proto)); -		return; -	default: -		break; -	} - -fallback: -	sb_add(m, "MAC="); -	if (dev->hard_header_len && -	    skb->mac_header != skb->network_header) { -		const unsigned char *p = skb_mac_header(skb); -		unsigned int i; - -		sb_add(m, "%02x", *p++); -		for (i = 1; i < dev->hard_header_len; i++, p++) -			sb_add(m, ":%02x", *p); -	} -	sb_add(m, " "); -} - -static struct nf_loginfo default_loginfo = { -	.type	= NF_LOG_TYPE_LOG, -	.u = { -		.log = { -			.level    = 5, -			.logflags = NF_LOG_MASK, -		}, -	}, -}; - -static void -ipt_log_packet(u_int8_t pf, -	       unsigned int hooknum, -	       const struct sk_buff *skb, -	       const struct net_device *in, -	       const struct net_device *out, -	       const struct nf_loginfo *loginfo, -	       const char *prefix) -{ -	struct sbuff *m = sb_open(); - -	if (!loginfo) -		loginfo = &default_loginfo; - -	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, -	       prefix, -	       in ? in->name : "", -	       out ? out->name : ""); -#ifdef CONFIG_BRIDGE_NETFILTER -	if (skb->nf_bridge) { -		const struct net_device *physindev; -		const struct net_device *physoutdev; - -		physindev = skb->nf_bridge->physindev; -		if (physindev && in != physindev) -			sb_add(m, "PHYSIN=%s ", physindev->name); -		physoutdev = skb->nf_bridge->physoutdev; -		if (physoutdev && out != physoutdev) -			sb_add(m, "PHYSOUT=%s ", physoutdev->name); -	} -#endif - -	/* MAC logging for input path only. */ -	if (in && !out) -		dump_mac_header(m, loginfo, skb); - -	dump_packet(m, loginfo, skb, 0); - -	sb_close(m); -} - -static unsigned int -log_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ -	const struct ipt_log_info *loginfo = par->targinfo; -	struct nf_loginfo li; - -	li.type = NF_LOG_TYPE_LOG; -	li.u.log.level = loginfo->level; -	li.u.log.logflags = loginfo->logflags; - -	ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, -		       loginfo->prefix); -	return XT_CONTINUE; -} - -static int log_tg_check(const struct xt_tgchk_param *par) -{ -	const struct ipt_log_info *loginfo = par->targinfo; - -	if (loginfo->level >= 8) { -		pr_debug("level %u >= 8\n", loginfo->level); -		return -EINVAL; -	} -	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { -		pr_debug("prefix is not null-terminated\n"); -		return -EINVAL; -	} -	return 0; -} - -static struct xt_target log_tg_reg __read_mostly = { -	.name		= "LOG", -	.family		= NFPROTO_IPV4, -	.target		= log_tg, -	.targetsize	= sizeof(struct ipt_log_info), -	.checkentry	= log_tg_check, -	.me		= THIS_MODULE, -}; - -static struct nf_logger ipt_log_logger __read_mostly = { -	.name		= "ipt_LOG", -	.logfn		= &ipt_log_packet, -	.me		= THIS_MODULE, -}; - -static int __init log_tg_init(void) -{ -	int ret; - -	ret = xt_register_target(&log_tg_reg); -	if (ret < 0) -		return ret; -	nf_log_register(NFPROTO_IPV4, &ipt_log_logger); -	return 0; -} - -static void __exit log_tg_exit(void) -{ -	nf_log_unregister(&ipt_log_logger); -	xt_unregister_target(&log_tg_reg); -} - -module_init(log_tg_init); -module_exit(log_tg_exit); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index d2ed9dc74eb..00352ce0f0d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -19,9 +19,9 @@  #include <net/ip.h>  #include <net/checksum.h>  #include <net/route.h> -#include <net/netfilter/nf_nat_rule.h>  #include <linux/netfilter_ipv4.h>  #include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h>  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");  /* FIXME: Multiple targets. --RR */  static int masquerade_tg_check(const struct xt_tgchk_param *par)  { -	const struct nf_nat_multi_range_compat *mr = par->targinfo; +	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; -	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { +	if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {  		pr_debug("bad MAP_IPS.\n");  		return -EINVAL;  	} @@ -50,9 +50,9 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)  	struct nf_conn_nat *nat;  	enum ip_conntrack_info ctinfo;  	struct nf_nat_range newrange; -	const struct nf_nat_multi_range_compat *mr; +	const struct nf_nat_ipv4_multi_range_compat *mr;  	const struct rtable *rt; -	__be32 newsrc; +	__be32 newsrc, nh;  	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); @@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)  	nat = nfct_nat(ct);  	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || -			    ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); +			    ctinfo == IP_CT_RELATED_REPLY));  	/* Source address is 0.0.0.0 - locally generated packet that is  	 * probably not supposed to be masqueraded. @@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)  	mr = par->targinfo;  	rt = skb_rtable(skb); -	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); +	nh = rt_nexthop(rt, ip_hdr(skb)->daddr); +	newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);  	if (!newsrc) {  		pr_info("%s ate my IP address\n", par->out->name);  		return NF_DROP; @@ -79,13 +80,16 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)  	nat->masq_index = par->out->ifindex;  	/* Transfer from original range. */ -	newrange = ((struct nf_nat_range) -		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, -		  newsrc, newsrc, -		  mr->range[0].min, mr->range[0].max }); +	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); +	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); +	newrange.flags       = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; +	newrange.min_addr.ip = newsrc; +	newrange.max_addr.ip = newsrc; +	newrange.min_proto   = mr->range[0].min; +	newrange.max_proto   = mr->range[0].max;  	/* Hand modified range to generic setup. */ -	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); +	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);  }  static int @@ -95,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)  	if (!nat)  		return 0; - +	if (nf_ct_l3num(i) != NFPROTO_IPV4) +		return 0;  	return nat->masq_index == (int)(long)ifindex;  } @@ -103,7 +108,7 @@ static int masq_device_event(struct notifier_block *this,  			     unsigned long event,  			     void *ptr)  { -	const struct net_device *dev = ptr; +	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);  	struct net *net = dev_net(dev);  	if (event == NETDEV_DOWN) { @@ -113,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,  		NF_CT_ASSERT(dev->ifindex != 0);  		nf_ct_iterate_cleanup(net, device_cmp, -				      (void *)(long)dev->ifindex); +				      (void *)(long)dev->ifindex, 0, 0);  	}  	return NOTIFY_DONE; @@ -124,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this,  			   void *ptr)  {  	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; -	return masq_device_event(this, event, dev); +	struct netdev_notifier_info info; + +	netdev_notifier_info_init(&info, dev); +	return masq_device_event(this, event, &info);  }  static struct notifier_block masq_dev_notifier = { @@ -139,7 +147,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {  	.name		= "MASQUERADE",  	.family		= NFPROTO_IPV4,  	.target		= masquerade_tg, -	.targetsize	= sizeof(struct nf_nat_multi_range_compat), +	.targetsize	= sizeof(struct nf_nat_ipv4_multi_range_compat),  	.table		= "nat",  	.hooks		= 1 << NF_INET_POST_ROUTING,  	.checkentry	= masquerade_tg_check, diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c deleted file mode 100644 index 6cdb298f103..00000000000 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ /dev/null @@ -1,98 +0,0 @@ -/* NETMAP - static NAT mapping of IP network addresses (1:1). - * The mapping can be applied to source (POSTROUTING), - * destination (PREROUTING), or both (with separate rules). - */ - -/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); -MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); - -static int netmap_tg_check(const struct xt_tgchk_param *par) -{ -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { -		pr_debug("bad MAP_IPS.\n"); -		return -EINVAL; -	} -	if (mr->rangesize != 1) { -		pr_debug("bad rangesize %u.\n", mr->rangesize); -		return -EINVAL; -	} -	return 0; -} - -static unsigned int -netmap_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ -	struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	__be32 new_ip, netmask; -	const struct nf_nat_multi_range_compat *mr = par->targinfo; -	struct nf_nat_range newrange; - -	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || -		     par->hooknum == NF_INET_POST_ROUTING || -		     par->hooknum == NF_INET_LOCAL_OUT || -		     par->hooknum == NF_INET_LOCAL_IN); -	ct = nf_ct_get(skb, &ctinfo); - -	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - -	if (par->hooknum == NF_INET_PRE_ROUTING || -	    par->hooknum == NF_INET_LOCAL_OUT) -		new_ip = ip_hdr(skb)->daddr & ~netmask; -	else -		new_ip = ip_hdr(skb)->saddr & ~netmask; -	new_ip |= mr->range[0].min_ip & netmask; - -	newrange = ((struct nf_nat_range) -		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, -		  new_ip, new_ip, -		  mr->range[0].min, mr->range[0].max }); - -	/* Hand modified range to generic setup. */ -	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); -} - -static struct xt_target netmap_tg_reg __read_mostly = { -	.name 		= "NETMAP", -	.family		= NFPROTO_IPV4, -	.target 	= netmap_tg, -	.targetsize	= sizeof(struct nf_nat_multi_range_compat), -	.table		= "nat", -	.hooks		= (1 << NF_INET_PRE_ROUTING) | -			  (1 << NF_INET_POST_ROUTING) | -			  (1 << NF_INET_LOCAL_OUT) | -			  (1 << NF_INET_LOCAL_IN), -	.checkentry 	= netmap_tg_check, -	.me 		= THIS_MODULE -}; - -static int __init netmap_tg_init(void) -{ -	return xt_register_target(&netmap_tg_reg); -} - -static void __exit netmap_tg_exit(void) -{ -	xt_unregister_target(&netmap_tg_reg); -} - -module_init(netmap_tg_init); -module_exit(netmap_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c deleted file mode 100644 index 18a0656505a..00000000000 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ /dev/null @@ -1,110 +0,0 @@ -/* Redirect.  Simple mapping which alters dst to a local IP address. */ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/timer.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <linux/inetdevice.h> -#include <net/protocol.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); - -/* FIXME: Take multiple ranges --RR */ -static int redirect_tg_check(const struct xt_tgchk_param *par) -{ -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { -		pr_debug("bad MAP_IPS.\n"); -		return -EINVAL; -	} -	if (mr->rangesize != 1) { -		pr_debug("bad rangesize %u.\n", mr->rangesize); -		return -EINVAL; -	} -	return 0; -} - -static unsigned int -redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ -	struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	__be32 newdst; -	const struct nf_nat_multi_range_compat *mr = par->targinfo; -	struct nf_nat_range newrange; - -	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || -		     par->hooknum == NF_INET_LOCAL_OUT); - -	ct = nf_ct_get(skb, &ctinfo); -	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - -	/* Local packets: make them go to loopback */ -	if (par->hooknum == NF_INET_LOCAL_OUT) -		newdst = htonl(0x7F000001); -	else { -		struct in_device *indev; -		struct in_ifaddr *ifa; - -		newdst = 0; - -		rcu_read_lock(); -		indev = __in_dev_get_rcu(skb->dev); -		if (indev && (ifa = indev->ifa_list)) -			newdst = ifa->ifa_local; -		rcu_read_unlock(); - -		if (!newdst) -			return NF_DROP; -	} - -	/* Transfer from original range. */ -	newrange = ((struct nf_nat_range) -		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, -		  newdst, newdst, -		  mr->range[0].min, mr->range[0].max }); - -	/* Hand modified range to generic setup. */ -	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST); -} - -static struct xt_target redirect_tg_reg __read_mostly = { -	.name		= "REDIRECT", -	.family		= NFPROTO_IPV4, -	.target		= redirect_tg, -	.targetsize	= sizeof(struct nf_nat_multi_range_compat), -	.table		= "nat", -	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), -	.checkentry	= redirect_tg_check, -	.me		= THIS_MODULE, -}; - -static int __init redirect_tg_init(void) -{ -	return xt_register_target(&redirect_tg_reg); -} - -static void __exit redirect_tg_exit(void) -{ -	xt_unregister_target(&redirect_tg_reg); -} - -module_init(redirect_tg_init); -module_exit(redirect_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 43eec80c0e7..5b6e0df4ccf 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@  #include <linux/udp.h>  #include <linux/icmp.h>  #include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h> -#include <net/route.h> -#include <net/dst.h>  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter_ipv4/ip_tables.h>  #include <linux/netfilter_ipv4/ipt_REJECT.h> @@ -28,114 +24,12 @@  #include <linux/netfilter_bridge.h>  #endif +#include <net/netfilter/ipv4/nf_reject.h> +  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");  MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ -	struct sk_buff *nskb; -	const struct iphdr *oiph; -	struct iphdr *niph; -	const struct tcphdr *oth; -	struct tcphdr _otcph, *tcph; -	unsigned int addr_type; - -	/* IP header checks: fragment. */ -	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) -		return; - -	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), -				 sizeof(_otcph), &_otcph); -	if (oth == NULL) -		return; - -	/* No RST for RST. */ -	if (oth->rst) -		return; - -	/* Check checksum */ -	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) -		return; -	oiph = ip_hdr(oldskb); - -	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + -			 LL_MAX_HEADER, GFP_ATOMIC); -	if (!nskb) -		return; - -	skb_reserve(nskb, LL_MAX_HEADER); - -	skb_reset_network_header(nskb); -	niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); -	niph->version	= 4; -	niph->ihl	= sizeof(struct iphdr) / 4; -	niph->tos	= 0; -	niph->id	= 0; -	niph->frag_off	= htons(IP_DF); -	niph->protocol	= IPPROTO_TCP; -	niph->check	= 0; -	niph->saddr	= oiph->daddr; -	niph->daddr	= oiph->saddr; - -	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); -	memset(tcph, 0, sizeof(*tcph)); -	tcph->source	= oth->dest; -	tcph->dest	= oth->source; -	tcph->doff	= sizeof(struct tcphdr) / 4; - -	if (oth->ack) -		tcph->seq = oth->ack_seq; -	else { -		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + -				      oldskb->len - ip_hdrlen(oldskb) - -				      (oth->doff << 2)); -		tcph->ack = 1; -	} - -	tcph->rst	= 1; -	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, -				    niph->daddr, 0); -	nskb->ip_summed = CHECKSUM_PARTIAL; -	nskb->csum_start = (unsigned char *)tcph - nskb->head; -	nskb->csum_offset = offsetof(struct tcphdr, check); - -	addr_type = RTN_UNSPEC; -	if (hook != NF_INET_FORWARD -#ifdef CONFIG_BRIDGE_NETFILTER -	    || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) -#endif -	   ) -		addr_type = RTN_LOCAL; - -	/* ip_route_me_harder expects skb->dst to be set */ -	skb_dst_set_noref(nskb, skb_dst(oldskb)); - -	nskb->protocol = htons(ETH_P_IP); -	if (ip_route_me_harder(nskb, addr_type)) -		goto free_nskb; - -	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); - -	/* "Never happens" */ -	if (nskb->len > dst_mtu(skb_dst(nskb))) -		goto free_nskb; - -	nf_ct_attach(nskb, oldskb); - -	ip_local_out(nskb); -	return; - - free_nskb: -	kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ -	icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} -  static unsigned int  reject_tg(struct sk_buff *skb, const struct xt_action_param *par)  { @@ -143,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)  	switch (reject->with) {  	case IPT_ICMP_NET_UNREACHABLE: -		send_unreach(skb, ICMP_NET_UNREACH); +		nf_send_unreach(skb, ICMP_NET_UNREACH);  		break;  	case IPT_ICMP_HOST_UNREACHABLE: -		send_unreach(skb, ICMP_HOST_UNREACH); +		nf_send_unreach(skb, ICMP_HOST_UNREACH);  		break;  	case IPT_ICMP_PROT_UNREACHABLE: -		send_unreach(skb, ICMP_PROT_UNREACH); +		nf_send_unreach(skb, ICMP_PROT_UNREACH);  		break;  	case IPT_ICMP_PORT_UNREACHABLE: -		send_unreach(skb, ICMP_PORT_UNREACH); +		nf_send_unreach(skb, ICMP_PORT_UNREACH);  		break;  	case IPT_ICMP_NET_PROHIBITED: -		send_unreach(skb, ICMP_NET_ANO); +		nf_send_unreach(skb, ICMP_NET_ANO);  		break;  	case IPT_ICMP_HOST_PROHIBITED: -		send_unreach(skb, ICMP_HOST_ANO); +		nf_send_unreach(skb, ICMP_HOST_ANO);  		break;  	case IPT_ICMP_ADMIN_PROHIBITED: -		send_unreach(skb, ICMP_PKT_FILTERED); +		nf_send_unreach(skb, ICMP_PKT_FILTERED);  		break;  	case IPT_TCP_RESET: -		send_reset(skb, par->hooknum); +		nf_send_reset(skb, par->hooknum);  	case IPT_ICMP_ECHOREPLY:  		/* Doesn't happen. */  		break; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 00000000000..a313c3fbeb4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct iphdr * +synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +{ +	struct iphdr *iph; + +	skb_reset_network_header(skb); +	iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); +	iph->version	= 4; +	iph->ihl	= sizeof(*iph) / 4; +	iph->tos	= 0; +	iph->id		= 0; +	iph->frag_off	= htons(IP_DF); +	iph->ttl	= sysctl_ip_default_ttl; +	iph->protocol	= IPPROTO_TCP; +	iph->check	= 0; +	iph->saddr	= saddr; +	iph->daddr	= daddr; + +	return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, +		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, +		  struct iphdr *niph, struct tcphdr *nth, +		  unsigned int tcp_hdr_size) +{ +	nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); +	nskb->ip_summed   = CHECKSUM_PARTIAL; +	nskb->csum_start  = (unsigned char *)nth - nskb->head; +	nskb->csum_offset = offsetof(struct tcphdr, check); + +	skb_dst_set_noref(nskb, skb_dst(skb)); +	nskb->protocol = htons(ETH_P_IP); +	if (ip_route_me_harder(nskb, RTN_UNSPEC)) +		goto free_nskb; + +	if (nfct) { +		nskb->nfct = nfct; +		nskb->nfctinfo = ctinfo; +		nf_conntrack_get(nfct); +	} + +	ip_local_out(nskb); +	return; + +free_nskb: +	kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, +			    const struct synproxy_options *opts) +{ +	struct sk_buff *nskb; +	struct iphdr *iph, *niph; +	struct tcphdr *nth; +	unsigned int tcp_hdr_size; +	u16 mss = opts->mss; + +	iph = ip_hdr(skb); + +	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); +	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, +			 GFP_ATOMIC); +	if (nskb == NULL) +		return; +	skb_reserve(nskb, MAX_TCP_HEADER); + +	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + +	skb_reset_transport_header(nskb); +	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); +	nth->source	= th->dest; +	nth->dest	= th->source; +	nth->seq	= htonl(__cookie_v4_init_sequence(iph, th, &mss)); +	nth->ack_seq	= htonl(ntohl(th->seq) + 1); +	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; +	if (opts->options & XT_SYNPROXY_OPT_ECN) +		tcp_flag_word(nth) |= TCP_FLAG_ECE; +	nth->doff	= tcp_hdr_size / 4; +	nth->window	= 0; +	nth->check	= 0; +	nth->urg_ptr	= 0; + +	synproxy_build_options(nth, opts); + +	synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, +			  niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, +			 const struct sk_buff *skb, const struct tcphdr *th, +			 const struct synproxy_options *opts, u32 recv_seq) +{ +	struct sk_buff *nskb; +	struct iphdr *iph, *niph; +	struct tcphdr *nth; +	unsigned int tcp_hdr_size; + +	iph = ip_hdr(skb); + +	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); +	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, +			 GFP_ATOMIC); +	if (nskb == NULL) +		return; +	skb_reserve(nskb, MAX_TCP_HEADER); + +	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + +	skb_reset_transport_header(nskb); +	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); +	nth->source	= th->source; +	nth->dest	= th->dest; +	nth->seq	= htonl(recv_seq - 1); +	/* ack_seq is used to relay our ISN to the synproxy hook to initialize +	 * sequence number translation once a connection tracking entry exists. +	 */ +	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1); +	tcp_flag_word(nth) = TCP_FLAG_SYN; +	if (opts->options & XT_SYNPROXY_OPT_ECN) +		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; +	nth->doff	= tcp_hdr_size / 4; +	nth->window	= th->window; +	nth->check	= 0; +	nth->urg_ptr	= 0; + +	synproxy_build_options(nth, opts); + +	synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, +			  niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, +			 const struct ip_ct_tcp *state, +			 const struct sk_buff *skb, const struct tcphdr *th, +			 const struct synproxy_options *opts) +{ +	struct sk_buff *nskb; +	struct iphdr *iph, *niph; +	struct tcphdr *nth; +	unsigned int tcp_hdr_size; + +	iph = ip_hdr(skb); + +	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); +	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, +			 GFP_ATOMIC); +	if (nskb == NULL) +		return; +	skb_reserve(nskb, MAX_TCP_HEADER); + +	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + +	skb_reset_transport_header(nskb); +	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); +	nth->source	= th->dest; +	nth->dest	= th->source; +	nth->seq	= htonl(ntohl(th->ack_seq)); +	nth->ack_seq	= htonl(ntohl(th->seq) + 1); +	tcp_flag_word(nth) = TCP_FLAG_ACK; +	nth->doff	= tcp_hdr_size / 4; +	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); +	nth->check	= 0; +	nth->urg_ptr	= 0; + +	synproxy_build_options(nth, opts); + +	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, +			 const struct sk_buff *skb, const struct tcphdr *th, +			 const struct synproxy_options *opts) +{ +	struct sk_buff *nskb; +	struct iphdr *iph, *niph; +	struct tcphdr *nth; +	unsigned int tcp_hdr_size; + +	iph = ip_hdr(skb); + +	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); +	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, +			 GFP_ATOMIC); +	if (nskb == NULL) +		return; +	skb_reserve(nskb, MAX_TCP_HEADER); + +	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + +	skb_reset_transport_header(nskb); +	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); +	nth->source	= th->source; +	nth->dest	= th->dest; +	nth->seq	= htonl(ntohl(th->seq) + 1); +	nth->ack_seq	= th->ack_seq; +	tcp_flag_word(nth) = TCP_FLAG_ACK; +	nth->doff	= tcp_hdr_size / 4; +	nth->window	= ntohs(htons(th->window) >> opts->wscale); +	nth->check	= 0; +	nth->urg_ptr	= 0; + +	synproxy_build_options(nth, opts); + +	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, +			 const struct sk_buff *skb, const struct tcphdr *th, +			 struct synproxy_options *opts, u32 recv_seq) +{ +	int mss; + +	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); +	if (mss == 0) { +		this_cpu_inc(snet->stats->cookie_invalid); +		return false; +	} + +	this_cpu_inc(snet->stats->cookie_valid); +	opts->mss = mss; +	opts->options |= XT_SYNPROXY_OPT_MSS; + +	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) +		synproxy_check_timestamp_cookie(opts); + +	synproxy_send_server_syn(snet, skb, th, opts, recv_seq); +	return true; +} + +static unsigned int +synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ +	const struct xt_synproxy_info *info = par->targinfo; +	struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); +	struct synproxy_options opts = {}; +	struct tcphdr *th, _th; + +	if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) +		return NF_DROP; + +	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); +	if (th == NULL) +		return NF_DROP; + +	if (!synproxy_parse_options(skb, par->thoff, th, &opts)) +		return NF_DROP; + +	if (th->syn && !(th->ack || th->fin || th->rst)) { +		/* Initial SYN from client */ +		this_cpu_inc(snet->stats->syn_received); + +		if (th->ece && th->cwr) +			opts.options |= XT_SYNPROXY_OPT_ECN; + +		opts.options &= info->options; +		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) +			synproxy_init_timestamp_cookie(info, &opts); +		else +			opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | +					  XT_SYNPROXY_OPT_SACK_PERM | +					  XT_SYNPROXY_OPT_ECN); + +		synproxy_send_client_synack(skb, th, &opts); +		return NF_DROP; + +	} else if (th->ack && !(th->fin || th->rst || th->syn)) { +		/* ACK from client */ +		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); +		return NF_DROP; +	} + +	return XT_CONTINUE; +} + +static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops, +				       struct sk_buff *skb, +				       const struct net_device *in, +				       const struct net_device *out, +				       int (*okfn)(struct sk_buff *)) +{ +	struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct; +	struct nf_conn_synproxy *synproxy; +	struct synproxy_options opts = {}; +	const struct ip_ct_tcp *state; +	struct tcphdr *th, _th; +	unsigned int thoff; + +	ct = nf_ct_get(skb, &ctinfo); +	if (ct == NULL) +		return NF_ACCEPT; + +	synproxy = nfct_synproxy(ct); +	if (synproxy == NULL) +		return NF_ACCEPT; + +	if (nf_is_loopback_packet(skb)) +		return NF_ACCEPT; + +	thoff = ip_hdrlen(skb); +	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); +	if (th == NULL) +		return NF_DROP; + +	state = &ct->proto.tcp; +	switch (state->state) { +	case TCP_CONNTRACK_CLOSE: +		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { +			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - +						      ntohl(th->seq) + 1); +			break; +		} + +		if (!th->syn || th->ack || +		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) +			break; + +		/* Reopened connection - reset the sequence number and timestamp +		 * adjustments, they will get initialized once the connection is +		 * reestablished. +		 */ +		nf_ct_seqadj_init(ct, ctinfo, 0); +		synproxy->tsoff = 0; +		this_cpu_inc(snet->stats->conn_reopened); + +		/* fall through */ +	case TCP_CONNTRACK_SYN_SENT: +		if (!synproxy_parse_options(skb, thoff, th, &opts)) +			return NF_DROP; + +		if (!th->syn && th->ack && +		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { +			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, +			 * therefore we need to add 1 to make the SYN sequence +			 * number match the one of first SYN. +			 */ +			if (synproxy_recv_client_ack(snet, skb, th, &opts, +						     ntohl(th->seq) + 1)) +				this_cpu_inc(snet->stats->cookie_retrans); + +			return NF_DROP; +		} + +		synproxy->isn = ntohl(th->ack_seq); +		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) +			synproxy->its = opts.tsecr; +		break; +	case TCP_CONNTRACK_SYN_RECV: +		if (!th->syn || !th->ack) +			break; + +		if (!synproxy_parse_options(skb, thoff, th, &opts)) +			return NF_DROP; + +		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) +			synproxy->tsoff = opts.tsval - synproxy->its; + +		opts.options &= ~(XT_SYNPROXY_OPT_MSS | +				  XT_SYNPROXY_OPT_WSCALE | +				  XT_SYNPROXY_OPT_SACK_PERM); + +		swap(opts.tsval, opts.tsecr); +		synproxy_send_server_ack(snet, state, skb, th, &opts); + +		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + +		swap(opts.tsval, opts.tsecr); +		synproxy_send_client_ack(snet, skb, th, &opts); + +		consume_skb(skb); +		return NF_STOLEN; +	default: +		break; +	} + +	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); +	return NF_ACCEPT; +} + +static int synproxy_tg4_check(const struct xt_tgchk_param *par) +{ +	const struct ipt_entry *e = par->entryinfo; + +	if (e->ip.proto != IPPROTO_TCP || +	    e->ip.invflags & XT_INV_PROTO) +		return -EINVAL; + +	return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ +	nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg4_reg __read_mostly = { +	.name		= "SYNPROXY", +	.family		= NFPROTO_IPV4, +	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), +	.target		= synproxy_tg4, +	.targetsize	= sizeof(struct xt_synproxy_info), +	.checkentry	= synproxy_tg4_check, +	.destroy	= synproxy_tg4_destroy, +	.me		= THIS_MODULE, +}; + +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { +	{ +		.hook		= ipv4_synproxy_hook, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_LOCAL_IN, +		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1, +	}, +	{ +		.hook		= ipv4_synproxy_hook, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_POST_ROUTING, +		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1, +	}, +}; + +static int __init synproxy_tg4_init(void) +{ +	int err; + +	err = nf_register_hooks(ipv4_synproxy_ops, +				ARRAY_SIZE(ipv4_synproxy_ops)); +	if (err < 0) +		goto err1; + +	err = xt_register_target(&synproxy_tg4_reg); +	if (err < 0) +		goto err2; + +	return 0; + +err2: +	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +err1: +	return err; +} + +static void __exit synproxy_tg4_exit(void) +{ +	xt_unregister_target(&synproxy_tg4_reg); +	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +} + +module_init(synproxy_tg4_init); +module_exit(synproxy_tg4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 446e0f467a1..9cb993cd224 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -4,6 +4,7 @@   * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>   * (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2005-2007 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -37,7 +38,7 @@  #include <linux/skbuff.h>  #include <linux/kernel.h>  #include <linux/timer.h> -#include <linux/netlink.h> +#include <net/netlink.h>  #include <linux/netdevice.h>  #include <linux/mm.h>  #include <linux/moduleparam.h> @@ -45,6 +46,7 @@  #include <linux/netfilter/x_tables.h>  #include <linux/netfilter_ipv4/ipt_ULOG.h>  #include <net/netfilter/nf_log.h> +#include <net/netns/generic.h>  #include <net/sock.h>  #include <linux/bitops.h>  #include <asm/unaligned.h> @@ -65,7 +67,7 @@ static unsigned int flushtimeout = 10;  module_param(flushtimeout, uint, 0600);  MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); -static int nflog = 1; +static bool nflog = true;  module_param(nflog, bool, 0400);  MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); @@ -78,20 +80,26 @@ typedef struct {  	struct timer_list timer;	/* the timer function */  } ulog_buff_t; -static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];	/* array of buffers */ +static int ulog_net_id __read_mostly; +struct ulog_net { +	unsigned int nlgroup[ULOG_MAXNLGROUPS]; +	ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; +	struct sock *nflognl; +	spinlock_t lock; +}; -static struct sock *nflognl;		/* our socket */ -static DEFINE_SPINLOCK(ulog_lock);	/* spinlock */ +static struct ulog_net *ulog_pernet(struct net *net) +{ +	return net_generic(net, ulog_net_id); +}  /* send one ulog_buff_t to userspace */ -static void ulog_send(unsigned int nlgroupnum) +static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)  { -	ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; +	ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum]; -	if (timer_pending(&ub->timer)) { -		pr_debug("ulog_send: timer was pending, deleting\n"); -		del_timer(&ub->timer); -	} +	pr_debug("ulog_send: timer is deleting\n"); +	del_timer(&ub->timer);  	if (!ub->skb) {  		pr_debug("ulog_send: nothing to send\n"); @@ -105,7 +113,8 @@ static void ulog_send(unsigned int nlgroupnum)  	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;  	pr_debug("throwing %d packets to netlink group %u\n",  		 ub->qlen, nlgroupnum + 1); -	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); +	netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1, +			  GFP_ATOMIC);  	ub->qlen = 0;  	ub->skb = NULL; @@ -116,13 +125,17 @@ static void ulog_send(unsigned int nlgroupnum)  /* timer function to flush queue in flushtimeout time */  static void ulog_timer(unsigned long data)  { +	unsigned int groupnum = *((unsigned int *)data); +	struct ulog_net *ulog = container_of((void *)data, +					     struct ulog_net, +					     nlgroup[groupnum]);  	pr_debug("timer function called, calling ulog_send\n");  	/* lock to protect against somebody modifying our structure  	 * from ipt_ulog_target at the same time */ -	spin_lock_bh(&ulog_lock); -	ulog_send(data); -	spin_unlock_bh(&ulog_lock); +	spin_lock_bh(&ulog->lock); +	ulog_send(ulog, groupnum); +	spin_unlock_bh(&ulog->lock);  }  static struct sk_buff *ulog_alloc_skb(unsigned int size) @@ -135,10 +148,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)  	 * due to slab allocator restrictions */  	n = max(size, nlbufsiz); -	skb = alloc_skb(n, GFP_ATOMIC); +	skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);  	if (!skb) { -		pr_debug("cannot alloc whole buffer %ub!\n", n); -  		if (n > size) {  			/* try to allocate only as much as we need for  			 * current packet */ @@ -152,7 +163,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)  	return skb;  } -static void ipt_ulog_packet(unsigned int hooknum, +static void ipt_ulog_packet(struct net *net, +			    unsigned int hooknum,  			    const struct sk_buff *skb,  			    const struct net_device *in,  			    const struct net_device *out, @@ -164,6 +176,7 @@ static void ipt_ulog_packet(unsigned int hooknum,  	size_t size, copy_len;  	struct nlmsghdr *nlh;  	struct timeval tv; +	struct ulog_net *ulog = ulog_pernet(net);  	/* ffs == find first bit set, necessary because userspace  	 * is already shifting groupnumber, but we need unshifted. @@ -176,11 +189,11 @@ static void ipt_ulog_packet(unsigned int hooknum,  	else  		copy_len = loginfo->copy_range; -	size = NLMSG_SPACE(sizeof(*pm) + copy_len); +	size = nlmsg_total_size(sizeof(*pm) + copy_len); -	ub = &ulog_buffers[groupnum]; +	ub = &ulog->ulog_buffers[groupnum]; -	spin_lock_bh(&ulog_lock); +	spin_lock_bh(&ulog->lock);  	if (!ub->skb) {  		if (!(ub->skb = ulog_alloc_skb(size))) @@ -190,7 +203,7 @@ static void ipt_ulog_packet(unsigned int hooknum,  		/* either the queue len is too high or we don't have  		 * enough room in nlskb left. send it to userspace. */ -		ulog_send(groupnum); +		ulog_send(ulog, groupnum);  		if (!(ub->skb = ulog_alloc_skb(size)))  			goto alloc_failure; @@ -198,12 +211,16 @@ static void ipt_ulog_packet(unsigned int hooknum,  	pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); -	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ -	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, -			sizeof(*pm)+copy_len); +	nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, +			sizeof(*pm)+copy_len, 0); +	if (!nlh) { +		pr_debug("error during nlmsg_put\n"); +		goto out_unlock; +	}  	ub->qlen++; -	pm = NLMSG_DATA(nlh); +	pm = nlmsg_data(nlh); +	memset(pm, 0, sizeof(*pm));  	/* We might not have a timestamp, get one */  	if (skb->tstamp.tv64 == 0) @@ -216,12 +233,12 @@ static void ipt_ulog_packet(unsigned int hooknum,  	put_unaligned(tv.tv_usec, &pm->timestamp_usec);  	put_unaligned(skb->mark, &pm->mark);  	pm->hook = hooknum; -	if (prefix != NULL) -		strncpy(pm->prefix, prefix, sizeof(pm->prefix)); +	if (prefix != NULL) { +		strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); +		pm->prefix[sizeof(pm->prefix) - 1] = '\0'; +	}  	else if (loginfo->prefix[0] != '\0')  		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); -	else -		*(pm->prefix) = '\0';  	if (in && in->hard_header_len > 0 &&  	    skb->mac_header != skb->network_header && @@ -233,13 +250,9 @@ static void ipt_ulog_packet(unsigned int hooknum,  	if (in)  		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); -	else -		pm->indev_name[0] = '\0';  	if (out)  		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); -	else -		pm->outdev_name[0] = '\0';  	/* copy_len <= skb->len, so can't fail. */  	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) @@ -261,29 +274,30 @@ static void ipt_ulog_packet(unsigned int hooknum,  	if (ub->qlen >= loginfo->qthreshold) {  		if (loginfo->qthreshold > 1)  			nlh->nlmsg_type = NLMSG_DONE; -		ulog_send(groupnum); +		ulog_send(ulog, groupnum);  	} - -	spin_unlock_bh(&ulog_lock); +out_unlock: +	spin_unlock_bh(&ulog->lock);  	return; -nlmsg_failure: -	pr_debug("error during NLMSG_PUT\n");  alloc_failure:  	pr_debug("Error building netlink message\n"); -	spin_unlock_bh(&ulog_lock); +	spin_unlock_bh(&ulog->lock);  }  static unsigned int  ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)  { -	ipt_ulog_packet(par->hooknum, skb, par->in, par->out, +	struct net *net = dev_net(par->in ? par->in : par->out); + +	ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,  	                par->targinfo, NULL);  	return XT_CONTINUE;  } -static void ipt_logfn(u_int8_t pf, +static void ipt_logfn(struct net *net, +		      u_int8_t pf,  		      unsigned int hooknum,  		      const struct sk_buff *skb,  		      const struct net_device *in, @@ -305,13 +319,19 @@ static void ipt_logfn(u_int8_t pf,  		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));  	} -	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +	ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);  }  static int ulog_tg_check(const struct xt_tgchk_param *par)  {  	const struct ipt_ulog_info *loginfo = par->targinfo; +	if (!par->net->xt.ulog_warn_deprecated) { +		pr_info("ULOG is deprecated and it will be removed soon, " +			"use NFLOG instead\n"); +		par->net->xt.ulog_warn_deprecated = true; +	} +  	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {  		pr_debug("prefix not null-terminated\n");  		return -EINVAL; @@ -379,57 +399,48 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {  	.me		= THIS_MODULE,  }; -static int __init ulog_tg_init(void) +static int __net_init ulog_tg_net_init(struct net *net)  { -	int ret, i; - -	pr_debug("init module\n"); - -	if (nlbufsiz > 128*1024) { -		pr_warning("Netlink buffer has to be <= 128kB\n"); -		return -EINVAL; -	} +	int i; +	struct ulog_net *ulog = ulog_pernet(net); +	struct netlink_kernel_cfg cfg = { +		.groups	= ULOG_MAXNLGROUPS, +	}; +	spin_lock_init(&ulog->lock);  	/* initialize ulog_buffers */ -	for (i = 0; i < ULOG_MAXNLGROUPS; i++) -		setup_timer(&ulog_buffers[i].timer, ulog_timer, i); +	for (i = 0; i < ULOG_MAXNLGROUPS; i++) { +		ulog->nlgroup[i] = i; +		setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer, +			    (unsigned long)&ulog->nlgroup[i]); +	} -	nflognl = netlink_kernel_create(&init_net, -					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, -					NULL, THIS_MODULE); -	if (!nflognl) +	ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg); +	if (!ulog->nflognl)  		return -ENOMEM; -	ret = xt_register_target(&ulog_tg_reg); -	if (ret < 0) { -		netlink_kernel_release(nflognl); -		return ret; -	}  	if (nflog) -		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); +		nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);  	return 0;  } -static void __exit ulog_tg_exit(void) +static void __net_exit ulog_tg_net_exit(struct net *net)  {  	ulog_buff_t *ub;  	int i; - -	pr_debug("cleanup_module\n"); +	struct ulog_net *ulog = ulog_pernet(net);  	if (nflog) -		nf_log_unregister(&ipt_ulog_logger); -	xt_unregister_target(&ulog_tg_reg); -	netlink_kernel_release(nflognl); +		nf_log_unset(net, &ipt_ulog_logger); + +	netlink_kernel_release(ulog->nflognl);  	/* remove pending timers and free allocated skb's */  	for (i = 0; i < ULOG_MAXNLGROUPS; i++) { -		ub = &ulog_buffers[i]; -		if (timer_pending(&ub->timer)) { -			pr_debug("timer was pending, deleting\n"); -			del_timer(&ub->timer); -		} +		ub = &ulog->ulog_buffers[i]; +		pr_debug("timer is deleting\n"); +		del_timer(&ub->timer);  		if (ub->skb) {  			kfree_skb(ub->skb); @@ -438,5 +449,50 @@ static void __exit ulog_tg_exit(void)  	}  } +static struct pernet_operations ulog_tg_net_ops = { +	.init = ulog_tg_net_init, +	.exit = ulog_tg_net_exit, +	.id   = &ulog_net_id, +	.size = sizeof(struct ulog_net), +}; + +static int __init ulog_tg_init(void) +{ +	int ret; +	pr_debug("init module\n"); + +	if (nlbufsiz > 128*1024) { +		pr_warn("Netlink buffer has to be <= 128kB\n"); +		return -EINVAL; +	} + +	ret = register_pernet_subsys(&ulog_tg_net_ops); +	if (ret) +		goto out_pernet; + +	ret = xt_register_target(&ulog_tg_reg); +	if (ret < 0) +		goto out_target; + +	if (nflog) +		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); + +	return 0; + +out_target: +	unregister_pernet_subsys(&ulog_tg_net_ops); +out_pernet: +	return ret; +} + +static void __exit ulog_tg_exit(void) +{ +	pr_debug("cleanup_module\n"); +	if (nflog) +		nf_log_unregister(&ipt_ulog_logger); +	xt_unregister_target(&ulog_tg_reg); +	unregister_pernet_subsys(&ulog_tg_net_ops); +} +  module_init(ulog_tg_init);  module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c deleted file mode 100644 index db8bff0fb86..00000000000 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - *  iptables module to match inet_addr_type() of an ip. - * - *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net> - *  (C) 2007 Laszlo Attila Toth <panther@balabit.hu> - * - *  This program is free software; you can redistribute it and/or modify - *  it under the terms of the GNU General Public License version 2 as - *  published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ipt_addrtype.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: address type match for IPv4"); - -static inline bool match_type(struct net *net, const struct net_device *dev, -			      __be32 addr, u_int16_t mask) -{ -	return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); -} - -static bool -addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) -{ -	struct net *net = dev_net(par->in ? par->in : par->out); -	const struct ipt_addrtype_info *info = par->matchinfo; -	const struct iphdr *iph = ip_hdr(skb); -	bool ret = true; - -	if (info->source) -		ret &= match_type(net, NULL, iph->saddr, info->source) ^ -		       info->invert_source; -	if (info->dest) -		ret &= match_type(net, NULL, iph->daddr, info->dest) ^ -		       info->invert_dest; - -	return ret; -} - -static bool -addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ -	struct net *net = dev_net(par->in ? par->in : par->out); -	const struct ipt_addrtype_info_v1 *info = par->matchinfo; -	const struct iphdr *iph = ip_hdr(skb); -	const struct net_device *dev = NULL; -	bool ret = true; - -	if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) -		dev = par->in; -	else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) -		dev = par->out; - -	if (info->source) -		ret &= match_type(net, dev, iph->saddr, info->source) ^ -		       (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); -	if (ret && info->dest) -		ret &= match_type(net, dev, iph->daddr, info->dest) ^ -		       !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); -	return ret; -} - -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) -{ -	struct ipt_addrtype_info_v1 *info = par->matchinfo; - -	if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && -	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { -		pr_info("both incoming and outgoing " -			"interface limitation cannot be selected\n"); -		return -EINVAL; -	} - -	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | -	    (1 << NF_INET_LOCAL_IN)) && -	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { -		pr_info("output interface limitation " -			"not valid in PREROUTING and INPUT\n"); -		return -EINVAL; -	} - -	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | -	    (1 << NF_INET_LOCAL_OUT)) && -	    info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { -		pr_info("input interface limitation " -			"not valid in POSTROUTING and OUTPUT\n"); -		return -EINVAL; -	} - -	return 0; -} - -static struct xt_match addrtype_mt_reg[] __read_mostly = { -	{ -		.name		= "addrtype", -		.family		= NFPROTO_IPV4, -		.match		= addrtype_mt_v0, -		.matchsize	= sizeof(struct ipt_addrtype_info), -		.me		= THIS_MODULE -	}, -	{ -		.name		= "addrtype", -		.family		= NFPROTO_IPV4, -		.revision	= 1, -		.match		= addrtype_mt_v1, -		.checkentry	= addrtype_mt_checkentry_v1, -		.matchsize	= sizeof(struct ipt_addrtype_info_v1), -		.me		= THIS_MODULE -	} -}; - -static int __init addrtype_mt_init(void) -{ -	return xt_register_matches(addrtype_mt_reg, -				   ARRAY_SIZE(addrtype_mt_reg)); -} - -static void __exit addrtype_mt_exit(void) -{ -	xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); -} - -module_init(addrtype_mt_init); -module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c deleted file mode 100644 index af6e9c77834..00000000000 --- a/net/ipv4/netfilter/ipt_ecn.c +++ /dev/null @@ -1,128 +0,0 @@ -/* IP tables module for matching the value of the IPv4 and TCP ECN bits - * - * (C) 2002 by Harald Welte <laforge@gnumonks.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/in.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/tcp.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ipt_ecn.h> - -MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4"); -MODULE_LICENSE("GPL"); - -static inline bool match_ip(const struct sk_buff *skb, -			    const struct ipt_ecn_info *einfo) -{ -	return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; -} - -static inline bool match_tcp(const struct sk_buff *skb, -			     const struct ipt_ecn_info *einfo, -			     bool *hotdrop) -{ -	struct tcphdr _tcph; -	const struct tcphdr *th; - -	/* In practice, TCP match does this, so can't fail.  But let's -	 * be good citizens. -	 */ -	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); -	if (th == NULL) { -		*hotdrop = false; -		return false; -	} - -	if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { -		if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { -			if (th->ece == 1) -				return false; -		} else { -			if (th->ece == 0) -				return false; -		} -	} - -	if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { -		if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { -			if (th->cwr == 1) -				return false; -		} else { -			if (th->cwr == 0) -				return false; -		} -	} - -	return true; -} - -static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) -{ -	const struct ipt_ecn_info *info = par->matchinfo; - -	if (info->operation & IPT_ECN_OP_MATCH_IP) -		if (!match_ip(skb, info)) -			return false; - -	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { -		if (ip_hdr(skb)->protocol != IPPROTO_TCP) -			return false; -		if (!match_tcp(skb, info, &par->hotdrop)) -			return false; -	} - -	return true; -} - -static int ecn_mt_check(const struct xt_mtchk_param *par) -{ -	const struct ipt_ecn_info *info = par->matchinfo; -	const struct ipt_ip *ip = par->entryinfo; - -	if (info->operation & IPT_ECN_OP_MATCH_MASK) -		return -EINVAL; - -	if (info->invert & IPT_ECN_OP_MATCH_MASK) -		return -EINVAL; - -	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && -	    ip->proto != IPPROTO_TCP) { -		pr_info("cannot match TCP bits in rule for non-tcp packets\n"); -		return -EINVAL; -	} - -	return 0; -} - -static struct xt_match ecn_mt_reg __read_mostly = { -	.name		= "ecn", -	.family		= NFPROTO_IPV4, -	.match		= ecn_mt, -	.matchsize	= sizeof(struct ipt_ecn_info), -	.checkentry	= ecn_mt_check, -	.me		= THIS_MODULE, -}; - -static int __init ecn_mt_init(void) -{ -	return xt_register_match(&ecn_mt_reg); -} - -static void __exit ecn_mt_exit(void) -{ -	xt_unregister_match(&ecn_mt_reg); -} - -module_init(ecn_mt_init); -module_exit(ecn_mt_exit); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c new file mode 100644 index 00000000000..4bfaedf9b34 --- /dev/null +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2011 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <net/ip_fib.h> +#include <net/route.h> + +#include <linux/netfilter/xt_rpfilter.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match"); + +/* don't try to find route from mcast/bcast/zeronet */ +static __be32 rpfilter_get_saddr(__be32 addr) +{ +	if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || +	    ipv4_is_zeronet(addr)) +		return 0; +	return addr; +} + +static bool rpfilter_lookup_reverse(struct flowi4 *fl4, +				const struct net_device *dev, u8 flags) +{ +	struct fib_result res; +	bool dev_match; +	struct net *net = dev_net(dev); +	int ret __maybe_unused; + +	if (fib_lookup(net, fl4, &res)) +		return false; + +	if (res.type != RTN_UNICAST) { +		if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) +			return false; +	} +	dev_match = false; +#ifdef CONFIG_IP_ROUTE_MULTIPATH +	for (ret = 0; ret < res.fi->fib_nhs; ret++) { +		struct fib_nh *nh = &res.fi->fib_nh[ret]; + +		if (nh->nh_dev == dev) { +			dev_match = true; +			break; +		} +	} +#else +	if (FIB_RES_DEV(res) == dev) +		dev_match = true; +#endif +	if (dev_match || flags & XT_RPFILTER_LOOSE) +		return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; +	return dev_match; +} + +static bool rpfilter_is_local(const struct sk_buff *skb) +{ +	const struct rtable *rt = skb_rtable(skb); +	return rt && (rt->rt_flags & RTCF_LOCAL); +} + +static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ +	const struct xt_rpfilter_info *info; +	const struct iphdr *iph; +	struct flowi4 flow; +	bool invert; + +	info = par->matchinfo; +	invert = info->flags & XT_RPFILTER_INVERT; + +	if (rpfilter_is_local(skb)) +		return true ^ invert; + +	iph = ip_hdr(skb); +	if (ipv4_is_multicast(iph->daddr)) { +		if (ipv4_is_zeronet(iph->saddr)) +			return ipv4_is_local_multicast(iph->daddr) ^ invert; +	} +	flow.flowi4_iif = LOOPBACK_IFINDEX; +	flow.daddr = iph->saddr; +	flow.saddr = rpfilter_get_saddr(iph->daddr); +	flow.flowi4_oif = 0; +	flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; +	flow.flowi4_tos = RT_TOS(iph->tos); +	flow.flowi4_scope = RT_SCOPE_UNIVERSE; + +	return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert; +} + +static int rpfilter_check(const struct xt_mtchk_param *par) +{ +	const struct xt_rpfilter_info *info = par->matchinfo; +	unsigned int options = ~XT_RPFILTER_OPTION_MASK; +	if (info->flags & options) { +		pr_info("unknown options encountered"); +		return -EINVAL; +	} + +	if (strcmp(par->table, "mangle") != 0 && +	    strcmp(par->table, "raw") != 0) { +		pr_info("match only valid in the \'raw\' " +			"or \'mangle\' tables, not \'%s\'.\n", par->table); +		return -EINVAL; +	} + +	return 0; +} + +static struct xt_match rpfilter_mt_reg __read_mostly = { +	.name		= "rpfilter", +	.family		= NFPROTO_IPV4, +	.checkentry	= rpfilter_check, +	.match		= rpfilter_mt, +	.matchsize	= sizeof(struct xt_rpfilter_info), +	.hooks		= (1 << NF_INET_PRE_ROUTING), +	.me		= THIS_MODULE +}; + +static int __init rpfilter_mt_init(void) +{ +	return xt_register_match(&rpfilter_mt_reg); +} + +static void __exit rpfilter_mt_exit(void) +{ +	xt_unregister_match(&rpfilter_mt_reg); +} + +module_init(rpfilter_mt_init); +module_exit(rpfilter_mt_exit); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index c37641e819f..e08a74a243a 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -33,26 +33,27 @@ static const struct xt_table packet_filter = {  };  static unsigned int -iptable_filter_hook(unsigned int hook, struct sk_buff *skb, +iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		    const struct net_device *in, const struct net_device *out,  		    int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT && +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* root is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); +	return ipt_do_table(skb, ops->hooknum, in, out, +			    net->ipv4.iptable_filter);  }  static struct nf_hook_ops *filter_ops __read_mostly;  /* Default to forward because I got too much mail already. */ -static int forward = NF_ACCEPT; +static bool forward = true;  module_param(forward, bool, 0000);  static int __net_init iptable_filter_net_init(struct net *net) @@ -64,14 +65,12 @@ static int __net_init iptable_filter_net_init(struct net *net)  		return -ENOMEM;  	/* Entry 1 is the FORWARD hook */  	((struct ipt_standard *)repl->entries)[1].target.verdict = -		-forward - 1; +		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;  	net->ipv4.iptable_filter =  		ipt_register_table(net, &packet_filter, repl);  	kfree(repl); -	if (IS_ERR(net->ipv4.iptable_filter)) -		return PTR_ERR(net->ipv4.iptable_filter); -	return 0; +	return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);  }  static void __net_exit iptable_filter_net_exit(struct net *net) @@ -88,11 +87,6 @@ static int __init iptable_filter_init(void)  {  	int ret; -	if (forward < 0 || forward > NF_MAX_VERDICT) { -		pr_err("iptables forward must be 0 or 1\n"); -		return -EINVAL; -	} -  	ret = register_pernet_subsys(&iptable_filter_net_ops);  	if (ret < 0)  		return ret; @@ -101,14 +95,10 @@ static int __init iptable_filter_init(void)  	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);  	if (IS_ERR(filter_ops)) {  		ret = PTR_ERR(filter_ops); -		goto cleanup_table; +		unregister_pernet_subsys(&iptable_filter_net_ops);  	}  	return ret; - - cleanup_table: -	unregister_pernet_subsys(&iptable_filter_net_ops); -	return ret;  }  static void __exit iptable_filter_fini(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f29..6a5079c34bb 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -44,6 +44,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)  	u_int8_t tos;  	__be32 saddr, daddr;  	u_int32_t mark; +	int err;  	/* root is playing with raw sockets. */  	if (skb->len < sizeof(struct iphdr) || @@ -60,15 +61,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)  	ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,  			   dev_net(out)->ipv4.iptable_mangle);  	/* Reroute for ANY change. */ -	if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { +	if (ret != NF_DROP && ret != NF_STOLEN) {  		iph = ip_hdr(skb);  		if (iph->saddr != saddr ||  		    iph->daddr != daddr ||  		    skb->mark != mark || -		    iph->tos != tos) -			if (ip_route_me_harder(skb, RTN_UNSPEC)) -				ret = NF_DROP; +		    iph->tos != tos) { +			err = ip_route_me_harder(skb, RTN_UNSPEC); +			if (err < 0) +				ret = NF_DROP_ERR(err); +		}  	}  	return ret; @@ -76,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)  /* The work comes in here from netfilter.c. */  static unsigned int -iptable_mangle_hook(unsigned int hook, +iptable_mangle_hook(const struct nf_hook_ops *ops,  		     struct sk_buff *skb,  		     const struct net_device *in,  		     const struct net_device *out,  		     int (*okfn)(struct sk_buff *))  { -	if (hook == NF_INET_LOCAL_OUT) +	if (ops->hooknum == NF_INET_LOCAL_OUT)  		return ipt_mangle_out(skb, out); -	if (hook == NF_INET_POST_ROUTING) -		return ipt_do_table(skb, hook, in, out, +	if (ops->hooknum == NF_INET_POST_ROUTING) +		return ipt_do_table(skb, ops->hooknum, in, out,  				    dev_net(out)->ipv4.iptable_mangle);  	/* PREROUTING/INPUT/FORWARD: */ -	return ipt_do_table(skb, hook, in, out, +	return ipt_do_table(skb, ops->hooknum, in, out,  			    dev_net(in)->ipv4.iptable_mangle);  } @@ -104,9 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)  	net->ipv4.iptable_mangle =  		ipt_register_table(net, &packet_mangler, repl);  	kfree(repl); -	if (IS_ERR(net->ipv4.iptable_mangle)) -		return PTR_ERR(net->ipv4.iptable_mangle); -	return 0; +	return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);  }  static void __net_exit iptable_mangle_net_exit(struct net *net) @@ -131,14 +132,10 @@ static int __init iptable_mangle_init(void)  	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);  	if (IS_ERR(mangle_ops)) {  		ret = PTR_ERR(mangle_ops); -		goto cleanup_table; +		unregister_pernet_subsys(&iptable_mangle_net_ops);  	}  	return ret; - - cleanup_table: -	unregister_pernet_subsys(&iptable_mangle_net_ops); -	return ret;  }  static void __exit iptable_mangle_fini(void) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c new file mode 100644 index 00000000000..f1787c04a4d --- /dev/null +++ b/net/ipv4/netfilter/iptable_nat.c @@ -0,0 +1,328 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/ip.h> +#include <net/ip.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> + +static const struct xt_table nf_nat_ipv4_table = { +	.name		= "nat", +	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_LOCAL_IN), +	.me		= THIS_MODULE, +	.af		= NFPROTO_IPV4, +}; + +static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ +	/* Force range to this IP; let proto decide mapping for +	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). +	 */ +	struct nf_nat_range range; + +	range.flags = 0; +	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, +		 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? +		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : +		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); + +	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); +} + +static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, +				     const struct net_device *in, +				     const struct net_device *out, +				     struct nf_conn *ct) +{ +	struct net *net = nf_ct_net(ct); +	unsigned int ret; + +	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); +	if (ret == NF_ACCEPT) { +		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) +			ret = alloc_null_binding(ct, hooknum); +	} +	return ret; +} + +static unsigned int +nf_nat_ipv4_fn(const struct nf_hook_ops *ops, +	       struct sk_buff *skb, +	       const struct net_device *in, +	       const struct net_device *out, +	       int (*okfn)(struct sk_buff *)) +{ +	struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	struct nf_conn_nat *nat; +	/* maniptype == SRC for postrouting. */ +	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + +	/* We never see fragments: conntrack defrags on pre-routing +	 * and local-out, and nf_nat_out protects post-routing. +	 */ +	NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + +	ct = nf_ct_get(skb, &ctinfo); +	/* Can't track?  It's not due to stress, or conntrack would +	 * have dropped it.  Hence it's the user's responsibilty to +	 * packet filter it out, or implement conntrack/NAT for that +	 * protocol. 8) --RR +	 */ +	if (!ct) +		return NF_ACCEPT; + +	/* Don't try to NAT if this packet is not conntracked */ +	if (nf_ct_is_untracked(ct)) +		return NF_ACCEPT; + +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT; + +	switch (ctinfo) { +	case IP_CT_RELATED: +	case IP_CT_RELATED_REPLY: +		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, +							   ops->hooknum)) +				return NF_DROP; +			else +				return NF_ACCEPT; +		} +		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ +	case IP_CT_NEW: +		/* Seen it before?  This can happen for loopback, retrans, +		 * or local packets. +		 */ +		if (!nf_nat_initialized(ct, maniptype)) { +			unsigned int ret; + +			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); +			if (ret != NF_ACCEPT) +				return ret; +		} else { +			pr_debug("Already setup manip %s for ct %p\n", +				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", +				 ct); +			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) +				goto oif_changed; +		} +		break; + +	default: +		/* ESTABLISHED */ +		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || +			     ctinfo == IP_CT_ESTABLISHED_REPLY); +		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) +			goto oif_changed; +	} + +	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: +	nf_ct_kill_acct(ct, ctinfo, skb); +	return NF_DROP; +} + +static unsigned int +nf_nat_ipv4_in(const struct nf_hook_ops *ops, +	       struct sk_buff *skb, +	       const struct net_device *in, +	       const struct net_device *out, +	       int (*okfn)(struct sk_buff *)) +{ +	unsigned int ret; +	__be32 daddr = ip_hdr(skb)->daddr; + +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    daddr != ip_hdr(skb)->daddr) +		skb_dst_drop(skb); + +	return ret; +} + +static unsigned int +nf_nat_ipv4_out(const struct nf_hook_ops *ops, +		struct sk_buff *skb, +		const struct net_device *in, +		const struct net_device *out, +		int (*okfn)(struct sk_buff *)) +{ +#ifdef CONFIG_XFRM +	const struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	int err; +#endif +	unsigned int ret; + +	/* root is playing with raw sockets. */ +	if (skb->len < sizeof(struct iphdr) || +	    ip_hdrlen(skb) < sizeof(struct iphdr)) +		return NF_ACCEPT; + +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM +	if (ret != NF_DROP && ret != NF_STOLEN && +	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if ((ct->tuplehash[dir].tuple.src.u3.ip != +		     ct->tuplehash[!dir].tuple.dst.u3.ip) || +		    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && +		     ct->tuplehash[dir].tuple.src.u.all != +		     ct->tuplehash[!dir].tuple.dst.u.all)) { +			err = nf_xfrm_me_harder(skb, AF_INET); +			if (err < 0) +				ret = NF_DROP_ERR(err); +		} +	} +#endif +	return ret; +} + +static unsigned int +nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, +		     struct sk_buff *skb, +		     const struct net_device *in, +		     const struct net_device *out, +		     int (*okfn)(struct sk_buff *)) +{ +	const struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; +	unsigned int ret; +	int err; + +	/* root is playing with raw sockets. */ +	if (skb->len < sizeof(struct iphdr) || +	    ip_hdrlen(skb) < sizeof(struct iphdr)) +		return NF_ACCEPT; + +	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if (ct->tuplehash[dir].tuple.dst.u3.ip != +		    ct->tuplehash[!dir].tuple.src.u3.ip) { +			err = ip_route_me_harder(skb, RTN_UNSPEC); +			if (err < 0) +				ret = NF_DROP_ERR(err); +		} +#ifdef CONFIG_XFRM +		else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && +			 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && +			 ct->tuplehash[dir].tuple.dst.u.all != +			 ct->tuplehash[!dir].tuple.src.u.all) { +			err = nf_xfrm_me_harder(skb, AF_INET); +			if (err < 0) +				ret = NF_DROP_ERR(err); +		} +#endif +	} +	return ret; +} + +static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { +	/* Before packet filtering, change destination */ +	{ +		.hook		= nf_nat_ipv4_in, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_PRE_ROUTING, +		.priority	= NF_IP_PRI_NAT_DST, +	}, +	/* After packet filtering, change source */ +	{ +		.hook		= nf_nat_ipv4_out, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_POST_ROUTING, +		.priority	= NF_IP_PRI_NAT_SRC, +	}, +	/* Before packet filtering, change destination */ +	{ +		.hook		= nf_nat_ipv4_local_fn, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_LOCAL_OUT, +		.priority	= NF_IP_PRI_NAT_DST, +	}, +	/* After packet filtering, change source */ +	{ +		.hook		= nf_nat_ipv4_fn, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_LOCAL_IN, +		.priority	= NF_IP_PRI_NAT_SRC, +	}, +}; + +static int __net_init iptable_nat_net_init(struct net *net) +{ +	struct ipt_replace *repl; + +	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); +	if (repl == NULL) +		return -ENOMEM; +	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); +	kfree(repl); +	return PTR_ERR_OR_ZERO(net->ipv4.nat_table); +} + +static void __net_exit iptable_nat_net_exit(struct net *net) +{ +	ipt_unregister_table(net, net->ipv4.nat_table); +} + +static struct pernet_operations iptable_nat_net_ops = { +	.init	= iptable_nat_net_init, +	.exit	= iptable_nat_net_exit, +}; + +static int __init iptable_nat_init(void) +{ +	int err; + +	err = register_pernet_subsys(&iptable_nat_net_ops); +	if (err < 0) +		goto err1; + +	err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); +	if (err < 0) +		goto err2; +	return 0; + +err2: +	unregister_pernet_subsys(&iptable_nat_net_ops); +err1: +	return err; +} + +static void __exit iptable_nat_exit(void) +{ +	nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); +	unregister_pernet_subsys(&iptable_nat_net_ops); +} + +module_init(iptable_nat_init); +module_exit(iptable_nat_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 07fb710cd72..b2f7e8f9831 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {  /* The work comes in here from netfilter.c. */  static unsigned int -iptable_raw_hook(unsigned int hook, struct sk_buff *skb, +iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		 const struct net_device *in, const struct net_device *out,  		 int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT &&  +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* root is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); +	return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);  }  static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)  	net->ipv4.iptable_raw =  		ipt_register_table(net, &packet_raw, repl);  	kfree(repl); -	if (IS_ERR(net->ipv4.iptable_raw)) -		return PTR_ERR(net->ipv4.iptable_raw); -	return 0; +	return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);  }  static void __net_exit iptable_raw_net_exit(struct net *net) @@ -75,14 +73,10 @@ static int __init iptable_raw_init(void)  	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);  	if (IS_ERR(rawtable_ops)) {  		ret = PTR_ERR(rawtable_ops); -		goto cleanup_table; +		unregister_pernet_subsys(&iptable_raw_net_ops);  	}  	return ret; - - cleanup_table: -	unregister_pernet_subsys(&iptable_raw_net_ops); -	return ret;  }  static void __exit iptable_raw_fini(void) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index be45bdc4c60..c86647ed207 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -37,21 +37,22 @@ static const struct xt_table security_table = {  };  static unsigned int -iptable_security_hook(unsigned int hook, struct sk_buff *skb, +iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,  		      const struct net_device *in,  		      const struct net_device *out,  		      int (*okfn)(struct sk_buff *))  {  	const struct net *net; -	if (hook == NF_INET_LOCAL_OUT && +	if (ops->hooknum == NF_INET_LOCAL_OUT &&  	    (skb->len < sizeof(struct iphdr) ||  	     ip_hdrlen(skb) < sizeof(struct iphdr)))  		/* Somebody is playing with raw sockets. */  		return NF_ACCEPT;  	net = dev_net((in != NULL) ? in : out); -	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); +	return ipt_do_table(skb, ops->hooknum, in, out, +			    net->ipv4.iptable_security);  }  static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -66,10 +67,7 @@ static int __net_init iptable_security_net_init(struct net *net)  	net->ipv4.iptable_security =  		ipt_register_table(net, &security_table, repl);  	kfree(repl); -	if (IS_ERR(net->ipv4.iptable_security)) -		return PTR_ERR(net->ipv4.iptable_security); - -	return 0; +	return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);  }  static void __net_exit iptable_security_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a03c02af99..8127dc80286 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -1,6 +1,7 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -24,16 +25,12 @@  #include <net/netfilter/nf_conntrack_l3proto.h>  #include <net/netfilter/nf_conntrack_zones.h>  #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_seqadj.h>  #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>  #include <net/netfilter/nf_nat_helper.h>  #include <net/netfilter/ipv4/nf_defrag_ipv4.h>  #include <net/netfilter/nf_log.h> -int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, -			      struct nf_conn *ct, -			      enum ip_conntrack_info ctinfo); -EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook); -  static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,  			      struct nf_conntrack_tuple *tuple)  { @@ -74,58 +71,73 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,  	iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);  	if (iph == NULL) -		return -NF_DROP; +		return -NF_ACCEPT;  	/* Conntrack defragments packets, we might still see fragments  	 * inside ICMP packets though. */  	if (iph->frag_off & htons(IP_OFFSET)) -		return -NF_DROP; +		return -NF_ACCEPT;  	*dataoff = nhoff + (iph->ihl << 2);  	*protonum = iph->protocol; +	/* Check bogus IP headers */ +	if (*dataoff > skb->len) { +		pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " +			 "nhoff %u, ihl %u, skblen %u\n", +			 nhoff, iph->ihl << 2, skb->len); +		return -NF_ACCEPT; +	} +  	return NF_ACCEPT;  } -static unsigned int ipv4_confirm(unsigned int hooknum, -				 struct sk_buff *skb, -				 const struct net_device *in, -				 const struct net_device *out, -				 int (*okfn)(struct sk_buff *)) +static unsigned int ipv4_helper(const struct nf_hook_ops *ops, +				struct sk_buff *skb, +				const struct net_device *in, +				const struct net_device *out, +				int (*okfn)(struct sk_buff *))  {  	struct nf_conn *ct;  	enum ip_conntrack_info ctinfo;  	const struct nf_conn_help *help;  	const struct nf_conntrack_helper *helper; -	unsigned int ret;  	/* This is where we call the helper: as the packet goes out. */  	ct = nf_ct_get(skb, &ctinfo); -	if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) -		goto out; +	if (!ct || ctinfo == IP_CT_RELATED_REPLY) +		return NF_ACCEPT;  	help = nfct_help(ct);  	if (!help) -		goto out; +		return NF_ACCEPT;  	/* rcu_read_lock()ed by nf_hook_slow */  	helper = rcu_dereference(help->helper);  	if (!helper) -		goto out; +		return NF_ACCEPT; -	ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), -			   ct, ctinfo); -	if (ret != NF_ACCEPT) { -		nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, -			      "nf_ct_%s: dropping packet", helper->name); -		return ret; -	} +	return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), +			    ct, ctinfo); +} + +static unsigned int ipv4_confirm(const struct nf_hook_ops *ops, +				 struct sk_buff *skb, +				 const struct net_device *in, +				 const struct net_device *out, +				 int (*okfn)(struct sk_buff *)) +{ +	struct nf_conn *ct; +	enum ip_conntrack_info ctinfo; -	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { -		typeof(nf_nat_seq_adjust_hook) seq_adjust; +	ct = nf_ct_get(skb, &ctinfo); +	if (!ct || ctinfo == IP_CT_RELATED_REPLY) +		goto out; -		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); -		if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) { +	/* adjust seqs for loopback traffic only in outgoing direction */ +	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && +	    !nf_is_loopback_packet(skb)) { +		if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {  			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);  			return NF_DROP;  		} @@ -135,16 +147,16 @@ out:  	return nf_conntrack_confirm(skb);  } -static unsigned int ipv4_conntrack_in(unsigned int hooknum, +static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,  				      struct sk_buff *skb,  				      const struct net_device *in,  				      const struct net_device *out,  				      int (*okfn)(struct sk_buff *))  { -	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); +	return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);  } -static unsigned int ipv4_conntrack_local(unsigned int hooknum, +static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,  					 struct sk_buff *skb,  					 const struct net_device *in,  					 const struct net_device *out, @@ -154,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,  	if (skb->len < sizeof(struct iphdr) ||  	    ip_hdrlen(skb) < sizeof(struct iphdr))  		return NF_ACCEPT; -	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); +	return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);  }  /* Connection tracking may drop packets, but never alters them, so @@ -175,6 +187,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {  		.priority	= NF_IP_PRI_CONNTRACK,  	},  	{ +		.hook		= ipv4_helper, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_POST_ROUTING, +		.priority	= NF_IP_PRI_CONNTRACK_HELPER, +	}, +	{  		.hook		= ipv4_confirm,  		.owner		= THIS_MODULE,  		.pf		= NFPROTO_IPV4, @@ -182,6 +201,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {  		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,  	},  	{ +		.hook		= ipv4_helper, +		.owner		= THIS_MODULE, +		.pf		= NFPROTO_IPV4, +		.hooknum	= NF_INET_LOCAL_IN, +		.priority	= NF_IP_PRI_CONNTRACK_HELPER, +	}, +	{  		.hook		= ipv4_confirm,  		.owner		= THIS_MODULE,  		.pf		= NFPROTO_IPV4, @@ -194,38 +220,33 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {  static int log_invalid_proto_min = 0;  static int log_invalid_proto_max = 255; -static ctl_table ip_ct_sysctl_table[] = { +static struct ctl_table ip_ct_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_max", -		.data		= &nf_conntrack_max,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_count", -		.data		= &init_net.ct.count,  		.maxlen		= sizeof(int),  		.mode		= 0444,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_buckets", -		.data		= &init_net.ct.htable_size,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0444,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_checksum", -		.data		= &init_net.ct.sysctl_checksum,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "ip_conntrack_log_invalid", -		.data		= &init_net.ct.sysctl_log_invalid,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, @@ -301,8 +322,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)  static int ipv4_tuple_to_nlattr(struct sk_buff *skb,  				const struct nf_conntrack_tuple *tuple)  { -	NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); -	NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); +	if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || +	    nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -340,6 +362,25 @@ static struct nf_sockopt_ops so_getorigdst = {  	.owner		= THIS_MODULE,  }; +static int ipv4_init_net(struct net *net) +{ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +	struct nf_ip_net *in = &net->ct.nf_ct_proto; +	in->ctl_table = kmemdup(ip_ct_sysctl_table, +				sizeof(ip_ct_sysctl_table), +				GFP_KERNEL); +	if (!in->ctl_table) +		return -ENOMEM; + +	in->ctl_table[0].data = &nf_conntrack_max; +	in->ctl_table[1].data = &net->ct.count; +	in->ctl_table[2].data = &net->ct.htable_size; +	in->ctl_table[3].data = &net->ct.sysctl_checksum; +	in->ctl_table[4].data = &net->ct.sysctl_log_invalid; +#endif +	return 0; +} +  struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {  	.l3proto	 = PF_INET,  	.name		 = "ipv4", @@ -354,9 +395,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {  	.nla_policy	 = ipv4_nla_policy,  #endif  #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) -	.ctl_table_path  = nf_net_ipv4_netfilter_sysctl_path, -	.ctl_table	 = ip_ct_sysctl_table, +	.ctl_table_path  = "net/ipv4/netfilter",  #endif +	.init_net	 = ipv4_init_net,  	.me		 = THIS_MODULE,  }; @@ -367,6 +408,54 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));  MODULE_ALIAS("ip_conntrack");  MODULE_LICENSE("GPL"); +static int ipv4_net_init(struct net *net) +{ +	int ret = 0; + +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4); +	if (ret < 0) { +		pr_err("nf_conntrack_tcp4: pernet registration failed\n"); +		goto out_tcp; +	} +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4); +	if (ret < 0) { +		pr_err("nf_conntrack_udp4: pernet registration failed\n"); +		goto out_udp; +	} +	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp); +	if (ret < 0) { +		pr_err("nf_conntrack_icmp4: pernet registration failed\n"); +		goto out_icmp; +	} +	ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4); +	if (ret < 0) { +		pr_err("nf_conntrack_ipv4: pernet registration failed\n"); +		goto out_ipv4; +	} +	return 0; +out_ipv4: +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); +out_icmp: +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); +out_udp: +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); +out_tcp: +	return ret; +} + +static void ipv4_net_exit(struct net *net) +{ +	nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4); +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp); +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4); +	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4); +} + +static struct pernet_operations ipv4_net_ops = { +	.init = ipv4_net_init, +	.exit = ipv4_net_exit, +}; +  static int __init nf_conntrack_l3proto_ipv4_init(void)  {  	int ret = 0; @@ -380,54 +469,63 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)  		return ret;  	} -	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); +	ret = register_pernet_subsys(&ipv4_net_ops);  	if (ret < 0) { -		pr_err("nf_conntrack_ipv4: can't register tcp.\n"); +		pr_err("nf_conntrack_ipv4: can't register pernet ops\n");  		goto cleanup_sockopt;  	} -	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); +	ret = nf_register_hooks(ipv4_conntrack_ops, +				ARRAY_SIZE(ipv4_conntrack_ops));  	if (ret < 0) { -		pr_err("nf_conntrack_ipv4: can't register udp.\n"); -		goto cleanup_tcp; +		pr_err("nf_conntrack_ipv4: can't register hooks.\n"); +		goto cleanup_pernet;  	} -	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);  	if (ret < 0) { -		pr_err("nf_conntrack_ipv4: can't register icmp.\n"); -		goto cleanup_udp; +		pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n"); +		goto cleanup_hooks;  	} -	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);  	if (ret < 0) { -		pr_err("nf_conntrack_ipv4: can't register ipv4\n"); -		goto cleanup_icmp; +		pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n"); +		goto cleanup_tcp4;  	} -	ret = nf_register_hooks(ipv4_conntrack_ops, -				ARRAY_SIZE(ipv4_conntrack_ops)); +	ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);  	if (ret < 0) { -		pr_err("nf_conntrack_ipv4: can't register hooks.\n"); -		goto cleanup_ipv4; +		pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n"); +		goto cleanup_udp4;  	} + +	ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); +	if (ret < 0) { +		pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); +		goto cleanup_icmpv4; +	} +  #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)  	ret = nf_conntrack_ipv4_compat_init();  	if (ret < 0) -		goto cleanup_hooks; +		goto cleanup_proto;  #endif  	return ret;  #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + cleanup_proto: +	nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +#endif + cleanup_icmpv4: +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); + cleanup_udp4: +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); + cleanup_tcp4: +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);   cleanup_hooks:  	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -#endif - cleanup_ipv4: -	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - cleanup_icmp: -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); - cleanup_udp: -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); - cleanup_tcp: -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); + cleanup_pernet: +	unregister_pernet_subsys(&ipv4_net_ops);   cleanup_sockopt:  	nf_unregister_sockopt(&so_getorigdst);  	return ret; @@ -439,19 +537,14 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)  #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)  	nf_conntrack_ipv4_compat_fini();  #endif +	nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); +	nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);  	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); -	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); -	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); +	unregister_pernet_subsys(&ipv4_net_ops);  	nf_unregister_sockopt(&so_getorigdst);  }  module_init(nf_conntrack_l3proto_ipv4_init);  module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ -	return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 37f8adb68c7..4c48e434bb1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -2,6 +2,7 @@   *   * (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -20,6 +21,8 @@  #include <net/netfilter/nf_conntrack_l4proto.h>  #include <net/netfilter/nf_conntrack_expect.h>  #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> +#include <linux/export.h>  struct ct_iter_state {  	struct seq_net_private p; @@ -35,7 +38,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)  	for (st->bucket = 0;  	     st->bucket < net->ct.htable_size;  	     st->bucket++) { -		n = rcu_dereference(net->ct.hash[st->bucket].first); +		n = rcu_dereference( +			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));  		if (!is_a_nulls(n))  			return n;  	} @@ -48,13 +52,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,  	struct net *net = seq_file_net(seq);  	struct ct_iter_state *st = seq->private; -	head = rcu_dereference(head->next); +	head = rcu_dereference(hlist_nulls_next_rcu(head));  	while (is_a_nulls(head)) {  		if (likely(get_nulls_value(head) == st->bucket)) {  			if (++st->bucket >= net->ct.htable_size)  				return NULL;  		} -		head = rcu_dereference(net->ct.hash[st->bucket].first); +		head = rcu_dereference( +			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));  	}  	return head;  } @@ -97,7 +102,7 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)  	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);  	if (ret) -		return ret; +		return 0;  	ret = seq_printf(s, "secctx=%s ", secctx); @@ -217,7 +222,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)  	struct hlist_node *n;  	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { -		n = rcu_dereference(net->ct.expect_hash[st->bucket].first); +		n = rcu_dereference( +			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));  		if (n)  			return n;  	} @@ -230,11 +236,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,  	struct net *net = seq_file_net(seq);  	struct ct_expect_iter_state *st = seq->private; -	head = rcu_dereference(head->next); +	head = rcu_dereference(hlist_next_rcu(head));  	while (head == NULL) {  		if (++st->bucket >= nf_ct_expect_hsize)  			return NULL; -		head = rcu_dereference(net->ct.expect_hash[st->bucket].first); +		head = rcu_dereference( +			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));  	}  	return head;  } @@ -411,12 +418,12 @@ static int __net_init ip_conntrack_net_init(struct net *net)  {  	struct proc_dir_entry *proc, *proc_exp, *proc_stat; -	proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); +	proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);  	if (!proc)  		goto err1; -	proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, -					&ip_exp_file_ops); +	proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net, +			       &ip_exp_file_ops);  	if (!proc_exp)  		goto err2; @@ -427,9 +434,9 @@ static int __net_init ip_conntrack_net_init(struct net *net)  	return 0;  err3: -	proc_net_remove(net, "ip_conntrack_expect"); +	remove_proc_entry("ip_conntrack_expect", net->proc_net);  err2: -	proc_net_remove(net, "ip_conntrack"); +	remove_proc_entry("ip_conntrack", net->proc_net);  err1:  	return -ENOMEM;  } @@ -437,8 +444,8 @@ err1:  static void __net_exit ip_conntrack_net_exit(struct net *net)  {  	remove_proc_entry("ip_conntrack", net->proc_net_stat); -	proc_net_remove(net, "ip_conntrack_expect"); -	proc_net_remove(net, "ip_conntrack"); +	remove_proc_entry("ip_conntrack_expect", net->proc_net); +	remove_proc_entry("ip_conntrack", net->proc_net);  }  static struct pernet_operations ip_conntrack_net_ops = { diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7404bde9599..a338dad41b7 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -1,5 +1,6 @@  /* (C) 1999-2001 Paul `Rusty' Russell   * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2006-2010 Patrick McHardy <kaber@trash.net>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -23,6 +24,11 @@  static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static inline struct nf_icmp_net *icmp_pernet(struct net *net) +{ +	return &net->ct.nf_ct_proto.icmp; +} +  static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,  			      struct nf_conntrack_tuple *tuple)  { @@ -75,25 +81,31 @@ static int icmp_print_tuple(struct seq_file *s,  			  ntohs(tuple->src.u.icmp.id));  } +static unsigned int *icmp_get_timeouts(struct net *net) +{ +	return &icmp_pernet(net)->timeout; +} +  /* Returns verdict for packet, or -1 for invalid. */  static int icmp_packet(struct nf_conn *ct,  		       const struct sk_buff *skb,  		       unsigned int dataoff,  		       enum ip_conntrack_info ctinfo,  		       u_int8_t pf, -		       unsigned int hooknum) +		       unsigned int hooknum, +		       unsigned int *timeout)  {  	/* Do not immediately delete the connection after the first  	   successful reply to avoid excessive conntrackd traffic  	   and also to handle correctly ICMP echo reply duplicates. */ -	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); +	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);  	return NF_ACCEPT;  }  /* Called when a new connection for this protocol found. */  static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, -		     unsigned int dataoff) +		     unsigned int dataoff, unsigned int *timeouts)  {  	static const u_int8_t valid_new[] = {  		[ICMP_ECHO] = 1, @@ -160,7 +172,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,  	/* Update skb to refer to this connection */  	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;  	skb->nfctinfo = *ctinfo; -	return -NF_ACCEPT; +	return NF_ACCEPT;  }  /* Small and modified version of icmp_rcv */ @@ -176,8 +188,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl,  	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);  	if (icmph == NULL) {  		if (LOG_INVALID(net, IPPROTO_ICMP)) -			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, -				      "nf_ct_icmp: short packet "); +			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, +				      NULL, "nf_ct_icmp: short packet ");  		return -NF_ACCEPT;  	} @@ -185,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,  	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&  	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {  		if (LOG_INVALID(net, IPPROTO_ICMP)) -			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,  				      "nf_ct_icmp: bad HW ICMP checksum ");  		return -NF_ACCEPT;  	} @@ -198,7 +210,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,  	 */  	if (icmph->type > NR_ICMP_TYPES) {  		if (LOG_INVALID(net, IPPROTO_ICMP)) -			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, +			nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,  				      "nf_ct_icmp: invalid ICMP type ");  		return -NF_ACCEPT;  	} @@ -222,10 +234,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,  static int icmp_tuple_to_nlattr(struct sk_buff *skb,  				const struct nf_conntrack_tuple *t)  { -	NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); -	NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); -	NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); - +	if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || +	    nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || +	    nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) +		goto nla_put_failure;  	return 0;  nla_put_failure: @@ -263,12 +275,50 @@ static int icmp_nlattr_tuple_size(void)  }  #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], +				      struct net *net, void *data) +{ +	unsigned int *timeout = data; +	struct nf_icmp_net *in = icmp_pernet(net); + +	if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { +		*timeout = +			ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; +	} else { +		/* Set default ICMP timeout. */ +		*timeout = in->timeout; +	} +	return 0; +} + +static int +icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ +	const unsigned int *timeout = data; + +	if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) +		goto nla_put_failure; +	return 0; + +nla_put_failure: +	return -ENOSPC; +} + +static const struct nla_policy +icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { +	[CTA_TIMEOUT_ICMP_TIMEOUT]	= { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +  #ifdef CONFIG_SYSCTL -static struct ctl_table_header *icmp_sysctl_header;  static struct ctl_table icmp_sysctl_table[] = {  	{  		.procname	= "nf_conntrack_icmp_timeout", -		.data		= &nf_ct_icmp_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -279,7 +329,6 @@ static struct ctl_table icmp_sysctl_table[] = {  static struct ctl_table icmp_compat_sysctl_table[] = {  	{  		.procname	= "ip_conntrack_icmp_timeout", -		.data		= &nf_ct_icmp_timeout,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_jiffies, @@ -289,6 +338,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */  #endif /* CONFIG_SYSCTL */ +static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, +				     struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL +	pn->ctl_table = kmemdup(icmp_sysctl_table, +				sizeof(icmp_sysctl_table), +				GFP_KERNEL); +	if (!pn->ctl_table) +		return -ENOMEM; + +	pn->ctl_table[0].data = &in->timeout; +#endif +	return 0; +} + +static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, +					    struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +	pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, +				       sizeof(icmp_compat_sysctl_table), +				       GFP_KERNEL); +	if (!pn->ctl_compat_table) +		return -ENOMEM; + +	pn->ctl_compat_table[0].data = &in->timeout; +#endif +#endif +	return 0; +} + +static int icmp_init_net(struct net *net, u_int16_t proto) +{ +	int ret; +	struct nf_icmp_net *in = icmp_pernet(net); +	struct nf_proto_net *pn = &in->pn; + +	in->timeout = nf_ct_icmp_timeout; + +	ret = icmp_kmemdup_compat_sysctl_table(pn, in); +	if (ret < 0) +		return ret; + +	ret = icmp_kmemdup_sysctl_table(pn, in); +	if (ret < 0) +		nf_ct_kfree_compat_sysctl_table(pn); + +	return ret; +} + +static struct nf_proto_net *icmp_get_net_proto(struct net *net) +{ +	return &net->ct.nf_ct_proto.icmp.pn; +} +  struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =  {  	.l3proto		= PF_INET, @@ -298,6 +403,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =  	.invert_tuple		= icmp_invert_tuple,  	.print_tuple		= icmp_print_tuple,  	.packet			= icmp_packet, +	.get_timeouts		= icmp_get_timeouts,  	.new			= icmp_new,  	.error			= icmp_error,  	.destroy		= NULL, @@ -308,11 +414,15 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =  	.nlattr_to_tuple	= icmp_nlattr_to_tuple,  	.nla_policy		= icmp_nla_policy,  #endif -#ifdef CONFIG_SYSCTL -	.ctl_table_header	= &icmp_sysctl_header, -	.ctl_table		= icmp_sysctl_table, -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -	.ctl_compat_table	= icmp_compat_sysctl_table, -#endif -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) +	.ctnl_timeout		= { +		.nlattr_to_obj	= icmp_timeout_nlattr_to_obj, +		.obj_to_nlattr	= icmp_timeout_obj_to_nlattr, +		.nlattr_max	= CTA_TIMEOUT_ICMP_MAX, +		.obj_size	= sizeof(unsigned int), +		.nla_policy	= icmp_timeout_nla_policy, +	}, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +	.init_net		= icmp_init_net, +	.get_net_proto		= icmp_get_net_proto,  }; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index f3a9b42b16c..b8f6381c7d0 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -22,7 +22,6 @@  #endif  #include <net/netfilter/nf_conntrack_zones.h> -/* Returns new sk_buff, or NULL */  static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)  {  	int err; @@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)  	err = ip_defrag(skb, user);  	local_bh_enable(); -	if (!err) +	if (!err) {  		ip_send_check(ip_hdr(skb)); +		skb->ignore_df = 1; +	}  	return err;  } @@ -60,7 +61,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,  		return IP_DEFRAG_CONNTRACK_OUT + zone;  } -static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, +static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,  					  struct sk_buff *skb,  					  const struct net_device *in,  					  const struct net_device *out, @@ -82,8 +83,10 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,  #endif  #endif  	/* Gather fragments. */ -	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { -		enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); +	if (ip_is_fragment(ip_hdr(skb))) { +		enum ip_defrag_users user = +			nf_ct_defrag_user(ops->hooknum, skb); +  		if (nf_ct_ipv4_gather_frags(skb, user))  			return NF_STOLEN;  	} @@ -94,14 +97,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {  	{  		.hook		= ipv4_conntrack_defrag,  		.owner		= THIS_MODULE, -		.pf		= PF_INET, +		.pf		= NFPROTO_IPV4,  		.hooknum	= NF_INET_PRE_ROUTING,  		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,  	},  	{  		.hook           = ipv4_conntrack_defrag,  		.owner          = THIS_MODULE, -		.pf             = PF_INET, +		.pf             = NFPROTO_IPV4,  		.hooknum        = NF_INET_LOCAL_OUT,  		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,  	}, diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c deleted file mode 100644 index 0f23b3f06df..00000000000 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Amanda extension for TCP NAT alteration. - * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> - * based on a copy of HW's ip_nat_irc.c as well as other modules - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/udp.h> - -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_amanda.h> - -MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); -MODULE_DESCRIPTION("Amanda NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_amanda"); - -static unsigned int help(struct sk_buff *skb, -			 enum ip_conntrack_info ctinfo, -			 unsigned int matchoff, -			 unsigned int matchlen, -			 struct nf_conntrack_expect *exp) -{ -	char buffer[sizeof("65535")]; -	u_int16_t port; -	unsigned int ret; - -	/* Connection comes from client. */ -	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; -	exp->dir = IP_CT_DIR_ORIGINAL; - -	/* When you see the packet, we need to NAT it the same as the -	 * this one (ie. same IP: it will be TCP and master is UDP). */ -	exp->expectfn = nf_nat_follow_master; - -	/* Try to get same port: if not, try to change it. */ -	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { -		int ret; - -		exp->tuple.dst.u.tcp.port = htons(port); -		ret = nf_ct_expect_related(exp); -		if (ret == 0) -			break; -		else if (ret != -EBUSY) { -			port = 0; -			break; -		} -	} - -	if (port == 0) -		return NF_DROP; - -	sprintf(buffer, "%u", port); -	ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, -				       matchoff, matchlen, -				       buffer, strlen(buffer)); -	if (ret != NF_ACCEPT) -		nf_ct_unexpect_related(exp); -	return ret; -} - -static void __exit nf_nat_amanda_fini(void) -{ -	rcu_assign_pointer(nf_nat_amanda_hook, NULL); -	synchronize_rcu(); -} - -static int __init nf_nat_amanda_init(void) -{ -	BUG_ON(nf_nat_amanda_hook != NULL); -	rcu_assign_pointer(nf_nat_amanda_hook, help); -	return 0; -} - -module_init(nf_nat_amanda_init); -module_exit(nf_nat_amanda_fini); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c deleted file mode 100644 index c04787ce1a7..00000000000 --- a/net/ipv4/netfilter/nf_nat_core.c +++ /dev/null @@ -1,774 +0,0 @@ -/* NAT for netfilter; shared with compatibility layer. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/gfp.h> -#include <net/checksum.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/tcp.h>  /* For tcp_prot in getorigdst */ -#include <linux/icmp.h> -#include <linux/udp.h> -#include <linux/jhash.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_zones.h> - -static DEFINE_SPINLOCK(nf_nat_lock); - -static struct nf_conntrack_l3proto *l3proto __read_mostly; - -#define MAX_IP_NAT_PROTO 256 -static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] -						__read_mostly; - -static inline const struct nf_nat_protocol * -__nf_nat_proto_find(u_int8_t protonum) -{ -	return rcu_dereference(nf_nat_protos[protonum]); -} - -/* We keep an extra hash for each conntrack, for fast searching. */ -static inline unsigned int -hash_by_src(const struct net *net, u16 zone, -	    const struct nf_conntrack_tuple *tuple) -{ -	unsigned int hash; - -	/* Original src, to ensure we map it consistently if poss. */ -	hash = jhash_3words((__force u32)tuple->src.u3.ip, -			    (__force u32)tuple->src.u.all ^ zone, -			    tuple->dst.protonum, 0); -	return ((u64)hash * net->ipv4.nat_htable_size) >> 32; -} - -/* Is this tuple already taken? (not by us) */ -int -nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, -		  const struct nf_conn *ignored_conntrack) -{ -	/* Conntrack tracking doesn't keep track of outgoing tuples; only -	   incoming ones.  NAT means they don't have a fixed mapping, -	   so we invert the tuple and look for the incoming reply. - -	   We could keep a separate hash if this proves too slow. */ -	struct nf_conntrack_tuple reply; - -	nf_ct_invert_tuplepr(&reply, tuple); -	return nf_conntrack_tuple_taken(&reply, ignored_conntrack); -} -EXPORT_SYMBOL(nf_nat_used_tuple); - -/* If we source map this tuple so reply looks like reply_tuple, will - * that meet the constraints of range. */ -static int -in_range(const struct nf_conntrack_tuple *tuple, -	 const struct nf_nat_range *range) -{ -	const struct nf_nat_protocol *proto; -	int ret = 0; - -	/* If we are supposed to map IPs, then we must be in the -	   range specified, otherwise let this drag us onto a new src IP. */ -	if (range->flags & IP_NAT_RANGE_MAP_IPS) { -		if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) || -		    ntohl(tuple->src.u3.ip) > ntohl(range->max_ip)) -			return 0; -	} - -	rcu_read_lock(); -	proto = __nf_nat_proto_find(tuple->dst.protonum); -	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || -	    proto->in_range(tuple, IP_NAT_MANIP_SRC, -			    &range->min, &range->max)) -		ret = 1; -	rcu_read_unlock(); - -	return ret; -} - -static inline int -same_src(const struct nf_conn *ct, -	 const struct nf_conntrack_tuple *tuple) -{ -	const struct nf_conntrack_tuple *t; - -	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; -	return (t->dst.protonum == tuple->dst.protonum && -		t->src.u3.ip == tuple->src.u3.ip && -		t->src.u.all == tuple->src.u.all); -} - -/* Only called for SRC manip */ -static int -find_appropriate_src(struct net *net, u16 zone, -		     const struct nf_conntrack_tuple *tuple, -		     struct nf_conntrack_tuple *result, -		     const struct nf_nat_range *range) -{ -	unsigned int h = hash_by_src(net, zone, tuple); -	const struct nf_conn_nat *nat; -	const struct nf_conn *ct; -	const struct hlist_node *n; - -	rcu_read_lock(); -	hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { -		ct = nat->ct; -		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { -			/* Copy source part from reply tuple. */ -			nf_ct_invert_tuplepr(result, -				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple); -			result->dst = tuple->dst; - -			if (in_range(result, range)) { -				rcu_read_unlock(); -				return 1; -			} -		} -	} -	rcu_read_unlock(); -	return 0; -} - -/* For [FUTURE] fragmentation handling, we want the least-used -   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus -   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports -   1-65535, we don't do pro-rata allocation based on ports; we choose -   the ip with the lowest src-ip/dst-ip/proto usage. -*/ -static void -find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, -		    const struct nf_nat_range *range, -		    const struct nf_conn *ct, -		    enum nf_nat_manip_type maniptype) -{ -	__be32 *var_ipp; -	/* Host order */ -	u_int32_t minip, maxip, j; - -	/* No IP mapping?  Do nothing. */ -	if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) -		return; - -	if (maniptype == IP_NAT_MANIP_SRC) -		var_ipp = &tuple->src.u3.ip; -	else -		var_ipp = &tuple->dst.u3.ip; - -	/* Fast path: only one choice. */ -	if (range->min_ip == range->max_ip) { -		*var_ipp = range->min_ip; -		return; -	} - -	/* Hashing source and destination IPs gives a fairly even -	 * spread in practice (if there are a small number of IPs -	 * involved, there usually aren't that many connections -	 * anyway).  The consistency means that servers see the same -	 * client coming from the same IP (some Internet Banking sites -	 * like this), even across reboots. */ -	minip = ntohl(range->min_ip); -	maxip = ntohl(range->max_ip); -	j = jhash_2words((__force u32)tuple->src.u3.ip, -			 range->flags & IP_NAT_RANGE_PERSISTENT ? -				0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); -	j = ((u64)j * (maxip - minip + 1)) >> 32; -	*var_ipp = htonl(minip + j); -} - -/* Manipulate the tuple into the range given.  For NF_INET_POST_ROUTING, - * we change the source to map into the range.  For NF_INET_PRE_ROUTING - * and NF_INET_LOCAL_OUT, we change the destination to map into the - * range.  It might not be possible to get a unique tuple, but we try. - * At worst (or if we race), we will end up with a final duplicate in - * __ip_conntrack_confirm and drop the packet. */ -static void -get_unique_tuple(struct nf_conntrack_tuple *tuple, -		 const struct nf_conntrack_tuple *orig_tuple, -		 const struct nf_nat_range *range, -		 struct nf_conn *ct, -		 enum nf_nat_manip_type maniptype) -{ -	struct net *net = nf_ct_net(ct); -	const struct nf_nat_protocol *proto; -	u16 zone = nf_ct_zone(ct); - -	/* 1) If this srcip/proto/src-proto-part is currently mapped, -	   and that same mapping gives a unique tuple within the given -	   range, use that. - -	   This is only required for source (ie. NAT/masq) mappings. -	   So far, we don't do local source mappings, so multiple -	   manips not an issue.  */ -	if (maniptype == IP_NAT_MANIP_SRC && -	    !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { -		if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { -			pr_debug("get_unique_tuple: Found current src map\n"); -			if (!nf_nat_used_tuple(tuple, ct)) -				return; -		} -	} - -	/* 2) Select the least-used IP/proto combination in the given -	   range. */ -	*tuple = *orig_tuple; -	find_best_ips_proto(zone, tuple, range, ct, maniptype); - -	/* 3) The per-protocol part of the manip is made to map into -	   the range to make a unique tuple. */ - -	rcu_read_lock(); -	proto = __nf_nat_proto_find(orig_tuple->dst.protonum); - -	/* Only bother mapping if it's not already in range and unique */ -	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { -		if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { -			if (proto->in_range(tuple, maniptype, &range->min, -					    &range->max) && -			    (range->min.all == range->max.all || -			     !nf_nat_used_tuple(tuple, ct))) -				goto out; -		} else if (!nf_nat_used_tuple(tuple, ct)) { -			goto out; -		} -	} - -	/* Last change: get protocol to try to obtain unique tuple. */ -	proto->unique_tuple(tuple, range, maniptype, ct); -out: -	rcu_read_unlock(); -} - -unsigned int -nf_nat_setup_info(struct nf_conn *ct, -		  const struct nf_nat_range *range, -		  enum nf_nat_manip_type maniptype) -{ -	struct net *net = nf_ct_net(ct); -	struct nf_conntrack_tuple curr_tuple, new_tuple; -	struct nf_conn_nat *nat; -	int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); - -	/* nat helper or nfctnetlink also setup binding */ -	nat = nfct_nat(ct); -	if (!nat) { -		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); -		if (nat == NULL) { -			pr_debug("failed to add NAT extension\n"); -			return NF_ACCEPT; -		} -	} - -	NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || -		     maniptype == IP_NAT_MANIP_DST); -	BUG_ON(nf_nat_initialized(ct, maniptype)); - -	/* What we've got will look like inverse of reply. Normally -	   this is what is in the conntrack, except for prior -	   manipulations (future optimization: if num_manips == 0, -	   orig_tp = -	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ -	nf_ct_invert_tuplepr(&curr_tuple, -			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - -	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); - -	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { -		struct nf_conntrack_tuple reply; - -		/* Alter conntrack table so will recognize replies. */ -		nf_ct_invert_tuplepr(&reply, &new_tuple); -		nf_conntrack_alter_reply(ct, &reply); - -		/* Non-atomic: we own this at the moment. */ -		if (maniptype == IP_NAT_MANIP_SRC) -			ct->status |= IPS_SRC_NAT; -		else -			ct->status |= IPS_DST_NAT; -	} - -	/* Place in source hash if this is the first time. */ -	if (have_to_hash) { -		unsigned int srchash; - -		srchash = hash_by_src(net, nf_ct_zone(ct), -				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); -		spin_lock_bh(&nf_nat_lock); -		/* nf_conntrack_alter_reply might re-allocate exntension aera */ -		nat = nfct_nat(ct); -		nat->ct = ct; -		hlist_add_head_rcu(&nat->bysource, -				   &net->ipv4.nat_bysource[srchash]); -		spin_unlock_bh(&nf_nat_lock); -	} - -	/* It's done. */ -	if (maniptype == IP_NAT_MANIP_DST) -		set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); -	else -		set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - -	return NF_ACCEPT; -} -EXPORT_SYMBOL(nf_nat_setup_info); - -/* Returns true if succeeded. */ -static bool -manip_pkt(u_int16_t proto, -	  struct sk_buff *skb, -	  unsigned int iphdroff, -	  const struct nf_conntrack_tuple *target, -	  enum nf_nat_manip_type maniptype) -{ -	struct iphdr *iph; -	const struct nf_nat_protocol *p; - -	if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) -		return false; - -	iph = (void *)skb->data + iphdroff; - -	/* Manipulate protcol part. */ - -	/* rcu_read_lock()ed by nf_hook_slow */ -	p = __nf_nat_proto_find(proto); -	if (!p->manip_pkt(skb, iphdroff, target, maniptype)) -		return false; - -	iph = (void *)skb->data + iphdroff; - -	if (maniptype == IP_NAT_MANIP_SRC) { -		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); -		iph->saddr = target->src.u3.ip; -	} else { -		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); -		iph->daddr = target->dst.u3.ip; -	} -	return true; -} - -/* Do packet manipulations according to nf_nat_setup_info. */ -unsigned int nf_nat_packet(struct nf_conn *ct, -			   enum ip_conntrack_info ctinfo, -			   unsigned int hooknum, -			   struct sk_buff *skb) -{ -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	unsigned long statusbit; -	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); - -	if (mtype == IP_NAT_MANIP_SRC) -		statusbit = IPS_SRC_NAT; -	else -		statusbit = IPS_DST_NAT; - -	/* Invert if this is reply dir. */ -	if (dir == IP_CT_DIR_REPLY) -		statusbit ^= IPS_NAT_MASK; - -	/* Non-atomic: these bits don't change. */ -	if (ct->status & statusbit) { -		struct nf_conntrack_tuple target; - -		/* We are aiming to look like inverse of other direction. */ -		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - -		if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype)) -			return NF_DROP; -	} -	return NF_ACCEPT; -} -EXPORT_SYMBOL_GPL(nf_nat_packet); - -/* Dir is direction ICMP is coming from (opposite to packet it contains) */ -int nf_nat_icmp_reply_translation(struct nf_conn *ct, -				  enum ip_conntrack_info ctinfo, -				  unsigned int hooknum, -				  struct sk_buff *skb) -{ -	struct { -		struct icmphdr icmp; -		struct iphdr ip; -	} *inside; -	const struct nf_conntrack_l4proto *l4proto; -	struct nf_conntrack_tuple inner, target; -	int hdrlen = ip_hdrlen(skb); -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	unsigned long statusbit; -	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); - -	if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) -		return 0; - -	inside = (void *)skb->data + hdrlen; - -	/* We're actually going to mangle it beyond trivial checksum -	   adjustment, so make sure the current checksum is correct. */ -	if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) -		return 0; - -	/* Must be RELATED */ -	NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || -		     skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); - -	/* Redirects on non-null nats must be dropped, else they'll -	   start talking to each other without our translation, and be -	   confused... --RR */ -	if (inside->icmp.type == ICMP_REDIRECT) { -		/* If NAT isn't finished, assume it and drop. */ -		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) -			return 0; - -		if (ct->status & IPS_NAT_MASK) -			return 0; -	} - -	if (manip == IP_NAT_MANIP_SRC) -		statusbit = IPS_SRC_NAT; -	else -		statusbit = IPS_DST_NAT; - -	/* Invert if this is reply dir. */ -	if (dir == IP_CT_DIR_REPLY) -		statusbit ^= IPS_NAT_MASK; - -	if (!(ct->status & statusbit)) -		return 1; - -	pr_debug("icmp_reply_translation: translating error %p manip %u " -		 "dir %s\n", skb, manip, -		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); - -	/* rcu_read_lock()ed by nf_hook_slow */ -	l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - -	if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr), -			     (hdrlen + -			      sizeof(struct icmphdr) + inside->ip.ihl * 4), -			     (u_int16_t)AF_INET, inside->ip.protocol, -			     &inner, l3proto, l4proto)) -		return 0; - -	/* Change inner back to look like incoming packet.  We do the -	   opposite manip on this hook to normal, because it might not -	   pass all hooks (locally-generated ICMP).  Consider incoming -	   packet: PREROUTING (DST manip), routing produces ICMP, goes -	   through POSTROUTING (which must correct the DST manip). */ -	if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp), -		       &ct->tuplehash[!dir].tuple, !manip)) -		return 0; - -	if (skb->ip_summed != CHECKSUM_PARTIAL) { -		/* Reloading "inside" here since manip_pkt inner. */ -		inside = (void *)skb->data + hdrlen; -		inside->icmp.checksum = 0; -		inside->icmp.checksum = -			csum_fold(skb_checksum(skb, hdrlen, -					       skb->len - hdrlen, 0)); -	} - -	/* Change outer to look the reply to an incoming packet -	 * (proto 0 means don't invert per-proto part). */ -	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); -	if (!manip_pkt(0, skb, 0, &target, manip)) -		return 0; - -	return 1; -} -EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); - -/* Protocol registration. */ -int nf_nat_protocol_register(const struct nf_nat_protocol *proto) -{ -	int ret = 0; - -	spin_lock_bh(&nf_nat_lock); -	if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { -		ret = -EBUSY; -		goto out; -	} -	rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); - out: -	spin_unlock_bh(&nf_nat_lock); -	return ret; -} -EXPORT_SYMBOL(nf_nat_protocol_register); - -/* Noone stores the protocol anywhere; simply delete it. */ -void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) -{ -	spin_lock_bh(&nf_nat_lock); -	rcu_assign_pointer(nf_nat_protos[proto->protonum], -			   &nf_nat_unknown_protocol); -	spin_unlock_bh(&nf_nat_lock); -	synchronize_rcu(); -} -EXPORT_SYMBOL(nf_nat_protocol_unregister); - -/* Noone using conntrack by the time this called. */ -static void nf_nat_cleanup_conntrack(struct nf_conn *ct) -{ -	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - -	if (nat == NULL || nat->ct == NULL) -		return; - -	NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); - -	spin_lock_bh(&nf_nat_lock); -	hlist_del_rcu(&nat->bysource); -	spin_unlock_bh(&nf_nat_lock); -} - -static void nf_nat_move_storage(void *new, void *old) -{ -	struct nf_conn_nat *new_nat = new; -	struct nf_conn_nat *old_nat = old; -	struct nf_conn *ct = old_nat->ct; - -	if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) -		return; - -	spin_lock_bh(&nf_nat_lock); -	new_nat->ct = ct; -	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); -	spin_unlock_bh(&nf_nat_lock); -} - -static struct nf_ct_ext_type nat_extend __read_mostly = { -	.len		= sizeof(struct nf_conn_nat), -	.align		= __alignof__(struct nf_conn_nat), -	.destroy	= nf_nat_cleanup_conntrack, -	.move		= nf_nat_move_storage, -	.id		= NF_CT_EXT_NAT, -	.flags		= NF_CT_EXT_F_PREALLOC, -}; - -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_conntrack.h> - -static const struct nf_nat_protocol * -nf_nat_proto_find_get(u_int8_t protonum) -{ -	const struct nf_nat_protocol *p; - -	rcu_read_lock(); -	p = __nf_nat_proto_find(protonum); -	if (!try_module_get(p->me)) -		p = &nf_nat_unknown_protocol; -	rcu_read_unlock(); - -	return p; -} - -static void -nf_nat_proto_put(const struct nf_nat_protocol *p) -{ -	module_put(p->me); -} - -static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { -	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 }, -	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 }, -}; - -static int nfnetlink_parse_nat_proto(struct nlattr *attr, -				     const struct nf_conn *ct, -				     struct nf_nat_range *range) -{ -	struct nlattr *tb[CTA_PROTONAT_MAX+1]; -	const struct nf_nat_protocol *npt; -	int err; - -	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); -	if (err < 0) -		return err; - -	npt = nf_nat_proto_find_get(nf_ct_protonum(ct)); -	if (npt->nlattr_to_range) -		err = npt->nlattr_to_range(tb, range); -	nf_nat_proto_put(npt); -	return err; -} - -static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { -	[CTA_NAT_MINIP]		= { .type = NLA_U32 }, -	[CTA_NAT_MAXIP]		= { .type = NLA_U32 }, -}; - -static int -nfnetlink_parse_nat(const struct nlattr *nat, -		    const struct nf_conn *ct, struct nf_nat_range *range) -{ -	struct nlattr *tb[CTA_NAT_MAX+1]; -	int err; - -	memset(range, 0, sizeof(*range)); - -	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); -	if (err < 0) -		return err; - -	if (tb[CTA_NAT_MINIP]) -		range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]); - -	if (!tb[CTA_NAT_MAXIP]) -		range->max_ip = range->min_ip; -	else -		range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]); - -	if (range->min_ip) -		range->flags |= IP_NAT_RANGE_MAP_IPS; - -	if (!tb[CTA_NAT_PROTO]) -		return 0; - -	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); -	if (err < 0) -		return err; - -	return 0; -} - -static int -nfnetlink_parse_nat_setup(struct nf_conn *ct, -			  enum nf_nat_manip_type manip, -			  const struct nlattr *attr) -{ -	struct nf_nat_range range; - -	if (nfnetlink_parse_nat(attr, ct, &range) < 0) -		return -EINVAL; -	if (nf_nat_initialized(ct, manip)) -		return -EEXIST; - -	return nf_nat_setup_info(ct, &range, manip); -} -#else -static int -nfnetlink_parse_nat_setup(struct nf_conn *ct, -			  enum nf_nat_manip_type manip, -			  const struct nlattr *attr) -{ -	return -EOPNOTSUPP; -} -#endif - -static int __net_init nf_nat_net_init(struct net *net) -{ -	/* Leave them the same for the moment. */ -	net->ipv4.nat_htable_size = net->ct.htable_size; -	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, -						       &net->ipv4.nat_vmalloced, 0); -	if (!net->ipv4.nat_bysource) -		return -ENOMEM; -	return 0; -} - -/* Clear NAT section of all conntracks, in case we're loaded again. */ -static int clean_nat(struct nf_conn *i, void *data) -{ -	struct nf_conn_nat *nat = nfct_nat(i); - -	if (!nat) -		return 0; -	memset(nat, 0, sizeof(*nat)); -	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); -	return 0; -} - -static void __net_exit nf_nat_net_exit(struct net *net) -{ -	nf_ct_iterate_cleanup(net, &clean_nat, NULL); -	synchronize_rcu(); -	nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, -			     net->ipv4.nat_htable_size); -} - -static struct pernet_operations nf_nat_net_ops = { -	.init = nf_nat_net_init, -	.exit = nf_nat_net_exit, -}; - -static int __init nf_nat_init(void) -{ -	size_t i; -	int ret; - -	need_ipv4_conntrack(); - -	ret = nf_ct_extend_register(&nat_extend); -	if (ret < 0) { -		printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); -		return ret; -	} - -	ret = register_pernet_subsys(&nf_nat_net_ops); -	if (ret < 0) -		goto cleanup_extend; - -	/* Sew in builtin protocols. */ -	spin_lock_bh(&nf_nat_lock); -	for (i = 0; i < MAX_IP_NAT_PROTO; i++) -		rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); -	rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); -	rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); -	rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); -	spin_unlock_bh(&nf_nat_lock); - -	/* Initialize fake conntrack so that NAT will skip it */ -	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - -	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); - -	BUG_ON(nf_nat_seq_adjust_hook != NULL); -	rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); -	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); -	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, -			   nfnetlink_parse_nat_setup); -	BUG_ON(nf_ct_nat_offset != NULL); -	rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); -	return 0; - - cleanup_extend: -	nf_ct_extend_unregister(&nat_extend); -	return ret; -} - -static void __exit nf_nat_cleanup(void) -{ -	unregister_pernet_subsys(&nf_nat_net_ops); -	nf_ct_l3proto_put(l3proto); -	nf_ct_extend_unregister(&nat_extend); -	rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); -	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); -	rcu_assign_pointer(nf_ct_nat_offset, NULL); -	synchronize_net(); -} - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("nf-nat-ipv4"); - -module_init(nf_nat_init); -module_exit(nf_nat_cleanup); diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c deleted file mode 100644 index dc73abb3fe2..00000000000 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ /dev/null @@ -1,137 +0,0 @@ -/* FTP extension for TCP NAT alteration. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_ftp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); -MODULE_DESCRIPTION("ftp NAT helper"); -MODULE_ALIAS("ip_nat_ftp"); - -/* FIXME: Time out? --RR */ - -static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type, -			      char *buffer, size_t buflen, -			      __be32 addr, u16 port) -{ -	switch (type) { -	case NF_CT_FTP_PORT: -	case NF_CT_FTP_PASV: -		return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", -				((unsigned char *)&addr)[0], -				((unsigned char *)&addr)[1], -				((unsigned char *)&addr)[2], -				((unsigned char *)&addr)[3], -				port >> 8, -				port & 0xFF); -	case NF_CT_FTP_EPRT: -		return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port); -	case NF_CT_FTP_EPSV: -		return snprintf(buffer, buflen, "|||%u|", port); -	} - -	return 0; -} - -/* So, this packet has hit the connection tracking matching code. -   Mangle it, and change the expectation to match the new version. */ -static unsigned int nf_nat_ftp(struct sk_buff *skb, -			       enum ip_conntrack_info ctinfo, -			       enum nf_ct_ftp_type type, -			       unsigned int matchoff, -			       unsigned int matchlen, -			       struct nf_conntrack_expect *exp) -{ -	__be32 newip; -	u_int16_t port; -	int dir = CTINFO2DIR(ctinfo); -	struct nf_conn *ct = exp->master; -	char buffer[sizeof("|1|255.255.255.255|65535|")]; -	unsigned int buflen; - -	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); - -	/* Connection will come from wherever this packet goes, hence !dir */ -	newip = ct->tuplehash[!dir].tuple.dst.u3.ip; -	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; -	exp->dir = !dir; - -	/* When you see the packet, we need to NAT it the same as the -	 * this one. */ -	exp->expectfn = nf_nat_follow_master; - -	/* Try to get same port: if not, try to change it. */ -	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { -		int ret; - -		exp->tuple.dst.u.tcp.port = htons(port); -		ret = nf_ct_expect_related(exp); -		if (ret == 0) -			break; -		else if (ret != -EBUSY) { -			port = 0; -			break; -		} -	} - -	if (port == 0) -		return NF_DROP; - -	buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port); -	if (!buflen) -		goto out; - -	pr_debug("calling nf_nat_mangle_tcp_packet\n"); - -	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, -				      matchlen, buffer, buflen)) -		goto out; - -	return NF_ACCEPT; - -out: -	nf_ct_unexpect_related(exp); -	return NF_DROP; -} - -static void __exit nf_nat_ftp_fini(void) -{ -	rcu_assign_pointer(nf_nat_ftp_hook, NULL); -	synchronize_rcu(); -} - -static int __init nf_nat_ftp_init(void) -{ -	BUG_ON(nf_nat_ftp_hook != NULL); -	rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp); -	return 0; -} - -/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ -	printk(KERN_INFO KBUILD_MODNAME -	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); -	return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(nf_nat_ftp_init); -module_exit(nf_nat_ftp_fini); diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 790f3160e01..574f7ebba0b 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -2,6 +2,7 @@   * H.323 extension for NAT alteration.   *   * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>   *   * This source code is licensed under General Public License version 2.   * @@ -15,13 +16,12 @@  #include <net/netfilter/nf_nat.h>  #include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_expect.h>  #include <linux/netfilter/nf_conntrack_h323.h>  /****************************************************************************/ -static int set_addr(struct sk_buff *skb, +static int set_addr(struct sk_buff *skb, unsigned int protoff,  		    unsigned char **data, int dataoff,  		    unsigned int addroff, __be32 ip, __be16 port)  { @@ -40,11 +40,9 @@ static int set_addr(struct sk_buff *skb,  	if (ip_hdr(skb)->protocol == IPPROTO_TCP) {  		if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, -					      addroff, sizeof(buf), +					      protoff, addroff, sizeof(buf),  					      (char *) &buf, sizeof(buf))) { -			if (net_ratelimit()) -				pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet" -				       " error\n"); +			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");  			return -1;  		} @@ -56,11 +54,9 @@ static int set_addr(struct sk_buff *skb,  		*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;  	} else {  		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, -					      addroff, sizeof(buf), +					      protoff, addroff, sizeof(buf),  					      (char *) &buf, sizeof(buf))) { -			if (net_ratelimit()) -				pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet" -				       " error\n"); +			net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");  			return -1;  		}  		/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy @@ -73,22 +69,22 @@ static int set_addr(struct sk_buff *skb,  }  /****************************************************************************/ -static int set_h225_addr(struct sk_buff *skb, +static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,  			 unsigned char **data, int dataoff,  			 TransportAddress *taddr,  			 union nf_inet_addr *addr, __be16 port)  { -	return set_addr(skb, data, dataoff, taddr->ipAddress.ip, +	return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,  			addr->ip, port);  }  /****************************************************************************/ -static int set_h245_addr(struct sk_buff *skb, +static int set_h245_addr(struct sk_buff *skb, unsigned protoff,  			 unsigned char **data, int dataoff,  			 H245_TransportAddress *taddr,  			 union nf_inet_addr *addr, __be16 port)  { -	return set_addr(skb, data, dataoff, +	return set_addr(skb, protoff, data, dataoff,  			taddr->unicastAddress.iPAddress.network,  			addr->ip, port);  } @@ -96,10 +92,10 @@ static int set_h245_addr(struct sk_buff *skb,  /****************************************************************************/  static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, +			unsigned int protoff, unsigned char **data,  			TransportAddress *taddr, int count)  { -	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	const struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	int i;  	__be16 port; @@ -122,7 +118,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,  					 &addr.ip, port,  					 &ct->tuplehash[!dir].tuple.dst.u3.ip,  					 info->sig_port[!dir]); -				return set_h225_addr(skb, data, 0, &taddr[i], +				return set_h225_addr(skb, protoff, data, 0, +						     &taddr[i],  						     &ct->tuplehash[!dir].  						     tuple.dst.u3,  						     info->sig_port[!dir]); @@ -133,7 +130,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,  					 &addr.ip, port,  					 &ct->tuplehash[!dir].tuple.src.u3.ip,  					 info->sig_port[!dir]); -				return set_h225_addr(skb, data, 0, &taddr[i], +				return set_h225_addr(skb, protoff, data, 0, +						     &taddr[i],  						     &ct->tuplehash[!dir].  						     tuple.src.u3,  						     info->sig_port[!dir]); @@ -147,7 +145,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, +			unsigned int protoff, unsigned char **data,  			TransportAddress *taddr, int count)  {  	int dir = CTINFO2DIR(ctinfo); @@ -163,7 +161,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,  				 &addr.ip, ntohs(port),  				 &ct->tuplehash[!dir].tuple.dst.u3.ip,  				 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); -			return set_h225_addr(skb, data, 0, &taddr[i], +			return set_h225_addr(skb, protoff, data, 0, &taddr[i],  					     &ct->tuplehash[!dir].tuple.dst.u3,  					     ct->tuplehash[!dir].tuple.  								dst.u.udp.port); @@ -176,13 +174,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  			enum ip_conntrack_info ctinfo, -			unsigned char **data, int dataoff, +			unsigned int protoff, unsigned char **data, int dataoff,  			H245_TransportAddress *taddr,  			__be16 port, __be16 rtp_port,  			struct nf_conntrack_expect *rtp_exp,  			struct nf_conntrack_expect *rtcp_exp)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	int i;  	u_int16_t nated_port; @@ -214,8 +212,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  	/* Run out of expectations */  	if (i >= H323_RTP_CHANNEL_MAX) { -		if (net_ratelimit()) -			pr_notice("nf_nat_h323: out of expectations\n"); +		net_notice_ratelimited("nf_nat_h323: out of expectations\n");  		return 0;  	} @@ -232,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  			ret = nf_ct_expect_related(rtcp_exp);  			if (ret == 0)  				break; -			else if (ret != -EBUSY) { +			else if (ret == -EBUSY) { +				nf_ct_unexpect_related(rtp_exp); +				continue; +			} else if (ret < 0) {  				nf_ct_unexpect_related(rtp_exp);  				nated_port = 0;  				break; @@ -244,13 +244,12 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  	}  	if (nated_port == 0) {	/* No port available */ -		if (net_ratelimit()) -			pr_notice("nf_nat_h323: out of RTP ports\n"); +		net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");  		return 0;  	}  	/* Modify signal */ -	if (set_h245_addr(skb, data, dataoff, taddr, +	if (set_h245_addr(skb, protoff, data, dataoff, taddr,  			  &ct->tuplehash[!dir].tuple.dst.u3,  			  htons((port & htons(1)) ? nated_port + 1 :  						    nated_port)) == 0) { @@ -281,7 +280,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,  		    enum ip_conntrack_info ctinfo, -		    unsigned char **data, int dataoff, +		    unsigned int protoff, unsigned char **data, int dataoff,  		    H245_TransportAddress *taddr, __be16 port,  		    struct nf_conntrack_expect *exp)  { @@ -308,13 +307,12 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,  	}  	if (nated_port == 0) {	/* No port available */ -		if (net_ratelimit()) -			pr_notice("nf_nat_h323: out of TCP ports\n"); +		net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");  		return 0;  	}  	/* Modify signal */ -	if (set_h245_addr(skb, data, dataoff, taddr, +	if (set_h245_addr(skb, protoff, data, dataoff, taddr,  			  &ct->tuplehash[!dir].tuple.dst.u3,  			  htons(nated_port)) < 0) {  		nf_ct_unexpect_related(exp); @@ -333,11 +331,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,  /****************************************************************************/  static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,  		    enum ip_conntrack_info ctinfo, -		    unsigned char **data, int dataoff, +		    unsigned int protoff, unsigned char **data, int dataoff,  		    TransportAddress *taddr, __be16 port,  		    struct nf_conntrack_expect *exp)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	u_int16_t nated_port = ntohs(port); @@ -365,13 +363,12 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,  	}  	if (nated_port == 0) {	/* No port available */ -		if (net_ratelimit()) -			pr_notice("nf_nat_q931: out of TCP ports\n"); +		net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");  		return 0;  	}  	/* Modify signal */ -	if (set_h225_addr(skb, data, dataoff, taddr, +	if (set_h225_addr(skb, protoff, data, dataoff, taddr,  			  &ct->tuplehash[!dir].tuple.dst.u3,  			  htons(nated_port)) == 0) {  		/* Save ports */ @@ -409,25 +406,27 @@ static void ip_nat_q931_expect(struct nf_conn *new,  	BUG_ON(new->status & IPS_NAT_DONE_MASK);  	/* Change src to where master sends to */ -	range.flags = IP_NAT_RANGE_MAP_IPS; -	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; -	nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); +	range.flags = NF_NAT_RANGE_MAP_IPS; +	range.min_addr = range.max_addr = +	    new->tuplehash[!this->dir].tuple.src.u3; +	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);  	/* For DST manip, map port here to where it's expected. */ -	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); -	range.min = range.max = this->saved_proto; -	range.min_ip = range.max_ip = -	    new->master->tuplehash[!this->dir].tuple.src.u3.ip; -	nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); +	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); +	range.min_proto = range.max_proto = this->saved_proto; +	range.min_addr = range.max_addr = +	    new->master->tuplehash[!this->dir].tuple.src.u3; +	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);  }  /****************************************************************************/  static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,  		    enum ip_conntrack_info ctinfo, -		    unsigned char **data, TransportAddress *taddr, int idx, +		    unsigned int protoff, unsigned char **data, +		    TransportAddress *taddr, int idx,  		    __be16 port, struct nf_conntrack_expect *exp)  { -	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; +	struct nf_ct_h323_master *info = nfct_help_data(ct);  	int dir = CTINFO2DIR(ctinfo);  	u_int16_t nated_port = ntohs(port);  	union nf_inet_addr addr; @@ -456,13 +455,12 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,  	}  	if (nated_port == 0) {	/* No port available */ -		if (net_ratelimit()) -			pr_notice("nf_nat_ras: out of TCP ports\n"); +		net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");  		return 0;  	}  	/* Modify signal */ -	if (set_h225_addr(skb, data, 0, &taddr[idx], +	if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],  			  &ct->tuplehash[!dir].tuple.dst.u3,  			  htons(nated_port)) == 0) {  		/* Save ports */ @@ -473,7 +471,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,  		if (idx > 0 &&  		    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&  		    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { -			set_h225_addr(skb, data, 0, &taddr[0], +			set_h225_addr(skb, protoff, data, 0, &taddr[0],  				      &ct->tuplehash[!dir].tuple.dst.u3,  				      info->sig_port[!dir]);  		} @@ -502,20 +500,22 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,  	BUG_ON(new->status & IPS_NAT_DONE_MASK);  	/* Change src to where master sends to */ -	range.flags = IP_NAT_RANGE_MAP_IPS; -	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; -	nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); +	range.flags = NF_NAT_RANGE_MAP_IPS; +	range.min_addr = range.max_addr = +	    new->tuplehash[!this->dir].tuple.src.u3; +	nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);  	/* For DST manip, map port here to where it's expected. */ -	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); -	range.min = range.max = this->saved_proto; -	range.min_ip = range.max_ip = this->saved_ip; -	nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); +	range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); +	range.min_proto = range.max_proto = this->saved_proto; +	range.min_addr = range.max_addr = this->saved_addr; +	nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);  }  /****************************************************************************/  static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,  			      enum ip_conntrack_info ctinfo, +			      unsigned int protoff,  			      unsigned char **data, int dataoff,  			      TransportAddress *taddr, __be16 port,  			      struct nf_conntrack_expect *exp) @@ -524,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,  	u_int16_t nated_port;  	/* Set expectations for NAT */ -	exp->saved_ip = exp->tuple.dst.u3.ip; +	exp->saved_addr = exp->tuple.dst.u3;  	exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;  	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;  	exp->expectfn = ip_nat_callforwarding_expect; @@ -545,13 +545,12 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,  	}  	if (nated_port == 0) {	/* No port available */ -		if (net_ratelimit()) -			pr_notice("nf_nat_q931: out of TCP ports\n"); +		net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");  		return 0;  	}  	/* Modify signal */ -	if (!set_h225_addr(skb, data, dataoff, taddr, +	if (!set_h225_addr(skb, protoff, data, dataoff, taddr,  			   &ct->tuplehash[!dir].tuple.dst.u3,  			   htons(nated_port)) == 0) {  		nf_ct_unexpect_related(exp); @@ -568,6 +567,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,  	return 0;  } +static struct nf_ct_helper_expectfn q931_nat = { +	.name		= "Q.931", +	.expectfn	= ip_nat_q931_expect, +}; + +static struct nf_ct_helper_expectfn callforwarding_nat = { +	.name		= "callforwarding", +	.expectfn	= ip_nat_callforwarding_expect, +}; +  /****************************************************************************/  static int __init init(void)  { @@ -581,30 +590,34 @@ static int __init init(void)  	BUG_ON(nat_callforwarding_hook != NULL);  	BUG_ON(nat_q931_hook != NULL); -	rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); -	rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); -	rcu_assign_pointer(set_sig_addr_hook, set_sig_addr); -	rcu_assign_pointer(set_ras_addr_hook, set_ras_addr); -	rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp); -	rcu_assign_pointer(nat_t120_hook, nat_t120); -	rcu_assign_pointer(nat_h245_hook, nat_h245); -	rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding); -	rcu_assign_pointer(nat_q931_hook, nat_q931); +	RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr); +	RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr); +	RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr); +	RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr); +	RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp); +	RCU_INIT_POINTER(nat_t120_hook, nat_t120); +	RCU_INIT_POINTER(nat_h245_hook, nat_h245); +	RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding); +	RCU_INIT_POINTER(nat_q931_hook, nat_q931); +	nf_ct_helper_expectfn_register(&q931_nat); +	nf_ct_helper_expectfn_register(&callforwarding_nat);  	return 0;  }  /****************************************************************************/  static void __exit fini(void)  { -	rcu_assign_pointer(set_h245_addr_hook, NULL); -	rcu_assign_pointer(set_h225_addr_hook, NULL); -	rcu_assign_pointer(set_sig_addr_hook, NULL); -	rcu_assign_pointer(set_ras_addr_hook, NULL); -	rcu_assign_pointer(nat_rtp_rtcp_hook, NULL); -	rcu_assign_pointer(nat_t120_hook, NULL); -	rcu_assign_pointer(nat_h245_hook, NULL); -	rcu_assign_pointer(nat_callforwarding_hook, NULL); -	rcu_assign_pointer(nat_q931_hook, NULL); +	RCU_INIT_POINTER(set_h245_addr_hook, NULL); +	RCU_INIT_POINTER(set_h225_addr_hook, NULL); +	RCU_INIT_POINTER(set_sig_addr_hook, NULL); +	RCU_INIT_POINTER(set_ras_addr_hook, NULL); +	RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL); +	RCU_INIT_POINTER(nat_t120_hook, NULL); +	RCU_INIT_POINTER(nat_h245_hook, NULL); +	RCU_INIT_POINTER(nat_callforwarding_hook, NULL); +	RCU_INIT_POINTER(nat_q931_hook, NULL); +	nf_ct_helper_expectfn_unregister(&q931_nat); +	nf_ct_helper_expectfn_unregister(&callforwarding_nat);  	synchronize_rcu();  } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c deleted file mode 100644 index 31427fb57aa..00000000000 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ /dev/null @@ -1,451 +0,0 @@ -/* ip_nat_helper.c - generic support functions for NAT helpers - * - * (C) 2000-2002 Harald Welte <laforge@netfilter.org> - * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/gfp.h> -#include <linux/kmod.h> -#include <linux/types.h> -#include <linux/timer.h> -#include <linux/skbuff.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> - -#define DUMP_OFFSET(x) \ -	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ -		 x->offset_before, x->offset_after, x->correction_pos); - -static DEFINE_SPINLOCK(nf_nat_seqofs_lock); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, -		    int sizediff, -		    struct nf_conn *ct, -		    enum ip_conntrack_info ctinfo) -{ -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	struct nf_conn_nat *nat = nfct_nat(ct); -	struct nf_nat_seq *this_way = &nat->seq[dir]; - -	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", -		 seq, sizediff); - -	pr_debug("adjust_tcp_sequence: Seq_offset before: "); -	DUMP_OFFSET(this_way); - -	spin_lock_bh(&nf_nat_seqofs_lock); - -	/* SYN adjust. If it's uninitialized, or this is after last -	 * correction, record it: we don't handle more than one -	 * adjustment in the window, but do deal with common case of a -	 * retransmit */ -	if (this_way->offset_before == this_way->offset_after || -	    before(this_way->correction_pos, seq)) { -		this_way->correction_pos = seq; -		this_way->offset_before = this_way->offset_after; -		this_way->offset_after += sizediff; -	} -	spin_unlock_bh(&nf_nat_seqofs_lock); - -	pr_debug("adjust_tcp_sequence: Seq_offset after: "); -	DUMP_OFFSET(this_way); -} - -/* Get the offset value, for conntrack */ -s16 nf_nat_get_offset(const struct nf_conn *ct, -		      enum ip_conntrack_dir dir, -		      u32 seq) -{ -	struct nf_conn_nat *nat = nfct_nat(ct); -	struct nf_nat_seq *this_way; -	s16 offset; - -	if (!nat) -		return 0; - -	this_way = &nat->seq[dir]; -	spin_lock_bh(&nf_nat_seqofs_lock); -	offset = after(seq, this_way->correction_pos) -		 ? this_way->offset_after : this_way->offset_before; -	spin_unlock_bh(&nf_nat_seqofs_lock); - -	return offset; -} -EXPORT_SYMBOL_GPL(nf_nat_get_offset); - -/* Frobs data inside this packet, which is linear. */ -static void mangle_contents(struct sk_buff *skb, -			    unsigned int dataoff, -			    unsigned int match_offset, -			    unsigned int match_len, -			    const char *rep_buffer, -			    unsigned int rep_len) -{ -	unsigned char *data; - -	BUG_ON(skb_is_nonlinear(skb)); -	data = skb_network_header(skb) + dataoff; - -	/* move post-replacement */ -	memmove(data + match_offset + rep_len, -		data + match_offset + match_len, -		skb->tail - (skb->network_header + dataoff + -			     match_offset + match_len)); - -	/* insert data from buffer */ -	memcpy(data + match_offset, rep_buffer, rep_len); - -	/* update skb info */ -	if (rep_len > match_len) { -		pr_debug("nf_nat_mangle_packet: Extending packet by " -			 "%u from %u bytes\n", rep_len - match_len, skb->len); -		skb_put(skb, rep_len - match_len); -	} else { -		pr_debug("nf_nat_mangle_packet: Shrinking packet from " -			 "%u from %u bytes\n", match_len - rep_len, skb->len); -		__skb_trim(skb, skb->len + rep_len - match_len); -	} - -	/* fix IP hdr checksum information */ -	ip_hdr(skb)->tot_len = htons(skb->len); -	ip_send_check(ip_hdr(skb)); -} - -/* Unusual, but possible case. */ -static int enlarge_skb(struct sk_buff *skb, unsigned int extra) -{ -	if (skb->len + extra > 65535) -		return 0; - -	if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) -		return 0; - -	return 1; -} - -void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, -			   __be32 seq, s16 off) -{ -	if (!off) -		return; -	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); -	adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); -	nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); -} -EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); - -static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data, -			int datalen, __sum16 *check, int oldlen) -{ -	struct rtable *rt = skb_rtable(skb); - -	if (skb->ip_summed != CHECKSUM_PARTIAL) { -		if (!(rt->rt_flags & RTCF_LOCAL) && -		    skb->dev->features & NETIF_F_V4_CSUM) { -			skb->ip_summed = CHECKSUM_PARTIAL; -			skb->csum_start = skb_headroom(skb) + -					  skb_network_offset(skb) + -					  iph->ihl * 4; -			skb->csum_offset = (void *)check - data; -			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, -						    datalen, iph->protocol, 0); -		} else { -			*check = 0; -			*check = csum_tcpudp_magic(iph->saddr, iph->daddr, -						   datalen, iph->protocol, -						   csum_partial(data, datalen, -								0)); -			if (iph->protocol == IPPROTO_UDP && !*check) -				*check = CSUM_MANGLED_0; -		} -	} else -		inet_proto_csum_replace2(check, skb, -					 htons(oldlen), htons(datalen), 1); -} - -/* Generic function for mangling variable-length address changes inside - * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX - * command in FTP). - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * */ -int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, -			       struct nf_conn *ct, -			       enum ip_conntrack_info ctinfo, -			       unsigned int match_offset, -			       unsigned int match_len, -			       const char *rep_buffer, -			       unsigned int rep_len, bool adjust) -{ -	struct iphdr *iph; -	struct tcphdr *tcph; -	int oldlen, datalen; - -	if (!skb_make_writable(skb, skb->len)) -		return 0; - -	if (rep_len > match_len && -	    rep_len - match_len > skb_tailroom(skb) && -	    !enlarge_skb(skb, rep_len - match_len)) -		return 0; - -	SKB_LINEAR_ASSERT(skb); - -	iph = ip_hdr(skb); -	tcph = (void *)iph + iph->ihl*4; - -	oldlen = skb->len - iph->ihl*4; -	mangle_contents(skb, iph->ihl*4 + tcph->doff*4, -			match_offset, match_len, rep_buffer, rep_len); - -	datalen = skb->len - iph->ihl*4; -	nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); - -	if (adjust && rep_len != match_len) -		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, -				      (int)rep_len - (int)match_len); - -	return 1; -} -EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); - -/* Generic function for mangling variable-length address changes inside - * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX - * command in the Amanda protocol) - * - * Takes care about all the nasty sequence number changes, checksumming, - * skb enlargement, ... - * - * XXX - This function could be merged with nf_nat_mangle_tcp_packet which - *       should be fairly easy to do. - */ -int -nf_nat_mangle_udp_packet(struct sk_buff *skb, -			 struct nf_conn *ct, -			 enum ip_conntrack_info ctinfo, -			 unsigned int match_offset, -			 unsigned int match_len, -			 const char *rep_buffer, -			 unsigned int rep_len) -{ -	struct iphdr *iph; -	struct udphdr *udph; -	int datalen, oldlen; - -	/* UDP helpers might accidentally mangle the wrong packet */ -	iph = ip_hdr(skb); -	if (skb->len < iph->ihl*4 + sizeof(*udph) + -			       match_offset + match_len) -		return 0; - -	if (!skb_make_writable(skb, skb->len)) -		return 0; - -	if (rep_len > match_len && -	    rep_len - match_len > skb_tailroom(skb) && -	    !enlarge_skb(skb, rep_len - match_len)) -		return 0; - -	iph = ip_hdr(skb); -	udph = (void *)iph + iph->ihl*4; - -	oldlen = skb->len - iph->ihl*4; -	mangle_contents(skb, iph->ihl*4 + sizeof(*udph), -			match_offset, match_len, rep_buffer, rep_len); - -	/* update the length of the UDP packet */ -	datalen = skb->len - iph->ihl*4; -	udph->len = htons(datalen); - -	/* fix udp checksum if udp checksum was previously calculated */ -	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) -		return 1; - -	nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); - -	return 1; -} -EXPORT_SYMBOL(nf_nat_mangle_udp_packet); - -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, -	    struct tcphdr *tcph, -	    unsigned int sackoff, -	    unsigned int sackend, -	    struct nf_nat_seq *natseq) -{ -	while (sackoff < sackend) { -		struct tcp_sack_block_wire *sack; -		__be32 new_start_seq, new_end_seq; - -		sack = (void *)skb->data + sackoff; -		if (after(ntohl(sack->start_seq) - natseq->offset_before, -			  natseq->correction_pos)) -			new_start_seq = htonl(ntohl(sack->start_seq) -					- natseq->offset_after); -		else -			new_start_seq = htonl(ntohl(sack->start_seq) -					- natseq->offset_before); - -		if (after(ntohl(sack->end_seq) - natseq->offset_before, -			  natseq->correction_pos)) -			new_end_seq = htonl(ntohl(sack->end_seq) -				      - natseq->offset_after); -		else -			new_end_seq = htonl(ntohl(sack->end_seq) -				      - natseq->offset_before); - -		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", -			 ntohl(sack->start_seq), new_start_seq, -			 ntohl(sack->end_seq), new_end_seq); - -		inet_proto_csum_replace4(&tcph->check, skb, -					 sack->start_seq, new_start_seq, 0); -		inet_proto_csum_replace4(&tcph->check, skb, -					 sack->end_seq, new_end_seq, 0); -		sack->start_seq = new_start_seq; -		sack->end_seq = new_end_seq; -		sackoff += sizeof(*sack); -	} -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -nf_nat_sack_adjust(struct sk_buff *skb, -		   struct tcphdr *tcph, -		   struct nf_conn *ct, -		   enum ip_conntrack_info ctinfo) -{ -	unsigned int dir, optoff, optend; -	struct nf_conn_nat *nat = nfct_nat(ct); - -	optoff = ip_hdrlen(skb) + sizeof(struct tcphdr); -	optend = ip_hdrlen(skb) + tcph->doff * 4; - -	if (!skb_make_writable(skb, optend)) -		return 0; - -	dir = CTINFO2DIR(ctinfo); - -	while (optoff < optend) { -		/* Usually: option, length. */ -		unsigned char *op = skb->data + optoff; - -		switch (op[0]) { -		case TCPOPT_EOL: -			return 1; -		case TCPOPT_NOP: -			optoff++; -			continue; -		default: -			/* no partial options */ -			if (optoff + 1 == optend || -			    optoff + op[1] > optend || -			    op[1] < 2) -				return 0; -			if (op[0] == TCPOPT_SACK && -			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK && -			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) -				sack_adjust(skb, tcph, optoff+2, -					    optoff+op[1], &nat->seq[!dir]); -			optoff += op[1]; -		} -	} -	return 1; -} - -/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */ -int -nf_nat_seq_adjust(struct sk_buff *skb, -		  struct nf_conn *ct, -		  enum ip_conntrack_info ctinfo) -{ -	struct tcphdr *tcph; -	int dir; -	__be32 newseq, newack; -	s16 seqoff, ackoff; -	struct nf_conn_nat *nat = nfct_nat(ct); -	struct nf_nat_seq *this_way, *other_way; - -	dir = CTINFO2DIR(ctinfo); - -	this_way = &nat->seq[dir]; -	other_way = &nat->seq[!dir]; - -	if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) -		return 0; - -	tcph = (void *)skb->data + ip_hdrlen(skb); -	if (after(ntohl(tcph->seq), this_way->correction_pos)) -		seqoff = this_way->offset_after; -	else -		seqoff = this_way->offset_before; - -	if (after(ntohl(tcph->ack_seq) - other_way->offset_before, -		  other_way->correction_pos)) -		ackoff = other_way->offset_after; -	else -		ackoff = other_way->offset_before; - -	newseq = htonl(ntohl(tcph->seq) + seqoff); -	newack = htonl(ntohl(tcph->ack_seq) - ackoff); - -	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); -	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); - -	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", -		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), -		 ntohl(newack)); - -	tcph->seq = newseq; -	tcph->ack_seq = newack; - -	return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); -} - -/* Setup NAT on this expected conntrack so it follows master. */ -/* If we fail to get a free NAT slot, we'll get dropped on confirm */ -void nf_nat_follow_master(struct nf_conn *ct, -			  struct nf_conntrack_expect *exp) -{ -	struct nf_nat_range range; - -	/* This must be a fresh one. */ -	BUG_ON(ct->status & IPS_NAT_DONE_MASK); - -	/* Change src to where master sends to */ -	range.flags = IP_NAT_RANGE_MAP_IPS; -	range.min_ip = range.max_ip -		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; -	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); - -	/* For DST manip, map port here to where it's expected. */ -	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); -	range.min = range.max = exp->saved_proto; -	range.min_ip = range.max_ip -		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; -	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); -} -EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c deleted file mode 100644 index 535e1a80235..00000000000 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ /dev/null @@ -1,99 +0,0 @@ -/* IRC extension for TCP NAT alteration. - * - * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> - * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * based on a copy of RR's ip_nat_ftp.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/tcp.h> -#include <linux/kernel.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_irc.h> - -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("IRC (DCC) NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_irc"); - -static unsigned int help(struct sk_buff *skb, -			 enum ip_conntrack_info ctinfo, -			 unsigned int matchoff, -			 unsigned int matchlen, -			 struct nf_conntrack_expect *exp) -{ -	char buffer[sizeof("4294967296 65635")]; -	u_int32_t ip; -	u_int16_t port; -	unsigned int ret; - -	/* Reply comes from server. */ -	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; -	exp->dir = IP_CT_DIR_REPLY; -	exp->expectfn = nf_nat_follow_master; - -	/* Try to get same port: if not, try to change it. */ -	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { -		int ret; - -		exp->tuple.dst.u.tcp.port = htons(port); -		ret = nf_ct_expect_related(exp); -		if (ret == 0) -			break; -		else if (ret != -EBUSY) { -			port = 0; -			break; -		} -	} - -	if (port == 0) -		return NF_DROP; - -	ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip); -	sprintf(buffer, "%u %u", ip, port); -	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", -		 buffer, &ip, port); - -	ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo, -				       matchoff, matchlen, buffer, -				       strlen(buffer)); -	if (ret != NF_ACCEPT) -		nf_ct_unexpect_related(exp); -	return ret; -} - -static void __exit nf_nat_irc_fini(void) -{ -	rcu_assign_pointer(nf_nat_irc_hook, NULL); -	synchronize_rcu(); -} - -static int __init nf_nat_irc_init(void) -{ -	BUG_ON(nf_nat_irc_hook != NULL); -	rcu_assign_pointer(nf_nat_irc_hook, help); -	return 0; -} - -/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */ -static int warn_set(const char *val, struct kernel_param *kp) -{ -	printk(KERN_INFO KBUILD_MODNAME -	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); -	return 0; -} -module_param_call(ports, warn_set, NULL, NULL, 0); - -module_init(nf_nat_irc_init); -module_exit(nf_nat_irc_fini); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c new file mode 100644 index 00000000000..d8b2e14efdd --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -0,0 +1,281 @@ +/* + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/secure_seq.h> +#include <net/checksum.h> +#include <net/route.h> +#include <net/ip.h> + +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv4; + +#ifdef CONFIG_XFRM +static void nf_nat_ipv4_decode_session(struct sk_buff *skb, +				       const struct nf_conn *ct, +				       enum ip_conntrack_dir dir, +				       unsigned long statusbit, +				       struct flowi *fl) +{ +	const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; +	struct flowi4 *fl4 = &fl->u.ip4; + +	if (ct->status & statusbit) { +		fl4->daddr = t->dst.u3.ip; +		if (t->dst.protonum == IPPROTO_TCP || +		    t->dst.protonum == IPPROTO_UDP || +		    t->dst.protonum == IPPROTO_UDPLITE || +		    t->dst.protonum == IPPROTO_DCCP || +		    t->dst.protonum == IPPROTO_SCTP) +			fl4->fl4_dport = t->dst.u.all; +	} + +	statusbit ^= IPS_NAT_MASK; + +	if (ct->status & statusbit) { +		fl4->saddr = t->src.u3.ip; +		if (t->dst.protonum == IPPROTO_TCP || +		    t->dst.protonum == IPPROTO_UDP || +		    t->dst.protonum == IPPROTO_UDPLITE || +		    t->dst.protonum == IPPROTO_DCCP || +		    t->dst.protonum == IPPROTO_SCTP) +			fl4->fl4_sport = t->src.u.all; +	} +} +#endif /* CONFIG_XFRM */ + +static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, +				 const struct nf_nat_range *range) +{ +	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && +	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); +} + +static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, +				   __be16 dport) +{ +	return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); +} + +static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, +				  unsigned int iphdroff, +				  const struct nf_nat_l4proto *l4proto, +				  const struct nf_conntrack_tuple *target, +				  enum nf_nat_manip_type maniptype) +{ +	struct iphdr *iph; +	unsigned int hdroff; + +	if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) +		return false; + +	iph = (void *)skb->data + iphdroff; +	hdroff = iphdroff + iph->ihl * 4; + +	if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, +				target, maniptype)) +		return false; +	iph = (void *)skb->data + iphdroff; + +	if (maniptype == NF_NAT_MANIP_SRC) { +		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); +		iph->saddr = target->src.u3.ip; +	} else { +		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); +		iph->daddr = target->dst.u3.ip; +	} +	return true; +} + +static void nf_nat_ipv4_csum_update(struct sk_buff *skb, +				    unsigned int iphdroff, __sum16 *check, +				    const struct nf_conntrack_tuple *t, +				    enum nf_nat_manip_type maniptype) +{ +	struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); +	__be32 oldip, newip; + +	if (maniptype == NF_NAT_MANIP_SRC) { +		oldip = iph->saddr; +		newip = t->src.u3.ip; +	} else { +		oldip = iph->daddr; +		newip = t->dst.u3.ip; +	} +	inet_proto_csum_replace4(check, skb, oldip, newip, 1); +} + +static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, +				    u8 proto, void *data, __sum16 *check, +				    int datalen, int oldlen) +{ +	const struct iphdr *iph = ip_hdr(skb); +	struct rtable *rt = skb_rtable(skb); + +	if (skb->ip_summed != CHECKSUM_PARTIAL) { +		if (!(rt->rt_flags & RTCF_LOCAL) && +		    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { +			skb->ip_summed = CHECKSUM_PARTIAL; +			skb->csum_start = skb_headroom(skb) + +					  skb_network_offset(skb) + +					  ip_hdrlen(skb); +			skb->csum_offset = (void *)check - data; +			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, +						    datalen, proto, 0); +		} else { +			*check = 0; +			*check = csum_tcpudp_magic(iph->saddr, iph->daddr, +						   datalen, proto, +						   csum_partial(data, datalen, +								0)); +			if (proto == IPPROTO_UDP && !*check) +				*check = CSUM_MANGLED_0; +		} +	} else +		inet_proto_csum_replace2(check, skb, +					 htons(oldlen), htons(datalen), 1); +} + +static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], +				       struct nf_nat_range *range) +{ +	if (tb[CTA_NAT_V4_MINIP]) { +		range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); +		range->flags |= NF_NAT_RANGE_MAP_IPS; +	} + +	if (tb[CTA_NAT_V4_MAXIP]) +		range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); +	else +		range->max_addr.ip = range->min_addr.ip; + +	return 0; +} + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { +	.l3proto		= NFPROTO_IPV4, +	.in_range		= nf_nat_ipv4_in_range, +	.secure_port		= nf_nat_ipv4_secure_port, +	.manip_pkt		= nf_nat_ipv4_manip_pkt, +	.csum_update		= nf_nat_ipv4_csum_update, +	.csum_recalc		= nf_nat_ipv4_csum_recalc, +	.nlattr_to_range	= nf_nat_ipv4_nlattr_to_range, +#ifdef CONFIG_XFRM +	.decode_session		= nf_nat_ipv4_decode_session, +#endif +}; + +int nf_nat_icmp_reply_translation(struct sk_buff *skb, +				  struct nf_conn *ct, +				  enum ip_conntrack_info ctinfo, +				  unsigned int hooknum) +{ +	struct { +		struct icmphdr	icmp; +		struct iphdr	ip; +	} *inside; +	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); +	unsigned int hdrlen = ip_hdrlen(skb); +	const struct nf_nat_l4proto *l4proto; +	struct nf_conntrack_tuple target; +	unsigned long statusbit; + +	NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); + +	if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) +		return 0; +	if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) +		return 0; + +	inside = (void *)skb->data + hdrlen; +	if (inside->icmp.type == ICMP_REDIRECT) { +		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) +			return 0; +		if (ct->status & IPS_NAT_MASK) +			return 0; +	} + +	if (manip == NF_NAT_MANIP_SRC) +		statusbit = IPS_SRC_NAT; +	else +		statusbit = IPS_DST_NAT; + +	/* Invert if this is reply direction */ +	if (dir == IP_CT_DIR_REPLY) +		statusbit ^= IPS_NAT_MASK; + +	if (!(ct->status & statusbit)) +		return 1; + +	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); +	if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), +				   l4proto, &ct->tuplehash[!dir].tuple, !manip)) +		return 0; + +	if (skb->ip_summed != CHECKSUM_PARTIAL) { +		/* Reloading "inside" here since manip_pkt may reallocate */ +		inside = (void *)skb->data + hdrlen; +		inside->icmp.checksum = 0; +		inside->icmp.checksum = +			csum_fold(skb_checksum(skb, hdrlen, +					       skb->len - hdrlen, 0)); +	} + +	/* Change outer to look like the reply to an incoming packet */ +	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); +	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); +	if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) +		return 0; + +	return 1; +} +EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); + +static int __init nf_nat_l3proto_ipv4_init(void) +{ +	int err; + +	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); +	if (err < 0) +		goto err1; +	err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); +	if (err < 0) +		goto err2; +	return err; + +err2: +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); +err1: +	return err; +} + +static void __exit nf_nat_l3proto_ipv4_exit(void) +{ +	nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); +} + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-" __stringify(AF_INET)); + +module_init(nf_nat_l3proto_ipv4_init); +module_exit(nf_nat_l3proto_ipv4_exit); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 4c060038d29..657d2307f03 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -13,6 +13,8 @@   *   * Development of this code funded by Astaro AG (http://www.astaro.com/)   * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + *   * TODO: - NAT to a unique tuple, not to TCP source port   * 	   (needs netfilter tuple reservation)   */ @@ -22,7 +24,6 @@  #include <net/netfilter/nf_nat.h>  #include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_conntrack_expect.h>  #include <net/netfilter/nf_conntrack_zones.h> @@ -49,7 +50,7 @@ static void pptp_nat_expected(struct nf_conn *ct,  	const struct nf_nat_pptp *nat_pptp_info;  	struct nf_nat_range range; -	ct_pptp_info = &nfct_help(master)->help.ct_pptp_info; +	ct_pptp_info = nfct_help_data(master);  	nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;  	/* And here goes the grand finale of corrosion... */ @@ -88,24 +89,24 @@ static void pptp_nat_expected(struct nf_conn *ct,  	BUG_ON(ct->status & IPS_NAT_DONE_MASK);  	/* Change src to where master sends to */ -	range.flags = IP_NAT_RANGE_MAP_IPS; -	range.min_ip = range.max_ip -		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; +	range.flags = NF_NAT_RANGE_MAP_IPS; +	range.min_addr = range.max_addr +		= ct->master->tuplehash[!exp->dir].tuple.dst.u3;  	if (exp->dir == IP_CT_DIR_ORIGINAL) { -		range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; -		range.min = range.max = exp->saved_proto; +		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +		range.min_proto = range.max_proto = exp->saved_proto;  	} -	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); +	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);  	/* For DST manip, map port here to where it's expected. */ -	range.flags = IP_NAT_RANGE_MAP_IPS; -	range.min_ip = range.max_ip -		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; +	range.flags = NF_NAT_RANGE_MAP_IPS; +	range.min_addr = range.max_addr +		= ct->master->tuplehash[!exp->dir].tuple.src.u3;  	if (exp->dir == IP_CT_DIR_REPLY) { -		range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; -		range.min = range.max = exp->saved_proto; +		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +		range.min_proto = range.max_proto = exp->saved_proto;  	} -	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); +	nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);  }  /* outbound packets == from PNS to PAC */ @@ -113,6 +114,7 @@ static int  pptp_outbound_pkt(struct sk_buff *skb,  		  struct nf_conn *ct,  		  enum ip_conntrack_info ctinfo, +		  unsigned int protoff,  		  struct PptpControlHeader *ctlh,  		  union pptp_ctrl_union *pptpReq) @@ -123,7 +125,7 @@ pptp_outbound_pkt(struct sk_buff *skb,  	__be16 new_callid;  	unsigned int cid_off; -	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info; +	ct_pptp_info = nfct_help_data(ct);  	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;  	new_callid = ct_pptp_info->pns_call_id; @@ -175,7 +177,7 @@ pptp_outbound_pkt(struct sk_buff *skb,  		 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));  	/* mangle packet */ -	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, +	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,  				     cid_off + sizeof(struct pptp_pkt_hdr) +  				     sizeof(struct PptpControlHeader),  				     sizeof(new_callid), (char *)&new_callid, @@ -192,7 +194,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,  	struct nf_ct_pptp_master *ct_pptp_info;  	struct nf_nat_pptp *nat_pptp_info; -	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info; +	ct_pptp_info = nfct_help_data(ct);  	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;  	/* save original PAC call ID in nat_info */ @@ -216,6 +218,7 @@ static int  pptp_inbound_pkt(struct sk_buff *skb,  		 struct nf_conn *ct,  		 enum ip_conntrack_info ctinfo, +		 unsigned int protoff,  		 struct PptpControlHeader *ctlh,  		 union pptp_ctrl_union *pptpReq)  { @@ -268,7 +271,7 @@ pptp_inbound_pkt(struct sk_buff *skb,  	pr_debug("altering peer call id from 0x%04x to 0x%04x\n",  		 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); -	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, +	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,  				     pcid_off + sizeof(struct pptp_pkt_hdr) +  				     sizeof(struct PptpControlHeader),  				     sizeof(new_pcid), (char *)&new_pcid, @@ -282,25 +285,25 @@ static int __init nf_nat_helper_pptp_init(void)  	nf_nat_need_gre();  	BUG_ON(nf_nat_pptp_hook_outbound != NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); +	RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);  	BUG_ON(nf_nat_pptp_hook_inbound != NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); +	RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);  	BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); +	RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);  	BUG_ON(nf_nat_pptp_hook_expectfn != NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); +	RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);  	return 0;  }  static void __exit nf_nat_helper_pptp_fini(void)  { -	rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL); -	rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL); +	RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL); +	RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL); +	RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL); +	RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);  	synchronize_rcu();  } diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c deleted file mode 100644 index 3e61faf23a9..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ /dev/null @@ -1,124 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/random.h> -#include <linux/ip.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, -			   enum nf_nat_manip_type maniptype, -			   const union nf_conntrack_man_proto *min, -			   const union nf_conntrack_man_proto *max) -{ -	__be16 port; - -	if (maniptype == IP_NAT_MANIP_SRC) -		port = tuple->src.u.all; -	else -		port = tuple->dst.u.all; - -	return ntohs(port) >= ntohs(min->all) && -	       ntohs(port) <= ntohs(max->all); -} -EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); - -void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, -			       const struct nf_nat_range *range, -			       enum nf_nat_manip_type maniptype, -			       const struct nf_conn *ct, -			       u_int16_t *rover) -{ -	unsigned int range_size, min, i; -	__be16 *portptr; -	u_int16_t off; - -	if (maniptype == IP_NAT_MANIP_SRC) -		portptr = &tuple->src.u.all; -	else -		portptr = &tuple->dst.u.all; - -	/* If no range specified... */ -	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { -		/* If it's dst rewrite, can't change port */ -		if (maniptype == IP_NAT_MANIP_DST) -			return; - -		if (ntohs(*portptr) < 1024) { -			/* Loose convention: >> 512 is credential passing */ -			if (ntohs(*portptr) < 512) { -				min = 1; -				range_size = 511 - min + 1; -			} else { -				min = 600; -				range_size = 1023 - min + 1; -			} -		} else { -			min = 1024; -			range_size = 65535 - 1024 + 1; -		} -	} else { -		min = ntohs(range->min.all); -		range_size = ntohs(range->max.all) - min + 1; -	} - -	if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) -		off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip, -						 maniptype == IP_NAT_MANIP_SRC -						 ? tuple->dst.u.all -						 : tuple->src.u.all); -	else -		off = *rover; - -	for (i = 0; ; ++off) { -		*portptr = htons(min + off % range_size); -		if (++i != range_size && nf_nat_used_tuple(tuple, ct)) -			continue; -		if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) -			*rover = off; -		return; -	} -	return; -} -EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); - -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -int nf_nat_proto_range_to_nlattr(struct sk_buff *skb, -				 const struct nf_nat_range *range) -{ -	NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all); -	NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all); -	return 0; - -nla_put_failure: -	return -1; -} -EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range); - -int nf_nat_proto_nlattr_to_range(struct nlattr *tb[], -				 struct nf_nat_range *range) -{ -	if (tb[CTA_PROTONAT_PORT_MIN]) { -		range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); -		range->max.all = range->min.tcp.port; -		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; -	} -	if (tb[CTA_PROTONAT_PORT_MAX]) { -		range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); -		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; -	} -	return 0; -} -EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr); -#endif diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c deleted file mode 100644 index 570faf2667b..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_dccp.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * DCCP NAT protocol helper - * - * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/dccp.h> - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t dccp_port_rover; - -static void -dccp_unique_tuple(struct nf_conntrack_tuple *tuple, -		  const struct nf_nat_range *range, -		  enum nf_nat_manip_type maniptype, -		  const struct nf_conn *ct) -{ -	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, -				  &dccp_port_rover); -} - -static bool -dccp_manip_pkt(struct sk_buff *skb, -	       unsigned int iphdroff, -	       const struct nf_conntrack_tuple *tuple, -	       enum nf_nat_manip_type maniptype) -{ -	const struct iphdr *iph = (const void *)(skb->data + iphdroff); -	struct dccp_hdr *hdr; -	unsigned int hdroff = iphdroff + iph->ihl * 4; -	__be32 oldip, newip; -	__be16 *portptr, oldport, newport; -	int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - -	if (skb->len >= hdroff + sizeof(struct dccp_hdr)) -		hdrsize = sizeof(struct dccp_hdr); - -	if (!skb_make_writable(skb, hdroff + hdrsize)) -		return false; - -	iph = (struct iphdr *)(skb->data + iphdroff); -	hdr = (struct dccp_hdr *)(skb->data + hdroff); - -	if (maniptype == IP_NAT_MANIP_SRC) { -		oldip = iph->saddr; -		newip = tuple->src.u3.ip; -		newport = tuple->src.u.dccp.port; -		portptr = &hdr->dccph_sport; -	} else { -		oldip = iph->daddr; -		newip = tuple->dst.u3.ip; -		newport = tuple->dst.u.dccp.port; -		portptr = &hdr->dccph_dport; -	} - -	oldport = *portptr; -	*portptr = newport; - -	if (hdrsize < sizeof(*hdr)) -		return true; - -	inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1); -	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, -				 0); -	return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_dccp = { -	.protonum		= IPPROTO_DCCP, -	.me			= THIS_MODULE, -	.manip_pkt		= dccp_manip_pkt, -	.in_range		= nf_nat_proto_in_range, -	.unique_tuple		= dccp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_dccp_init(void) -{ -	return nf_nat_protocol_register(&nf_nat_protocol_dccp); -} - -static void __exit nf_nat_proto_dccp_fini(void) -{ -	nf_nat_protocol_unregister(&nf_nat_protocol_dccp); -} - -module_init(nf_nat_proto_dccp_init); -module_exit(nf_nat_proto_dccp_fini); - -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("DCCP NAT protocol helper"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index bc8d83a31c7..690d890111b 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -21,6 +21,8 @@   *   * Development of this code funded by Astaro AG (http://www.astaro.com/)   * + * (C) 2006-2012 Patrick McHardy <kaber@trash.net> + *   */  #include <linux/module.h> @@ -28,8 +30,7 @@  #include <linux/ip.h>  #include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h>  #include <linux/netfilter/nf_conntrack_proto_gre.h>  MODULE_LICENSE("GPL"); @@ -38,7 +39,8 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");  /* generate unique tuple ... */  static void -gre_unique_tuple(struct nf_conntrack_tuple *tuple, +gre_unique_tuple(const struct nf_nat_l3proto *l3proto, +		 struct nf_conntrack_tuple *tuple,  		 const struct nf_nat_range *range,  		 enum nf_nat_manip_type maniptype,  		 const struct nf_conn *ct) @@ -52,18 +54,18 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,  	if (!ct->master)  		return; -	if (maniptype == IP_NAT_MANIP_SRC) +	if (maniptype == NF_NAT_MANIP_SRC)  		keyptr = &tuple->src.u.gre.key;  	else  		keyptr = &tuple->dst.u.gre.key; -	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { +	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {  		pr_debug("%p: NATing GRE PPTP\n", ct);  		min = 1;  		range_size = 0xffff;  	} else { -		min = ntohs(range->min.gre.key); -		range_size = ntohs(range->max.gre.key) - min + 1; +		min = ntohs(range->min_proto.gre.key); +		range_size = ntohs(range->max_proto.gre.key) - min + 1;  	}  	pr_debug("min = %u, range_size = %u\n", min, range_size); @@ -80,14 +82,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,  /* manipulate a GRE packet according to maniptype */  static bool -gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, +gre_manip_pkt(struct sk_buff *skb, +	      const struct nf_nat_l3proto *l3proto, +	      unsigned int iphdroff, unsigned int hdroff,  	      const struct nf_conntrack_tuple *tuple,  	      enum nf_nat_manip_type maniptype)  {  	const struct gre_hdr *greh;  	struct gre_hdr_pptp *pgreh; -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); -	unsigned int hdroff = iphdroff + iph->ihl * 4;  	/* pgreh includes two optional 32bit fields which are not required  	 * to be there.  That's where the magic '8' comes from */ @@ -99,7 +101,7 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,  	/* we only have destination manip of a packet, since 'source key'  	 * is not present in the packet itself */ -	if (maniptype != IP_NAT_MANIP_DST) +	if (maniptype != NF_NAT_MANIP_DST)  		return true;  	switch (greh->version) {  	case GRE_VERSION_1701: @@ -117,26 +119,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,  	return true;  } -static const struct nf_nat_protocol gre = { -	.protonum		= IPPROTO_GRE, -	.me			= THIS_MODULE, +static const struct nf_nat_l4proto gre = { +	.l4proto		= IPPROTO_GRE,  	.manip_pkt		= gre_manip_pkt, -	.in_range		= nf_nat_proto_in_range, +	.in_range		= nf_nat_l4proto_in_range,  	.unique_tuple		= gre_unique_tuple,  #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,  #endif  };  static int __init nf_nat_proto_gre_init(void)  { -	return nf_nat_protocol_register(&gre); +	return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);  }  static void __exit nf_nat_proto_gre_fini(void)  { -	nf_nat_protocol_unregister(&gre); +	nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);  }  module_init(nf_nat_proto_gre_init); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 5744c3ec847..eb303471bcf 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -8,14 +8,14 @@  #include <linux/types.h>  #include <linux/init.h> +#include <linux/export.h>  #include <linux/ip.h>  #include <linux/icmp.h>  #include <linux/netfilter.h>  #include <net/netfilter/nf_nat.h>  #include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> +#include <net/netfilter/nf_nat_l4proto.h>  static bool  icmp_in_range(const struct nf_conntrack_tuple *tuple, @@ -28,7 +28,8 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,  }  static void -icmp_unique_tuple(struct nf_conntrack_tuple *tuple, +icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, +		  struct nf_conntrack_tuple *tuple,  		  const struct nf_nat_range *range,  		  enum nf_nat_manip_type maniptype,  		  const struct nf_conn *ct) @@ -37,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,  	unsigned int range_size;  	unsigned int i; -	range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1; +	range_size = ntohs(range->max_proto.icmp.id) - +		     ntohs(range->min_proto.icmp.id) + 1;  	/* If no range specified... */ -	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) +	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))  		range_size = 0xFFFF;  	for (i = 0; ; ++id) { -		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + +		tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +  					     (id % range_size));  		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))  			return; @@ -53,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,  static bool  icmp_manip_pkt(struct sk_buff *skb, -	       unsigned int iphdroff, +	       const struct nf_nat_l3proto *l3proto, +	       unsigned int iphdroff, unsigned int hdroff,  	       const struct nf_conntrack_tuple *tuple,  	       enum nf_nat_manip_type maniptype)  { -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);  	struct icmphdr *hdr; -	unsigned int hdroff = iphdroff + iph->ihl*4;  	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))  		return false; @@ -71,14 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,  	return true;  } -const struct nf_nat_protocol nf_nat_protocol_icmp = { -	.protonum		= IPPROTO_ICMP, -	.me			= THIS_MODULE, +const struct nf_nat_l4proto nf_nat_l4proto_icmp = { +	.l4proto		= IPPROTO_ICMP,  	.manip_pkt		= icmp_manip_pkt,  	.in_range		= icmp_in_range,  	.unique_tuple		= icmp_unique_tuple,  #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, +	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,  #endif  }; diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c deleted file mode 100644 index 756331d4266..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/sctp.h> -#include <net/sctp/checksum.h> - -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t nf_sctp_port_rover; - -static void -sctp_unique_tuple(struct nf_conntrack_tuple *tuple, -		  const struct nf_nat_range *range, -		  enum nf_nat_manip_type maniptype, -		  const struct nf_conn *ct) -{ -	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, -				  &nf_sctp_port_rover); -} - -static bool -sctp_manip_pkt(struct sk_buff *skb, -	       unsigned int iphdroff, -	       const struct nf_conntrack_tuple *tuple, -	       enum nf_nat_manip_type maniptype) -{ -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); -	struct sk_buff *frag; -	sctp_sctphdr_t *hdr; -	unsigned int hdroff = iphdroff + iph->ihl*4; -	__be32 oldip, newip; -	__be32 crc32; - -	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) -		return false; - -	iph = (struct iphdr *)(skb->data + iphdroff); -	hdr = (struct sctphdr *)(skb->data + hdroff); - -	if (maniptype == IP_NAT_MANIP_SRC) { -		/* Get rid of src ip and src pt */ -		oldip = iph->saddr; -		newip = tuple->src.u3.ip; -		hdr->source = tuple->src.u.sctp.port; -	} else { -		/* Get rid of dst ip and dst pt */ -		oldip = iph->daddr; -		newip = tuple->dst.u3.ip; -		hdr->dest = tuple->dst.u.sctp.port; -	} - -	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff); -	skb_walk_frags(skb, frag) -		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag), -					  crc32); -	crc32 = sctp_end_cksum(crc32); -	hdr->checksum = crc32; - -	return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_sctp = { -	.protonum		= IPPROTO_SCTP, -	.me			= THIS_MODULE, -	.manip_pkt		= sctp_manip_pkt, -	.in_range		= nf_nat_proto_in_range, -	.unique_tuple		= sctp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_sctp_init(void) -{ -	return nf_nat_protocol_register(&nf_nat_protocol_sctp); -} - -static void __exit nf_nat_proto_sctp_exit(void) -{ -	nf_nat_protocol_unregister(&nf_nat_protocol_sctp); -} - -module_init(nf_nat_proto_sctp_init); -module_exit(nf_nat_proto_sctp_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SCTP NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c deleted file mode 100644 index aa460a595d5..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ /dev/null @@ -1,92 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/tcp.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> - -static u_int16_t tcp_port_rover; - -static void -tcp_unique_tuple(struct nf_conntrack_tuple *tuple, -		 const struct nf_nat_range *range, -		 enum nf_nat_manip_type maniptype, -		 const struct nf_conn *ct) -{ -	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover); -} - -static bool -tcp_manip_pkt(struct sk_buff *skb, -	      unsigned int iphdroff, -	      const struct nf_conntrack_tuple *tuple, -	      enum nf_nat_manip_type maniptype) -{ -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); -	struct tcphdr *hdr; -	unsigned int hdroff = iphdroff + iph->ihl*4; -	__be32 oldip, newip; -	__be16 *portptr, newport, oldport; -	int hdrsize = 8; /* TCP connection tracking guarantees this much */ - -	/* this could be a inner header returned in icmp packet; in such -	   cases we cannot update the checksum field since it is outside of -	   the 8 bytes of transport layer headers we are guaranteed */ -	if (skb->len >= hdroff + sizeof(struct tcphdr)) -		hdrsize = sizeof(struct tcphdr); - -	if (!skb_make_writable(skb, hdroff + hdrsize)) -		return false; - -	iph = (struct iphdr *)(skb->data + iphdroff); -	hdr = (struct tcphdr *)(skb->data + hdroff); - -	if (maniptype == IP_NAT_MANIP_SRC) { -		/* Get rid of src ip and src pt */ -		oldip = iph->saddr; -		newip = tuple->src.u3.ip; -		newport = tuple->src.u.tcp.port; -		portptr = &hdr->source; -	} else { -		/* Get rid of dst ip and dst pt */ -		oldip = iph->daddr; -		newip = tuple->dst.u3.ip; -		newport = tuple->dst.u.tcp.port; -		portptr = &hdr->dest; -	} - -	oldport = *portptr; -	*portptr = newport; - -	if (hdrsize < sizeof(*hdr)) -		return true; - -	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); -	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); -	return true; -} - -const struct nf_nat_protocol nf_nat_protocol_tcp = { -	.protonum		= IPPROTO_TCP, -	.me			= THIS_MODULE, -	.manip_pkt		= tcp_manip_pkt, -	.in_range		= nf_nat_proto_in_range, -	.unique_tuple		= tcp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c deleted file mode 100644 index dfe65c7e292..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ /dev/null @@ -1,83 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t udp_port_rover; - -static void -udp_unique_tuple(struct nf_conntrack_tuple *tuple, -		 const struct nf_nat_range *range, -		 enum nf_nat_manip_type maniptype, -		 const struct nf_conn *ct) -{ -	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover); -} - -static bool -udp_manip_pkt(struct sk_buff *skb, -	      unsigned int iphdroff, -	      const struct nf_conntrack_tuple *tuple, -	      enum nf_nat_manip_type maniptype) -{ -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); -	struct udphdr *hdr; -	unsigned int hdroff = iphdroff + iph->ihl*4; -	__be32 oldip, newip; -	__be16 *portptr, newport; - -	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) -		return false; - -	iph = (struct iphdr *)(skb->data + iphdroff); -	hdr = (struct udphdr *)(skb->data + hdroff); - -	if (maniptype == IP_NAT_MANIP_SRC) { -		/* Get rid of src ip and src pt */ -		oldip = iph->saddr; -		newip = tuple->src.u3.ip; -		newport = tuple->src.u.udp.port; -		portptr = &hdr->source; -	} else { -		/* Get rid of dst ip and dst pt */ -		oldip = iph->daddr; -		newip = tuple->dst.u3.ip; -		newport = tuple->dst.u.udp.port; -		portptr = &hdr->dest; -	} -	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { -		inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); -		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, -					 0); -		if (!hdr->check) -			hdr->check = CSUM_MANGLED_0; -	} -	*portptr = newport; -	return true; -} - -const struct nf_nat_protocol nf_nat_protocol_udp = { -	.protonum		= IPPROTO_UDP, -	.me			= THIS_MODULE, -	.manip_pkt		= udp_manip_pkt, -	.in_range		= nf_nat_proto_in_range, -	.unique_tuple		= udp_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c deleted file mode 100644 index 3cc8c8af39e..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_udplite.c +++ /dev/null @@ -1,99 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/ip.h> -#include <linux/udp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_protocol.h> - -static u_int16_t udplite_port_rover; - -static void -udplite_unique_tuple(struct nf_conntrack_tuple *tuple, -		     const struct nf_nat_range *range, -		     enum nf_nat_manip_type maniptype, -		     const struct nf_conn *ct) -{ -	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, -				  &udplite_port_rover); -} - -static bool -udplite_manip_pkt(struct sk_buff *skb, -		  unsigned int iphdroff, -		  const struct nf_conntrack_tuple *tuple, -		  enum nf_nat_manip_type maniptype) -{ -	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); -	struct udphdr *hdr; -	unsigned int hdroff = iphdroff + iph->ihl*4; -	__be32 oldip, newip; -	__be16 *portptr, newport; - -	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) -		return false; - -	iph = (struct iphdr *)(skb->data + iphdroff); -	hdr = (struct udphdr *)(skb->data + hdroff); - -	if (maniptype == IP_NAT_MANIP_SRC) { -		/* Get rid of src ip and src pt */ -		oldip = iph->saddr; -		newip = tuple->src.u3.ip; -		newport = tuple->src.u.udp.port; -		portptr = &hdr->source; -	} else { -		/* Get rid of dst ip and dst pt */ -		oldip = iph->daddr; -		newip = tuple->dst.u3.ip; -		newport = tuple->dst.u.udp.port; -		portptr = &hdr->dest; -	} - -	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); -	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); -	if (!hdr->check) -		hdr->check = CSUM_MANGLED_0; - -	*portptr = newport; -	return true; -} - -static const struct nf_nat_protocol nf_nat_protocol_udplite = { -	.protonum		= IPPROTO_UDPLITE, -	.me			= THIS_MODULE, -	.manip_pkt		= udplite_manip_pkt, -	.in_range		= nf_nat_proto_in_range, -	.unique_tuple		= udplite_unique_tuple, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) -	.range_to_nlattr	= nf_nat_proto_range_to_nlattr, -	.nlattr_to_range	= nf_nat_proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_udplite_init(void) -{ -	return nf_nat_protocol_register(&nf_nat_protocol_udplite); -} - -static void __exit nf_nat_proto_udplite_fini(void) -{ -	nf_nat_protocol_unregister(&nf_nat_protocol_udplite); -} - -module_init(nf_nat_proto_udplite_init); -module_exit(nf_nat_proto_udplite_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("UDP-Lite NAT protocol helper"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c deleted file mode 100644 index a50f2bc1c73..00000000000 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ /dev/null @@ -1,53 +0,0 @@ -/* The "unknown" protocol.  This is what is used for protocols we - * don't understand.  It's returned by ip_ct_find_proto(). - */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> - -static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, -			     enum nf_nat_manip_type manip_type, -			     const union nf_conntrack_man_proto *min, -			     const union nf_conntrack_man_proto *max) -{ -	return true; -} - -static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, -				 const struct nf_nat_range *range, -				 enum nf_nat_manip_type maniptype, -				 const struct nf_conn *ct) -{ -	/* Sorry: we can't help you; if it's not unique, we can't frob -	   anything. */ -	return; -} - -static bool -unknown_manip_pkt(struct sk_buff *skb, -		  unsigned int iphdroff, -		  const struct nf_conntrack_tuple *tuple, -		  enum nf_nat_manip_type maniptype) -{ -	return true; -} - -const struct nf_nat_protocol nf_nat_unknown_protocol = { -	/* .me isn't set: getting a ref to this cannot fail. */ -	.manip_pkt		= unknown_manip_pkt, -	.in_range		= unknown_in_range, -	.unique_tuple		= unknown_unique_tuple, -}; diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c deleted file mode 100644 index 21c30426480..00000000000 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ /dev/null @@ -1,214 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* Everything about the rules for NAT. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/slab.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/bitops.h> - -#include <linux/netfilter_ipv4/ip_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_rule.h> - -#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ -			 (1 << NF_INET_POST_ROUTING) | \ -			 (1 << NF_INET_LOCAL_OUT) | \ -			 (1 << NF_INET_LOCAL_IN)) - -static const struct xt_table nat_table = { -	.name		= "nat", -	.valid_hooks	= NAT_VALID_HOOKS, -	.me		= THIS_MODULE, -	.af		= NFPROTO_IPV4, -}; - -/* Source NAT */ -static unsigned int -ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) -{ -	struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING || -		     par->hooknum == NF_INET_LOCAL_IN); - -	ct = nf_ct_get(skb, &ctinfo); - -	/* Connection must be valid and new. */ -	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || -			    ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); -	NF_CT_ASSERT(par->out != NULL); - -	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); -} - -static unsigned int -ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par) -{ -	struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || -		     par->hooknum == NF_INET_LOCAL_OUT); - -	ct = nf_ct_get(skb, &ctinfo); - -	/* Connection must be valid and new. */ -	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - -	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); -} - -static int ipt_snat_checkentry(const struct xt_tgchk_param *par) -{ -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	/* Must be a valid range */ -	if (mr->rangesize != 1) { -		pr_info("SNAT: multiple ranges no longer supported\n"); -		return -EINVAL; -	} -	return 0; -} - -static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) -{ -	const struct nf_nat_multi_range_compat *mr = par->targinfo; - -	/* Must be a valid range */ -	if (mr->rangesize != 1) { -		pr_info("DNAT: multiple ranges no longer supported\n"); -		return -EINVAL; -	} -	return 0; -} - -static unsigned int -alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ -	/* Force range to this IP; let proto decide mapping for -	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). -	*/ -	struct nf_nat_range range; - -	range.flags = 0; -	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, -		 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? -		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : -		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - -	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -int nf_nat_rule_find(struct sk_buff *skb, -		     unsigned int hooknum, -		     const struct net_device *in, -		     const struct net_device *out, -		     struct nf_conn *ct) -{ -	struct net *net = nf_ct_net(ct); -	int ret; - -	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); - -	if (ret == NF_ACCEPT) { -		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) -			/* NUL mapping */ -			ret = alloc_null_binding(ct, hooknum); -	} -	return ret; -} - -static struct xt_target ipt_snat_reg __read_mostly = { -	.name		= "SNAT", -	.target		= ipt_snat_target, -	.targetsize	= sizeof(struct nf_nat_multi_range_compat), -	.table		= "nat", -	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), -	.checkentry	= ipt_snat_checkentry, -	.family		= AF_INET, -}; - -static struct xt_target ipt_dnat_reg __read_mostly = { -	.name		= "DNAT", -	.target		= ipt_dnat_target, -	.targetsize	= sizeof(struct nf_nat_multi_range_compat), -	.table		= "nat", -	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), -	.checkentry	= ipt_dnat_checkentry, -	.family		= AF_INET, -}; - -static int __net_init nf_nat_rule_net_init(struct net *net) -{ -	struct ipt_replace *repl; - -	repl = ipt_alloc_initial_table(&nat_table); -	if (repl == NULL) -		return -ENOMEM; -	net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl); -	kfree(repl); -	if (IS_ERR(net->ipv4.nat_table)) -		return PTR_ERR(net->ipv4.nat_table); -	return 0; -} - -static void __net_exit nf_nat_rule_net_exit(struct net *net) -{ -	ipt_unregister_table(net, net->ipv4.nat_table); -} - -static struct pernet_operations nf_nat_rule_net_ops = { -	.init = nf_nat_rule_net_init, -	.exit = nf_nat_rule_net_exit, -}; - -int __init nf_nat_rule_init(void) -{ -	int ret; - -	ret = register_pernet_subsys(&nf_nat_rule_net_ops); -	if (ret != 0) -		goto out; -	ret = xt_register_target(&ipt_snat_reg); -	if (ret != 0) -		goto unregister_table; - -	ret = xt_register_target(&ipt_dnat_reg); -	if (ret != 0) -		goto unregister_snat; - -	return ret; - - unregister_snat: -	xt_unregister_target(&ipt_snat_reg); - unregister_table: -	unregister_pernet_subsys(&nf_nat_rule_net_ops); - out: -	return ret; -} - -void nf_nat_rule_cleanup(void) -{ -	xt_unregister_target(&ipt_dnat_reg); -	xt_unregister_target(&ipt_snat_reg); -	unregister_pernet_subsys(&nf_nat_rule_net_ops); -} diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c deleted file mode 100644 index e40cf7816fd..00000000000 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ /dev/null @@ -1,561 +0,0 @@ -/* SIP extension for NAT alteration. - * - * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> - * based on RR's ip_nat_ftp.c and other modules. - * (C) 2007 United Security Providers - * (C) 2007, 2008 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/ip.h> -#include <linux/udp.h> -#include <linux/tcp.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_sip.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); -MODULE_DESCRIPTION("SIP NAT helper"); -MODULE_ALIAS("ip_nat_sip"); - - -static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff, -				  const char **dptr, unsigned int *datalen, -				  unsigned int matchoff, unsigned int matchlen, -				  const char *buffer, unsigned int buflen) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	struct tcphdr *th; -	unsigned int baseoff; - -	if (nf_ct_protonum(ct) == IPPROTO_TCP) { -		th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); -		baseoff = ip_hdrlen(skb) + th->doff * 4; -		matchoff += dataoff - baseoff; - -		if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, -						matchoff, matchlen, -						buffer, buflen, false)) -			return 0; -	} else { -		baseoff = ip_hdrlen(skb) + sizeof(struct udphdr); -		matchoff += dataoff - baseoff; - -		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, -					      matchoff, matchlen, -					      buffer, buflen)) -			return 0; -	} - -	/* Reload data pointer and adjust datalen value */ -	*dptr = skb->data + dataoff; -	*datalen += buflen - matchlen; -	return 1; -} - -static int map_addr(struct sk_buff *skb, unsigned int dataoff, -		    const char **dptr, unsigned int *datalen, -		    unsigned int matchoff, unsigned int matchlen, -		    union nf_inet_addr *addr, __be16 port) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; -	unsigned int buflen; -	__be32 newaddr; -	__be16 newport; - -	if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip && -	    ct->tuplehash[dir].tuple.src.u.udp.port == port) { -		newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip; -		newport = ct->tuplehash[!dir].tuple.dst.u.udp.port; -	} else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip && -		   ct->tuplehash[dir].tuple.dst.u.udp.port == port) { -		newaddr = ct->tuplehash[!dir].tuple.src.u3.ip; -		newport = ct->tuplehash[!dir].tuple.src.u.udp.port; -	} else -		return 1; - -	if (newaddr == addr->ip && newport == port) -		return 1; - -	buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); - -	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, -			     buffer, buflen); -} - -static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff, -			const char **dptr, unsigned int *datalen, -			enum sip_header_types type) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	unsigned int matchlen, matchoff; -	union nf_inet_addr addr; -	__be16 port; - -	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, -				    &matchoff, &matchlen, &addr, &port) <= 0) -		return 1; -	return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, -			&addr, port); -} - -static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, -			       const char **dptr, unsigned int *datalen) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	unsigned int coff, matchoff, matchlen; -	enum sip_header_types hdr; -	union nf_inet_addr addr; -	__be16 port; -	int request, in_header; - -	/* Basic rules: requests and responses. */ -	if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) { -		if (ct_sip_parse_request(ct, *dptr, *datalen, -					 &matchoff, &matchlen, -					 &addr, &port) > 0 && -		    !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, -			      &addr, port)) -			return NF_DROP; -		request = 1; -	} else -		request = 0; - -	if (nf_ct_protonum(ct) == IPPROTO_TCP) -		hdr = SIP_HDR_VIA_TCP; -	else -		hdr = SIP_HDR_VIA_UDP; - -	/* Translate topmost Via header and parameters */ -	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, -				    hdr, NULL, &matchoff, &matchlen, -				    &addr, &port) > 0) { -		unsigned int matchend, poff, plen, buflen, n; -		char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; - -		/* We're only interested in headers related to this -		 * connection */ -		if (request) { -			if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip || -			    port != ct->tuplehash[dir].tuple.src.u.udp.port) -				goto next; -		} else { -			if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip || -			    port != ct->tuplehash[dir].tuple.dst.u.udp.port) -				goto next; -		} - -		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, -			      &addr, port)) -			return NF_DROP; - -		matchend = matchoff + matchlen; - -		/* The maddr= parameter (RFC 2361) specifies where to send -		 * the reply. */ -		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, -					       "maddr=", &poff, &plen, -					       &addr) > 0 && -		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && -		    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { -			buflen = sprintf(buffer, "%pI4", -					&ct->tuplehash[!dir].tuple.dst.u3.ip); -			if (!mangle_packet(skb, dataoff, dptr, datalen, -					   poff, plen, buffer, buflen)) -				return NF_DROP; -		} - -		/* The received= parameter (RFC 2361) contains the address -		 * from which the server received the request. */ -		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, -					       "received=", &poff, &plen, -					       &addr) > 0 && -		    addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && -		    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { -			buflen = sprintf(buffer, "%pI4", -					&ct->tuplehash[!dir].tuple.src.u3.ip); -			if (!mangle_packet(skb, dataoff, dptr, datalen, -					   poff, plen, buffer, buflen)) -				return NF_DROP; -		} - -		/* The rport= parameter (RFC 3581) contains the port number -		 * from which the server received the request. */ -		if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, -						 "rport=", &poff, &plen, -						 &n) > 0 && -		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && -		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { -			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; -			buflen = sprintf(buffer, "%u", ntohs(p)); -			if (!mangle_packet(skb, dataoff, dptr, datalen, -					   poff, plen, buffer, buflen)) -				return NF_DROP; -		} -	} - -next: -	/* Translate Contact headers */ -	coff = 0; -	in_header = 0; -	while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, -				       SIP_HDR_CONTACT, &in_header, -				       &matchoff, &matchlen, -				       &addr, &port) > 0) { -		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, -			      &addr, port)) -			return NF_DROP; -	} - -	if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) || -	    !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) -		return NF_DROP; - -	return NF_ACCEPT; -} - -static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	const struct tcphdr *th; - -	if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) -		return; - -	th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); -	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); -} - -/* Handles expected signalling connections and media streams */ -static void ip_nat_sip_expected(struct nf_conn *ct, -				struct nf_conntrack_expect *exp) -{ -	struct nf_nat_range range; - -	/* This must be a fresh one. */ -	BUG_ON(ct->status & IPS_NAT_DONE_MASK); - -	/* For DST manip, map port here to where it's expected. */ -	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); -	range.min = range.max = exp->saved_proto; -	range.min_ip = range.max_ip = exp->saved_ip; -	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); - -	/* Change src to where master sends to, but only if the connection -	 * actually came from the same source. */ -	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == -	    ct->master->tuplehash[exp->dir].tuple.src.u3.ip) { -		range.flags = IP_NAT_RANGE_MAP_IPS; -		range.min_ip = range.max_ip -			= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; -		nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); -	} -} - -static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, -				      const char **dptr, unsigned int *datalen, -				      struct nf_conntrack_expect *exp, -				      unsigned int matchoff, -				      unsigned int matchlen) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	__be32 newip; -	u_int16_t port; -	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; -	unsigned buflen; - -	/* Connection will come from reply */ -	if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip) -		newip = exp->tuple.dst.u3.ip; -	else -		newip = ct->tuplehash[!dir].tuple.dst.u3.ip; - -	/* If the signalling port matches the connection's source port in the -	 * original direction, try to use the destination port in the opposite -	 * direction. */ -	if (exp->tuple.dst.u.udp.port == -	    ct->tuplehash[dir].tuple.src.u.udp.port) -		port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port); -	else -		port = ntohs(exp->tuple.dst.u.udp.port); - -	exp->saved_ip = exp->tuple.dst.u3.ip; -	exp->tuple.dst.u3.ip = newip; -	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; -	exp->dir = !dir; -	exp->expectfn = ip_nat_sip_expected; - -	for (; port != 0; port++) { -		int ret; - -		exp->tuple.dst.u.udp.port = htons(port); -		ret = nf_ct_expect_related(exp); -		if (ret == 0) -			break; -		else if (ret != -EBUSY) { -			port = 0; -			break; -		} -	} - -	if (port == 0) -		return NF_DROP; - -	if (exp->tuple.dst.u3.ip != exp->saved_ip || -	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { -		buflen = sprintf(buffer, "%pI4:%u", &newip, port); -		if (!mangle_packet(skb, dataoff, dptr, datalen, -				   matchoff, matchlen, buffer, buflen)) -			goto err; -	} -	return NF_ACCEPT; - -err: -	nf_ct_unexpect_related(exp); -	return NF_DROP; -} - -static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff, -			      const char **dptr, unsigned int *datalen) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	unsigned int matchoff, matchlen; -	char buffer[sizeof("65536")]; -	int buflen, c_len; - -	/* Get actual SDP length */ -	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, -				  SDP_HDR_VERSION, SDP_HDR_UNSPEC, -				  &matchoff, &matchlen) <= 0) -		return 0; -	c_len = *datalen - matchoff + strlen("v="); - -	/* Now, update SDP length */ -	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH, -			      &matchoff, &matchlen) <= 0) -		return 0; - -	buflen = sprintf(buffer, "%u", c_len); -	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, -			     buffer, buflen); -} - -static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff, -			     const char **dptr, unsigned int *datalen, -			     unsigned int sdpoff, -			     enum sdp_header_types type, -			     enum sdp_header_types term, -			     char *buffer, int buflen) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	unsigned int matchlen, matchoff; - -	if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, -				  &matchoff, &matchlen) <= 0) -		return -ENOENT; -	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, -			     buffer, buflen) ? 0 : -EINVAL; -} - -static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff, -				    const char **dptr, unsigned int *datalen, -				    unsigned int sdpoff, -				    enum sdp_header_types type, -				    enum sdp_header_types term, -				    const union nf_inet_addr *addr) -{ -	char buffer[sizeof("nnn.nnn.nnn.nnn")]; -	unsigned int buflen; - -	buflen = sprintf(buffer, "%pI4", &addr->ip); -	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term, -			      buffer, buflen)) -		return 0; - -	return mangle_content_len(skb, dataoff, dptr, datalen); -} - -static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff, -				    const char **dptr, unsigned int *datalen, -				    unsigned int matchoff, -				    unsigned int matchlen, -				    u_int16_t port) -{ -	char buffer[sizeof("nnnnn")]; -	unsigned int buflen; - -	buflen = sprintf(buffer, "%u", port); -	if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, -			   buffer, buflen)) -		return 0; - -	return mangle_content_len(skb, dataoff, dptr, datalen); -} - -static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff, -				       const char **dptr, unsigned int *datalen, -				       unsigned int sdpoff, -				       const union nf_inet_addr *addr) -{ -	char buffer[sizeof("nnn.nnn.nnn.nnn")]; -	unsigned int buflen; - -	/* Mangle session description owner and contact addresses */ -	buflen = sprintf(buffer, "%pI4", &addr->ip); -	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, -			       SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, -			       buffer, buflen)) -		return 0; - -	switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, -				  SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, -				  buffer, buflen)) { -	case 0: -	/* -	 * RFC 2327: -	 * -	 * Session description -	 * -	 * c=* (connection information - not required if included in all media) -	 */ -	case -ENOENT: -		break; -	default: -		return 0; -	} - -	return mangle_content_len(skb, dataoff, dptr, datalen); -} - -/* So, this packet has hit the connection tracking matching code. -   Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, -				     const char **dptr, unsigned int *datalen, -				     struct nf_conntrack_expect *rtp_exp, -				     struct nf_conntrack_expect *rtcp_exp, -				     unsigned int mediaoff, -				     unsigned int medialen, -				     union nf_inet_addr *rtp_addr) -{ -	enum ip_conntrack_info ctinfo; -	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); -	u_int16_t port; - -	/* Connection will come from reply */ -	if (ct->tuplehash[dir].tuple.src.u3.ip == -	    ct->tuplehash[!dir].tuple.dst.u3.ip) -		rtp_addr->ip = rtp_exp->tuple.dst.u3.ip; -	else -		rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip; - -	rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip; -	rtp_exp->tuple.dst.u3.ip = rtp_addr->ip; -	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port; -	rtp_exp->dir = !dir; -	rtp_exp->expectfn = ip_nat_sip_expected; - -	rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip; -	rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip; -	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port; -	rtcp_exp->dir = !dir; -	rtcp_exp->expectfn = ip_nat_sip_expected; - -	/* Try to get same pair of ports: if not, try to change them. */ -	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); -	     port != 0; port += 2) { -		int ret; - -		rtp_exp->tuple.dst.u.udp.port = htons(port); -		ret = nf_ct_expect_related(rtp_exp); -		if (ret == -EBUSY) -			continue; -		else if (ret < 0) { -			port = 0; -			break; -		} -		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); -		ret = nf_ct_expect_related(rtcp_exp); -		if (ret == 0) -			break; -		else if (ret != -EBUSY) { -			nf_ct_unexpect_related(rtp_exp); -			port = 0; -			break; -		} -	} - -	if (port == 0) -		goto err1; - -	/* Update media port. */ -	if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && -	    !ip_nat_sdp_port(skb, dataoff, dptr, datalen, -			     mediaoff, medialen, port)) -		goto err2; - -	return NF_ACCEPT; - -err2: -	nf_ct_unexpect_related(rtp_exp); -	nf_ct_unexpect_related(rtcp_exp); -err1: -	return NF_DROP; -} - -static void __exit nf_nat_sip_fini(void) -{ -	rcu_assign_pointer(nf_nat_sip_hook, NULL); -	rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL); -	rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); -	rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); -	rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); -	rcu_assign_pointer(nf_nat_sdp_session_hook, NULL); -	rcu_assign_pointer(nf_nat_sdp_media_hook, NULL); -	synchronize_rcu(); -} - -static int __init nf_nat_sip_init(void) -{ -	BUG_ON(nf_nat_sip_hook != NULL); -	BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); -	BUG_ON(nf_nat_sip_expect_hook != NULL); -	BUG_ON(nf_nat_sdp_addr_hook != NULL); -	BUG_ON(nf_nat_sdp_port_hook != NULL); -	BUG_ON(nf_nat_sdp_session_hook != NULL); -	BUG_ON(nf_nat_sdp_media_hook != NULL); -	rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); -	rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); -	rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); -	rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); -	rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); -	rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session); -	rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media); -	return 0; -} - -module_init(nf_nat_sip_init); -module_exit(nf_nat_sip_fini); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a5..7c676671329 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -34,10 +34,11 @@   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   * GNU General Public License for more details.   * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>.   *   * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>   */  #include <linux/module.h>  #include <linux/moduleparam.h> @@ -54,6 +55,7 @@  #include <net/netfilter/nf_conntrack_expect.h>  #include <net/netfilter/nf_conntrack_helper.h>  #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h>  MODULE_LICENSE("GPL");  MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -399,15 +401,12 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,  	*len = 0;  	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); -	if (*octets == NULL) { -		if (net_ratelimit()) -			pr_notice("OOM in bsalg (%d)\n", __LINE__); +	if (*octets == NULL)  		return 0; -	}  	ptr = *octets;  	while (ctx->pointer < eoc) { -		if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { +		if (!asn1_octet_decode(ctx, ptr++)) {  			kfree(*octets);  			*octets = NULL;  			return 0; @@ -450,11 +449,8 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,  		return 0;  	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); -	if (*oid == NULL) { -		if (net_ratelimit()) -			pr_notice("OOM in bsalg (%d)\n", __LINE__); +	if (*oid == NULL)  		return 0; -	}  	optr = *oid; @@ -465,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,  	}  	if (subid < 40) { -		optr [0] = 0; -		optr [1] = subid; +		optr[0] = 0; +		optr[1] = subid;  	} else if (subid < 80) { -		optr [0] = 1; -		optr [1] = subid - 40; +		optr[0] = 1; +		optr[1] = subid - 40;  	} else { -		optr [0] = 2; -		optr [1] = subid - 80; +		optr[0] = 2; +		optr[1] = subid - 80;  	}  	*len = 2; @@ -718,117 +714,103 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,  	l = 0;  	switch (type) { -		case SNMP_INTEGER: -			len = sizeof(long); -			if (!asn1_long_decode(ctx, end, &l)) { -				kfree(id); -				return 0; -			} -			*obj = kmalloc(sizeof(struct snmp_object) + len, -				       GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			(*obj)->syntax.l[0] = l; -			break; -		case SNMP_OCTETSTR: -		case SNMP_OPAQUE: -			if (!asn1_octets_decode(ctx, end, &p, &len)) { -				kfree(id); -				return 0; -			} -			*obj = kmalloc(sizeof(struct snmp_object) + len, -				       GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(p); -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			memcpy((*obj)->syntax.c, p, len); +	case SNMP_INTEGER: +		len = sizeof(long); +		if (!asn1_long_decode(ctx, end, &l)) { +			kfree(id); +			return 0; +		} +		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); +		if (*obj == NULL) { +			kfree(id); +			return 0; +		} +		(*obj)->syntax.l[0] = l; +		break; +	case SNMP_OCTETSTR: +	case SNMP_OPAQUE: +		if (!asn1_octets_decode(ctx, end, &p, &len)) { +			kfree(id); +			return 0; +		} +		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); +		if (*obj == NULL) {  			kfree(p); -			break; -		case SNMP_NULL: -		case SNMP_NOSUCHOBJECT: -		case SNMP_NOSUCHINSTANCE: -		case SNMP_ENDOFMIBVIEW: -			len = 0; -			*obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			if (!asn1_null_decode(ctx, end)) { -				kfree(id); -				kfree(*obj); -				*obj = NULL; -				return 0; -			} -			break; -		case SNMP_OBJECTID: -			if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { -				kfree(id); -				return 0; -			} -			len *= sizeof(unsigned long); -			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(lp); -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			memcpy((*obj)->syntax.ul, lp, len); +			kfree(id); +			return 0; +		} +		memcpy((*obj)->syntax.c, p, len); +		kfree(p); +		break; +	case SNMP_NULL: +	case SNMP_NOSUCHOBJECT: +	case SNMP_NOSUCHINSTANCE: +	case SNMP_ENDOFMIBVIEW: +		len = 0; +		*obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); +		if (*obj == NULL) { +			kfree(id); +			return 0; +		} +		if (!asn1_null_decode(ctx, end)) { +			kfree(id); +			kfree(*obj); +			*obj = NULL; +			return 0; +		} +		break; +	case SNMP_OBJECTID: +		if (!asn1_oid_decode(ctx, end, &lp, &len)) { +			kfree(id); +			return 0; +		} +		len *= sizeof(unsigned long); +		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); +		if (*obj == NULL) {  			kfree(lp); -			break; -		case SNMP_IPADDR: -			if (!asn1_octets_decode(ctx, end, &p, &len)) { -				kfree(id); -				return 0; -			} -			if (len != 4) { -				kfree(p); -				kfree(id); -				return 0; -			} -			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(p); -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			memcpy((*obj)->syntax.uc, p, len); +			kfree(id); +			return 0; +		} +		memcpy((*obj)->syntax.ul, lp, len); +		kfree(lp); +		break; +	case SNMP_IPADDR: +		if (!asn1_octets_decode(ctx, end, &p, &len)) { +			kfree(id); +			return 0; +		} +		if (len != 4) {  			kfree(p); -			break; -		case SNMP_COUNTER: -		case SNMP_GAUGE: -		case SNMP_TIMETICKS: -			len = sizeof(unsigned long); -			if (!asn1_ulong_decode(ctx, end, &ul)) { -				kfree(id); -				return 0; -			} -			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); -			if (*obj == NULL) { -				kfree(id); -				if (net_ratelimit()) -					pr_notice("OOM in bsalg (%d)\n", __LINE__); -				return 0; -			} -			(*obj)->syntax.ul[0] = ul; -			break; -		default:  			kfree(id);  			return 0; +		} +		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); +		if (*obj == NULL) { +			kfree(p); +			kfree(id); +			return 0; +		} +		memcpy((*obj)->syntax.uc, p, len); +		kfree(p); +		break; +	case SNMP_COUNTER: +	case SNMP_GAUGE: +	case SNMP_TIMETICKS: +		len = sizeof(unsigned long); +		if (!asn1_ulong_decode(ctx, end, &ul)) { +			kfree(id); +			return 0; +		} +		*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); +		if (*obj == NULL) { +			kfree(id); +			return 0; +		} +		(*obj)->syntax.ul[0] = ul; +		break; +	default: +		kfree(id); +		return 0;  	}  	(*obj)->syntax_len = len; @@ -1216,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct,  		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);  	} else {  		/* DNAT replies */ -		map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); -		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); +		map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); +		map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);  	}  	if (map.from == map.to) @@ -1225,8 +1207,7 @@ static int snmp_translate(struct nf_conn *ct,  	if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),  			       paylen, &map, &udph->check)) { -		if (net_ratelimit()) -			printk(KERN_WARNING "bsalg: parser failed\n"); +		net_warn_ratelimited("bsalg: parser failed\n");  		return NF_DROP;  	}  	return NF_ACCEPT; @@ -1260,9 +1241,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,  	 * can mess around with the payload.  	 */  	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { -		 if (net_ratelimit()) -			 printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", -				&iph->saddr, &iph->daddr); +		net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", +				     &iph->saddr, &iph->daddr);  		 return NF_DROP;  	} @@ -1310,9 +1290,9 @@ static int __init nf_nat_snmp_basic_init(void)  {  	int ret = 0; -	ret = nf_conntrack_helper_register(&snmp_helper); -	if (ret < 0) -		return ret; +	BUG_ON(nf_nat_snmp_hook != NULL); +	RCU_INIT_POINTER(nf_nat_snmp_hook, help); +  	ret = nf_conntrack_helper_register(&snmp_trap_helper);  	if (ret < 0) {  		nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1303,7 @@ static int __init nf_nat_snmp_basic_init(void)  static void __exit nf_nat_snmp_basic_fini(void)  { -	nf_conntrack_helper_unregister(&snmp_helper); +	RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);  	nf_conntrack_helper_unregister(&snmp_trap_helper);  } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c deleted file mode 100644 index 95481fee8bd..00000000000 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ /dev/null @@ -1,325 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/types.h> -#include <linux/icmp.h> -#include <linux/gfp.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <linux/spinlock.h> - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_extend.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_nat_protocol.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_helper.h> -#include <linux/netfilter_ipv4/ip_tables.h> - -#ifdef CONFIG_XFRM -static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) -{ -	const struct nf_conn *ct; -	const struct nf_conntrack_tuple *t; -	enum ip_conntrack_info ctinfo; -	enum ip_conntrack_dir dir; -	unsigned long statusbit; - -	ct = nf_ct_get(skb, &ctinfo); -	if (ct == NULL) -		return; -	dir = CTINFO2DIR(ctinfo); -	t = &ct->tuplehash[dir].tuple; - -	if (dir == IP_CT_DIR_ORIGINAL) -		statusbit = IPS_DST_NAT; -	else -		statusbit = IPS_SRC_NAT; - -	if (ct->status & statusbit) { -		fl->fl4_dst = t->dst.u3.ip; -		if (t->dst.protonum == IPPROTO_TCP || -		    t->dst.protonum == IPPROTO_UDP || -		    t->dst.protonum == IPPROTO_UDPLITE || -		    t->dst.protonum == IPPROTO_DCCP || -		    t->dst.protonum == IPPROTO_SCTP) -			fl->fl_ip_dport = t->dst.u.tcp.port; -	} - -	statusbit ^= IPS_NAT_MASK; - -	if (ct->status & statusbit) { -		fl->fl4_src = t->src.u3.ip; -		if (t->dst.protonum == IPPROTO_TCP || -		    t->dst.protonum == IPPROTO_UDP || -		    t->dst.protonum == IPPROTO_UDPLITE || -		    t->dst.protonum == IPPROTO_DCCP || -		    t->dst.protonum == IPPROTO_SCTP) -			fl->fl_ip_sport = t->src.u.tcp.port; -	} -} -#endif - -static unsigned int -nf_nat_fn(unsigned int hooknum, -	  struct sk_buff *skb, -	  const struct net_device *in, -	  const struct net_device *out, -	  int (*okfn)(struct sk_buff *)) -{ -	struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	struct nf_conn_nat *nat; -	/* maniptype == SRC for postrouting. */ -	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); - -	/* We never see fragments: conntrack defrags on pre-routing -	   and local-out, and nf_nat_out protects post-routing. */ -	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); - -	ct = nf_ct_get(skb, &ctinfo); -	/* Can't track?  It's not due to stress, or conntrack would -	   have dropped it.  Hence it's the user's responsibilty to -	   packet filter it out, or implement conntrack/NAT for that -	   protocol. 8) --RR */ -	if (!ct) -		return NF_ACCEPT; - -	/* Don't try to NAT if this packet is not conntracked */ -	if (nf_ct_is_untracked(ct)) -		return NF_ACCEPT; - -	nat = nfct_nat(ct); -	if (!nat) { -		/* NAT module was loaded late. */ -		if (nf_ct_is_confirmed(ct)) -			return NF_ACCEPT; -		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); -		if (nat == NULL) { -			pr_debug("failed to add NAT extension\n"); -			return NF_ACCEPT; -		} -	} - -	switch (ctinfo) { -	case IP_CT_RELATED: -	case IP_CT_RELATED+IP_CT_IS_REPLY: -		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { -			if (!nf_nat_icmp_reply_translation(ct, ctinfo, -							   hooknum, skb)) -				return NF_DROP; -			else -				return NF_ACCEPT; -		} -		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ -	case IP_CT_NEW: - -		/* Seen it before?  This can happen for loopback, retrans, -		   or local packets.. */ -		if (!nf_nat_initialized(ct, maniptype)) { -			unsigned int ret; - -			ret = nf_nat_rule_find(skb, hooknum, in, out, ct); -			if (ret != NF_ACCEPT) -				return ret; -		} else -			pr_debug("Already setup manip %s for ct %p\n", -				 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", -				 ct); -		break; - -	default: -		/* ESTABLISHED */ -		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || -			     ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); -	} - -	return nf_nat_packet(ct, ctinfo, hooknum, skb); -} - -static unsigned int -nf_nat_in(unsigned int hooknum, -	  struct sk_buff *skb, -	  const struct net_device *in, -	  const struct net_device *out, -	  int (*okfn)(struct sk_buff *)) -{ -	unsigned int ret; -	__be32 daddr = ip_hdr(skb)->daddr; - -	ret = nf_nat_fn(hooknum, skb, in, out, okfn); -	if (ret != NF_DROP && ret != NF_STOLEN && -	    daddr != ip_hdr(skb)->daddr) -		skb_dst_drop(skb); - -	return ret; -} - -static unsigned int -nf_nat_out(unsigned int hooknum, -	   struct sk_buff *skb, -	   const struct net_device *in, -	   const struct net_device *out, -	   int (*okfn)(struct sk_buff *)) -{ -#ifdef CONFIG_XFRM -	const struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -#endif -	unsigned int ret; - -	/* root is playing with raw sockets. */ -	if (skb->len < sizeof(struct iphdr) || -	    ip_hdrlen(skb) < sizeof(struct iphdr)) -		return NF_ACCEPT; - -	ret = nf_nat_fn(hooknum, skb, in, out, okfn); -#ifdef CONFIG_XFRM -	if (ret != NF_DROP && ret != NF_STOLEN && -	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { -		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - -		if ((ct->tuplehash[dir].tuple.src.u3.ip != -		     ct->tuplehash[!dir].tuple.dst.u3.ip) || -		    (ct->tuplehash[dir].tuple.src.u.all != -		     ct->tuplehash[!dir].tuple.dst.u.all) -		   ) -			return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; -	} -#endif -	return ret; -} - -static unsigned int -nf_nat_local_fn(unsigned int hooknum, -		struct sk_buff *skb, -		const struct net_device *in, -		const struct net_device *out, -		int (*okfn)(struct sk_buff *)) -{ -	const struct nf_conn *ct; -	enum ip_conntrack_info ctinfo; -	unsigned int ret; - -	/* root is playing with raw sockets. */ -	if (skb->len < sizeof(struct iphdr) || -	    ip_hdrlen(skb) < sizeof(struct iphdr)) -		return NF_ACCEPT; - -	ret = nf_nat_fn(hooknum, skb, in, out, okfn); -	if (ret != NF_DROP && ret != NF_STOLEN && -	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { -		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - -		if (ct->tuplehash[dir].tuple.dst.u3.ip != -		    ct->tuplehash[!dir].tuple.src.u3.ip) { -			if (ip_route_me_harder(skb, RTN_UNSPEC)) -				ret = NF_DROP; -		} -#ifdef CONFIG_XFRM -		else if (ct->tuplehash[dir].tuple.dst.u.all != -			 ct->tuplehash[!dir].tuple.src.u.all) -			if (ip_xfrm_me_harder(skb)) -				ret = NF_DROP; -#endif -	} -	return ret; -} - -/* We must be after connection tracking and before packet filtering. */ - -static struct nf_hook_ops nf_nat_ops[] __read_mostly = { -	/* Before packet filtering, change destination */ -	{ -		.hook		= nf_nat_in, -		.owner		= THIS_MODULE, -		.pf		= NFPROTO_IPV4, -		.hooknum	= NF_INET_PRE_ROUTING, -		.priority	= NF_IP_PRI_NAT_DST, -	}, -	/* After packet filtering, change source */ -	{ -		.hook		= nf_nat_out, -		.owner		= THIS_MODULE, -		.pf		= NFPROTO_IPV4, -		.hooknum	= NF_INET_POST_ROUTING, -		.priority	= NF_IP_PRI_NAT_SRC, -	}, -	/* Before packet filtering, change destination */ -	{ -		.hook		= nf_nat_local_fn, -		.owner		= THIS_MODULE, -		.pf		= NFPROTO_IPV4, -		.hooknum	= NF_INET_LOCAL_OUT, -		.priority	= NF_IP_PRI_NAT_DST, -	}, -	/* After packet filtering, change source */ -	{ -		.hook		= nf_nat_fn, -		.owner		= THIS_MODULE, -		.pf		= NFPROTO_IPV4, -		.hooknum	= NF_INET_LOCAL_IN, -		.priority	= NF_IP_PRI_NAT_SRC, -	}, -}; - -static int __init nf_nat_standalone_init(void) -{ -	int ret = 0; - -	need_ipv4_conntrack(); - -#ifdef CONFIG_XFRM -	BUG_ON(ip_nat_decode_session != NULL); -	rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); -#endif -	ret = nf_nat_rule_init(); -	if (ret < 0) { -		pr_err("nf_nat_init: can't setup rules.\n"); -		goto cleanup_decode_session; -	} -	ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); -	if (ret < 0) { -		pr_err("nf_nat_init: can't register hooks.\n"); -		goto cleanup_rule_init; -	} -	return ret; - - cleanup_rule_init: -	nf_nat_rule_cleanup(); - cleanup_decode_session: -#ifdef CONFIG_XFRM -	rcu_assign_pointer(ip_nat_decode_session, NULL); -	synchronize_net(); -#endif -	return ret; -} - -static void __exit nf_nat_standalone_fini(void) -{ -	nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); -	nf_nat_rule_cleanup(); -#ifdef CONFIG_XFRM -	rcu_assign_pointer(ip_nat_decode_session, NULL); -	synchronize_net(); -#endif -	/* Conntrack caches are unregistered in nf_conntrack_cleanup */ -} - -module_init(nf_nat_standalone_init); -module_exit(nf_nat_standalone_fini); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat"); diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c deleted file mode 100644 index 7274a43c7a1..00000000000 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ /dev/null @@ -1,51 +0,0 @@ -/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/udp.h> - -#include <net/netfilter/nf_nat_helper.h> -#include <net/netfilter/nf_nat_rule.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <linux/netfilter/nf_conntrack_tftp.h> - -MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); -MODULE_DESCRIPTION("TFTP NAT helper"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_tftp"); - -static unsigned int help(struct sk_buff *skb, -			 enum ip_conntrack_info ctinfo, -			 struct nf_conntrack_expect *exp) -{ -	const struct nf_conn *ct = exp->master; - -	exp->saved_proto.udp.port -		= ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; -	exp->dir = IP_CT_DIR_REPLY; -	exp->expectfn = nf_nat_follow_master; -	if (nf_ct_expect_related(exp) != 0) -		return NF_DROP; -	return NF_ACCEPT; -} - -static void __exit nf_nat_tftp_fini(void) -{ -	rcu_assign_pointer(nf_nat_tftp_hook, NULL); -	synchronize_rcu(); -} - -static int __init nf_nat_tftp_init(void) -{ -	BUG_ON(nf_nat_tftp_hook != NULL); -	rcu_assign_pointer(nf_nat_tftp_hook, help); -	return 0; -} - -module_init(nf_nat_tftp_init); -module_exit(nf_nat_tftp_fini); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c new file mode 100644 index 00000000000..19412a4063f --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netfilter_arp.h> +#include <net/netfilter/nf_tables.h> + +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, +		  struct sk_buff *skb, +		  const struct net_device *in, +		  const struct net_device *out, +		  int (*okfn)(struct sk_buff *)) +{ +	struct nft_pktinfo pkt; + +	nft_set_pktinfo(&pkt, ops, skb, in, out); + +	return nft_do_chain(&pkt, ops); +} + +static struct nft_af_info nft_af_arp __read_mostly = { +	.family		= NFPROTO_ARP, +	.nhooks		= NF_ARP_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 1, +	.hooks		= { +		[NF_ARP_IN]		= nft_do_chain_arp, +		[NF_ARP_OUT]		= nft_do_chain_arp, +		[NF_ARP_FORWARD]	= nft_do_chain_arp, +	}, +}; + +static int nf_tables_arp_init_net(struct net *net) +{ +	net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.arp== NULL) +		return -ENOMEM; + +	memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); + +	if (nft_register_afinfo(net, net->nft.arp) < 0) +		goto err; + +	return 0; +err: +	kfree(net->nft.arp); +	return -ENOMEM; +} + +static void nf_tables_arp_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.arp); +	kfree(net->nft.arp); +} + +static struct pernet_operations nf_tables_arp_net_ops = { +	.init   = nf_tables_arp_init_net, +	.exit   = nf_tables_arp_exit_net, +}; + +static const struct nf_chain_type filter_arp = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_ARP, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_ARP_IN) | +			  (1 << NF_ARP_OUT) | +			  (1 << NF_ARP_FORWARD), +}; + +static int __init nf_tables_arp_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_arp); +	ret = register_pernet_subsys(&nf_tables_arp_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_arp); + +	return ret; +} + +static void __exit nf_tables_arp_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_arp_net_ops); +	nft_unregister_chain_type(&filter_arp); +} + +module_init(nf_tables_arp_init); +module_exit(nf_tables_arp_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c new file mode 100644 index 00000000000..6820c8c4084 --- /dev/null +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_tables.h> +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/netfilter/nf_tables_ipv4.h> + +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, +				      struct sk_buff *skb, +				      const struct net_device *in, +				      const struct net_device *out, +				      int (*okfn)(struct sk_buff *)) +{ +	struct nft_pktinfo pkt; + +	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +	return nft_do_chain(&pkt, ops); +} + +static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, +				    struct sk_buff *skb, +				    const struct net_device *in, +				    const struct net_device *out, +				    int (*okfn)(struct sk_buff *)) +{ +	if (unlikely(skb->len < sizeof(struct iphdr) || +		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { +		if (net_ratelimit()) +			pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " +				"packet\n"); +		return NF_ACCEPT; +	} + +	return nft_do_chain_ipv4(ops, skb, in, out, okfn); +} + +struct nft_af_info nft_af_ipv4 __read_mostly = { +	.family		= NFPROTO_IPV4, +	.nhooks		= NF_INET_NUMHOOKS, +	.owner		= THIS_MODULE, +	.nops		= 1, +	.hooks		= { +		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4, +		[NF_INET_LOCAL_OUT]	= nft_ipv4_output, +		[NF_INET_FORWARD]	= nft_do_chain_ipv4, +		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4, +		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4, +	}, +}; +EXPORT_SYMBOL_GPL(nft_af_ipv4); + +static int nf_tables_ipv4_init_net(struct net *net) +{ +	net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); +	if (net->nft.ipv4 == NULL) +		return -ENOMEM; + +	memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); + +	if (nft_register_afinfo(net, net->nft.ipv4) < 0) +		goto err; + +	return 0; +err: +	kfree(net->nft.ipv4); +	return -ENOMEM; +} + +static void nf_tables_ipv4_exit_net(struct net *net) +{ +	nft_unregister_afinfo(net->nft.ipv4); +	kfree(net->nft.ipv4); +} + +static struct pernet_operations nf_tables_ipv4_net_ops = { +	.init	= nf_tables_ipv4_init_net, +	.exit	= nf_tables_ipv4_exit_net, +}; + +static const struct nf_chain_type filter_ipv4 = { +	.name		= "filter", +	.type		= NFT_CHAIN_T_DEFAULT, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_IN) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_FORWARD) | +			  (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_ipv4_init(void) +{ +	int ret; + +	nft_register_chain_type(&filter_ipv4); +	ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); +	if (ret < 0) +		nft_unregister_chain_type(&filter_ipv4); + +	return ret; +} + +static void __exit nf_tables_ipv4_exit(void) +{ +	unregister_pernet_subsys(&nf_tables_ipv4_net_ops); +	nft_unregister_chain_type(&filter_ipv4); +} + +module_init(nf_tables_ipv4_init); +module_exit(nf_tables_ipv4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c new file mode 100644 index 00000000000..3964157d826 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ip.h> + +/* + * NAT chains + */ + +static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, +			      struct sk_buff *skb, +			      const struct net_device *in, +			      const struct net_device *out, +			      int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo; +	struct nf_conn *ct = nf_ct_get(skb, &ctinfo); +	struct nf_conn_nat *nat; +	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); +	struct nft_pktinfo pkt; +	unsigned int ret; + +	if (ct == NULL || nf_ct_is_untracked(ct)) +		return NF_ACCEPT; + +	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET))); + +	nat = nf_ct_nat_ext_add(ct); +	if (nat == NULL) +		return NF_ACCEPT; + +	switch (ctinfo) { +	case IP_CT_RELATED: +	case IP_CT_RELATED + IP_CT_IS_REPLY: +		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { +			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, +							   ops->hooknum)) +				return NF_DROP; +			else +				return NF_ACCEPT; +		} +		/* Fall through */ +	case IP_CT_NEW: +		if (nf_nat_initialized(ct, maniptype)) +			break; + +		nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +		ret = nft_do_chain(&pkt, ops); +		if (ret != NF_ACCEPT) +			return ret; +		if (!nf_nat_initialized(ct, maniptype)) { +			ret = nf_nat_alloc_null_binding(ct, ops->hooknum); +			if (ret != NF_ACCEPT) +				return ret; +		} +	default: +		break; +	} + +	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops, +				      struct sk_buff *skb, +				      const struct net_device *in, +				      const struct net_device *out, +				      int (*okfn)(struct sk_buff *)) +{ +	__be32 daddr = ip_hdr(skb)->daddr; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    ip_hdr(skb)->daddr != daddr) { +		skb_dst_drop(skb); +	} +	return ret; +} + +static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops, +				       struct sk_buff *skb, +				       const struct net_device *in, +				       const struct net_device *out, +				       int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo __maybe_unused; +	const struct nf_conn *ct __maybe_unused; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM +	if (ret != NF_DROP && ret != NF_STOLEN && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if (ct->tuplehash[dir].tuple.src.u3.ip != +		    ct->tuplehash[!dir].tuple.dst.u3.ip || +		    ct->tuplehash[dir].tuple.src.u.all != +		    ct->tuplehash[!dir].tuple.dst.u.all) +			return nf_xfrm_me_harder(skb, AF_INET) == 0 ? +								ret : NF_DROP; +	} +#endif +	return ret; +} + +static unsigned int nf_nat_output(const struct nf_hook_ops *ops, +				  struct sk_buff *skb, +				  const struct net_device *in, +				  const struct net_device *out, +				  int (*okfn)(struct sk_buff *)) +{ +	enum ip_conntrack_info ctinfo; +	const struct nf_conn *ct; +	unsigned int ret; + +	ret = nf_nat_fn(ops, skb, in, out, okfn); +	if (ret != NF_DROP && ret != NF_STOLEN && +	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) { +		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + +		if (ct->tuplehash[dir].tuple.dst.u3.ip != +		    ct->tuplehash[!dir].tuple.src.u3.ip) { +			if (ip_route_me_harder(skb, RTN_UNSPEC)) +				ret = NF_DROP; +		} +#ifdef CONFIG_XFRM +		else if (ct->tuplehash[dir].tuple.dst.u.all != +			 ct->tuplehash[!dir].tuple.src.u.all) +			if (nf_xfrm_me_harder(skb, AF_INET)) +				ret = NF_DROP; +#endif +	} +	return ret; +} + +static const struct nf_chain_type nft_chain_nat_ipv4 = { +	.name		= "nat", +	.type		= NFT_CHAIN_T_NAT, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_PRE_ROUTING) | +			  (1 << NF_INET_POST_ROUTING) | +			  (1 << NF_INET_LOCAL_OUT) | +			  (1 << NF_INET_LOCAL_IN), +	.hooks		= { +		[NF_INET_PRE_ROUTING]	= nf_nat_prerouting, +		[NF_INET_POST_ROUTING]	= nf_nat_postrouting, +		[NF_INET_LOCAL_OUT]	= nf_nat_output, +		[NF_INET_LOCAL_IN]	= nf_nat_fn, +	}, +}; + +static int __init nft_chain_nat_init(void) +{ +	int err; + +	err = nft_register_chain_type(&nft_chain_nat_ipv4); +	if (err < 0) +		return err; + +	return 0; +} + +static void __exit nft_chain_nat_exit(void) +{ +	nft_unregister_chain_type(&nft_chain_nat_ipv4); +} + +module_init(nft_chain_nat_init); +module_exit(nft_chain_nat_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c new file mode 100644 index 00000000000..125b66766c0 --- /dev/null +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/route.h> +#include <net/ip.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, +					struct sk_buff *skb, +					const struct net_device *in, +					const struct net_device *out, +					int (*okfn)(struct sk_buff *)) +{ +	unsigned int ret; +	struct nft_pktinfo pkt; +	u32 mark; +	__be32 saddr, daddr; +	u_int8_t tos; +	const struct iphdr *iph; + +	/* root is playing with raw sockets. */ +	if (skb->len < sizeof(struct iphdr) || +	    ip_hdrlen(skb) < sizeof(struct iphdr)) +		return NF_ACCEPT; + +	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + +	mark = skb->mark; +	iph = ip_hdr(skb); +	saddr = iph->saddr; +	daddr = iph->daddr; +	tos = iph->tos; + +	ret = nft_do_chain(&pkt, ops); +	if (ret != NF_DROP && ret != NF_QUEUE) { +		iph = ip_hdr(skb); + +		if (iph->saddr != saddr || +		    iph->daddr != daddr || +		    skb->mark != mark || +		    iph->tos != tos) +			if (ip_route_me_harder(skb, RTN_UNSPEC)) +				ret = NF_DROP; +	} +	return ret; +} + +static const struct nf_chain_type nft_chain_route_ipv4 = { +	.name		= "route", +	.type		= NFT_CHAIN_T_ROUTE, +	.family		= NFPROTO_IPV4, +	.owner		= THIS_MODULE, +	.hook_mask	= (1 << NF_INET_LOCAL_OUT), +	.hooks		= { +		[NF_INET_LOCAL_OUT]	= nf_route_table_hook, +	}, +}; + +static int __init nft_chain_route_init(void) +{ +	return nft_register_chain_type(&nft_chain_route_ipv4); +} + +static void __exit nft_chain_route_exit(void) +{ +	nft_unregister_chain_type(&nft_chain_route_ipv4); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 00000000000..e79718a382f --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> + +void nft_reject_ipv4_eval(const struct nft_expr *expr, +			  struct nft_data data[NFT_REG_MAX + 1], +			  const struct nft_pktinfo *pkt) +{ +	struct nft_reject *priv = nft_expr_priv(expr); + +	switch (priv->type) { +	case NFT_REJECT_ICMP_UNREACH: +		nf_send_unreach(pkt->skb, priv->icmp_code); +		break; +	case NFT_REJECT_TCP_RST: +		nf_send_reset(pkt->skb, pkt->ops->hooknum); +		break; +	} + +	data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { +	.type		= &nft_reject_ipv4_type, +	.size		= NFT_EXPR_SIZE(sizeof(struct nft_reject)), +	.eval		= nft_reject_ipv4_eval, +	.init		= nft_reject_init, +	.dump		= nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { +	.family		= NFPROTO_IPV4, +	.name		= "reject", +	.ops		= &nft_reject_ipv4_ops, +	.policy		= nft_reject_policy, +	.maxattr	= NFTA_REJECT_MAX, +	.owner		= THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ +	return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ +	nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c new file mode 100644 index 00000000000..044a0ddf6a7 --- /dev/null +++ b/net/ipv4/ping.c @@ -0,0 +1,1218 @@ +/* + * INET		An implementation of the TCP/IP protocol suite for the LINUX + *		operating system.  INET is implemented using the  BSD Socket + *		interface as the means of communication with the user level. + * + *		"Ping" sockets + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Based on ipv4/udp.c code. + * + * Authors:	Vasiliy Kulikov / Openwall (for Linux 2.6), + *		Pavel Kankovsky (for Linux 2.4.32) + * + * Pavel gave all rights to bugs to Vasiliy, + * none of the bugs are Pavel's now. + * + */ + +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/export.h> +#include <net/sock.h> +#include <net/ping.h> +#include <net/udp.h> +#include <net/route.h> +#include <net/inet_common.h> +#include <net/checksum.h> + +#if IS_ENABLED(CONFIG_IPV6) +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#endif + +struct ping_table { +	struct hlist_nulls_head	hash[PING_HTABLE_SIZE]; +	rwlock_t		lock; +}; + +static struct ping_table ping_table; +struct pingv6_ops pingv6_ops; +EXPORT_SYMBOL_GPL(pingv6_ops); + +static u16 ping_port_rover; + +static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) +{ +	int res = (num + net_hash_mix(net)) & mask; + +	pr_debug("hash(%d) = %d\n", num, res); +	return res; +} +EXPORT_SYMBOL_GPL(ping_hash); + +static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, +					     struct net *net, unsigned int num) +{ +	return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; +} + +int ping_get_port(struct sock *sk, unsigned short ident) +{ +	struct hlist_nulls_node *node; +	struct hlist_nulls_head *hlist; +	struct inet_sock *isk, *isk2; +	struct sock *sk2 = NULL; + +	isk = inet_sk(sk); +	write_lock_bh(&ping_table.lock); +	if (ident == 0) { +		u32 i; +		u16 result = ping_port_rover + 1; + +		for (i = 0; i < (1L << 16); i++, result++) { +			if (!result) +				result++; /* avoid zero */ +			hlist = ping_hashslot(&ping_table, sock_net(sk), +					    result); +			ping_portaddr_for_each_entry(sk2, node, hlist) { +				isk2 = inet_sk(sk2); + +				if (isk2->inet_num == result) +					goto next_port; +			} + +			/* found */ +			ping_port_rover = ident = result; +			break; +next_port: +			; +		} +		if (i >= (1L << 16)) +			goto fail; +	} else { +		hlist = ping_hashslot(&ping_table, sock_net(sk), ident); +		ping_portaddr_for_each_entry(sk2, node, hlist) { +			isk2 = inet_sk(sk2); + +			/* BUG? Why is this reuse and not reuseaddr? ping.c +			 * doesn't turn off SO_REUSEADDR, and it doesn't expect +			 * that other ping processes can steal its packets. +			 */ +			if ((isk2->inet_num == ident) && +			    (sk2 != sk) && +			    (!sk2->sk_reuse || !sk->sk_reuse)) +				goto fail; +		} +	} + +	pr_debug("found port/ident = %d\n", ident); +	isk->inet_num = ident; +	if (sk_unhashed(sk)) { +		pr_debug("was not hashed\n"); +		sock_hold(sk); +		hlist_nulls_add_head(&sk->sk_nulls_node, hlist); +		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +	} +	write_unlock_bh(&ping_table.lock); +	return 0; + +fail: +	write_unlock_bh(&ping_table.lock); +	return 1; +} +EXPORT_SYMBOL_GPL(ping_get_port); + +void ping_hash(struct sock *sk) +{ +	pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); +	BUG(); /* "Please do not press this button again." */ +} + +void ping_unhash(struct sock *sk) +{ +	struct inet_sock *isk = inet_sk(sk); +	pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); +	if (sk_hashed(sk)) { +		write_lock_bh(&ping_table.lock); +		hlist_nulls_del(&sk->sk_nulls_node); +		sock_put(sk); +		isk->inet_num = 0; +		isk->inet_sport = 0; +		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +		write_unlock_bh(&ping_table.lock); +	} +} +EXPORT_SYMBOL_GPL(ping_unhash); + +static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) +{ +	struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); +	struct sock *sk = NULL; +	struct inet_sock *isk; +	struct hlist_nulls_node *hnode; +	int dif = skb->dev->ifindex; + +	if (skb->protocol == htons(ETH_P_IP)) { +		pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", +			 (int)ident, &ip_hdr(skb)->daddr, dif); +#if IS_ENABLED(CONFIG_IPV6) +	} else if (skb->protocol == htons(ETH_P_IPV6)) { +		pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", +			 (int)ident, &ipv6_hdr(skb)->daddr, dif); +#endif +	} + +	read_lock_bh(&ping_table.lock); + +	ping_portaddr_for_each_entry(sk, hnode, hslot) { +		isk = inet_sk(sk); + +		pr_debug("iterate\n"); +		if (isk->inet_num != ident) +			continue; + +		if (skb->protocol == htons(ETH_P_IP) && +		    sk->sk_family == AF_INET) { +			pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, +				 (int) isk->inet_num, &isk->inet_rcv_saddr, +				 sk->sk_bound_dev_if); + +			if (isk->inet_rcv_saddr && +			    isk->inet_rcv_saddr != ip_hdr(skb)->daddr) +				continue; +#if IS_ENABLED(CONFIG_IPV6) +		} else if (skb->protocol == htons(ETH_P_IPV6) && +			   sk->sk_family == AF_INET6) { + +			pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, +				 (int) isk->inet_num, +				 &sk->sk_v6_rcv_saddr, +				 sk->sk_bound_dev_if); + +			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && +			    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, +					     &ipv6_hdr(skb)->daddr)) +				continue; +#endif +		} + +		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) +			continue; + +		sock_hold(sk); +		goto exit; +	} + +	sk = NULL; +exit: +	read_unlock_bh(&ping_table.lock); + +	return sk; +} + +static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, +					  kgid_t *high) +{ +	kgid_t *data = net->ipv4.ping_group_range.range; +	unsigned int seq; + +	do { +		seq = read_seqbegin(&net->ipv4.ping_group_range.lock); + +		*low = data[0]; +		*high = data[1]; +	} while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); +} + + +int ping_init_sock(struct sock *sk) +{ +	struct net *net = sock_net(sk); +	kgid_t group = current_egid(); +	struct group_info *group_info; +	int i, j, count; +	kgid_t low, high; +	int ret = 0; + +	inet_get_ping_group_range_net(net, &low, &high); +	if (gid_lte(low, group) && gid_lte(group, high)) +		return 0; + +	group_info = get_current_groups(); +	count = group_info->ngroups; +	for (i = 0; i < group_info->nblocks; i++) { +		int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); +		for (j = 0; j < cp_count; j++) { +			kgid_t gid = group_info->blocks[i][j]; +			if (gid_lte(low, gid) && gid_lte(gid, high)) +				goto out_release_group; +		} + +		count -= cp_count; +	} + +	ret = -EACCES; + +out_release_group: +	put_group_info(group_info); +	return ret; +} +EXPORT_SYMBOL_GPL(ping_init_sock); + +void ping_close(struct sock *sk, long timeout) +{ +	pr_debug("ping_close(sk=%p,sk->num=%u)\n", +		 inet_sk(sk), inet_sk(sk)->inet_num); +	pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); + +	sk_common_release(sk); +} +EXPORT_SYMBOL_GPL(ping_close); + +/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ +static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, +				struct sockaddr *uaddr, int addr_len) { +	struct net *net = sock_net(sk); +	if (sk->sk_family == AF_INET) { +		struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; +		int chk_addr_ret; + +		if (addr_len < sizeof(*addr)) +			return -EINVAL; + +		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", +			 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); + +		chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + +		if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) +			chk_addr_ret = RTN_LOCAL; + +		if ((sysctl_ip_nonlocal_bind == 0 && +		    isk->freebind == 0 && isk->transparent == 0 && +		     chk_addr_ret != RTN_LOCAL) || +		    chk_addr_ret == RTN_MULTICAST || +		    chk_addr_ret == RTN_BROADCAST) +			return -EADDRNOTAVAIL; + +#if IS_ENABLED(CONFIG_IPV6) +	} else if (sk->sk_family == AF_INET6) { +		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; +		int addr_type, scoped, has_addr; +		struct net_device *dev = NULL; + +		if (addr_len < sizeof(*addr)) +			return -EINVAL; + +		if (addr->sin6_family != AF_INET6) +			return -EINVAL; + +		pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", +			 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); + +		addr_type = ipv6_addr_type(&addr->sin6_addr); +		scoped = __ipv6_addr_needs_scope_id(addr_type); +		if ((addr_type != IPV6_ADDR_ANY && +		     !(addr_type & IPV6_ADDR_UNICAST)) || +		    (scoped && !addr->sin6_scope_id)) +			return -EINVAL; + +		rcu_read_lock(); +		if (addr->sin6_scope_id) { +			dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); +			if (!dev) { +				rcu_read_unlock(); +				return -ENODEV; +			} +		} +		has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, +						    scoped); +		rcu_read_unlock(); + +		if (!(isk->freebind || isk->transparent || has_addr || +		      addr_type == IPV6_ADDR_ANY)) +			return -EADDRNOTAVAIL; + +		if (scoped) +			sk->sk_bound_dev_if = addr->sin6_scope_id; +#endif +	} else { +		return -EAFNOSUPPORT; +	} +	return 0; +} + +static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) +{ +	if (saddr->sa_family == AF_INET) { +		struct inet_sock *isk = inet_sk(sk); +		struct sockaddr_in *addr = (struct sockaddr_in *) saddr; +		isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; +#if IS_ENABLED(CONFIG_IPV6) +	} else if (saddr->sa_family == AF_INET6) { +		struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; +		struct ipv6_pinfo *np = inet6_sk(sk); +		sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; +#endif +	} +} + +static void ping_clear_saddr(struct sock *sk, int dif) +{ +	sk->sk_bound_dev_if = dif; +	if (sk->sk_family == AF_INET) { +		struct inet_sock *isk = inet_sk(sk); +		isk->inet_rcv_saddr = isk->inet_saddr = 0; +#if IS_ENABLED(CONFIG_IPV6) +	} else if (sk->sk_family == AF_INET6) { +		struct ipv6_pinfo *np = inet6_sk(sk); +		memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); +		memset(&np->saddr, 0, sizeof(np->saddr)); +#endif +	} +} +/* + * We need our own bind because there are no privileged id's == local ports. + * Moreover, we don't allow binding to multi- and broadcast addresses. + */ + +int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ +	struct inet_sock *isk = inet_sk(sk); +	unsigned short snum; +	int err; +	int dif = sk->sk_bound_dev_if; + +	err = ping_check_bind_addr(sk, isk, uaddr, addr_len); +	if (err) +		return err; + +	lock_sock(sk); + +	err = -EINVAL; +	if (isk->inet_num != 0) +		goto out; + +	err = -EADDRINUSE; +	ping_set_saddr(sk, uaddr); +	snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); +	if (ping_get_port(sk, snum) != 0) { +		ping_clear_saddr(sk, dif); +		goto out; +	} + +	pr_debug("after bind(): num = %d, dif = %d\n", +		 (int)isk->inet_num, +		 (int)sk->sk_bound_dev_if); + +	err = 0; +	if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) +		sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#if IS_ENABLED(CONFIG_IPV6) +	if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) +		sk->sk_userlocks |= SOCK_BINDADDR_LOCK; +#endif + +	if (snum) +		sk->sk_userlocks |= SOCK_BINDPORT_LOCK; +	isk->inet_sport = htons(isk->inet_num); +	isk->inet_daddr = 0; +	isk->inet_dport = 0; + +#if IS_ENABLED(CONFIG_IPV6) +	if (sk->sk_family == AF_INET6) +		memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); +#endif + +	sk_dst_reset(sk); +out: +	release_sock(sk); +	pr_debug("ping_v4_bind -> %d\n", err); +	return err; +} +EXPORT_SYMBOL_GPL(ping_bind); + +/* + * Is this a supported type of ICMP message? + */ + +static inline int ping_supported(int family, int type, int code) +{ +	return (family == AF_INET && type == ICMP_ECHO && code == 0) || +	       (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. + */ + +void ping_err(struct sk_buff *skb, int offset, u32 info) +{ +	int family; +	struct icmphdr *icmph; +	struct inet_sock *inet_sock; +	int type; +	int code; +	struct net *net = dev_net(skb->dev); +	struct sock *sk; +	int harderr; +	int err; + +	if (skb->protocol == htons(ETH_P_IP)) { +		family = AF_INET; +		type = icmp_hdr(skb)->type; +		code = icmp_hdr(skb)->code; +		icmph = (struct icmphdr *)(skb->data + offset); +	} else if (skb->protocol == htons(ETH_P_IPV6)) { +		family = AF_INET6; +		type = icmp6_hdr(skb)->icmp6_type; +		code = icmp6_hdr(skb)->icmp6_code; +		icmph = (struct icmphdr *) (skb->data + offset); +	} else { +		BUG(); +	} + +	/* We assume the packet has already been checked by icmp_unreach */ + +	if (!ping_supported(family, icmph->type, icmph->code)) +		return; + +	pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", +		 skb->protocol, type, code, ntohs(icmph->un.echo.id), +		 ntohs(icmph->un.echo.sequence)); + +	sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); +	if (sk == NULL) { +		pr_debug("no socket, dropping\n"); +		return;	/* No socket for error */ +	} +	pr_debug("err on socket %p\n", sk); + +	err = 0; +	harderr = 0; +	inet_sock = inet_sk(sk); + +	if (skb->protocol == htons(ETH_P_IP)) { +		switch (type) { +		default: +		case ICMP_TIME_EXCEEDED: +			err = EHOSTUNREACH; +			break; +		case ICMP_SOURCE_QUENCH: +			/* This is not a real error but ping wants to see it. +			 * Report it with some fake errno. +			 */ +			err = EREMOTEIO; +			break; +		case ICMP_PARAMETERPROB: +			err = EPROTO; +			harderr = 1; +			break; +		case ICMP_DEST_UNREACH: +			if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ +				ipv4_sk_update_pmtu(skb, sk, info); +				if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { +					err = EMSGSIZE; +					harderr = 1; +					break; +				} +				goto out; +			} +			err = EHOSTUNREACH; +			if (code <= NR_ICMP_UNREACH) { +				harderr = icmp_err_convert[code].fatal; +				err = icmp_err_convert[code].errno; +			} +			break; +		case ICMP_REDIRECT: +			/* See ICMP_SOURCE_QUENCH */ +			ipv4_sk_redirect(skb, sk); +			err = EREMOTEIO; +			break; +		} +#if IS_ENABLED(CONFIG_IPV6) +	} else if (skb->protocol == htons(ETH_P_IPV6)) { +		harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); +#endif +	} + +	/* +	 *      RFC1122: OK.  Passes ICMP errors back to application, as per +	 *	4.1.3.3. +	 */ +	if ((family == AF_INET && !inet_sock->recverr) || +	    (family == AF_INET6 && !inet6_sk(sk)->recverr)) { +		if (!harderr || sk->sk_state != TCP_ESTABLISHED) +			goto out; +	} else { +		if (family == AF_INET) { +			ip_icmp_error(sk, skb, err, 0 /* no remote port */, +				      info, (u8 *)icmph); +#if IS_ENABLED(CONFIG_IPV6) +		} else if (family == AF_INET6) { +			pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, +						   info, (u8 *)icmph); +#endif +		} +	} +	sk->sk_err = err; +	sk->sk_error_report(sk); +out: +	sock_put(sk); +} +EXPORT_SYMBOL_GPL(ping_err); + +/* + *	Copy and checksum an ICMP Echo packet from user space into a buffer + *	starting from the payload. + */ + +int ping_getfrag(void *from, char *to, +		 int offset, int fraglen, int odd, struct sk_buff *skb) +{ +	struct pingfakehdr *pfh = (struct pingfakehdr *)from; + +	if (offset == 0) { +		if (fraglen < sizeof(struct icmphdr)) +			BUG(); +		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), +			    pfh->iov, 0, fraglen - sizeof(struct icmphdr), +			    &pfh->wcheck)) +			return -EFAULT; +	} else if (offset < sizeof(struct icmphdr)) { +			BUG(); +	} else { +		if (csum_partial_copy_fromiovecend +				(to, pfh->iov, offset - sizeof(struct icmphdr), +				 fraglen, &pfh->wcheck)) +			return -EFAULT; +	} + +#if IS_ENABLED(CONFIG_IPV6) +	/* For IPv6, checksum each skb as we go along, as expected by +	 * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in +	 * wcheck, it will be finalized in ping_v4_push_pending_frames. +	 */ +	if (pfh->family == AF_INET6) { +		skb->csum = pfh->wcheck; +		skb->ip_summed = CHECKSUM_NONE; +		pfh->wcheck = 0; +	} +#endif + +	return 0; +} +EXPORT_SYMBOL_GPL(ping_getfrag); + +static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, +				       struct flowi4 *fl4) +{ +	struct sk_buff *skb = skb_peek(&sk->sk_write_queue); + +	pfh->wcheck = csum_partial((char *)&pfh->icmph, +		sizeof(struct icmphdr), pfh->wcheck); +	pfh->icmph.checksum = csum_fold(pfh->wcheck); +	memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); +	skb->ip_summed = CHECKSUM_NONE; +	return ip_push_pending_frames(sk, fl4); +} + +int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, +			void *user_icmph, size_t icmph_len) { +	u8 type, code; + +	if (len > 0xFFFF) +		return -EMSGSIZE; + +	/* +	 *	Check the flags. +	 */ + +	/* Mirror BSD error message compatibility */ +	if (msg->msg_flags & MSG_OOB) +		return -EOPNOTSUPP; + +	/* +	 *	Fetch the ICMP header provided by the userland. +	 *	iovec is modified! The ICMP header is consumed. +	 */ +	if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) +		return -EFAULT; + +	if (family == AF_INET) { +		type = ((struct icmphdr *) user_icmph)->type; +		code = ((struct icmphdr *) user_icmph)->code; +#if IS_ENABLED(CONFIG_IPV6) +	} else if (family == AF_INET6) { +		type = ((struct icmp6hdr *) user_icmph)->icmp6_type; +		code = ((struct icmp6hdr *) user_icmph)->icmp6_code; +#endif +	} else { +		BUG(); +	} + +	if (!ping_supported(family, type, code)) +		return -EINVAL; + +	return 0; +} +EXPORT_SYMBOL_GPL(ping_common_sendmsg); + +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +			   size_t len) +{ +	struct net *net = sock_net(sk); +	struct flowi4 fl4; +	struct inet_sock *inet = inet_sk(sk); +	struct ipcm_cookie ipc; +	struct icmphdr user_icmph; +	struct pingfakehdr pfh; +	struct rtable *rt = NULL; +	struct ip_options_data opt_copy; +	int free = 0; +	__be32 saddr, daddr, faddr; +	u8  tos; +	int err; + +	pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + +	err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, +				  sizeof(user_icmph)); +	if (err) +		return err; + +	/* +	 *	Get and verify the address. +	 */ + +	if (msg->msg_name) { +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); +		if (msg->msg_namelen < sizeof(*usin)) +			return -EINVAL; +		if (usin->sin_family != AF_INET) +			return -EINVAL; +		daddr = usin->sin_addr.s_addr; +		/* no remote port */ +	} else { +		if (sk->sk_state != TCP_ESTABLISHED) +			return -EDESTADDRREQ; +		daddr = inet->inet_daddr; +		/* no remote port */ +	} + +	ipc.addr = inet->inet_saddr; +	ipc.opt = NULL; +	ipc.oif = sk->sk_bound_dev_if; +	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; + +	sock_tx_timestamp(sk, &ipc.tx_flags); + +	if (msg->msg_controllen) { +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); +		if (err) +			return err; +		if (ipc.opt) +			free = 1; +	} +	if (!ipc.opt) { +		struct ip_options_rcu *inet_opt; + +		rcu_read_lock(); +		inet_opt = rcu_dereference(inet->inet_opt); +		if (inet_opt) { +			memcpy(&opt_copy, inet_opt, +			       sizeof(*inet_opt) + inet_opt->opt.optlen); +			ipc.opt = &opt_copy.opt; +		} +		rcu_read_unlock(); +	} + +	saddr = ipc.addr; +	ipc.addr = faddr = daddr; + +	if (ipc.opt && ipc.opt->opt.srr) { +		if (!daddr) +			return -EINVAL; +		faddr = ipc.opt->opt.faddr; +	} +	tos = get_rttos(&ipc, inet); +	if (sock_flag(sk, SOCK_LOCALROUTE) || +	    (msg->msg_flags & MSG_DONTROUTE) || +	    (ipc.opt && ipc.opt->opt.is_strictroute)) { +		tos |= RTO_ONLINK; +	} + +	if (ipv4_is_multicast(daddr)) { +		if (!ipc.oif) +			ipc.oif = inet->mc_index; +		if (!saddr) +			saddr = inet->mc_addr; +	} else if (!ipc.oif) +		ipc.oif = inet->uc_index; + +	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, +			   RT_SCOPE_UNIVERSE, sk->sk_protocol, +			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + +	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_flow(net, &fl4, sk); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt); +		rt = NULL; +		if (err == -ENETUNREACH) +			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); +		goto out; +	} + +	err = -EACCES; +	if ((rt->rt_flags & RTCF_BROADCAST) && +	    !sock_flag(sk, SOCK_BROADCAST)) +		goto out; + +	if (msg->msg_flags & MSG_CONFIRM) +		goto do_confirm; +back_from_confirm: + +	if (!ipc.addr) +		ipc.addr = fl4.daddr; + +	lock_sock(sk); + +	pfh.icmph.type = user_icmph.type; /* already checked */ +	pfh.icmph.code = user_icmph.code; /* ditto */ +	pfh.icmph.checksum = 0; +	pfh.icmph.un.echo.id = inet->inet_sport; +	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; +	pfh.iov = msg->msg_iov; +	pfh.wcheck = 0; +	pfh.family = AF_INET; + +	err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, +			0, &ipc, &rt, msg->msg_flags); +	if (err) +		ip_flush_pending_frames(sk); +	else +		err = ping_v4_push_pending_frames(sk, &pfh, &fl4); +	release_sock(sk); + +out: +	ip_rt_put(rt); +	if (free) +		kfree(ipc.opt); +	if (!err) { +		icmp_out_count(sock_net(sk), user_icmph.type); +		return len; +	} +	return err; + +do_confirm: +	dst_confirm(&rt->dst); +	if (!(msg->msg_flags & MSG_PROBE) || len) +		goto back_from_confirm; +	err = 0; +	goto out; +} + +int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +		 size_t len, int noblock, int flags, int *addr_len) +{ +	struct inet_sock *isk = inet_sk(sk); +	int family = sk->sk_family; +	struct sk_buff *skb; +	int copied, err; + +	pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + +	err = -EOPNOTSUPP; +	if (flags & MSG_OOB) +		goto out; + +	if (flags & MSG_ERRQUEUE) { +		if (family == AF_INET) { +			return ip_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_IPV6) +		} else if (family == AF_INET6) { +			return pingv6_ops.ipv6_recv_error(sk, msg, len, +							  addr_len); +#endif +		} +	} + +	skb = skb_recv_datagram(sk, flags, noblock, &err); +	if (!skb) +		goto out; + +	copied = skb->len; +	if (copied > len) { +		msg->msg_flags |= MSG_TRUNC; +		copied = len; +	} + +	/* Don't bother checking the checksum */ +	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); +	if (err) +		goto done; + +	sock_recv_timestamp(msg, sk, skb); + +	/* Copy the address and add cmsg data. */ +	if (family == AF_INET) { +		DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); + +		if (sin) { +			sin->sin_family = AF_INET; +			sin->sin_port = 0 /* skb->h.uh->source */; +			sin->sin_addr.s_addr = ip_hdr(skb)->saddr; +			memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +			*addr_len = sizeof(*sin); +		} + +		if (isk->cmsg_flags) +			ip_cmsg_recv(msg, skb); + +#if IS_ENABLED(CONFIG_IPV6) +	} else if (family == AF_INET6) { +		struct ipv6_pinfo *np = inet6_sk(sk); +		struct ipv6hdr *ip6 = ipv6_hdr(skb); +		DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + +		if (sin6) { +			sin6->sin6_family = AF_INET6; +			sin6->sin6_port = 0; +			sin6->sin6_addr = ip6->saddr; +			sin6->sin6_flowinfo = 0; +			if (np->sndflow) +				sin6->sin6_flowinfo = ip6_flowinfo(ip6); +			sin6->sin6_scope_id = +				ipv6_iface_scope_id(&sin6->sin6_addr, +						    IP6CB(skb)->iif); +			*addr_len = sizeof(*sin6); +		} + +		if (inet6_sk(sk)->rxopt.all) +			pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); +		if (skb->protocol == htons(ETH_P_IPV6) && +		    inet6_sk(sk)->rxopt.all) +			pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); +		else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) +			ip_cmsg_recv(msg, skb); +#endif +	} else { +		BUG(); +	} + +	err = copied; + +done: +	skb_free_datagram(sk, skb); +out: +	pr_debug("ping_recvmsg -> %d\n", err); +	return err; +} +EXPORT_SYMBOL_GPL(ping_recvmsg); + +int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ +	pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", +		 inet_sk(sk), inet_sk(sk)->inet_num, skb); +	if (sock_queue_rcv_skb(sk, skb) < 0) { +		kfree_skb(skb); +		pr_debug("ping_queue_rcv_skb -> failed\n"); +		return -1; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); + + +/* + *	All we need to do is get the socket. + */ + +void ping_rcv(struct sk_buff *skb) +{ +	struct sock *sk; +	struct net *net = dev_net(skb->dev); +	struct icmphdr *icmph = icmp_hdr(skb); + +	/* We assume the packet has already been checked by icmp_rcv */ + +	pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", +		 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + +	/* Push ICMP header back */ +	skb_push(skb, skb->data - (u8 *)icmph); + +	sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); +	if (sk != NULL) { +		pr_debug("rcv on socket %p\n", sk); +		ping_queue_rcv_skb(sk, skb_get(skb)); +		sock_put(sk); +		return; +	} +	pr_debug("no socket, dropping\n"); + +	/* We're called from icmp_rcv(). kfree_skb() is done there. */ +} +EXPORT_SYMBOL_GPL(ping_rcv); + +struct proto ping_prot = { +	.name =		"PING", +	.owner =	THIS_MODULE, +	.init =		ping_init_sock, +	.close =	ping_close, +	.connect =	ip4_datagram_connect, +	.disconnect =	udp_disconnect, +	.setsockopt =	ip_setsockopt, +	.getsockopt =	ip_getsockopt, +	.sendmsg =	ping_v4_sendmsg, +	.recvmsg =	ping_recvmsg, +	.bind =		ping_bind, +	.backlog_rcv =	ping_queue_rcv_skb, +	.release_cb =	ip4_datagram_release_cb, +	.hash =		ping_hash, +	.unhash =	ping_unhash, +	.get_port =	ping_get_port, +	.obj_size =	sizeof(struct inet_sock), +}; +EXPORT_SYMBOL(ping_prot); + +#ifdef CONFIG_PROC_FS + +static struct sock *ping_get_first(struct seq_file *seq, int start) +{ +	struct sock *sk; +	struct ping_iter_state *state = seq->private; +	struct net *net = seq_file_net(seq); + +	for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; +	     ++state->bucket) { +		struct hlist_nulls_node *node; +		struct hlist_nulls_head *hslot; + +		hslot = &ping_table.hash[state->bucket]; + +		if (hlist_nulls_empty(hslot)) +			continue; + +		sk_nulls_for_each(sk, node, hslot) { +			if (net_eq(sock_net(sk), net) && +			    sk->sk_family == state->family) +				goto found; +		} +	} +	sk = NULL; +found: +	return sk; +} + +static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) +{ +	struct ping_iter_state *state = seq->private; +	struct net *net = seq_file_net(seq); + +	do { +		sk = sk_nulls_next(sk); +	} while (sk && (!net_eq(sock_net(sk), net))); + +	if (!sk) +		return ping_get_first(seq, state->bucket + 1); +	return sk; +} + +static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) +{ +	struct sock *sk = ping_get_first(seq, 0); + +	if (sk) +		while (pos && (sk = ping_get_next(seq, sk)) != NULL) +			--pos; +	return pos ? NULL : sk; +} + +void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) +{ +	struct ping_iter_state *state = seq->private; +	state->bucket = 0; +	state->family = family; + +	read_lock_bh(&ping_table.lock); + +	return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL_GPL(ping_seq_start); + +static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) +{ +	return ping_seq_start(seq, pos, AF_INET); +} + +void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ +	struct sock *sk; + +	if (v == SEQ_START_TOKEN) +		sk = ping_get_idx(seq, 0); +	else +		sk = ping_get_next(seq, v); + +	++*pos; +	return sk; +} +EXPORT_SYMBOL_GPL(ping_seq_next); + +void ping_seq_stop(struct seq_file *seq, void *v) +{ +	read_unlock_bh(&ping_table.lock); +} +EXPORT_SYMBOL_GPL(ping_seq_stop); + +static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, +		int bucket) +{ +	struct inet_sock *inet = inet_sk(sp); +	__be32 dest = inet->inet_daddr; +	__be32 src = inet->inet_rcv_saddr; +	__u16 destp = ntohs(inet->inet_dport); +	__u16 srcp = ntohs(inet->inet_sport); + +	seq_printf(f, "%5d: %08X:%04X %08X:%04X" +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", +		bucket, src, srcp, dest, destp, sp->sk_state, +		sk_wmem_alloc_get(sp), +		sk_rmem_alloc_get(sp), +		0, 0L, 0, +		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), +		0, sock_i_ino(sp), +		atomic_read(&sp->sk_refcnt), sp, +		atomic_read(&sp->sk_drops)); +} + +static int ping_v4_seq_show(struct seq_file *seq, void *v) +{ +	seq_setwidth(seq, 127); +	if (v == SEQ_START_TOKEN) +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue " +			   "rx_queue tr tm->when retrnsmt   uid  timeout " +			   "inode ref pointer drops"); +	else { +		struct ping_iter_state *state = seq->private; + +		ping_v4_format_sock(v, seq, state->bucket); +	} +	seq_pad(seq, '\n'); +	return 0; +} + +static const struct seq_operations ping_v4_seq_ops = { +	.show		= ping_v4_seq_show, +	.start		= ping_v4_seq_start, +	.next		= ping_seq_next, +	.stop		= ping_seq_stop, +}; + +static int ping_seq_open(struct inode *inode, struct file *file) +{ +	struct ping_seq_afinfo *afinfo = PDE_DATA(inode); +	return seq_open_net(inode, file, &afinfo->seq_ops, +			   sizeof(struct ping_iter_state)); +} + +const struct file_operations ping_seq_fops = { +	.open		= ping_seq_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_net, +}; +EXPORT_SYMBOL_GPL(ping_seq_fops); + +static struct ping_seq_afinfo ping_v4_seq_afinfo = { +	.name		= "icmp", +	.family		= AF_INET, +	.seq_fops	= &ping_seq_fops, +	.seq_ops	= { +		.start		= ping_v4_seq_start, +		.show		= ping_v4_seq_show, +		.next		= ping_seq_next, +		.stop		= ping_seq_stop, +	}, +}; + +int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) +{ +	struct proc_dir_entry *p; +	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, +			     afinfo->seq_fops, afinfo); +	if (!p) +		return -ENOMEM; +	return 0; +} +EXPORT_SYMBOL_GPL(ping_proc_register); + +void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo) +{ +	remove_proc_entry(afinfo->name, net->proc_net); +} +EXPORT_SYMBOL_GPL(ping_proc_unregister); + +static int __net_init ping_v4_proc_init_net(struct net *net) +{ +	return ping_proc_register(net, &ping_v4_seq_afinfo); +} + +static void __net_exit ping_v4_proc_exit_net(struct net *net) +{ +	ping_proc_unregister(net, &ping_v4_seq_afinfo); +} + +static struct pernet_operations ping_v4_net_ops = { +	.init = ping_v4_proc_init_net, +	.exit = ping_v4_proc_exit_net, +}; + +int __init ping_proc_init(void) +{ +	return register_pernet_subsys(&ping_v4_net_ops); +} + +void ping_proc_exit(void) +{ +	unregister_pernet_subsys(&ping_v4_net_ops); +} + +#endif + +void __init ping_init(void) +{ +	int i; + +	for (i = 0; i < PING_HTABLE_SIZE; i++) +		INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); +	rwlock_init(&ping_table.lock); +} diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1b48eb1ed45..ae0af9386f7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -42,6 +42,7 @@  #include <linux/inetdevice.h>  #include <linux/proc_fs.h>  #include <linux/seq_file.h> +#include <linux/export.h>  #include <net/sock.h>  #include <net/raw.h> @@ -55,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)  	local_bh_disable();  	orphans = percpu_counter_sum_positive(&tcp_orphan_count); -	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); +	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);  	local_bh_enable();  	socket_seq_show(seq);  	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",  		   sock_prot_inuse_get(net, &tcp_prot), orphans,  		   tcp_death_row.tw_count, sockets, -		   atomic_long_read(&tcp_memory_allocated)); +		   proto_memory_allocated(&tcp_prot));  	seq_printf(seq, "UDP: inuse %d mem %ld\n",  		   sock_prot_inuse_get(net, &udp_prot), -		   atomic_long_read(&udp_memory_allocated)); +		   proto_memory_allocated(&udp_prot));  	seq_printf(seq, "UDPLITE: inuse %d\n",  		   sock_prot_inuse_get(net, &udplite_prot));  	seq_printf(seq, "RAW: inuse %d\n", @@ -110,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {  	SNMP_MIB_SENTINEL  }; -/* Following RFC4293 items are displayed in /proc/net/netstat */ +/* Following items are displayed in /proc/net/netstat */  static const struct snmp_mib snmp4_ipextstats_list[] = {  	SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),  	SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), @@ -124,6 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {  	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),  	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),  	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), +	/* Non RFC4293 fields */ +	SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), +	SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), +	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), +	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), +	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),  	SNMP_MIB_SENTINEL  }; @@ -161,6 +168,7 @@ static const struct snmp_mib snmp4_tcp_list[] = {  	SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),  	SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),  	SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), +	SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),  	SNMP_MIB_SENTINEL  }; @@ -171,6 +179,7 @@ static const struct snmp_mib snmp4_udp_list[] = {  	SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),  	SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),  	SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), +	SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),  	SNMP_MIB_SENTINEL  }; @@ -215,7 +224,6 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),  	SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),  	SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), -	SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),  	SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),  	SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),  	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), @@ -224,6 +232,8 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),  	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),  	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), +	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), +	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),  	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),  	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),  	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), @@ -232,7 +242,6 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),  	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),  	SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), -	SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),  	SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),  	SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),  	SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), @@ -253,6 +262,30 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),  	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),  	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), +	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), +	SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), +	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), +	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL), +	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE), +	SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE), +	SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP), +	SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE), +	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), +	SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), +	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), +	SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL), +	SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), +	SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), +	SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), +	SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), +	SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), +	SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), +	SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), +	SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), +	SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), +	SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),  	SNMP_MIB_SENTINEL  }; @@ -284,7 +317,7 @@ static void icmpmsg_put(struct seq_file *seq)  	count = 0;  	for (i = 0; i < ICMPMSG_MIB_MAX; i++) { -		val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); +		val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);  		if (val) {  			type[count] = i;  			vals[count++] = val; @@ -303,27 +336,27 @@ static void icmp_put(struct seq_file *seq)  {  	int i;  	struct net *net = seq->private; +	atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; -	seq_puts(seq, "\nIcmp: InMsgs InErrors"); -	for (i=0; icmpmibmap[i].name != NULL; i++) +	seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " In%s", icmpmibmap[i].name);  	seq_printf(seq, " OutMsgs OutErrors"); -	for (i=0; icmpmibmap[i].name != NULL; i++) +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " Out%s", icmpmibmap[i].name); -	seq_printf(seq, "\nIcmp: %lu %lu", -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); -	for (i=0; icmpmibmap[i].name != NULL; i++) +	seq_printf(seq, "\nIcmp: %lu %lu %lu", +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, -				icmpmibmap[i].index)); +			   atomic_long_read(ptr + icmpmibmap[i].index));  	seq_printf(seq, " %lu %lu", -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), -		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); -	for (i=0; icmpmibmap[i].name != NULL; i++) +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), +		snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); +	for (i = 0; icmpmibmap[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, -				icmpmibmap[i].index | 0x100)); +			   atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));  }  /* @@ -346,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);  	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)  		seq_printf(seq, " %llu", -			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics, +			   snmp_fold_field64(net->mib.ip_statistics,  					     snmp4_ipstats_list[i].entry,  					     offsetof(struct ipstats_mib, syncp))); @@ -362,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  		/* MaxConn field is signed, RFC 2012 */  		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)  			seq_printf(seq, " %ld", -				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics, +				   snmp_fold_field(net->mib.tcp_statistics,  						   snmp4_tcp_list[i].entry));  		else  			seq_printf(seq, " %lu", -				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics, +				   snmp_fold_field(net->mib.tcp_statistics,  						   snmp4_tcp_list[i].entry));  	} @@ -377,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nUdp:");  	for (i = 0; snmp4_udp_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.udp_statistics, +			   snmp_fold_field(net->mib.udp_statistics,  					   snmp4_udp_list[i].entry));  	/* the UDP and UDP-Lite MIBs are the same */ @@ -388,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nUdpLite:");  	for (i = 0; snmp4_udp_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.udplite_statistics, +			   snmp_fold_field(net->mib.udplite_statistics,  					   snmp4_udp_list[i].entry));  	seq_putc(seq, '\n'); @@ -425,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nTcpExt:");  	for (i = 0; snmp4_net_list[i].name != NULL; i++)  		seq_printf(seq, " %lu", -			   snmp_fold_field((void __percpu **)net->mib.net_statistics, +			   snmp_fold_field(net->mib.net_statistics,  					   snmp4_net_list[i].entry));  	seq_puts(seq, "\nIpExt:"); @@ -435,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)  	seq_puts(seq, "\nIpExt:");  	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)  		seq_printf(seq, " %llu", -			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics, +			   snmp_fold_field64(net->mib.ip_statistics,  					     snmp4_ipextstats_list[i].entry,  					     offsetof(struct ipstats_mib, syncp))); @@ -458,28 +491,29 @@ static const struct file_operations netstat_seq_fops = {  static __net_init int ip_proc_init_net(struct net *net)  { -	if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) +	if (!proc_create("sockstat", S_IRUGO, net->proc_net, +			 &sockstat_seq_fops))  		goto out_sockstat; -	if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) +	if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))  		goto out_netstat; -	if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) +	if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))  		goto out_snmp;  	return 0;  out_snmp: -	proc_net_remove(net, "netstat"); +	remove_proc_entry("netstat", net->proc_net);  out_netstat: -	proc_net_remove(net, "sockstat"); +	remove_proc_entry("sockstat", net->proc_net);  out_sockstat:  	return -ENOMEM;  }  static __net_exit void ip_proc_exit_net(struct net *net)  { -	proc_net_remove(net, "snmp"); -	proc_net_remove(net, "netstat"); -	proc_net_remove(net, "sockstat"); +	remove_proc_entry("snmp", net->proc_net); +	remove_proc_entry("netstat", net->proc_net); +	remove_proc_entry("sockstat", net->proc_net);  }  static __net_initdata struct pernet_operations ip_proc_ops = { diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 9ae5c01cd0b..46d6a1c923a 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,29 +29,33 @@  #include <net/protocol.h>  const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; - -/* - *	Add a protocol handler to the hash tables - */ +const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;  int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)  { -	int hash = protocol & (MAX_INET_PROTOS - 1); +	if (!prot->netns_ok) { +		pr_err("Protocol %u is not namespace aware, cannot register.\n", +			protocol); +		return -EINVAL; +	} -	return !cmpxchg((const struct net_protocol **)&inet_protos[hash], +	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],  			NULL, prot) ? 0 : -1;  }  EXPORT_SYMBOL(inet_add_protocol); -/* - *	Remove a protocol from the hash tables. - */ +int inet_add_offload(const struct net_offload *prot, unsigned char protocol) +{ +	return !cmpxchg((const struct net_offload **)&inet_offloads[protocol], +			NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet_add_offload);  int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)  { -	int ret, hash = protocol & (MAX_INET_PROTOS - 1); +	int ret; -	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], +	ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],  		       prot, NULL) == prot) ? 0 : -1;  	synchronize_net(); @@ -59,3 +63,16 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)  	return ret;  }  EXPORT_SYMBOL(inet_del_protocol); + +int inet_del_offload(const struct net_offload *prot, unsigned char protocol) +{ +	int ret; + +	ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol], +		       prot, NULL) == prot) ? 0 : -1; + +	synchronize_net(); + +	return ret; +} +EXPORT_SYMBOL(inet_del_offload); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a3d5ab786e8..2c65160565e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -38,7 +38,7 @@   */  #include <linux/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <asm/byteorder.h>  #include <asm/current.h>  #include <asm/uaccess.h> @@ -48,6 +48,7 @@  #include <linux/errno.h>  #include <linux/aio.h>  #include <linux/kernel.h> +#include <linux/export.h>  #include <linux/spinlock.h>  #include <linux/sockios.h>  #include <linux/socket.h> @@ -76,6 +77,7 @@  #include <linux/seq_file.h>  #include <linux/netfilter.h>  #include <linux/netfilter_ipv4.h> +#include <linux/compat.h>  static struct raw_hashinfo raw_v4_hashinfo = {  	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -109,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk);  static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,  		unsigned short num, __be32 raddr, __be32 laddr, int dif)  { -	struct hlist_node *node; - -	sk_for_each_from(sk, node) { +	sk_for_each_from(sk) {  		struct inet_sock *inet = inet_sk(sk);  		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&& @@ -129,18 +129,20 @@ found:   *	0 - deliver   *	1 - block   */ -static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)  { -	int type; +	struct icmphdr _hdr; +	const struct icmphdr *hdr; -	if (!pskb_may_pull(skb, sizeof(struct icmphdr))) +	hdr = skb_header_pointer(skb, skb_transport_offset(skb), +				 sizeof(_hdr), &_hdr); +	if (!hdr)  		return 1; -	type = icmp_hdr(skb)->type; -	if (type < 32) { +	if (hdr->type < 32) {  		__u32 data = raw_sk(sk)->filter.data; -		return ((1 << type) & data) != 0; +		return ((1U << hdr->type) & data) != 0;  	}  	/* Do not block unknown ICMP types */ @@ -153,7 +155,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)   * RFC 1122: SHOULD pass TOS value up to the transport layer.   * -> It does. And not only TOS, but all IP header.   */ -static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)  {  	struct sock *sk;  	struct hlist_head *head; @@ -214,6 +216,13 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)  	int err = 0;  	int harderr = 0; +	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) +		ipv4_sk_update_pmtu(skb, sk, info); +	else if (type == ICMP_REDIRECT) { +		ipv4_sk_redirect(skb, sk); +		return; +	} +  	/* Report error on raw socket, if:  	   1. User requested ip_recverr.  	   2. Socket is connected (otherwise the error indication @@ -246,7 +255,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)  	}  	if (inet->recverr) { -		struct iphdr *iph = (struct iphdr *)skb->data; +		const struct iphdr *iph = (const struct iphdr *)skb->data;  		u8 *payload = skb->data + (iph->ihl << 2);  		if (inet->hdrincl) @@ -264,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)  {  	int hash;  	struct sock *raw_sk; -	struct iphdr *iph; +	const struct iphdr *iph;  	struct net *net;  	hash = protocol & (RAW_HTABLE_SIZE - 1); @@ -272,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)  	read_lock(&raw_v4_hashinfo.lock);  	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);  	if (raw_sk != NULL) { -		iph = (struct iphdr *)skb->data; +		iph = (const struct iphdr *)skb->data;  		net = dev_net(skb->dev);  		while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, @@ -280,17 +289,18 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)  						skb->dev->ifindex)) != NULL) {  			raw_err(raw_sk, skb, info);  			raw_sk = sk_next(raw_sk); -			iph = (struct iphdr *)skb->data; +			iph = (const struct iphdr *)skb->data;  		}  	}  	read_unlock(&raw_v4_hashinfo.lock);  } -static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)  {  	/* Charge it to the socket. */ -	if (ip_queue_rcv_skb(sk, skb) < 0) { +	ipv4_pktinfo_prepare(sk, skb); +	if (sock_queue_rcv_skb(sk, skb) < 0) {  		kfree_skb(skb);  		return NET_RX_DROP;  	} @@ -313,9 +323,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)  	return 0;  } -static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, -			struct rtable **rtp, -			unsigned int flags) +static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, +			   void *from, size_t length, +			   struct rtable **rtp, +			   unsigned int flags)  {  	struct inet_sock *inet = inet_sk(sk);  	struct net *net = sock_net(sk); @@ -324,21 +335,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,  	unsigned int iphlen;  	int err;  	struct rtable *rt = *rtp; +	int hlen, tlen;  	if (length > rt->dst.dev->mtu) { -		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, +		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,  			       rt->dst.dev->mtu);  		return -EMSGSIZE;  	}  	if (flags&MSG_PROBE)  		goto out; +	hlen = LL_RESERVED_SPACE(rt->dst.dev); +	tlen = rt->dst.dev->needed_tailroom;  	skb = sock_alloc_send_skb(sk, -				  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, +				  length + hlen + tlen + 15,  				  flags & MSG_DONTWAIT, &err);  	if (skb == NULL)  		goto error; -	skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); +	skb_reserve(skb, hlen);  	skb->priority = sk->sk_priority;  	skb->mark = sk->sk_mark; @@ -371,11 +385,11 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,  	if (iphlen >= sizeof(*iph)) {  		if (!iph->saddr) -			iph->saddr = rt->rt_src; +			iph->saddr = fl4->saddr;  		iph->check   = 0;  		iph->tot_len = htons(length);  		if (!iph->id) -			ip_select_ident(iph, &rt->dst, NULL); +			ip_select_ident(skb, NULL);  		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);  	} @@ -401,7 +415,7 @@ error:  	return err;  } -static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)  {  	struct iovec *iov;  	u8 __user *type = NULL; @@ -417,7 +431,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)  		if (!iov)  			continue; -		switch (fl->proto) { +		switch (fl4->flowi4_proto) {  		case IPPROTO_ICMP:  			/* check if one-byte field is readable or not. */  			if (iov->iov_base && iov->iov_len < 1) @@ -432,8 +446,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)  				code = iov->iov_base;  			if (type && code) { -				if (get_user(fl->fl_icmp_type, type) || -				    get_user(fl->fl_icmp_code, code)) +				if (get_user(fl4->fl4_icmp_type, type) || +				    get_user(fl4->fl4_icmp_code, code))  					return -EFAULT;  				probed = 1;  			} @@ -454,11 +468,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	struct inet_sock *inet = inet_sk(sk);  	struct ipcm_cookie ipc;  	struct rtable *rt = NULL; +	struct flowi4 fl4;  	int free = 0;  	__be32 daddr;  	__be32 saddr;  	u8  tos;  	int err; +	struct ip_options_data opt_copy;  	err = -EMSGSIZE;  	if (len > 0xFFFF) @@ -477,16 +493,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	 */  	if (msg->msg_namelen) { -		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);  		err = -EINVAL;  		if (msg->msg_namelen < sizeof(*usin))  			goto out;  		if (usin->sin_family != AF_INET) { -			static int complained; -			if (!complained++) -				printk(KERN_INFO "%s forgot to set AF_INET in " -						 "raw sendmsg. Fix it!\n", -						 current->comm); +			pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", +				     __func__, current->comm);  			err = -EAFNOSUPPORT;  			if (usin->sin_family)  				goto out; @@ -506,10 +519,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.addr = inet->inet_saddr;  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1;  	ipc.oif = sk->sk_bound_dev_if;  	if (msg->msg_controllen) { -		err = ip_cmsg_send(sock_net(sk), msg, &ipc); +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);  		if (err)  			goto out;  		if (ipc.opt) @@ -519,8 +534,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	saddr = ipc.addr;  	ipc.addr = daddr; -	if (!ipc.opt) -		ipc.opt = inet->opt; +	if (!ipc.opt) { +		struct ip_options_rcu *inet_opt; + +		rcu_read_lock(); +		inet_opt = rcu_dereference(inet->inet_opt); +		if (inet_opt) { +			memcpy(&opt_copy, inet_opt, +			       sizeof(*inet_opt) + inet_opt->opt.optlen); +			ipc.opt = &opt_copy.opt; +		} +		rcu_read_unlock(); +	}  	if (ipc.opt) {  		err = -EINVAL; @@ -529,13 +554,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		 */  		if (inet->hdrincl)  			goto done; -		if (ipc.opt->srr) { +		if (ipc.opt->opt.srr) {  			if (!daddr)  				goto done; -			daddr = ipc.opt->faddr; +			daddr = ipc.opt->opt.faddr;  		}  	} -	tos = RT_CONN_FLAGS(sk); +	tos = get_rtconn_flags(&ipc, sk);  	if (msg->msg_flags & MSG_DONTROUTE)  		tos |= RTO_ONLINK; @@ -544,28 +569,29 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  			ipc.oif = inet->mc_index;  		if (!saddr)  			saddr = inet->mc_addr; +	} else if (!ipc.oif) +		ipc.oif = inet->uc_index; + +	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, +			   RT_SCOPE_UNIVERSE, +			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, +			   inet_sk_flowi_flags(sk) | +			    (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), +			   daddr, saddr, 0, 0); + +	if (!inet->hdrincl) { +		err = raw_probe_proto_opt(&fl4, msg); +		if (err) +			goto done;  	} -	{ -		struct flowi fl = { .oif = ipc.oif, -				    .mark = sk->sk_mark, -				    .fl4_dst = daddr, -				    .fl4_src = saddr, -				    .fl4_tos = tos, -				    .proto = inet->hdrincl ? IPPROTO_RAW : -							     sk->sk_protocol, -				  }; -		if (!inet->hdrincl) { -			err = raw_probe_proto_opt(&fl, msg); -			if (err) -				goto done; -		} - -		security_sk_classify_flow(sk, &fl); -		err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); -	} -	if (err) +	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_flow(sock_net(sk), &fl4, sk); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt); +		rt = NULL;  		goto done; +	}  	err = -EACCES;  	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) @@ -576,19 +602,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  back_from_confirm:  	if (inet->hdrincl) -		err = raw_send_hdrinc(sk, msg->msg_iov, len, -					&rt, msg->msg_flags); +		err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, +				      &rt, msg->msg_flags);  	 else {  		if (!ipc.addr) -			ipc.addr = rt->rt_dst; +			ipc.addr = fl4.daddr;  		lock_sock(sk); -		err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, -					&ipc, &rt, msg->msg_flags); +		err = ip_append_data(sk, &fl4, ip_generic_getfrag, +				     msg->msg_iov, len, 0, +				     &ipc, &rt, msg->msg_flags);  		if (err)  			ip_flush_pending_frames(sk);  		else if (!(msg->msg_flags & MSG_MORE)) { -			err = ip_push_pending_frames(sk); +			err = ip_push_pending_frames(sk, &fl4);  			if (err == -ENOBUFS && !inet->recverr)  				err = 0;  		} @@ -615,7 +642,7 @@ do_confirm:  static void raw_close(struct sock *sk, long timeout)  {  	/* -	 * Raw sockets may have direct kernel refereneces. Kill them. +	 * Raw sockets may have direct kernel references. Kill them.  	 */  	ip_ra_control(sk, 0, NULL); @@ -663,17 +690,14 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	struct inet_sock *inet = inet_sk(sk);  	size_t copied = 0;  	int err = -EOPNOTSUPP; -	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct sk_buff *skb;  	if (flags & MSG_OOB)  		goto out; -	if (addr_len) -		*addr_len = sizeof(*sin); -  	if (flags & MSG_ERRQUEUE) { -		err = ip_recv_error(sk, msg, len); +		err = ip_recv_error(sk, msg, len, addr_len);  		goto out;  	} @@ -699,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;  		sin->sin_port = 0;  		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	if (inet->cmsg_flags)  		ip_cmsg_recv(msg, skb); @@ -812,31 +837,48 @@ static int compat_raw_getsockopt(struct sock *sk, int level, int optname,  static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)  {  	switch (cmd) { -		case SIOCOUTQ: { -			int amount = sk_wmem_alloc_get(sk); +	case SIOCOUTQ: { +		int amount = sk_wmem_alloc_get(sk); -			return put_user(amount, (int __user *)arg); -		} -		case SIOCINQ: { -			struct sk_buff *skb; -			int amount = 0; - -			spin_lock_bh(&sk->sk_receive_queue.lock); -			skb = skb_peek(&sk->sk_receive_queue); -			if (skb != NULL) -				amount = skb->len; -			spin_unlock_bh(&sk->sk_receive_queue.lock); -			return put_user(amount, (int __user *)arg); -		} +		return put_user(amount, (int __user *)arg); +	} +	case SIOCINQ: { +		struct sk_buff *skb; +		int amount = 0; + +		spin_lock_bh(&sk->sk_receive_queue.lock); +		skb = skb_peek(&sk->sk_receive_queue); +		if (skb != NULL) +			amount = skb->len; +		spin_unlock_bh(&sk->sk_receive_queue.lock); +		return put_user(amount, (int __user *)arg); +	} -		default: +	default: +#ifdef CONFIG_IP_MROUTE +		return ipmr_ioctl(sk, cmd, (void __user *)arg); +#else +		return -ENOIOCTLCMD; +#endif +	} +} + +#ifdef CONFIG_COMPAT +static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ +	switch (cmd) { +	case SIOCOUTQ: +	case SIOCINQ: +		return -ENOIOCTLCMD; +	default:  #ifdef CONFIG_IP_MROUTE -			return ipmr_ioctl(sk, cmd, (void __user *)arg); +		return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));  #else -			return -ENOIOCTLCMD; +		return -ENOIOCTLCMD;  #endif  	}  } +#endif  struct proto raw_prot = {  	.name		   = "RAW", @@ -853,6 +895,7 @@ struct proto raw_prot = {  	.recvmsg	   = raw_recvmsg,  	.bind		   = raw_bind,  	.backlog_rcv	   = raw_rcv_skb, +	.release_cb	   = ip4_datagram_release_cb,  	.hash		   = raw_hash_sk,  	.unhash		   = raw_unhash_sk,  	.obj_size	   = sizeof(struct raw_sock), @@ -860,6 +903,7 @@ struct proto raw_prot = {  #ifdef CONFIG_COMPAT  	.compat_setsockopt = compat_raw_setsockopt,  	.compat_getsockopt = compat_raw_getsockopt, +	.compat_ioctl	   = compat_raw_ioctl,  #endif  }; @@ -871,9 +915,7 @@ static struct sock *raw_get_first(struct seq_file *seq)  	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;  			++state->bucket) { -		struct hlist_node *node; - -		sk_for_each(sk, node, &state->h->ht[state->bucket]) +		sk_for_each(sk, &state->h->ht[state->bucket])  			if (sock_net(sk) == seq_file_net(seq))  				goto found;  	} @@ -948,11 +990,13 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)  	      srcp  = inet->inet_num;  	seq_printf(seq, "%4d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",  		i, src, srcp, dest, destp, sp->sk_state,  		sk_wmem_alloc_get(sp),  		sk_rmem_alloc_get(sp), -		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), +		0, 0L, 0, +		from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), +		0, sock_i_ino(sp),  		atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));  } @@ -1005,7 +1049,7 @@ static const struct file_operations raw_seq_fops = {  static __net_init int raw_init_net(struct net *net)  { -	if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) +	if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))  		return -ENOMEM;  	return 0; @@ -1013,7 +1057,7 @@ static __net_init int raw_init_net(struct net *net)  static __net_exit void raw_exit_net(struct net *net)  { -	proc_net_remove(net, "raw"); +	remove_proc_entry("raw", net->proc_net);  }  static __net_initdata struct pernet_operations raw_net_ops = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3843c2dfde8..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -62,14 +62,14 @@   *		2 of the License, or (at your option) any later version.   */ +#define pr_fmt(fmt) "IPv4: " fmt +  #include <linux/module.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include <linux/bitops.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/mm.h> -#include <linux/bootmem.h>  #include <linux/string.h>  #include <linux/socket.h>  #include <linux/sockios.h> @@ -79,7 +79,6 @@  #include <linux/netdevice.h>  #include <linux/proc_fs.h>  #include <linux/init.h> -#include <linux/workqueue.h>  #include <linux/skbuff.h>  #include <linux/inetdevice.h>  #include <linux/igmp.h> @@ -87,10 +86,10 @@  #include <linux/mroute.h>  #include <linux/netfilter_ipv4.h>  #include <linux/random.h> -#include <linux/jhash.h>  #include <linux/rcupdate.h>  #include <linux/times.h>  #include <linux/slab.h> +#include <linux/jhash.h>  #include <net/dst.h>  #include <net/net_namespace.h>  #include <net/protocol.h> @@ -107,67 +106,71 @@  #include <net/rtnetlink.h>  #ifdef CONFIG_SYSCTL  #include <linux/sysctl.h> +#include <linux/kmemleak.h>  #endif +#include <net/secure_seq.h> -#define RT_FL_TOS(oldflp) \ -    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) - -#define IP_MAX_MTU	0xFFF0 +#define RT_FL_TOS(oldflp4) \ +	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))  #define RT_GC_TIMEOUT (300*HZ)  static int ip_rt_max_size; -static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly	= 60 * HZ; -static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;  static int ip_rt_redirect_number __read_mostly	= 9;  static int ip_rt_redirect_load __read_mostly	= HZ / 50;  static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));  static int ip_rt_error_cost __read_mostly	= HZ;  static int ip_rt_error_burst __read_mostly	= 5 * HZ; -static int ip_rt_gc_elasticity __read_mostly	= 8;  static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;  static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;  static int ip_rt_min_advmss __read_mostly	= 256; -static int rt_chain_length_max __read_mostly	= 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies;  /*   *	Interface to generic destination cache.   */  static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); -static void		 ipv4_dst_destroy(struct dst_entry *dst); +static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst); +static unsigned int	 ipv4_mtu(const struct dst_entry *dst);  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);  static void		 ipv4_link_failure(struct sk_buff *skb); -static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(struct dst_ops *ops); +static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, +					   struct sk_buff *skb, u32 mtu); +static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk, +					struct sk_buff *skb); +static void		ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, -			    int how) +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)  { +	WARN_ON(1); +	return NULL;  } +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, +					   struct sk_buff *skb, +					   const void *daddr); +  static struct dst_ops ipv4_dst_ops = {  	.family =		AF_INET,  	.protocol =		cpu_to_be16(ETH_P_IP), -	.gc =			rt_garbage_collect,  	.check =		ipv4_dst_check, +	.default_advmss =	ipv4_default_advmss, +	.mtu =			ipv4_mtu, +	.cow_metrics =		ipv4_cow_metrics,  	.destroy =		ipv4_dst_destroy, -	.ifdown =		ipv4_dst_ifdown,  	.negative_advice =	ipv4_negative_advice,  	.link_failure =		ipv4_link_failure,  	.update_pmtu =		ip_rt_update_pmtu, +	.redirect =		ip_do_redirect,  	.local_out =		__ip_local_out, +	.neigh_lookup =		ipv4_neigh_lookup,  };  #define ECN_OR_COST(class)	TC_PRIO_##class  const __u8 ip_tos2prio[16] = {  	TC_PRIO_BESTEFFORT, -	ECN_OR_COST(FILLER), +	ECN_OR_COST(BESTEFFORT),  	TC_PRIO_BESTEFFORT,  	ECN_OR_COST(BESTEFFORT),  	TC_PRIO_BULK, @@ -183,186 +186,27 @@ const __u8 ip_tos2prio[16] = {  	TC_PRIO_INTERACTIVE_BULK,  	ECN_OR_COST(INTERACTIVE_BULK)  }; - - -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - *    as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - *    they do so with atomic increments and with the - *    lock held. - */ - -struct rt_hash_bucket { -	struct rtable __rcu	*chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ -	defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ	256 -#else -# if NR_CPUS >= 32 -#  define RT_HASH_LOCK_SZ	4096 -# elif NR_CPUS >= 16 -#  define RT_HASH_LOCK_SZ	2048 -# elif NR_CPUS >= 8 -#  define RT_HASH_LOCK_SZ	1024 -# elif NR_CPUS >= 4 -#  define RT_HASH_LOCK_SZ	512 -# else -#  define RT_HASH_LOCK_SZ	256 -# endif -#endif - -static spinlock_t	*rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ -	int i; - -	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, -			GFP_KERNEL); -	if (!rt_hash_locks) -		panic("IP: failed to allocate rt_hash_locks\n"); - -	for (i = 0; i < RT_HASH_LOCK_SZ; i++) -		spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket 	*rt_hash_table __read_mostly; -static unsigned			rt_hash_mask __read_mostly; -static unsigned int		rt_hash_log  __read_mostly; +EXPORT_SYMBOL(ip_tos2prio);  static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) - -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, -				   int genid) -{ -	return jhash_3words((__force u32)daddr, (__force u32)saddr, -			    idx, genid) -		& rt_hash_mask; -} - -static inline int rt_genid(struct net *net) -{ -	return atomic_read(&net->ipv4.rt_genid); -} +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)  #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { -	struct seq_net_private p; -	int bucket; -	int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ -	struct rt_cache_iter_state *st = seq->private; -	struct rtable *r = NULL; - -	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { -		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) -			continue; -		rcu_read_lock_bh(); -		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); -		while (r) { -			if (dev_net(r->dst.dev) == seq_file_net(seq) && -			    r->rt_genid == st->genid) -				return r; -			r = rcu_dereference_bh(r->dst.rt_next); -		} -		rcu_read_unlock_bh(); -	} -	return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, -					  struct rtable *r) -{ -	struct rt_cache_iter_state *st = seq->private; - -	r = rcu_dereference_bh(r->dst.rt_next); -	while (!r) { -		rcu_read_unlock_bh(); -		do { -			if (--st->bucket < 0) -				return NULL; -		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); -		rcu_read_lock_bh(); -		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); -	} -	return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, -					struct rtable *r) -{ -	struct rt_cache_iter_state *st = seq->private; -	while ((r = __rt_cache_get_next(seq, r)) != NULL) { -		if (dev_net(r->dst.dev) != seq_file_net(seq)) -			continue; -		if (r->rt_genid == st->genid) -			break; -	} -	return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ -	struct rtable *r = rt_cache_get_first(seq); - -	if (r) -		while (pos && (r = rt_cache_get_next(seq, r))) -			--pos; -	return pos ? NULL : r; -} -  static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)  { -	struct rt_cache_iter_state *st = seq->private;  	if (*pos) -		return rt_cache_get_idx(seq, *pos - 1); -	st->genid = rt_genid(seq_file_net(seq)); +		return NULL;  	return SEQ_START_TOKEN;  }  static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)  { -	struct rtable *r; - -	if (v == SEQ_START_TOKEN) -		r = rt_cache_get_first(seq); -	else -		r = rt_cache_get_next(seq, v);  	++*pos; -	return r; +	return NULL;  }  static void rt_cache_seq_stop(struct seq_file *seq, void *v)  { -	if (v && v != SEQ_START_TOKEN) -		rcu_read_unlock_bh();  }  static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -372,30 +216,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)  			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"  			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"  			   "HHUptod\tSpecDst"); -	else { -		struct rtable *r = v; -		int len; - -		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" -			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", -			r->dst.dev ? r->dst.dev->name : "*", -			(__force u32)r->rt_dst, -			(__force u32)r->rt_gateway, -			r->rt_flags, atomic_read(&r->dst.__refcnt), -			r->dst.__use, 0, (__force u32)r->rt_src, -			(dst_metric(&r->dst, RTAX_ADVMSS) ? -			     (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), -			dst_metric(&r->dst, RTAX_WINDOW), -			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + -			      dst_metric(&r->dst, RTAX_RTTVAR)), -			r->fl.fl4_tos, -			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, -			r->dst.hh ? (r->dst.hh->hh_output == -				       dev_queue_xmit) : 0, -			r->rt_spec_dst, &len); - -		seq_printf(seq, "%*s\n", 127 - len, ""); -	}  	return 0;  } @@ -408,8 +228,7 @@ static const struct seq_operations rt_cache_seq_ops = {  static int rt_cache_seq_open(struct inode *inode, struct file *file)  { -	return seq_open_net(inode, file, &rt_cache_seq_ops, -			sizeof(struct rt_cache_iter_state)); +	return seq_open(file, &rt_cache_seq_ops);  }  static const struct file_operations rt_cache_seq_fops = { @@ -417,7 +236,7 @@ static const struct file_operations rt_cache_seq_fops = {  	.open	 = rt_cache_seq_open,  	.read	 = seq_read,  	.llseek	 = seq_lseek, -	.release = seq_release_net, +	.release = seq_release,  }; @@ -468,7 +287,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)  	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "  		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",  		   dst_entries_get_slow(&ipv4_dst_ops), -		   st->in_hit, +		   0, /* st->in_hit */  		   st->in_slow_tot,  		   st->in_slow_mc,  		   st->in_no_route, @@ -476,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)  		   st->in_martian_dst,  		   st->in_martian_src, -		   st->out_hit, +		   0, /* st->out_hit */  		   st->out_slow_tot,  		   st->out_slow_mc, -		   st->gc_total, -		   st->gc_ignored, -		   st->gc_goal_miss, -		   st->gc_dst_overflow, -		   st->in_hlist_search, -		   st->out_hlist_search +		   0, /* st->gc_total */ +		   0, /* st->gc_ignored */ +		   0, /* st->gc_goal_miss */ +		   0, /* st->gc_dst_overflow */ +		   0, /* st->in_hlist_search */ +		   0  /* st->out_hlist_search */  		);  	return 0;  } @@ -511,7 +330,7 @@ static const struct file_operations rt_cpu_seq_fops = {  	.release = seq_release,  }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  static int rt_acct_proc_show(struct seq_file *m, void *v)  {  	struct ip_rt_acct *dst, *src; @@ -554,8 +373,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)  {  	struct proc_dir_entry *pde; -	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, -			&rt_cache_seq_fops); +	pde = proc_create("rt_cache", S_IRUGO, net->proc_net, +			  &rt_cache_seq_fops);  	if (!pde)  		goto err1; @@ -564,14 +383,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)  	if (!pde)  		goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);  	if (!pde)  		goto err3;  #endif  	return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  err3:  	remove_proc_entry("rt_cache", net->proc_net_stat);  #endif @@ -585,7 +404,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)  {  	remove_proc_entry("rt_cache", net->proc_net_stat);  	remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	remove_proc_entry("rt_acct", net->proc_net);  #endif  } @@ -607,770 +426,306 @@ static inline int ip_rt_proc_init(void)  }  #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) +static inline bool rt_is_expired(const struct rtable *rth)  { -	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));  } -static inline void rt_drop(struct rtable *rt) +void rt_cache_flush(struct net *net)  { -	ip_rt_put(rt); -	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); +	rt_genid_bump_ipv4(net);  } -static inline int rt_fast_clean(struct rtable *rth) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, +					   struct sk_buff *skb, +					   const void *daddr)  { -	/* Kill broadcast/multicast entries very aggresively, if they -	   collide in hash table with more useful entries */ -	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && -		rt_is_input_route(rth) && rth->dst.rt_next; -} - -static inline int rt_valuable(struct rtable *rth) -{ -	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || -		rth->dst.expires; -} - -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ -	unsigned long age; -	int ret = 0; +	struct net_device *dev = dst->dev; +	const __be32 *pkey = daddr; +	const struct rtable *rt; +	struct neighbour *n; -	if (atomic_read(&rth->dst.__refcnt)) -		goto out; +	rt = (const struct rtable *) dst; +	if (rt->rt_gateway) +		pkey = (const __be32 *) &rt->rt_gateway; +	else if (skb) +		pkey = &ip_hdr(skb)->daddr; -	ret = 1; -	if (rth->dst.expires && -	    time_after_eq(jiffies, rth->dst.expires)) -		goto out; - -	age = jiffies - rth->dst.lastuse; -	ret = 0; -	if ((age <= tmo1 && !rt_fast_clean(rth)) || -	    (age <= tmo2 && rt_valuable(rth))) -		goto out; -	ret = 1; -out:	return ret; +	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); +	if (n) +		return n; +	return neigh_create(&arp_tbl, pkey, dev);  } -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) -{ -	u32 score = jiffies - rt->dst.lastuse; - -	score = ~score & ~(3<<30); - -	if (rt_valuable(rt)) -		score |= (1<<31); - -	if (rt_is_output_route(rt) || -	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) -		score |= (1<<30); - -	return score; -} +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { +	atomic_t	id; +	u32		stamp32; +}; -static inline bool rt_caching(const struct net *net) -{ -	return net->ipv4.current_rt_cache_rebuild_count <= -		net->ipv4.sysctl_rt_cache_rebuild_count; -} +static struct ip_ident_bucket *ip_idents __read_mostly; -static inline bool compare_hash_inputs(const struct flowi *fl1, -					const struct flowi *fl2) +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. + */ +u32 ip_idents_reserve(u32 hash, int segs)  { -	return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | -		((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | -		(fl1->iif ^ fl2->iif)) == 0); -} +	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; +	u32 old = ACCESS_ONCE(bucket->stamp32); +	u32 now = (u32)jiffies; +	u32 delta = 0; -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) -{ -	return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | -		((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | -		(fl1->mark ^ fl2->mark) | -		(*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | -		(fl1->oif ^ fl2->oif) | -		(fl1->iif ^ fl2->iif)) == 0; -} +	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) +		delta = prandom_u32_max(now - old); -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ -	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); +	return atomic_add_return(segs + delta, &bucket->id) - segs;  } +EXPORT_SYMBOL(ip_idents_reserve); -static inline int rt_is_expired(struct rtable *rth) +void __ip_select_ident(struct iphdr *iph, int segs)  { -	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); -} +	static u32 ip_idents_hashrnd __read_mostly; +	u32 hash, id; -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(int process_context) -{ -	unsigned int i; -	struct rtable *rth, *next; -	struct rtable * tail; - -	for (i = 0; i <= rt_hash_mask; i++) { -		if (process_context && need_resched()) -			cond_resched(); -		rth = rcu_dereference_raw(rt_hash_table[i].chain); -		if (!rth) -			continue; +	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); -		spin_lock_bh(rt_hash_lock_addr(i)); -#ifdef CONFIG_NET_NS -		{ -		struct rtable __rcu **prev; -		struct rtable *p; - -		rth = rcu_dereference_protected(rt_hash_table[i].chain, -			lockdep_is_held(rt_hash_lock_addr(i))); - -		/* defer releasing the head of the list after spin_unlock */ -		for (tail = rth; tail; -		     tail = rcu_dereference_protected(tail->dst.rt_next, -				lockdep_is_held(rt_hash_lock_addr(i)))) -			if (!rt_is_expired(tail)) -				break; -		if (rth != tail) -			rt_hash_table[i].chain = tail; - -		/* call rt_free on entries after the tail requiring flush */ -		prev = &rt_hash_table[i].chain; -		for (p = rcu_dereference_protected(*prev, -				lockdep_is_held(rt_hash_lock_addr(i))); -		     p != NULL; -		     p = next) { -			next = rcu_dereference_protected(p->dst.rt_next, -				lockdep_is_held(rt_hash_lock_addr(i))); -			if (!rt_is_expired(p)) { -				prev = &p->dst.rt_next; -			} else { -				*prev = next; -				rt_free(p); -			} -		} -		} -#else -		rth = rcu_dereference_protected(rt_hash_table[i].chain, -			lockdep_is_held(rt_hash_lock_addr(i))); -		rcu_assign_pointer(rt_hash_table[i].chain, NULL); -		tail = NULL; -#endif -		spin_unlock_bh(rt_hash_lock_addr(i)); - -		for (; rth != tail; rth = next) { -			next = rcu_dereference_protected(rth->dst.rt_next, 1); -			rt_free(rth); -		} -	} +	hash = jhash_3words((__force u32)iph->daddr, +			    (__force u32)iph->saddr, +			    iph->protocol, +			    ip_idents_hashrnd); +	id = ip_idents_reserve(hash, segs); +	iph->id = htons(id);  } +EXPORT_SYMBOL(__ip_select_ident); -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - *  rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -/* - * Given a hash chain and an item in this hash chain, - * find if a previous entry has the same hash_inputs - * (but differs on tos, mark or oif) - * Returns 0 if an alias is found. - * Returns ONE if rth has no alias before itself. - */ -static int has_noalias(const struct rtable *head, const struct rtable *rth) +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, +			     const struct iphdr *iph, +			     int oif, u8 tos, +			     u8 prot, u32 mark, int flow_flags)  { -	const struct rtable *aux = head; +	if (sk) { +		const struct inet_sock *inet = inet_sk(sk); -	while (aux != rth) { -		if (compare_hash_inputs(&aux->fl, &rth->fl)) -			return 0; -		aux = rcu_dereference_protected(aux->dst.rt_next, 1); +		oif = sk->sk_bound_dev_if; +		mark = sk->sk_mark; +		tos = RT_CONN_FLAGS(sk); +		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;  	} -	return ONE; +	flowi4_init_output(fl4, oif, mark, tos, +			   RT_SCOPE_UNIVERSE, prot, +			   flow_flags, +			   iph->daddr, iph->saddr, 0, 0);  } -static void rt_check_expire(void) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, +			       const struct sock *sk)  { -	static unsigned int rover; -	unsigned int i = rover, goal; -	struct rtable *rth; -	struct rtable __rcu **rthp; -	unsigned long samples = 0; -	unsigned long sum = 0, sum2 = 0; -	unsigned long delta; -	u64 mult; - -	delta = jiffies - expires_ljiffies; -	expires_ljiffies = jiffies; -	mult = ((u64)delta) << rt_hash_log; -	if (ip_rt_gc_timeout > 1) -		do_div(mult, ip_rt_gc_timeout); -	goal = (unsigned int)mult; -	if (goal > rt_hash_mask) -		goal = rt_hash_mask + 1; -	for (; goal > 0; goal--) { -		unsigned long tmo = ip_rt_gc_timeout; -		unsigned long length; - -		i = (i + 1) & rt_hash_mask; -		rthp = &rt_hash_table[i].chain; - -		if (need_resched()) -			cond_resched(); - -		samples++; - -		if (rcu_dereference_raw(*rthp) == NULL) -			continue; -		length = 0; -		spin_lock_bh(rt_hash_lock_addr(i)); -		while ((rth = rcu_dereference_protected(*rthp, -					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { -			prefetch(rth->dst.rt_next); -			if (rt_is_expired(rth)) { -				*rthp = rth->dst.rt_next; -				rt_free(rth); -				continue; -			} -			if (rth->dst.expires) { -				/* Entry is expired even if it is in use */ -				if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: -					tmo >>= 1; -					rthp = &rth->dst.rt_next; -					/* -					 * We only count entries on -					 * a chain with equal hash inputs once -					 * so that entries for different QOS -					 * levels, and other non-hash input -					 * attributes don't unfairly skew -					 * the length computation -					 */ -					length += has_noalias(rt_hash_table[i].chain, rth); -					continue; -				} -			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) -				goto nofree; +	const struct iphdr *iph = ip_hdr(skb); +	int oif = skb->dev->ifindex; +	u8 tos = RT_TOS(iph->tos); +	u8 prot = iph->protocol; +	u32 mark = skb->mark; -			/* Cleanup aged off entries. */ -			*rthp = rth->dst.rt_next; -			rt_free(rth); -		} -		spin_unlock_bh(rt_hash_lock_addr(i)); -		sum += length; -		sum2 += length*length; -	} -	if (samples) { -		unsigned long avg = sum / samples; -		unsigned long sd = int_sqrt(sum2 / samples - avg*avg); -		rt_chain_length_max = max_t(unsigned long, -					ip_rt_gc_elasticity, -					(avg + 4*sd) >> FRACT_BITS); -	} -	rover = i; +	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);  } -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)  { -	rt_check_expire(); -	schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} +	const struct inet_sock *inet = inet_sk(sk); +	const struct ip_options_rcu *inet_opt; +	__be32 daddr = inet->inet_daddr; -/* - * Pertubation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) -{ -	unsigned char shuffle; - -	get_random_bytes(&shuffle, sizeof(shuffle)); -	atomic_add(shuffle + 1U, &net->ipv4.rt_genid); -} - -/* - * delay < 0  : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) -{ -	rt_cache_invalidate(net); -	if (delay >= 0) -		rt_do_flush(!in_softirq()); +	rcu_read_lock(); +	inet_opt = rcu_dereference(inet->inet_opt); +	if (inet_opt && inet_opt->opt.srr) +		daddr = inet_opt->opt.faddr; +	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, +			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, +			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, +			   inet_sk_flowi_flags(sk), +			   daddr, inet->inet_saddr, 0, 0); +	rcu_read_unlock();  } -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(void) +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, +				 const struct sk_buff *skb)  { -	rt_do_flush(!in_softirq()); +	if (skb) +		build_skb_flow_key(fl4, skb, sk); +	else +		build_sk_flow_key(fl4, sk);  } -static void rt_emergency_hash_rebuild(struct net *net) +static inline void rt_free(struct rtable *rt)  { -	if (net_ratelimit()) -		printk(KERN_WARNING "Route hash chain too long!\n"); -	rt_cache_invalidate(net); +	call_rcu(&rt->dst.rcu_head, dst_rcu_free);  } -/* -   Short description of GC goals. - -   We want to build algorithm, which will keep routing cache -   at some equilibrium point, when number of aged off entries -   is kept approximately equal to newly generated ones. - -   Current expiration strength is variable "expire". -   We try to adjust it dynamically, so that if networking -   is idle expires is large enough to keep enough of warm entries, -   and when load increases it reduces to limit cache size. - */ +static DEFINE_SPINLOCK(fnhe_lock); -static int rt_garbage_collect(struct dst_ops *ops) +static void fnhe_flush_routes(struct fib_nh_exception *fnhe)  { -	static unsigned long expire = RT_GC_TIMEOUT; -	static unsigned long last_gc; -	static int rover; -	static int equilibrium; -	struct rtable *rth; -	struct rtable __rcu **rthp; -	unsigned long now = jiffies; -	int goal; -	int entries = dst_entries_get_fast(&ipv4_dst_ops); - -	/* -	 * Garbage collection is pretty expensive, -	 * do not make it too frequently. -	 */ - -	RT_CACHE_STAT_INC(gc_total); +	struct rtable *rt; -	if (now - last_gc < ip_rt_gc_min_interval && -	    entries < ip_rt_max_size) { -		RT_CACHE_STAT_INC(gc_ignored); -		goto out; +	rt = rcu_dereference(fnhe->fnhe_rth_input); +	if (rt) { +		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); +		rt_free(rt);  	} - -	entries = dst_entries_get_slow(&ipv4_dst_ops); -	/* Calculate number of entries, which we want to expire now. */ -	goal = entries - (ip_rt_gc_elasticity << rt_hash_log); -	if (goal <= 0) { -		if (equilibrium < ipv4_dst_ops.gc_thresh) -			equilibrium = ipv4_dst_ops.gc_thresh; -		goal = entries - equilibrium; -		if (goal > 0) { -			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); -			goal = entries - equilibrium; -		} -	} else { -		/* We are in dangerous area. Try to reduce cache really -		 * aggressively. -		 */ -		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); -		equilibrium = entries - goal; +	rt = rcu_dereference(fnhe->fnhe_rth_output); +	if (rt) { +		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); +		rt_free(rt);  	} +} -	if (now - last_gc >= ip_rt_gc_min_interval) -		last_gc = now; +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +{ +	struct fib_nh_exception *fnhe, *oldest; -	if (goal <= 0) { -		equilibrium += goal; -		goto work_done; +	oldest = rcu_dereference(hash->chain); +	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) +			oldest = fnhe;  	} +	fnhe_flush_routes(oldest); +	return oldest; +} -	do { -		int i, k; - -		for (i = rt_hash_mask, k = rover; i >= 0; i--) { -			unsigned long tmo = expire; - -			k = (k + 1) & rt_hash_mask; -			rthp = &rt_hash_table[k].chain; -			spin_lock_bh(rt_hash_lock_addr(k)); -			while ((rth = rcu_dereference_protected(*rthp, -					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { -				if (!rt_is_expired(rth) && -					!rt_may_expire(rth, tmo, expire)) { -					tmo >>= 1; -					rthp = &rth->dst.rt_next; -					continue; -				} -				*rthp = rth->dst.rt_next; -				rt_free(rth); -				goal--; -			} -			spin_unlock_bh(rt_hash_lock_addr(k)); -			if (goal <= 0) -				break; -		} -		rover = k; - -		if (goal <= 0) -			goto work_done; - -		/* Goal is not achieved. We stop process if: - -		   - if expire reduced to zero. Otherwise, expire is halfed. -		   - if table is not full. -		   - if we are called from interrupt. -		   - jiffies check is just fallback/debug loop breaker. -		     We will not spin here for long time in any case. -		 */ - -		RT_CACHE_STAT_INC(gc_goal_miss); - -		if (expire == 0) -			break; - -		expire >>= 1; -#if RT_CACHE_DEBUG >= 2 -		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, -				dst_entries_get_fast(&ipv4_dst_ops), goal, i); -#endif +static inline u32 fnhe_hashfun(__be32 daddr) +{ +	u32 hval; -		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) -			goto out; -	} while (!in_softirq() && time_before_eq(jiffies, now)); +	hval = (__force u32) daddr; +	hval ^= (hval >> 11) ^ (hval >> 22); -	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) -		goto out; -	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) -		goto out; -	if (net_ratelimit()) -		printk(KERN_WARNING "dst cache overflow\n"); -	RT_CACHE_STAT_INC(gc_dst_overflow); -	return 1; - -work_done: -	expire += ip_rt_gc_min_interval; -	if (expire > ip_rt_gc_timeout || -	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || -	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) -		expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 -	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, -			dst_entries_get_fast(&ipv4_dst_ops), goal, rover); -#endif -out:	return 0; +	return hval & (FNHE_HASH_SIZE - 1);  } -/* - * Returns number of entries in a hash chain that have different hash_inputs - */ -static int slow_chain_length(const struct rtable *head) +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)  { -	int length = 0; -	const struct rtable *rth = head; +	rt->rt_pmtu = fnhe->fnhe_pmtu; +	rt->dst.expires = fnhe->fnhe_expires; -	while (rth) { -		length += has_noalias(head, rth); -		rth = rcu_dereference_protected(rth->dst.rt_next, 1); +	if (fnhe->fnhe_gw) { +		rt->rt_flags |= RTCF_REDIRECTED; +		rt->rt_gateway = fnhe->fnhe_gw; +		rt->rt_uses_gateway = 1;  	} -	return length >> FRACT_BITS;  } -static int rt_intern_hash(unsigned hash, struct rtable *rt, -			  struct rtable **rp, struct sk_buff *skb, int ifindex) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, +				  u32 pmtu, unsigned long expires)  { -	struct rtable	*rth, *cand; -	struct rtable __rcu **rthp, **candp; -	unsigned long	now; -	u32 		min_score; -	int		chain_length; -	int attempts = !in_softirq(); - -restart: -	chain_length = 0; -	min_score = ~(u32)0; -	cand = NULL; -	candp = NULL; -	now = jiffies; +	struct fnhe_hash_bucket *hash; +	struct fib_nh_exception *fnhe; +	struct rtable *rt; +	unsigned int i; +	int depth; +	u32 hval = fnhe_hashfun(daddr); -	if (!rt_caching(dev_net(rt->dst.dev))) { -		/* -		 * If we're not caching, just tell the caller we -		 * were successful and don't touch the route.  The -		 * caller hold the sole reference to the cache entry, and -		 * it will be released when the caller is done with it. -		 * If we drop it here, the callers have no way to resolve routes -		 * when we're not caching.  Instead, just point *rp at rt, so -		 * the caller gets a single use out of the route -		 * Note that we do rt_free on this new route entry, so that -		 * once its refcount hits zero, we are still able to reap it -		 * (Thanks Alexey) -		 * Note: To avoid expensive rcu stuff for this uncached dst, -		 * we set DST_NOCACHE so that dst_release() can free dst without -		 * waiting a grace period. -		 */ +	spin_lock_bh(&fnhe_lock); -		rt->dst.flags |= DST_NOCACHE; -		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { -			int err = arp_bind_neighbour(&rt->dst); -			if (err) { -				if (net_ratelimit()) -					printk(KERN_WARNING -					    "Neighbour table failure & not caching routes.\n"); -				ip_rt_put(rt); -				return err; -			} -		} - -		goto skip_hashing; +	hash = nh->nh_exceptions; +	if (!hash) { +		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); +		if (!hash) +			goto out_unlock; +		nh->nh_exceptions = hash;  	} -	rthp = &rt_hash_table[hash].chain; - -	spin_lock_bh(rt_hash_lock_addr(hash)); -	while ((rth = rcu_dereference_protected(*rthp, -			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { -		if (rt_is_expired(rth)) { -			*rthp = rth->dst.rt_next; -			rt_free(rth); -			continue; -		} -		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { -			/* Put it first */ -			*rthp = rth->dst.rt_next; -			/* -			 * Since lookup is lockfree, the deletion -			 * must be visible to another weakly ordered CPU before -			 * the insertion at the start of the hash chain. -			 */ -			rcu_assign_pointer(rth->dst.rt_next, -					   rt_hash_table[hash].chain); -			/* -			 * Since lookup is lockfree, the update writes -			 * must be ordered for consistency on SMP. -			 */ -			rcu_assign_pointer(rt_hash_table[hash].chain, rth); - -			dst_use(&rth->dst, now); -			spin_unlock_bh(rt_hash_lock_addr(hash)); - -			rt_drop(rt); -			if (rp) -				*rp = rth; -			else -				skb_dst_set(skb, &rth->dst); -			return 0; -		} +	hash += hval; -		if (!atomic_read(&rth->dst.__refcnt)) { -			u32 score = rt_score(rth); - -			if (score <= min_score) { -				cand = rth; -				candp = rthp; -				min_score = score; -			} -		} - -		chain_length++; - -		rthp = &rth->dst.rt_next; +	depth = 0; +	for (fnhe = rcu_dereference(hash->chain); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (fnhe->fnhe_daddr == daddr) +			break; +		depth++;  	} -	if (cand) { -		/* ip_rt_gc_elasticity used to be average length of chain -		 * length, when exceeded gc becomes really aggressive. -		 * -		 * The second limit is less certain. At the moment it allows -		 * only 2 entries per bucket. We will see. -		 */ -		if (chain_length > ip_rt_gc_elasticity) { -			*candp = cand->dst.rt_next; -			rt_free(cand); +	if (fnhe) { +		if (gw) +			fnhe->fnhe_gw = gw; +		if (pmtu) { +			fnhe->fnhe_pmtu = pmtu; +			fnhe->fnhe_expires = max(1UL, expires);  		} +		/* Update all cached dsts too */ +		rt = rcu_dereference(fnhe->fnhe_rth_input); +		if (rt) +			fill_route_from_fnhe(rt, fnhe); +		rt = rcu_dereference(fnhe->fnhe_rth_output); +		if (rt) +			fill_route_from_fnhe(rt, fnhe);  	} else { -		if (chain_length > rt_chain_length_max && -		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { -			struct net *net = dev_net(rt->dst.dev); -			int num = ++net->ipv4.current_rt_cache_rebuild_count; -			if (!rt_caching(net)) { -				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", -					rt->dst.dev->name, num); -			} -			rt_emergency_hash_rebuild(net); -			spin_unlock_bh(rt_hash_lock_addr(hash)); - -			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, -					ifindex, rt_genid(net)); -			goto restart; +		if (depth > FNHE_RECLAIM_DEPTH) +			fnhe = fnhe_oldest(hash); +		else { +			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); +			if (!fnhe) +				goto out_unlock; + +			fnhe->fnhe_next = hash->chain; +			rcu_assign_pointer(hash->chain, fnhe);  		} -	} - -	/* Try to bind route to arp only if it is output -	   route or unicast forwarding path. -	 */ -	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { -		int err = arp_bind_neighbour(&rt->dst); -		if (err) { -			spin_unlock_bh(rt_hash_lock_addr(hash)); - -			if (err != -ENOBUFS) { -				rt_drop(rt); -				return err; -			} - -			/* Neighbour tables are full and nothing -			   can be released. Try to shrink route cache, -			   it is most likely it holds some neighbour records. -			 */ -			if (attempts-- > 0) { -				int saved_elasticity = ip_rt_gc_elasticity; -				int saved_int = ip_rt_gc_min_interval; -				ip_rt_gc_elasticity	= 1; -				ip_rt_gc_min_interval	= 0; -				rt_garbage_collect(&ipv4_dst_ops); -				ip_rt_gc_min_interval	= saved_int; -				ip_rt_gc_elasticity	= saved_elasticity; -				goto restart; -			} - -			if (net_ratelimit()) -				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); -			rt_drop(rt); -			return -ENOBUFS; +		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); +		fnhe->fnhe_daddr = daddr; +		fnhe->fnhe_gw = gw; +		fnhe->fnhe_pmtu = pmtu; +		fnhe->fnhe_expires = expires; + +		/* Exception created; mark the cached routes for the nexthop +		 * stale, so anyone caching it rechecks if this exception +		 * applies to them. +		 */ +		rt = rcu_dereference(nh->nh_rth_input); +		if (rt) +			rt->dst.obsolete = DST_OBSOLETE_KILL; + +		for_each_possible_cpu(i) { +			struct rtable __rcu **prt; +			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); +			rt = rcu_dereference(*prt); +			if (rt) +				rt->dst.obsolete = DST_OBSOLETE_KILL;  		}  	} -	rt->dst.rt_next = rt_hash_table[hash].chain; - -#if RT_CACHE_DEBUG >= 2 -	if (rt->dst.rt_next) { -		struct rtable *trt; -		printk(KERN_DEBUG "rt_cache @%02x: %pI4", -		       hash, &rt->rt_dst); -		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) -			printk(" . %pI4", &trt->rt_dst); -		printk("\n"); -	} -#endif -	/* -	 * Since lookup is lockfree, we must make sure -	 * previous writes to rt are comitted to memory -	 * before making rt visible to other CPUS. -	 */ -	rcu_assign_pointer(rt_hash_table[hash].chain, rt); - -	spin_unlock_bh(rt_hash_lock_addr(hash)); - -skip_hashing: -	if (rp) -		*rp = rt; -	else -		skb_dst_set(skb, &rt->dst); -	return 0; -} - -void rt_bind_peer(struct rtable *rt, int create) -{ -	struct inet_peer *peer; - -	peer = inet_getpeer_v4(rt->rt_dst, create); +	fnhe->fnhe_stamp = jiffies; -	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) -		inet_putpeer(peer); +out_unlock: +	spin_unlock_bh(&fnhe_lock);  } -/* - * Peer allocation may fail only in serious out-of-memory conditions.  However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, +			     bool kill_route)  { -	static DEFINE_SPINLOCK(ip_fb_id_lock); -	static u32 ip_fallback_id; -	u32 salt; - -	spin_lock_bh(&ip_fb_id_lock); -	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); -	iph->id = htons(salt & 0xFFFF); -	ip_fallback_id = salt; -	spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) -{ -	struct rtable *rt = (struct rtable *) dst; - -	if (rt) { -		if (rt->peer == NULL) -			rt_bind_peer(rt, 1); - -		/* If peer is attached to destination, it is never detached, -		   so that we need not to grab a lock to dereference it. -		 */ -		if (rt->peer) { -			iph->id = htons(inet_getid(rt->peer, more)); -			return; -		} -	} else -		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", -		       __builtin_return_address(0)); - -	ip_select_fb_ident(iph); -} -EXPORT_SYMBOL(__ip_select_ident); +	__be32 new_gw = icmp_hdr(skb)->un.gateway; +	__be32 old_gw = ip_hdr(skb)->saddr; +	struct net_device *dev = skb->dev; +	struct in_device *in_dev; +	struct fib_result res; +	struct neighbour *n; +	struct net *net; -static void rt_del(unsigned hash, struct rtable *rt) -{ -	struct rtable __rcu **rthp; -	struct rtable *aux; +	switch (icmp_hdr(skb)->code & 7) { +	case ICMP_REDIR_NET: +	case ICMP_REDIR_NETTOS: +	case ICMP_REDIR_HOST: +	case ICMP_REDIR_HOSTTOS: +		break; -	rthp = &rt_hash_table[hash].chain; -	spin_lock_bh(rt_hash_lock_addr(hash)); -	ip_rt_put(rt); -	while ((aux = rcu_dereference_protected(*rthp, -			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { -		if (aux == rt || rt_is_expired(aux)) { -			*rthp = aux->dst.rt_next; -			rt_free(aux); -			continue; -		} -		rthp = &aux->dst.rt_next; +	default: +		return;  	} -	spin_unlock_bh(rt_hash_lock_addr(hash)); -} -/* called in rcu_read_lock() section */ -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, -		    __be32 saddr, struct net_device *dev) -{ -	int i, k; -	struct in_device *in_dev = __in_dev_get_rcu(dev); -	struct rtable *rth; -	struct rtable __rcu **rthp; -	__be32  skeys[2] = { saddr, 0 }; -	int  ikeys[2] = { dev->ifindex, 0 }; -	struct netevent_redirect netevent; -	struct net *net; +	if (rt->rt_gateway != old_gw) +		return; +	in_dev = __in_dev_get_rcu(dev);  	if (!in_dev)  		return; @@ -1380,9 +735,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,  	    ipv4_is_zeronet(new_gw))  		goto reject_redirect; -	if (!rt_caching(net)) -		goto reject_redirect; -  	if (!IN_DEV_SHARED_MEDIA(in_dev)) {  		if (!inet_addr_onlink(in_dev, new_gw, old_gw))  			goto reject_redirect; @@ -1393,105 +745,57 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,  			goto reject_redirect;  	} -	for (i = 0; i < 2; i++) { -		for (k = 0; k < 2; k++) { -			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], -						rt_genid(net)); - -			rthp = &rt_hash_table[hash].chain; - -			while ((rth = rcu_dereference(*rthp)) != NULL) { -				struct rtable *rt; - -				if (rth->fl.fl4_dst != daddr || -				    rth->fl.fl4_src != skeys[i] || -				    rth->fl.oif != ikeys[k] || -				    rt_is_input_route(rth) || -				    rt_is_expired(rth) || -				    !net_eq(dev_net(rth->dst.dev), net)) { -					rthp = &rth->dst.rt_next; -					continue; -				} - -				if (rth->rt_dst != daddr || -				    rth->rt_src != saddr || -				    rth->dst.error || -				    rth->rt_gateway != old_gw || -				    rth->dst.dev != dev) -					break; - -				dst_hold(&rth->dst); - -				rt = dst_alloc(&ipv4_dst_ops); -				if (rt == NULL) { -					ip_rt_put(rth); -					return; -				} +	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); +	if (n) { +		if (!(n->nud_state & NUD_VALID)) { +			neigh_event_send(n, NULL); +		} else { +			if (fib_lookup(net, fl4, &res) == 0) { +				struct fib_nh *nh = &FIB_RES_NH(res); -				/* Copy all the information. */ -				*rt = *rth; -				rt->dst.__use		= 1; -				atomic_set(&rt->dst.__refcnt, 1); -				rt->dst.child		= NULL; -				if (rt->dst.dev) -					dev_hold(rt->dst.dev); -				rt->dst.obsolete	= -1; -				rt->dst.lastuse	= jiffies; -				rt->dst.path		= &rt->dst; -				rt->dst.neighbour	= NULL; -				rt->dst.hh		= NULL; -#ifdef CONFIG_XFRM -				rt->dst.xfrm		= NULL; -#endif -				rt->rt_genid		= rt_genid(net); -				rt->rt_flags		|= RTCF_REDIRECTED; - -				/* Gateway is different ... */ -				rt->rt_gateway		= new_gw; - -				/* Redirect received -> path was valid */ -				dst_confirm(&rth->dst); - -				if (rt->peer) -					atomic_inc(&rt->peer->refcnt); - -				if (arp_bind_neighbour(&rt->dst) || -				    !(rt->dst.neighbour->nud_state & -					    NUD_VALID)) { -					if (rt->dst.neighbour) -						neigh_event_send(rt->dst.neighbour, NULL); -					ip_rt_put(rth); -					rt_drop(rt); -					goto do_next; -				} - -				netevent.old = &rth->dst; -				netevent.new = &rt->dst; -				call_netevent_notifiers(NETEVENT_REDIRECT, -							&netevent); - -				rt_del(hash, rth); -				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) -					ip_rt_put(rt); -				goto do_next; +				update_or_create_fnhe(nh, fl4->daddr, new_gw, +						      0, 0);  			} -		do_next: -			; +			if (kill_route) +				rt->dst.obsolete = DST_OBSOLETE_KILL; +			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);  		} +		neigh_release(n);  	}  	return;  reject_redirect:  #ifdef CONFIG_IP_ROUTE_VERBOSE -	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) -		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" -			"  Advised path = %pI4 -> %pI4\n", -		       &old_gw, dev->name, &new_gw, -		       &saddr, &daddr); +	if (IN_DEV_LOG_MARTIANS(in_dev)) { +		const struct iphdr *iph = (const struct iphdr *) skb->data; +		__be32 daddr = iph->daddr; +		__be32 saddr = iph->saddr; + +		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" +				     "  Advised path = %pI4 -> %pI4\n", +				     &old_gw, dev->name, &new_gw, +				     &saddr, &daddr); +	}  #endif  	;  } +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ +	struct rtable *rt; +	struct flowi4 fl4; +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	int oif = skb->dev->ifindex; +	u8 tos = RT_TOS(iph->tos); +	u8 prot = iph->protocol; +	u32 mark = skb->mark; + +	rt = (struct rtable *) dst; + +	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); +	__ip_do_redirect(rt, skb, &fl4, true); +} +  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)  {  	struct rtable *rt = (struct rtable *)dst; @@ -1502,16 +806,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)  			ip_rt_put(rt);  			ret = NULL;  		} else if ((rt->rt_flags & RTCF_REDIRECTED) || -			   (rt->dst.expires && -			    time_after_eq(jiffies, rt->dst.expires))) { -			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, -						rt->fl.oif, -						rt_genid(dev_net(dst->dev))); -#if RT_CACHE_DEBUG >= 1 -			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", -				&rt->rt_dst, rt->fl.fl4_tos); -#endif -			rt_del(hash, rt); +			   rt->dst.expires) { +			ip_rt_put(rt);  			ret = NULL;  		}  	} @@ -1538,6 +834,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)  {  	struct rtable *rt = skb_rtable(skb);  	struct in_device *in_dev; +	struct inet_peer *peer; +	struct net *net;  	int log_martians;  	rcu_read_lock(); @@ -1549,192 +847,291 @@ void ip_rt_send_redirect(struct sk_buff *skb)  	log_martians = IN_DEV_LOG_MARTIANS(in_dev);  	rcu_read_unlock(); +	net = dev_net(rt->dst.dev); +	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); +	if (!peer) { +		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, +			  rt_nexthop(rt, ip_hdr(skb)->daddr)); +		return; +	} +  	/* No redirected packets during ip_rt_redirect_silence;  	 * reset the algorithm.  	 */ -	if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) -		rt->dst.rate_tokens = 0; +	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) +		peer->rate_tokens = 0;  	/* Too many ignored redirects; do not send anything  	 * set dst.rate_last to the last seen redirected packet.  	 */ -	if (rt->dst.rate_tokens >= ip_rt_redirect_number) { -		rt->dst.rate_last = jiffies; -		return; +	if (peer->rate_tokens >= ip_rt_redirect_number) { +		peer->rate_last = jiffies; +		goto out_put_peer;  	}  	/* Check for load limit; set rate_last to the latest sent  	 * redirect.  	 */ -	if (rt->dst.rate_tokens == 0 || +	if (peer->rate_tokens == 0 ||  	    time_after(jiffies, -		       (rt->dst.rate_last + -			(ip_rt_redirect_load << rt->dst.rate_tokens)))) { -		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); -		rt->dst.rate_last = jiffies; -		++rt->dst.rate_tokens; +		       (peer->rate_last + +			(ip_rt_redirect_load << peer->rate_tokens)))) { +		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + +		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); +		peer->rate_last = jiffies; +		++peer->rate_tokens;  #ifdef CONFIG_IP_ROUTE_VERBOSE  		if (log_martians && -		    rt->dst.rate_tokens == ip_rt_redirect_number && -		    net_ratelimit()) -			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", -				&rt->rt_src, rt->rt_iif, -				&rt->rt_dst, &rt->rt_gateway); +		    peer->rate_tokens == ip_rt_redirect_number) +			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", +					     &ip_hdr(skb)->saddr, inet_iif(skb), +					     &ip_hdr(skb)->daddr, &gw);  #endif  	} +out_put_peer: +	inet_putpeer(peer);  }  static int ip_error(struct sk_buff *skb)  { +	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);  	struct rtable *rt = skb_rtable(skb); +	struct inet_peer *peer;  	unsigned long now; +	struct net *net; +	bool send;  	int code; -	switch (rt->dst.error) { -		case EINVAL: -		default: -			goto out; +	net = dev_net(rt->dst.dev); +	if (!IN_DEV_FORWARD(in_dev)) { +		switch (rt->dst.error) {  		case EHOSTUNREACH: -			code = ICMP_HOST_UNREACH; +			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);  			break; +  		case ENETUNREACH: -			code = ICMP_NET_UNREACH; -			IP_INC_STATS_BH(dev_net(rt->dst.dev), -					IPSTATS_MIB_INNOROUTES); -			break; -		case EACCES: -			code = ICMP_PKT_FILTERED; +			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);  			break; +		} +		goto out;  	} -	now = jiffies; -	rt->dst.rate_tokens += now - rt->dst.rate_last; -	if (rt->dst.rate_tokens > ip_rt_error_burst) -		rt->dst.rate_tokens = ip_rt_error_burst; -	rt->dst.rate_last = now; -	if (rt->dst.rate_tokens >= ip_rt_error_cost) { -		rt->dst.rate_tokens -= ip_rt_error_cost; -		icmp_send(skb, ICMP_DEST_UNREACH, code, 0); +	switch (rt->dst.error) { +	case EINVAL: +	default: +		goto out; +	case EHOSTUNREACH: +		code = ICMP_HOST_UNREACH; +		break; +	case ENETUNREACH: +		code = ICMP_NET_UNREACH; +		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); +		break; +	case EACCES: +		code = ICMP_PKT_FILTERED; +		break; +	} + +	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + +	send = true; +	if (peer) { +		now = jiffies; +		peer->rate_tokens += now - peer->rate_last; +		if (peer->rate_tokens > ip_rt_error_burst) +			peer->rate_tokens = ip_rt_error_burst; +		peer->rate_last = now; +		if (peer->rate_tokens >= ip_rt_error_cost) +			peer->rate_tokens -= ip_rt_error_cost; +		else +			send = false; +		inet_putpeer(peer);  	} +	if (send) +		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);  out:	kfree_skb(skb);  	return 0;  } -/* - *	The last two values are not from the RFC but - *	are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ +	struct dst_entry *dst = &rt->dst; +	struct fib_result res; + +	if (dst_metric_locked(dst, RTAX_MTU)) +		return; + +	if (dst->dev->mtu < mtu) +		return; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; +	if (mtu < ip_rt_min_pmtu) +		mtu = ip_rt_min_pmtu; -static inline unsigned short guess_mtu(unsigned short old_mtu) +	if (rt->rt_pmtu == mtu && +	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) +		return; + +	rcu_read_lock(); +	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { +		struct fib_nh *nh = &FIB_RES_NH(res); + +		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, +				      jiffies + ip_rt_mtu_expires); +	} +	rcu_read_unlock(); +} + +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, +			      struct sk_buff *skb, u32 mtu)  { -	int i; +	struct rtable *rt = (struct rtable *) dst; +	struct flowi4 fl4; -	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) -		if (old_mtu > mtu_plateau[i]) -			return mtu_plateau[i]; -	return 68; +	ip_rt_build_flow_key(&fl4, sk, skb); +	__ip_rt_update_pmtu(rt, &fl4, mtu);  } -unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, -				 unsigned short new_mtu, -				 struct net_device *dev) +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, +		      int oif, u32 mark, u8 protocol, int flow_flags)  { -	int i, k; -	unsigned short old_mtu = ntohs(iph->tot_len); -	struct rtable *rth; -	int  ikeys[2] = { dev->ifindex, 0 }; -	__be32  skeys[2] = { iph->saddr, 0, }; -	__be32  daddr = iph->daddr; -	unsigned short est_mtu = 0; - -	for (k = 0; k < 2; k++) { -		for (i = 0; i < 2; i++) { -			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], -						rt_genid(net)); - -			rcu_read_lock(); -			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; -			     rth = rcu_dereference(rth->dst.rt_next)) { -				unsigned short mtu = new_mtu; - -				if (rth->fl.fl4_dst != daddr || -				    rth->fl.fl4_src != skeys[i] || -				    rth->rt_dst != daddr || -				    rth->rt_src != iph->saddr || -				    rth->fl.oif != ikeys[k] || -				    rt_is_input_route(rth) || -				    dst_metric_locked(&rth->dst, RTAX_MTU) || -				    !net_eq(dev_net(rth->dst.dev), net) || -				    rt_is_expired(rth)) -					continue; - -				if (new_mtu < 68 || new_mtu >= old_mtu) { - -					/* BSD 4.2 compatibility hack :-( */ -					if (mtu == 0 && -					    old_mtu >= dst_mtu(&rth->dst) && -					    old_mtu >= 68 + (iph->ihl << 2)) -						old_mtu -= iph->ihl << 2; - -					mtu = guess_mtu(old_mtu); -				} -				if (mtu <= dst_mtu(&rth->dst)) { -					if (mtu < dst_mtu(&rth->dst)) { -						dst_confirm(&rth->dst); -						if (mtu < ip_rt_min_pmtu) { -							mtu = ip_rt_min_pmtu; -							rth->dst.metrics[RTAX_LOCK-1] |= -								(1 << RTAX_MTU); -						} -						rth->dst.metrics[RTAX_MTU-1] = mtu; -						dst_set_expires(&rth->dst, -							ip_rt_mtu_expires); -					} -					est_mtu = mtu; -				} -			} -			rcu_read_unlock(); -		} +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; + +	if (!mark) +		mark = IP4_REPLY_MARK(net, skb->mark); + +	__build_flow_key(&fl4, NULL, iph, oif, +			 RT_TOS(iph->tos), protocol, mark, flow_flags); +	rt = __ip_route_output_key(net, &fl4); +	if (!IS_ERR(rt)) { +		__ip_rt_update_pmtu(rt, &fl4, mtu); +		ip_rt_put(rt);  	} -	return est_mtu ? : new_mtu;  } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  { -	if (dst_mtu(dst) > mtu && mtu >= 68 && -	    !(dst_metric_locked(dst, RTAX_MTU))) { -		if (mtu < ip_rt_min_pmtu) { -			mtu = ip_rt_min_pmtu; -			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); -		} -		dst->metrics[RTAX_MTU-1] = mtu; -		dst_set_expires(dst, ip_rt_mtu_expires); -		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; + +	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + +	if (!fl4.flowi4_mark) +		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + +	rt = __ip_route_output_key(sock_net(sk), &fl4); +	if (!IS_ERR(rt)) { +		__ip_rt_update_pmtu(rt, &fl4, mtu); +		ip_rt_put(rt);  	}  } -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)  { -	if (rt_is_expired((struct rtable *)dst)) -		return NULL; -	return dst; +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; +	struct dst_entry *odst = NULL; +	bool new = false; + +	bh_lock_sock(sk); + +	if (!ip_sk_accept_pmtu(sk)) +		goto out; + +	odst = sk_dst_get(sk); + +	if (sock_owned_by_user(sk) || !odst) { +		__ipv4_sk_update_pmtu(skb, sk, mtu); +		goto out; +	} + +	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + +	rt = (struct rtable *)odst; +	if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { +		rt = ip_route_output_flow(sock_net(sk), &fl4, sk); +		if (IS_ERR(rt)) +			goto out; + +		new = true; +	} + +	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + +	if (!dst_check(&rt->dst, 0)) { +		if (new) +			dst_release(&rt->dst); + +		rt = ip_route_output_flow(sock_net(sk), &fl4, sk); +		if (IS_ERR(rt)) +			goto out; + +		new = true; +	} + +	if (new) +		sk_dst_set(sk, &rt->dst); + +out: +	bh_unlock_sock(sk); +	dst_release(odst);  } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ipv4_dst_destroy(struct dst_entry *dst) +void ipv4_redirect(struct sk_buff *skb, struct net *net, +		   int oif, u32 mark, u8 protocol, int flow_flags)  { -	struct rtable *rt = (struct rtable *) dst; -	struct inet_peer *peer = rt->peer; +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; -	if (peer) { -		rt->peer = NULL; -		inet_putpeer(peer); +	__build_flow_key(&fl4, NULL, iph, oif, +			 RT_TOS(iph->tos), protocol, mark, flow_flags); +	rt = __ip_route_output_key(net, &fl4); +	if (!IS_ERR(rt)) { +		__ip_do_redirect(rt, skb, &fl4, false); +		ip_rt_put(rt); +	} +} +EXPORT_SYMBOL_GPL(ipv4_redirect); + +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ +	const struct iphdr *iph = (const struct iphdr *) skb->data; +	struct flowi4 fl4; +	struct rtable *rt; + +	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); +	rt = __ip_route_output_key(sock_net(sk), &fl4); +	if (!IS_ERR(rt)) { +		__ip_do_redirect(rt, skb, &fl4, false); +		ip_rt_put(rt);  	}  } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ +	struct rtable *rt = (struct rtable *) dst; +	/* All IPV4 dsts are created with ->obsolete set to the value +	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down +	 * into this function always. +	 * +	 * When a PMTU/redirect information update invalidates a route, +	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or +	 * DST_OBSOLETE_DEAD by dst_free(). +	 */ +	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) +		return NULL; +	return dst; +}  static void ipv4_link_failure(struct sk_buff *skb)  { @@ -1747,12 +1144,13 @@ static void ipv4_link_failure(struct sk_buff *skb)  		dst_set_expires(&rt->dst, 0);  } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)  { -	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", -		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, -		skb->dev ? skb->dev->name : "?"); +	pr_debug("%s: %pI4 -> %pI4, %s\n", +		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, +		 skb->dev ? skb->dev->name : "?");  	kfree_skb(skb); +	WARN_ON(1);  	return 0;  } @@ -1765,26 +1163,40 @@ static int ip_rt_bug(struct sk_buff *skb)     in IP options!   */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)  {  	__be32 src; -	struct fib_result res;  	if (rt_is_output_route(rt)) -		src = rt->rt_src; +		src = ip_hdr(skb)->saddr;  	else { +		struct fib_result res; +		struct flowi4 fl4; +		struct iphdr *iph; + +		iph = ip_hdr(skb); + +		memset(&fl4, 0, sizeof(fl4)); +		fl4.daddr = iph->daddr; +		fl4.saddr = iph->saddr; +		fl4.flowi4_tos = RT_TOS(iph->tos); +		fl4.flowi4_oif = rt->dst.dev->ifindex; +		fl4.flowi4_iif = skb->dev->ifindex; +		fl4.flowi4_mark = skb->mark; +  		rcu_read_lock(); -		if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) -			src = FIB_RES_PREFSRC(res); +		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) +			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);  		else -			src = inet_select_addr(rt->dst.dev, rt->rt_gateway, -					RT_SCOPE_UNIVERSE); +			src = inet_select_addr(rt->dst.dev, +					       rt_nexthop(rt, iph->daddr), +					       RT_SCOPE_UNIVERSE);  		rcu_read_unlock();  	}  	memcpy(addr, &src, 4);  } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  static void set_class_tag(struct rtable *rt, u32 tag)  {  	if (!(rt->dst.tclassid & 0xFFFF)) @@ -1794,55 +1206,229 @@ static void set_class_tag(struct rtable *rt, u32 tag)  }  #endif -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static unsigned int ipv4_default_advmss(const struct dst_entry *dst)  { -	struct fib_info *fi = res->fi; +	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + +	if (advmss == 0) { +		advmss = max_t(unsigned int, dst->dev->mtu - 40, +			       ip_rt_min_advmss); +		if (advmss > 65535 - 40) +			advmss = 65535 - 40; +	} +	return advmss; +} + +static unsigned int ipv4_mtu(const struct dst_entry *dst) +{ +	const struct rtable *rt = (const struct rtable *) dst; +	unsigned int mtu = rt->rt_pmtu; + +	if (!mtu || time_after_eq(jiffies, rt->dst.expires)) +		mtu = dst_metric_raw(dst, RTAX_MTU); + +	if (mtu) +		return mtu; + +	mtu = dst->dev->mtu; + +	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { +		if (rt->rt_uses_gateway && mtu > 576) +			mtu = 576; +	} + +	return min_t(unsigned int, mtu, IP_MAX_MTU); +} + +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +{ +	struct fnhe_hash_bucket *hash = nh->nh_exceptions; +	struct fib_nh_exception *fnhe; +	u32 hval; + +	if (!hash) +		return NULL; + +	hval = fnhe_hashfun(daddr); + +	for (fnhe = rcu_dereference(hash[hval].chain); fnhe; +	     fnhe = rcu_dereference(fnhe->fnhe_next)) { +		if (fnhe->fnhe_daddr == daddr) +			return fnhe; +	} +	return NULL; +} + +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, +			      __be32 daddr) +{ +	bool ret = false; + +	spin_lock_bh(&fnhe_lock); + +	if (daddr == fnhe->fnhe_daddr) { +		struct rtable __rcu **porig; +		struct rtable *orig; +		int genid = fnhe_genid(dev_net(rt->dst.dev)); + +		if (rt_is_input_route(rt)) +			porig = &fnhe->fnhe_rth_input; +		else +			porig = &fnhe->fnhe_rth_output; +		orig = rcu_dereference(*porig); + +		if (fnhe->fnhe_genid != genid) { +			fnhe->fnhe_genid = genid; +			fnhe->fnhe_gw = 0; +			fnhe->fnhe_pmtu = 0; +			fnhe->fnhe_expires = 0; +			fnhe_flush_routes(fnhe); +			orig = NULL; +		} +		fill_route_from_fnhe(rt, fnhe); +		if (!rt->rt_gateway) +			rt->rt_gateway = daddr; + +		if (!(rt->dst.flags & DST_NOCACHE)) { +			rcu_assign_pointer(*porig, rt); +			if (orig) +				rt_free(orig); +			ret = true; +		} + +		fnhe->fnhe_stamp = jiffies; +	} +	spin_unlock_bh(&fnhe_lock); + +	return ret; +} + +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ +	struct rtable *orig, *prev, **p; +	bool ret = true; + +	if (rt_is_input_route(rt)) { +		p = (struct rtable **)&nh->nh_rth_input; +	} else { +		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); +	} +	orig = *p; + +	prev = cmpxchg(p, orig, rt); +	if (prev == orig) { +		if (orig) +			rt_free(orig); +	} else +		ret = false; + +	return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ +	spin_lock_bh(&rt_uncached_lock); +	list_add_tail(&rt->rt_uncached, &rt_uncached_list); +	spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ +	struct rtable *rt = (struct rtable *) dst; + +	if (!list_empty(&rt->rt_uncached)) { +		spin_lock_bh(&rt_uncached_lock); +		list_del(&rt->rt_uncached); +		spin_unlock_bh(&rt_uncached_lock); +	} +} + +void rt_flush_dev(struct net_device *dev) +{ +	if (!list_empty(&rt_uncached_list)) { +		struct net *net = dev_net(dev); +		struct rtable *rt; + +		spin_lock_bh(&rt_uncached_lock); +		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { +			if (rt->dst.dev != dev) +				continue; +			rt->dst.dev = net->loopback_dev; +			dev_hold(rt->dst.dev); +			dev_put(dev); +		} +		spin_unlock_bh(&rt_uncached_lock); +	} +} + +static bool rt_cache_valid(const struct rtable *rt) +{ +	return	rt && +		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && +		!rt_is_expired(rt); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, +			   const struct fib_result *res, +			   struct fib_nh_exception *fnhe, +			   struct fib_info *fi, u16 type, u32 itag) +{ +	bool cached = false;  	if (fi) { -		if (FIB_RES_GW(*res) && -		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) -			rt->rt_gateway = FIB_RES_GW(*res); -		memcpy(rt->dst.metrics, fi->fib_metrics, -		       sizeof(rt->dst.metrics)); -		if (fi->fib_mtu == 0) { -			rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; -			if (dst_metric_locked(&rt->dst, RTAX_MTU) && -			    rt->rt_gateway != rt->rt_dst && -			    rt->dst.dev->mtu > 576) -				rt->dst.metrics[RTAX_MTU-1] = 576; +		struct fib_nh *nh = &FIB_RES_NH(*res); + +		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { +			rt->rt_gateway = nh->nh_gw; +			rt->rt_uses_gateway = 1;  		} -#ifdef CONFIG_NET_CLS_ROUTE -		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +		dst_init_metrics(&rt->dst, fi->fib_metrics, true); +#ifdef CONFIG_IP_ROUTE_CLASSID +		rt->dst.tclassid = nh->nh_tclassid;  #endif +		if (unlikely(fnhe)) +			cached = rt_bind_exception(rt, fnhe, daddr); +		else if (!(rt->dst.flags & DST_NOCACHE)) +			cached = rt_cache_route(nh, rt); +		if (unlikely(!cached)) { +			/* Routes we intend to cache in nexthop exception or +			 * FIB nexthop have the DST_NOCACHE bit clear. +			 * However, if we are unsuccessful at storing this +			 * route into the cache we really need to set it. +			 */ +			rt->dst.flags |= DST_NOCACHE; +			if (!rt->rt_gateway) +				rt->rt_gateway = daddr; +			rt_add_uncached_list(rt); +		}  	} else -		rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; - -	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) -		rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; -	if (dst_mtu(&rt->dst) > IP_MAX_MTU) -		rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; -	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) -		rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40, -				       ip_rt_min_advmss); -	if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) -		rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; - -#ifdef CONFIG_NET_CLS_ROUTE +		rt_add_uncached_list(rt); + +#ifdef CONFIG_IP_ROUTE_CLASSID  #ifdef CONFIG_IP_MULTIPLE_TABLES -	set_class_tag(rt, fib_rules_tclass(res)); +	set_class_tag(rt, res->tclassid);  #endif  	set_class_tag(rt, itag);  #endif -	rt->rt_type = res->type; +} + +static struct rtable *rt_dst_alloc(struct net_device *dev, +				   bool nopolicy, bool noxfrm, bool will_cache) +{ +	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, +			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | +			 (nopolicy ? DST_NOPOLICY : 0) | +			 (noxfrm ? DST_NOXFRM : 0));  }  /* called in rcu_read_lock() section */  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  				u8 tos, struct net_device *dev, int our)  { -	unsigned int hash;  	struct rtable *rth; -	__be32 spec_dst;  	struct in_device *in_dev = __in_dev_get_rcu(dev);  	u32 itag = 0;  	int err; @@ -1853,49 +1439,41 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  		return -EINVAL;  	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || -	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) +	    skb->protocol != htons(ETH_P_IP))  		goto e_inval; +	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) +		if (ipv4_is_loopback(saddr)) +			goto e_inval; +  	if (ipv4_is_zeronet(saddr)) {  		if (!ipv4_is_local_multicast(daddr))  			goto e_inval; -		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);  	} else { -		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, -					  &itag, 0); +		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, +					  in_dev, &itag);  		if (err < 0)  			goto e_err;  	} -	rth = dst_alloc(&ipv4_dst_ops); +	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);  	if (!rth)  		goto e_nobufs; -	rth->dst.output = ip_rt_bug; -	rth->dst.obsolete = -1; - -	atomic_set(&rth->dst.__refcnt, 1); -	rth->dst.flags= DST_HOST; -	if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) -		rth->dst.flags |= DST_NOPOLICY; -	rth->fl.fl4_dst	= daddr; -	rth->rt_dst	= daddr; -	rth->fl.fl4_tos	= tos; -	rth->fl.mark    = skb->mark; -	rth->fl.fl4_src	= saddr; -	rth->rt_src	= saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	rth->dst.tclassid = itag;  #endif -	rth->rt_iif	= -	rth->fl.iif	= dev->ifindex; -	rth->dst.dev	= init_net.loopback_dev; -	dev_hold(rth->dst.dev); -	rth->fl.oif	= 0; -	rth->rt_gateway	= daddr; -	rth->rt_spec_dst= spec_dst; -	rth->rt_genid	= rt_genid(dev_net(dev)); +	rth->dst.output = ip_rt_bug; + +	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));  	rth->rt_flags	= RTCF_MULTICAST;  	rth->rt_type	= RTN_MULTICAST; +	rth->rt_is_input= 1; +	rth->rt_iif	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0; +	rth->rt_uses_gateway = 0; +	INIT_LIST_HEAD(&rth->rt_uncached);  	if (our) {  		rth->dst.input= ip_local_deliver;  		rth->rt_flags |= RTCF_LOCAL; @@ -1907,8 +1485,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,  #endif  	RT_CACHE_STAT_INC(in_slow_mc); -	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); -	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); +	skb_dst_set(skb, &rth->dst); +	return 0;  e_nobufs:  	return -ENOBUFS; @@ -1932,18 +1510,13 @@ static void ip_handle_martian_source(struct net_device *dev,  		 *	RFC1812 recommendation, if source is martian,  		 *	the only hint is MAC header.  		 */ -		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", +		pr_warn("martian source %pI4 from %pI4, on dev %s\n",  			&daddr, &saddr, dev->name);  		if (dev->hard_header_len && skb_mac_header_was_set(skb)) { -			int i; -			const unsigned char *p = skb_mac_header(skb); -			printk(KERN_WARNING "ll header: "); -			for (i = 0; i < dev->hard_header_len; i++, p++) { -				printk("%02x", *p); -				if (i < (dev->hard_header_len - 1)) -					printk(":"); -			} -			printk("\n"); +			print_hex_dump(KERN_WARNING, "ll header: ", +				       DUMP_PREFIX_OFFSET, 16, 1, +				       skb_mac_header(skb), +				       dev->hard_header_len, true);  		}  	}  #endif @@ -1951,30 +1524,27 @@ static void ip_handle_martian_source(struct net_device *dev,  /* called in rcu_read_lock() section */  static int __mkroute_input(struct sk_buff *skb, -			   struct fib_result *res, +			   const struct fib_result *res,  			   struct in_device *in_dev, -			   __be32 daddr, __be32 saddr, u32 tos, -			   struct rtable **result) +			   __be32 daddr, __be32 saddr, u32 tos)  { +	struct fib_nh_exception *fnhe;  	struct rtable *rth;  	int err;  	struct in_device *out_dev;  	unsigned int flags = 0; -	__be32 spec_dst; -	u32 itag; +	bool do_cache; +	u32 itag = 0;  	/* get a working reference to the output device */  	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));  	if (out_dev == NULL) { -		if (net_ratelimit()) -			printk(KERN_CRIT "Bug in ip_route_input" \ -			       "_slow(). Please, report\n"); +		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");  		return -EINVAL;  	} - -	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), -				  in_dev->dev, &spec_dst, &itag, skb->mark); +	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), +				  in_dev->dev, in_dev, &itag);  	if (err < 0) {  		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,  					 saddr); @@ -1982,13 +1552,13 @@ static int __mkroute_input(struct sk_buff *skb,  		goto cleanup;  	} -	if (err) -		flags |= RTCF_DIRECTSRC; - -	if (out_dev == in_dev && err && +	do_cache = res->fi && !itag; +	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&  	    (IN_DEV_SHARED_MEDIA(out_dev) || -	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) +	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {  		flags |= RTCF_DOREDIRECT; +		do_cache = false; +	}  	if (skb->protocol != htons(ETH_P_IP)) {  		/* Not IP (i.e. ARP). Do not create route, if it is @@ -2005,43 +1575,44 @@ static int __mkroute_input(struct sk_buff *skb,  		}  	} +	fnhe = find_exception(&FIB_RES_NH(*res), daddr); +	if (do_cache) { +		if (fnhe != NULL) +			rth = rcu_dereference(fnhe->fnhe_rth_input); +		else +			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + +		if (rt_cache_valid(rth)) { +			skb_dst_set_noref(skb, &rth->dst); +			goto out; +		} +	} -	rth = dst_alloc(&ipv4_dst_ops); +	rth = rt_dst_alloc(out_dev->dev, +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), +			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);  	if (!rth) {  		err = -ENOBUFS;  		goto cleanup;  	} -	atomic_set(&rth->dst.__refcnt, 1); -	rth->dst.flags= DST_HOST; -	if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) -		rth->dst.flags |= DST_NOPOLICY; -	if (IN_DEV_CONF_GET(out_dev, NOXFRM)) -		rth->dst.flags |= DST_NOXFRM; -	rth->fl.fl4_dst	= daddr; -	rth->rt_dst	= daddr; -	rth->fl.fl4_tos	= tos; -	rth->fl.mark    = skb->mark; -	rth->fl.fl4_src	= saddr; -	rth->rt_src	= saddr; -	rth->rt_gateway	= daddr; -	rth->rt_iif 	= -		rth->fl.iif	= in_dev->dev->ifindex; -	rth->dst.dev	= (out_dev)->dev; -	dev_hold(rth->dst.dev); -	rth->fl.oif 	= 0; -	rth->rt_spec_dst= spec_dst; - -	rth->dst.obsolete = -1; +	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); +	rth->rt_flags = flags; +	rth->rt_type = res->type; +	rth->rt_is_input = 1; +	rth->rt_iif 	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0; +	rth->rt_uses_gateway = 0; +	INIT_LIST_HEAD(&rth->rt_uncached); +	RT_CACHE_STAT_INC(in_slow_tot); +  	rth->dst.input = ip_forward;  	rth->dst.output = ip_output; -	rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); -	rt_set_nexthop(rth, res, itag); - -	rth->rt_flags = flags; - -	*result = rth; +	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); +	skb_dst_set(skb, &rth->dst); +out:  	err = 0;   cleanup:  	return err; @@ -2049,28 +1620,17 @@ static int __mkroute_input(struct sk_buff *skb,  static int ip_mkroute_input(struct sk_buff *skb,  			    struct fib_result *res, -			    const struct flowi *fl, +			    const struct flowi4 *fl4,  			    struct in_device *in_dev,  			    __be32 daddr, __be32 saddr, u32 tos)  { -	struct rtable* rth = NULL; -	int err; -	unsigned hash; -  #ifdef CONFIG_IP_ROUTE_MULTIPATH -	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) -		fib_select_multipath(fl, res); +	if (res->fi && res->fi->fib_nhs > 1) +		fib_select_multipath(res);  #endif  	/* create a routing cache entry */ -	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); -	if (err) -		return err; - -	/* put it into the cache */ -	hash = rt_hash(daddr, saddr, fl->iif, -		       rt_genid(dev_net(rth->dst.dev))); -	return rt_intern_hash(hash, rth, NULL, skb, fl->iif); +	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);  }  /* @@ -2089,19 +1649,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  {  	struct fib_result res;  	struct in_device *in_dev = __in_dev_get_rcu(dev); -	struct flowi fl = { .fl4_dst	= daddr, -			    .fl4_src	= saddr, -			    .fl4_tos	= tos, -			    .fl4_scope	= RT_SCOPE_UNIVERSE, -			    .mark = skb->mark, -			    .iif = dev->ifindex }; -	unsigned	flags = 0; +	struct flowi4	fl4; +	unsigned int	flags = 0;  	u32		itag = 0; -	struct rtable * rth; -	unsigned	hash; -	__be32		spec_dst; +	struct rtable	*rth;  	int		err = -EINVAL; -	struct net    * net = dev_net(dev); +	struct net    *net = dev_net(dev); +	bool do_cache;  	/* IP on this device is disabled. */ @@ -2112,10 +1666,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	   by fib_lookup.  	 */ -	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || -	    ipv4_is_loopback(saddr)) +	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))  		goto martian_source; +	res.fi = NULL;  	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))  		goto brd_input; @@ -2125,105 +1679,124 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	if (ipv4_is_zeronet(saddr))  		goto martian_source; -	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) +	if (ipv4_is_zeronet(daddr))  		goto martian_destination; +	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), +	 * and call it once if daddr or/and saddr are loopback addresses +	 */ +	if (ipv4_is_loopback(daddr)) { +		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) +			goto martian_destination; +	} else if (ipv4_is_loopback(saddr)) { +		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) +			goto martian_source; +	} +  	/*  	 *	Now we are ready to route packet.  	 */ -	err = fib_lookup(net, &fl, &res); +	fl4.flowi4_oif = 0; +	fl4.flowi4_iif = dev->ifindex; +	fl4.flowi4_mark = skb->mark; +	fl4.flowi4_tos = tos; +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE; +	fl4.daddr = daddr; +	fl4.saddr = saddr; +	err = fib_lookup(net, &fl4, &res);  	if (err != 0) {  		if (!IN_DEV_FORWARD(in_dev)) -			goto e_hostunreach; +			err = -EHOSTUNREACH;  		goto no_route;  	} -	RT_CACHE_STAT_INC(in_slow_tot); -  	if (res.type == RTN_BROADCAST)  		goto brd_input;  	if (res.type == RTN_LOCAL) { -		err = fib_validate_source(saddr, daddr, tos, -					  net->loopback_dev->ifindex, -					  dev, &spec_dst, &itag, skb->mark); +		err = fib_validate_source(skb, saddr, daddr, tos, +					  0, dev, in_dev, &itag);  		if (err < 0)  			goto martian_source_keep_err; -		if (err) -			flags |= RTCF_DIRECTSRC; -		spec_dst = daddr;  		goto local_input;  	} -	if (!IN_DEV_FORWARD(in_dev)) -		goto e_hostunreach; +	if (!IN_DEV_FORWARD(in_dev)) { +		err = -EHOSTUNREACH; +		goto no_route; +	}  	if (res.type != RTN_UNICAST)  		goto martian_destination; -	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); +	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);  out:	return err;  brd_input:  	if (skb->protocol != htons(ETH_P_IP))  		goto e_inval; -	if (ipv4_is_zeronet(saddr)) -		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); -	else { -		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, -					  &itag, skb->mark); +	if (!ipv4_is_zeronet(saddr)) { +		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, +					  in_dev, &itag);  		if (err < 0)  			goto martian_source_keep_err; -		if (err) -			flags |= RTCF_DIRECTSRC;  	}  	flags |= RTCF_BROADCAST;  	res.type = RTN_BROADCAST;  	RT_CACHE_STAT_INC(in_brd);  local_input: -	rth = dst_alloc(&ipv4_dst_ops); +	do_cache = false; +	if (res.fi) { +		if (!itag) { +			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); +			if (rt_cache_valid(rth)) { +				skb_dst_set_noref(skb, &rth->dst); +				err = 0; +				goto out; +			} +			do_cache = true; +		} +	} + +	rth = rt_dst_alloc(net->loopback_dev, +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);  	if (!rth)  		goto e_nobufs; +	rth->dst.input= ip_local_deliver;  	rth->dst.output= ip_rt_bug; -	rth->dst.obsolete = -1; -	rth->rt_genid = rt_genid(net); - -	atomic_set(&rth->dst.__refcnt, 1); -	rth->dst.flags= DST_HOST; -	if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) -		rth->dst.flags |= DST_NOPOLICY; -	rth->fl.fl4_dst	= daddr; -	rth->rt_dst	= daddr; -	rth->fl.fl4_tos	= tos; -	rth->fl.mark    = skb->mark; -	rth->fl.fl4_src	= saddr; -	rth->rt_src	= saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID  	rth->dst.tclassid = itag;  #endif -	rth->rt_iif	= -	rth->fl.iif	= dev->ifindex; -	rth->dst.dev	= net->loopback_dev; -	dev_hold(rth->dst.dev); -	rth->rt_gateway	= daddr; -	rth->rt_spec_dst= spec_dst; -	rth->dst.input= ip_local_deliver; + +	rth->rt_genid = rt_genid_ipv4(net);  	rth->rt_flags 	= flags|RTCF_LOCAL; +	rth->rt_type	= res.type; +	rth->rt_is_input = 1; +	rth->rt_iif	= 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway	= 0; +	rth->rt_uses_gateway = 0; +	INIT_LIST_HEAD(&rth->rt_uncached); +	RT_CACHE_STAT_INC(in_slow_tot);  	if (res.type == RTN_UNREACHABLE) {  		rth->dst.input= ip_error;  		rth->dst.error= -err;  		rth->rt_flags 	&= ~RTCF_LOCAL;  	} -	rth->rt_type	= res.type; -	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); -	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); +	if (do_cache) { +		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { +			rth->dst.flags |= DST_NOCACHE; +			rt_add_uncached_list(rth); +		} +	} +	skb_dst_set(skb, &rth->dst); +	err = 0;  	goto out;  no_route:  	RT_CACHE_STAT_INC(in_no_route); -	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);  	res.type = RTN_UNREACHABLE;  	if (err == -ESRCH)  		err = -ENETUNREACH; @@ -2235,15 +1808,11 @@ no_route:  martian_destination:  	RT_CACHE_STAT_INC(in_martian_dst);  #ifdef CONFIG_IP_ROUTE_VERBOSE -	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) -		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", -			&daddr, &saddr, dev->name); +	if (IN_DEV_LOG_MARTIANS(in_dev)) +		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", +				     &daddr, &saddr, dev->name);  #endif -e_hostunreach: -	err = -EHOSTUNREACH; -	goto out; -  e_inval:  	err = -EINVAL;  	goto out; @@ -2259,50 +1828,13 @@ martian_source_keep_err:  	goto out;  } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, -			   u8 tos, struct net_device *dev, bool noref) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, +			 u8 tos, struct net_device *dev)  { -	struct rtable * rth; -	unsigned	hash; -	int iif = dev->ifindex; -	struct net *net;  	int res; -	net = dev_net(dev); -  	rcu_read_lock(); -	if (!rt_caching(net)) -		goto skip_cache; - -	tos &= IPTOS_RT_MASK; -	hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - -	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; -	     rth = rcu_dereference(rth->dst.rt_next)) { -		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | -		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | -		     (rth->fl.iif ^ iif) | -		     rth->fl.oif | -		     (rth->fl.fl4_tos ^ tos)) == 0 && -		    rth->fl.mark == skb->mark && -		    net_eq(dev_net(rth->dst.dev), net) && -		    !rt_is_expired(rth)) { -			if (noref) { -				dst_use_noref(&rth->dst, jiffies); -				skb_dst_set_noref(skb, &rth->dst); -			} else { -				dst_use(&rth->dst, jiffies); -				skb_dst_set(skb, &rth->dst); -			} -			RT_CACHE_STAT_INC(in_hit); -			rcu_read_unlock(); -			return 0; -		} -		RT_CACHE_STAT_INC(in_hlist_search); -	} - -skip_cache:  	/* Multicast recognition logic is moved from route cache to here.  	   The problem was that too many Ethernet cards have broken/missing  	   hardware multicast filters :-( As result the host on multicasting @@ -2318,8 +1850,8 @@ skip_cache:  		struct in_device *in_dev = __in_dev_get_rcu(dev);  		if (in_dev) { -			int our = ip_check_mc(in_dev, daddr, saddr, -					      ip_hdr(skb)->protocol); +			int our = ip_check_mc_rcu(in_dev, daddr, saddr, +						  ip_hdr(skb)->protocol);  			if (our  #ifdef CONFIG_IP_MROUTE  				|| @@ -2340,101 +1872,118 @@ skip_cache:  	rcu_read_unlock();  	return res;  } -EXPORT_SYMBOL(ip_route_input_common); +EXPORT_SYMBOL(ip_route_input_noref);  /* called with rcu_read_lock() */ -static int __mkroute_output(struct rtable **result, -			    struct fib_result *res, -			    const struct flowi *fl, -			    const struct flowi *oldflp, -			    struct net_device *dev_out, -			    unsigned flags) +static struct rtable *__mkroute_output(const struct fib_result *res, +				       const struct flowi4 *fl4, int orig_oif, +				       struct net_device *dev_out, +				       unsigned int flags)  { -	struct rtable *rth; +	struct fib_info *fi = res->fi; +	struct fib_nh_exception *fnhe;  	struct in_device *in_dev; -	u32 tos = RT_FL_TOS(oldflp); +	u16 type = res->type; +	struct rtable *rth; +	bool do_cache; -	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) -		return -EINVAL; +	in_dev = __in_dev_get_rcu(dev_out); +	if (!in_dev) +		return ERR_PTR(-EINVAL); -	if (ipv4_is_lbcast(fl->fl4_dst)) -		res->type = RTN_BROADCAST; -	else if (ipv4_is_multicast(fl->fl4_dst)) -		res->type = RTN_MULTICAST; -	else if (ipv4_is_zeronet(fl->fl4_dst)) -		return -EINVAL; +	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) +		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) +			return ERR_PTR(-EINVAL); + +	if (ipv4_is_lbcast(fl4->daddr)) +		type = RTN_BROADCAST; +	else if (ipv4_is_multicast(fl4->daddr)) +		type = RTN_MULTICAST; +	else if (ipv4_is_zeronet(fl4->daddr)) +		return ERR_PTR(-EINVAL);  	if (dev_out->flags & IFF_LOOPBACK)  		flags |= RTCF_LOCAL; -	in_dev = __in_dev_get_rcu(dev_out); -	if (!in_dev) -		return -EINVAL; - -	if (res->type == RTN_BROADCAST) { +	do_cache = true; +	if (type == RTN_BROADCAST) {  		flags |= RTCF_BROADCAST | RTCF_LOCAL; -		res->fi = NULL; -	} else if (res->type == RTN_MULTICAST) { +		fi = NULL; +	} else if (type == RTN_MULTICAST) {  		flags |= RTCF_MULTICAST | RTCF_LOCAL; -		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, -				 oldflp->proto)) +		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, +				     fl4->flowi4_proto))  			flags &= ~RTCF_LOCAL; +		else +			do_cache = false;  		/* If multicast route do not exist use  		 * default one, but do not gateway in this case.  		 * Yes, it is hack.  		 */ -		if (res->fi && res->prefixlen < 4) -			res->fi = NULL; +		if (fi && res->prefixlen < 4) +			fi = NULL; +	} + +	fnhe = NULL; +	do_cache &= fi != NULL; +	if (do_cache) { +		struct rtable __rcu **prth; +		struct fib_nh *nh = &FIB_RES_NH(*res); + +		fnhe = find_exception(nh, fl4->daddr); +		if (fnhe) +			prth = &fnhe->fnhe_rth_output; +		else { +			if (unlikely(fl4->flowi4_flags & +				     FLOWI_FLAG_KNOWN_NH && +				     !(nh->nh_gw && +				       nh->nh_scope == RT_SCOPE_LINK))) { +				do_cache = false; +				goto add; +			} +			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); +		} +		rth = rcu_dereference(*prth); +		if (rt_cache_valid(rth)) { +			dst_hold(&rth->dst); +			return rth; +		}  	} - -	rth = dst_alloc(&ipv4_dst_ops); +add: +	rth = rt_dst_alloc(dev_out, +			   IN_DEV_CONF_GET(in_dev, NOPOLICY), +			   IN_DEV_CONF_GET(in_dev, NOXFRM), +			   do_cache);  	if (!rth) -		return -ENOBUFS; - -	atomic_set(&rth->dst.__refcnt, 1); -	rth->dst.flags= DST_HOST; -	if (IN_DEV_CONF_GET(in_dev, NOXFRM)) -		rth->dst.flags |= DST_NOXFRM; -	if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) -		rth->dst.flags |= DST_NOPOLICY; - -	rth->fl.fl4_dst	= oldflp->fl4_dst; -	rth->fl.fl4_tos	= tos; -	rth->fl.fl4_src	= oldflp->fl4_src; -	rth->fl.oif	= oldflp->oif; -	rth->fl.mark    = oldflp->mark; -	rth->rt_dst	= fl->fl4_dst; -	rth->rt_src	= fl->fl4_src; -	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex; -	/* get references to the devices that are to be hold by the routing -	   cache entry */ -	rth->dst.dev	= dev_out; -	dev_hold(dev_out); -	rth->rt_gateway = fl->fl4_dst; -	rth->rt_spec_dst= fl->fl4_src; - -	rth->dst.output=ip_output; -	rth->dst.obsolete = -1; -	rth->rt_genid = rt_genid(dev_net(dev_out)); +		return ERR_PTR(-ENOBUFS); + +	rth->dst.output = ip_output; + +	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); +	rth->rt_flags	= flags; +	rth->rt_type	= type; +	rth->rt_is_input = 0; +	rth->rt_iif	= orig_oif ? : 0; +	rth->rt_pmtu	= 0; +	rth->rt_gateway = 0; +	rth->rt_uses_gateway = 0; +	INIT_LIST_HEAD(&rth->rt_uncached);  	RT_CACHE_STAT_INC(out_slow_tot); -	if (flags & RTCF_LOCAL) { +	if (flags & RTCF_LOCAL)  		rth->dst.input = ip_local_deliver; -		rth->rt_spec_dst = fl->fl4_dst; -	}  	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { -		rth->rt_spec_dst = fl->fl4_src;  		if (flags & RTCF_LOCAL &&  		    !(dev_out->flags & IFF_LOOPBACK)) {  			rth->dst.output = ip_mc_output;  			RT_CACHE_STAT_INC(out_slow_mc);  		}  #ifdef CONFIG_IP_MROUTE -		if (res->type == RTN_MULTICAST) { +		if (type == RTN_MULTICAST) {  			if (IN_DEV_MFORWARD(in_dev) && -			    !ipv4_is_local_multicast(oldflp->fl4_dst)) { +			    !ipv4_is_local_multicast(fl4->daddr)) {  				rth->dst.input = ip_mr_input;  				rth->dst.output = ip_mc_output;  			} @@ -2442,66 +1991,41 @@ static int __mkroute_output(struct rtable **result,  #endif  	} -	rt_set_nexthop(rth, res, 0); +	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); -	rth->rt_flags = flags; -	*result = rth; -	return 0; -} - -/* called with rcu_read_lock() */ -static int ip_mkroute_output(struct rtable **rp, -			     struct fib_result *res, -			     const struct flowi *fl, -			     const struct flowi *oldflp, -			     struct net_device *dev_out, -			     unsigned flags) -{ -	struct rtable *rth = NULL; -	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); -	unsigned hash; -	if (err == 0) { -		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, -			       rt_genid(dev_net(dev_out))); -		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); -	} - -	return err; +	return rth;  }  /*   * Major route resolver routine. - * called with rcu_read_lock();   */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, -				const struct flowi *oldflp) -{ -	u32 tos	= RT_FL_TOS(oldflp); -	struct flowi fl = { .fl4_dst = oldflp->fl4_dst, -			    .fl4_src = oldflp->fl4_src, -			    .fl4_tos = tos & IPTOS_RT_MASK, -			    .fl4_scope = ((tos & RTO_ONLINK) ? -					  RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), -			    .mark = oldflp->mark, -			    .iif = net->loopback_dev->ifindex, -			    .oif = oldflp->oif }; -	struct fib_result res; -	unsigned int flags = 0; +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +{  	struct net_device *dev_out = NULL; -	int err; - +	__u8 tos = RT_FL_TOS(fl4); +	unsigned int flags = 0; +	struct fib_result res; +	struct rtable *rth; +	int orig_oif; +	res.tclassid	= 0;  	res.fi		= NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES -	res.r		= NULL; -#endif +	res.table	= NULL; + +	orig_oif = fl4->flowi4_oif; + +	fl4->flowi4_iif = LOOPBACK_IFINDEX; +	fl4->flowi4_tos = tos & IPTOS_RT_MASK; +	fl4->flowi4_scope = ((tos & RTO_ONLINK) ? +			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); -	if (oldflp->fl4_src) { -		err = -EINVAL; -		if (ipv4_is_multicast(oldflp->fl4_src) || -		    ipv4_is_lbcast(oldflp->fl4_src) || -		    ipv4_is_zeronet(oldflp->fl4_src)) +	rcu_read_lock(); +	if (fl4->saddr) { +		rth = ERR_PTR(-EINVAL); +		if (ipv4_is_multicast(fl4->saddr) || +		    ipv4_is_lbcast(fl4->saddr) || +		    ipv4_is_zeronet(fl4->saddr))  			goto out;  		/* I removed check for oif == dev_out->oif here. @@ -2512,11 +2036,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,  		      of another iface. --ANK  		 */ -		if (oldflp->oif == 0 && -		    (ipv4_is_multicast(oldflp->fl4_dst) || -		     ipv4_is_lbcast(oldflp->fl4_dst))) { +		if (fl4->flowi4_oif == 0 && +		    (ipv4_is_multicast(fl4->daddr) || +		     ipv4_is_lbcast(fl4->daddr))) {  			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ -			dev_out = __ip_dev_find(net, oldflp->fl4_src, false); +			dev_out = __ip_dev_find(net, fl4->saddr, false);  			if (dev_out == NULL)  				goto out; @@ -2535,59 +2059,61 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,  			   Luckily, this hack is good workaround.  			 */ -			fl.oif = dev_out->ifindex; +			fl4->flowi4_oif = dev_out->ifindex;  			goto make_route;  		} -		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { +		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {  			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ -			if (!__ip_dev_find(net, oldflp->fl4_src, false)) +			if (!__ip_dev_find(net, fl4->saddr, false))  				goto out;  		}  	} -	if (oldflp->oif) { -		dev_out = dev_get_by_index_rcu(net, oldflp->oif); -		err = -ENODEV; +	if (fl4->flowi4_oif) { +		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); +		rth = ERR_PTR(-ENODEV);  		if (dev_out == NULL)  			goto out;  		/* RACE: Check return value of inet_select_addr instead. */ -		if (rcu_dereference(dev_out->ip_ptr) == NULL) -			goto out;	/* Wrong error code */ - -		if (ipv4_is_local_multicast(oldflp->fl4_dst) || -		    ipv4_is_lbcast(oldflp->fl4_dst)) { -			if (!fl.fl4_src) -				fl.fl4_src = inet_select_addr(dev_out, 0, +		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { +			rth = ERR_PTR(-ENETUNREACH); +			goto out; +		} +		if (ipv4_is_local_multicast(fl4->daddr) || +		    ipv4_is_lbcast(fl4->daddr)) { +			if (!fl4->saddr) +				fl4->saddr = inet_select_addr(dev_out, 0,  							      RT_SCOPE_LINK);  			goto make_route;  		} -		if (!fl.fl4_src) { -			if (ipv4_is_multicast(oldflp->fl4_dst)) -				fl.fl4_src = inet_select_addr(dev_out, 0, -							      fl.fl4_scope); -			else if (!oldflp->fl4_dst) -				fl.fl4_src = inet_select_addr(dev_out, 0, +		if (!fl4->saddr) { +			if (ipv4_is_multicast(fl4->daddr)) +				fl4->saddr = inet_select_addr(dev_out, 0, +							      fl4->flowi4_scope); +			else if (!fl4->daddr) +				fl4->saddr = inet_select_addr(dev_out, 0,  							      RT_SCOPE_HOST);  		}  	} -	if (!fl.fl4_dst) { -		fl.fl4_dst = fl.fl4_src; -		if (!fl.fl4_dst) -			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); +	if (!fl4->daddr) { +		fl4->daddr = fl4->saddr; +		if (!fl4->daddr) +			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);  		dev_out = net->loopback_dev; -		fl.oif = net->loopback_dev->ifindex; +		fl4->flowi4_oif = LOOPBACK_IFINDEX;  		res.type = RTN_LOCAL;  		flags |= RTCF_LOCAL;  		goto make_route;  	} -	if (fib_lookup(net, &fl, &res)) { +	if (fib_lookup(net, fl4, &res)) {  		res.fi = NULL; -		if (oldflp->oif) { +		res.table = NULL; +		if (fl4->flowi4_oif) {  			/* Apparently, routing tables are wrong. Assume,  			   that the destination is on link. @@ -2606,190 +2132,161 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,  			   likely IPv6, but we do not.  			 */ -			if (fl.fl4_src == 0) -				fl.fl4_src = inet_select_addr(dev_out, 0, +			if (fl4->saddr == 0) +				fl4->saddr = inet_select_addr(dev_out, 0,  							      RT_SCOPE_LINK);  			res.type = RTN_UNICAST;  			goto make_route;  		} -		err = -ENETUNREACH; +		rth = ERR_PTR(-ENETUNREACH);  		goto out;  	}  	if (res.type == RTN_LOCAL) { -		if (!fl.fl4_src) -			fl.fl4_src = fl.fl4_dst; +		if (!fl4->saddr) { +			if (res.fi->fib_prefsrc) +				fl4->saddr = res.fi->fib_prefsrc; +			else +				fl4->saddr = fl4->daddr; +		}  		dev_out = net->loopback_dev; -		fl.oif = dev_out->ifindex; -		res.fi = NULL; +		fl4->flowi4_oif = dev_out->ifindex;  		flags |= RTCF_LOCAL;  		goto make_route;  	}  #ifdef CONFIG_IP_ROUTE_MULTIPATH -	if (res.fi->fib_nhs > 1 && fl.oif == 0) -		fib_select_multipath(&fl, &res); +	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) +		fib_select_multipath(&res);  	else  #endif -	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) -		fib_select_default(net, &fl, &res); +	if (!res.prefixlen && +	    res.table->tb_num_default > 1 && +	    res.type == RTN_UNICAST && !fl4->flowi4_oif) +		fib_select_default(&res); -	if (!fl.fl4_src) -		fl.fl4_src = FIB_RES_PREFSRC(res); +	if (!fl4->saddr) +		fl4->saddr = FIB_RES_PREFSRC(net, res);  	dev_out = FIB_RES_DEV(res); -	fl.oif = dev_out->ifindex; +	fl4->flowi4_oif = dev_out->ifindex;  make_route: -	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); +	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); -out:	return err; +out: +	rcu_read_unlock(); +	return rth;  } +EXPORT_SYMBOL_GPL(__ip_route_output_key); -int __ip_route_output_key(struct net *net, struct rtable **rp, -			  const struct flowi *flp) +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)  { -	unsigned int hash; -	int res; -	struct rtable *rth; +	return NULL; +} -	if (!rt_caching(net)) -		goto slow_output; - -	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); - -	rcu_read_lock_bh(); -	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; -		rth = rcu_dereference_bh(rth->dst.rt_next)) { -		if (rth->fl.fl4_dst == flp->fl4_dst && -		    rth->fl.fl4_src == flp->fl4_src && -		    rt_is_output_route(rth) && -		    rth->fl.oif == flp->oif && -		    rth->fl.mark == flp->mark && -		    !((rth->fl.fl4_tos ^ flp->fl4_tos) & -			    (IPTOS_RT_MASK | RTO_ONLINK)) && -		    net_eq(dev_net(rth->dst.dev), net) && -		    !rt_is_expired(rth)) { -			dst_use(&rth->dst, jiffies); -			RT_CACHE_STAT_INC(out_hit); -			rcu_read_unlock_bh(); -			*rp = rth; -			return 0; -		} -		RT_CACHE_STAT_INC(out_hlist_search); -	} -	rcu_read_unlock_bh(); +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) +{ +	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); -slow_output: -	rcu_read_lock(); -	res = ip_route_output_slow(net, rp, flp); -	rcu_read_unlock(); -	return res; +	return mtu ? : dst->dev->mtu;  } -EXPORT_SYMBOL_GPL(__ip_route_output_key); -static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, +					  struct sk_buff *skb, u32 mtu)  { -	return NULL;  } -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, +				       struct sk_buff *skb)  {  } +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, +					  unsigned long old) +{ +	return NULL; +} +  static struct dst_ops ipv4_dst_blackhole_ops = {  	.family			=	AF_INET,  	.protocol		=	cpu_to_be16(ETH_P_IP), -	.destroy		=	ipv4_dst_destroy,  	.check			=	ipv4_blackhole_dst_check, +	.mtu			=	ipv4_blackhole_mtu, +	.default_advmss		=	ipv4_default_advmss,  	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu, +	.redirect		=	ipv4_rt_blackhole_redirect, +	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics, +	.neigh_lookup		=	ipv4_neigh_lookup,  }; - -static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)  { -	struct rtable *ort = *rp; -	struct rtable *rt = (struct rtable *) -		dst_alloc(&ipv4_dst_blackhole_ops); +	struct rtable *ort = (struct rtable *) dst_orig; +	struct rtable *rt; +	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);  	if (rt) {  		struct dst_entry *new = &rt->dst; -		atomic_set(&new->__refcnt, 1);  		new->__use = 1;  		new->input = dst_discard; -		new->output = dst_discard; -		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); +		new->output = dst_discard_sk;  		new->dev = ort->dst.dev;  		if (new->dev)  			dev_hold(new->dev); -		rt->fl = ort->fl; +		rt->rt_is_input = ort->rt_is_input; +		rt->rt_iif = ort->rt_iif; +		rt->rt_pmtu = ort->rt_pmtu; -		rt->rt_genid = rt_genid(net); +		rt->rt_genid = rt_genid_ipv4(net);  		rt->rt_flags = ort->rt_flags;  		rt->rt_type = ort->rt_type; -		rt->rt_dst = ort->rt_dst; -		rt->rt_src = ort->rt_src; -		rt->rt_iif = ort->rt_iif;  		rt->rt_gateway = ort->rt_gateway; -		rt->rt_spec_dst = ort->rt_spec_dst; -		rt->peer = ort->peer; -		if (rt->peer) -			atomic_inc(&rt->peer->refcnt); +		rt->rt_uses_gateway = ort->rt_uses_gateway; + +		INIT_LIST_HEAD(&rt->rt_uncached);  		dst_free(new);  	} -	dst_release(&(*rp)->dst); -	*rp = rt; -	return rt ? 0 : -ENOMEM; +	dst_release(dst_orig); + +	return rt ? &rt->dst : ERR_PTR(-ENOMEM);  } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, -			 struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, +				    struct sock *sk)  { -	int err; - -	if ((err = __ip_route_output_key(net, rp, flp)) != 0) -		return err; +	struct rtable *rt = __ip_route_output_key(net, flp4); -	if (flp->proto) { -		if (!flp->fl4_src) -			flp->fl4_src = (*rp)->rt_src; -		if (!flp->fl4_dst) -			flp->fl4_dst = (*rp)->rt_dst; -		err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, -				    flags ? XFRM_LOOKUP_WAIT : 0); -		if (err == -EREMOTE) -			err = ipv4_dst_blackhole(net, rp, flp); +	if (IS_ERR(rt)) +		return rt; -		return err; -	} +	if (flp4->flowi4_proto) +		rt = (struct rtable *) xfrm_lookup(net, &rt->dst, +						   flowi4_to_flowi(flp4), +						   sk, 0); -	return 0; +	return rt;  }  EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) -{ -	return ip_route_output_flow(net, rp, flp, NULL, 0); -} -EXPORT_SYMBOL(ip_route_output_key); - -static int rt_fill_info(struct net *net, -			struct sk_buff *skb, u32 pid, u32 seq, int event, -			int nowait, unsigned int flags) +static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, +			struct flowi4 *fl4, struct sk_buff *skb, u32 portid, +			u32 seq, int event, int nowait, unsigned int flags)  {  	struct rtable *rt = skb_rtable(skb);  	struct rtmsg *r;  	struct nlmsghdr *nlh; -	long expires; -	u32 id = 0, ts = 0, tsage = 0, error; +	unsigned long expires = 0; +	u32 error; +	u32 metrics[RTAX_MAX]; -	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); +	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);  	if (nlh == NULL)  		return -EMSGSIZE; @@ -2797,9 +2294,10 @@ static int rt_fill_info(struct net *net,  	r->rtm_family	 = AF_INET;  	r->rtm_dst_len	= 32;  	r->rtm_src_len	= 0; -	r->rtm_tos	= rt->fl.fl4_tos; +	r->rtm_tos	= fl4->flowi4_tos;  	r->rtm_table	= RT_TABLE_MAIN; -	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); +	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) +		goto nla_put_failure;  	r->rtm_type	= rt->rt_type;  	r->rtm_scope	= RT_SCOPE_UNIVERSE;  	r->rtm_protocol = RTPROT_UNSPEC; @@ -2807,50 +2305,59 @@ static int rt_fill_info(struct net *net,  	if (rt->rt_flags & RTCF_NOTIFY)  		r->rtm_flags |= RTM_F_NOTIFY; -	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - -	if (rt->fl.fl4_src) { +	if (nla_put_be32(skb, RTA_DST, dst)) +		goto nla_put_failure; +	if (src) {  		r->rtm_src_len = 32; -		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); +		if (nla_put_be32(skb, RTA_SRC, src)) +			goto nla_put_failure;  	} -	if (rt->dst.dev) -		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE -	if (rt->dst.tclassid) -		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); +	if (rt->dst.dev && +	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) +		goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID +	if (rt->dst.tclassid && +	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) +		goto nla_put_failure;  #endif -	if (rt_is_input_route(rt)) -		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); -	else if (rt->rt_src != rt->fl.fl4_src) -		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); +	if (!rt_is_input_route(rt) && +	    fl4->saddr != src) { +		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) +			goto nla_put_failure; +	} +	if (rt->rt_uses_gateway && +	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) +		goto nla_put_failure; -	if (rt->rt_dst != rt->rt_gateway) -		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); +	expires = rt->dst.expires; +	if (expires) { +		unsigned long now = jiffies; -	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) +		if (time_before(now, expires)) +			expires -= now; +		else +			expires = 0; +	} + +	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); +	if (rt->rt_pmtu && expires) +		metrics[RTAX_MTU - 1] = rt->rt_pmtu; +	if (rtnetlink_put_metrics(skb, metrics) < 0)  		goto nla_put_failure; -	if (rt->fl.mark) -		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); +	if (fl4->flowi4_mark && +	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) +		goto nla_put_failure;  	error = rt->dst.error; -	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; -	if (rt->peer) { -		inet_peer_refcheck(rt->peer); -		id = atomic_read(&rt->peer->ip_id_count) & 0xffff; -		if (rt->peer->tcp_ts_stamp) { -			ts = rt->peer->tcp_ts; -			tsage = get_seconds() - rt->peer->tcp_ts_stamp; -		} -	}  	if (rt_is_input_route(rt)) {  #ifdef CONFIG_IP_MROUTE -		__be32 dst = rt->rt_dst; -  		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&  		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { -			int err = ipmr_get_route(net, skb, r, nowait); +			int err = ipmr_get_route(net, skb, +						 fl4->saddr, fl4->daddr, +						 r, nowait);  			if (err <= 0) {  				if (!nowait) {  					if (err == 0) @@ -2864,11 +2371,11 @@ static int rt_fill_info(struct net *net,  			}  		} else  #endif -			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); +			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) +				goto nla_put_failure;  	} -	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, -			       expires, error) < 0) +	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)  		goto nla_put_failure;  	return nlmsg_end(skb, nlh); @@ -2878,12 +2385,13 @@ nla_put_failure:  	return -EMSGSIZE;  } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)  {  	struct net *net = sock_net(in_skb->sk);  	struct rtmsg *rtm;  	struct nlattr *tb[RTA_MAX+1];  	struct rtable *rt = NULL; +	struct flowi4 fl4;  	__be32 dst = 0;  	__be32 src = 0;  	u32 iif; @@ -2918,6 +2426,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void  	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;  	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; +	memset(&fl4, 0, sizeof(fl4)); +	fl4.daddr = dst; +	fl4.saddr = src; +	fl4.flowi4_tos = rtm->rtm_tos; +	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; +	fl4.flowi4_mark = mark; +  	if (iif) {  		struct net_device *dev; @@ -2938,14 +2453,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void  		if (err == 0 && rt->dst.error)  			err = -rt->dst.error;  	} else { -		struct flowi fl = { -			.fl4_dst = dst, -			.fl4_src = src, -			.fl4_tos = rtm->rtm_tos, -			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, -			.mark = mark, -		}; -		err = ip_route_output_key(net, &rt, &fl); +		rt = ip_route_output_key(net, &fl4); + +		err = 0; +		if (IS_ERR(rt)) +			err = PTR_ERR(rt);  	}  	if (err) @@ -2955,12 +2467,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void  	if (rtm->rtm_flags & RTM_F_NOTIFY)  		rt->rt_flags |= RTCF_NOTIFY; -	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, +	err = rt_fill_info(net, dst, src, &fl4, skb, +			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,  			   RTM_NEWROUTE, 0, 0);  	if (err <= 0)  		goto errout_free; -	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);  errout:  	return err; @@ -2969,76 +2482,33 @@ errout_free:  	goto errout;  } -int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb) -{ -	struct rtable *rt; -	int h, s_h; -	int idx, s_idx; -	struct net *net; - -	net = sock_net(skb->sk); - -	s_h = cb->args[0]; -	if (s_h < 0) -		s_h = 0; -	s_idx = idx = cb->args[1]; -	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { -		if (!rt_hash_table[h].chain) -			continue; -		rcu_read_lock_bh(); -		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; -		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { -			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) -				continue; -			if (rt_is_expired(rt)) -				continue; -			skb_dst_set_noref(skb, &rt->dst); -			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, -					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, -					 1, NLM_F_MULTI) <= 0) { -				skb_dst_drop(skb); -				rcu_read_unlock_bh(); -				goto done; -			} -			skb_dst_drop(skb); -		} -		rcu_read_unlock_bh(); -	} - -done: -	cb->args[0] = h; -	cb->args[1] = idx; -	return skb->len; -} -  void ip_rt_multicast_event(struct in_device *in_dev)  { -	rt_cache_flush(dev_net(in_dev->dev), 0); +	rt_cache_flush(dev_net(in_dev->dev));  }  #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, +static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly  = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly	= HZ / 2; +static int ip_rt_gc_elasticity __read_mostly	= 8; + +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,  					void __user *buffer,  					size_t *lenp, loff_t *ppos)  { -	if (write) { -		int flush_delay; -		ctl_table ctl; -		struct net *net; - -		memcpy(&ctl, __ctl, sizeof(ctl)); -		ctl.data = &flush_delay; -		proc_dointvec(&ctl, write, buffer, lenp, ppos); +	struct net *net = (struct net *)__ctl->extra1; -		net = (struct net *)__ctl->extra1; -		rt_cache_flush(net, flush_delay); +	if (write) { +		rt_cache_flush(net); +		fnhe_genid_bump(net);  		return 0;  	}  	return -EINVAL;  } -static ctl_table ipv4_route_table[] = { +static struct ctl_table ipv4_route_table[] = {  	{  		.procname	= "gc_thresh",  		.data		= &ipv4_dst_ops.gc_thresh, @@ -3149,23 +2619,6 @@ static ctl_table ipv4_route_table[] = {  	{ }  }; -static struct ctl_table empty[1]; - -static struct ctl_table ipv4_skeleton[] = -{ -	{ .procname = "route",  -	  .mode = 0555, .child = ipv4_route_table}, -	{ .procname = "neigh",  -	  .mode = 0555, .child = empty}, -	{ } -}; - -static __net_initdata struct ctl_path ipv4_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ }, -}; -  static struct ctl_table ipv4_route_flush_table[] = {  	{  		.procname	= "flush", @@ -3176,13 +2629,6 @@ static struct ctl_table ipv4_route_flush_table[] = {  	{ },  }; -static __net_initdata struct ctl_path ipv4_route_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ .procname = "route", }, -	{ }, -}; -  static __net_init int sysctl_route_net_init(struct net *net)  {  	struct ctl_table *tbl; @@ -3192,11 +2638,14 @@ static __net_init int sysctl_route_net_init(struct net *net)  		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);  		if (tbl == NULL)  			goto err_dup; + +		/* Don't export sysctls to unprivileged users */ +		if (net->user_ns != &init_user_ns) +			tbl[0].procname = NULL;  	}  	tbl[0].extra1 = net; -	net->ipv4.route_hdr = -		register_net_sysctl_table(net, ipv4_route_path, tbl); +	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);  	if (net->ipv4.route_hdr == NULL)  		goto err_reg;  	return 0; @@ -3226,8 +2675,10 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {  static __net_init int rt_genid_init(struct net *net)  { -	get_random_bytes(&net->ipv4.rt_genid, -			 sizeof(net->ipv4.rt_genid)); +	atomic_set(&net->ipv4.rt_genid, 0); +	atomic_set(&net->fnhe_genid, 0); +	get_random_bytes(&net->ipv4.dev_addr_genid, +			 sizeof(net->ipv4.dev_addr_genid));  	return 0;  } @@ -3235,26 +2686,46 @@ static __net_initdata struct pernet_operations rt_genid_ops = {  	.init = rt_genid_init,  }; +static int __net_init ipv4_inetpeer_init(struct net *net) +{ +	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); -#ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +	if (!bp) +		return -ENOMEM; +	inet_peer_base_init(bp); +	net->ipv4.peers = bp; +	return 0; +} -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) +static void __net_exit ipv4_inetpeer_exit(struct net *net)  { -	if (!str) -		return 0; -	rhash_entries = simple_strtoul(str, &str, 0); -	return 1; +	struct inet_peer_base *bp = net->ipv4.peers; + +	net->ipv4.peers = NULL; +	inetpeer_invalidate_tree(bp); +	kfree(bp);  } -__setup("rhash_entries=", set_rhash_entries); + +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { +	.init	=	ipv4_inetpeer_init, +	.exit	=	ipv4_inetpeer_exit, +}; + +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */  int __init ip_rt_init(void)  {  	int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); +	if (!ip_idents) +		panic("IP: failed to allocate ip_idents\n"); + +	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + +#ifdef CONFIG_IP_ROUTE_CLASSID  	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));  	if (!ip_rt_acct)  		panic("IP: failed to allocate ip_rt_acct\n"); @@ -3272,45 +2743,25 @@ int __init ip_rt_init(void)  	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)  		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); -	rt_hash_table = (struct rt_hash_bucket *) -		alloc_large_system_hash("IP route cache", -					sizeof(struct rt_hash_bucket), -					rhash_entries, -					(totalram_pages >= 128 * 1024) ? -					15 : 17, -					0, -					&rt_hash_log, -					&rt_hash_mask, -					rhash_entries ? 0 : 512 * 1024); -	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); -	rt_hash_lock_init(); - -	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); -	ip_rt_max_size = (rt_hash_mask + 1) * 16; +	ipv4_dst_ops.gc_thresh = ~0; +	ip_rt_max_size = INT_MAX;  	devinet_init();  	ip_fib_init(); -	/* All the timers, started at system startup tend -	   to synchronize. Perturb it a bit. -	 */ -	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); -	expires_ljiffies = jiffies; -	schedule_delayed_work(&expires_work, -		net_random() % ip_rt_gc_interval + ip_rt_gc_interval); -  	if (ip_rt_proc_init()) -		printk(KERN_ERR "Unable to create route proc files\n"); +		pr_err("Unable to create route proc files\n");  #ifdef CONFIG_XFRM  	xfrm_init(); -	xfrm4_init(ip_rt_max_size); +	xfrm4_init();  #endif -	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); +	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);  #ifdef CONFIG_SYSCTL  	register_pernet_subsys(&sysctl_route_ops);  #endif  	register_pernet_subsys(&rt_genid_ops); +	register_pernet_subsys(&ipv4_inetpeer_ops);  	return rc;  } @@ -3321,6 +2772,6 @@ int __init ip_rt_init(void)   */  void __init ip_static_sysctl_init(void)  { -	register_sysctl_paths(ipv4_path, ipv4_skeleton); +	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);  }  #endif diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 47519205a01..c86624b36a6 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -15,6 +15,7 @@  #include <linux/random.h>  #include <linux/cryptohash.h>  #include <linux/kernel.h> +#include <linux/export.h>  #include <net/tcp.h>  #include <net/route.h> @@ -24,15 +25,7 @@  extern int sysctl_tcp_syncookies; -__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; -EXPORT_SYMBOL(syncookie_secret); - -static __init int init_syncookies(void) -{ -	get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); -	return 0; -} -__initcall(init_syncookies); +static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];  #define COOKIEBITS 24	/* Upper bits store count */  #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -43,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],  static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,  		       u32 count, int c)  { -	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); +	__u32 *tmp; + +	net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); +	tmp  = __get_cpu_var(ipv4_cookie_scratch);  	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));  	tmp[0] = (__force u32)saddr;  	tmp[1] = (__force u32)daddr; @@ -88,8 +84,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)  static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, -				   __be16 dport, __u32 sseq, __u32 count, -				   __u32 data) +				   __be16 dport, __u32 sseq, __u32 data)  {  	/*  	 * Compute the secure sequence number. @@ -101,7 +96,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,  	 * As an extra hack, we add a small "data" value that encodes the  	 * MSS into the second hash value.  	 */ - +	u32 count = tcp_cookie_time();  	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +  		sseq + (count << COOKIEBITS) +  		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -113,22 +108,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,   * If the syncookie is bad, the data returned will be out of   * range.  This must be checked by the caller.   * - * The count value used to generate the cookie must be within - * "maxdiff" if the current (passed-in) "count".  The return value - * is (__u32)-1 if this test fails. + * The count value used to generate the cookie must be less than + * MAX_SYNCOOKIE_AGE minutes in the past. + * The return value (__u32)-1 if this test fails.   */  static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, -				  __be16 sport, __be16 dport, __u32 sseq, -				  __u32 count, __u32 maxdiff) +				  __be16 sport, __be16 dport, __u32 sseq)  { -	__u32 diff; +	u32 diff, count = tcp_cookie_time();  	/* Strip away the layers from the cookie */  	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;  	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ -	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); -	if (diff >= maxdiff) +	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); +	if (diff >= MAX_SYNCOOKIE_AGE)  		return (__u32)-1;  	return (cookie - @@ -137,72 +131,70 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,  }  /* - * MSS Values are taken from the 2009 paper - * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: - *  - values 1440 to 1460 accounted for 80% of observed mss values - *  - values outside the 536-1460 range are rare (<0.2%). + * MSS Values are chosen based on the 2011 paper + * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. + * Values .. + *  .. lower than 536 are rare (< 0.2%) + *  .. between 537 and 1299 account for less than < 1.5% of observed values + *  .. in the 1300-1349 range account for about 15 to 20% of observed mss values + *  .. exceeding 1460 are very rare (< 0.04%)   * - * Table must be sorted. + *  1460 is the single most frequently announced mss value (30 to 46% depending + *  on monitor location).  Table must be sorted.   */  static __u16 const msstab[] = { -	64, -	512,  	536, -	1024, -	1440, +	1300, +	1440,	/* 1440, 1452: PPPoE */  	1460, -	4312, -	8960,  };  /*   * Generate a syncookie.  mssp points to the mss, which is returned   * rounded down to the value encoded in the cookie.   */ -__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, +			      u16 *mssp)  { -	const struct iphdr *iph = ip_hdr(skb); -	const struct tcphdr *th = tcp_hdr(skb);  	int mssind;  	const __u16 mss = *mssp; -	tcp_synq_overflow(sk); -  	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)  		if (mss >= msstab[mssind])  			break;  	*mssp = msstab[mssind]; -	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); -  	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,  				     th->source, th->dest, ntohl(th->seq), -				     jiffies / (HZ * 60), mssind); +				     mssind); +} +EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); + +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ +	const struct iphdr *iph = ip_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb); + +	tcp_synq_overflow(sk); +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + +	return __cookie_v4_init_sequence(iph, th, mssp);  } -/* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4  /*   * Check if a ack sequence number is a valid syncookie.   * Return the decoded mss if it is, or 0 if not.   */ -static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, +		      u32 cookie)  { -	const struct iphdr *iph = ip_hdr(skb); -	const struct tcphdr *th = tcp_hdr(skb);  	__u32 seq = ntohl(th->seq) - 1;  	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, -					    th->source, th->dest, seq, -					    jiffies / (HZ * 60), -					    COUNTER_TRIES); +					    th->source, th->dest, seq);  	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;  } +EXPORT_SYMBOL_GPL(__cookie_v4_check);  static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,  					   struct request_sock *req, @@ -231,7 +223,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,   *   * return false if we decode an option that should not be.   */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, +			struct net *net, bool *ecn_ok)  {  	/* echoed timestamp, lowest bits contain options */  	u32 options = tcp_opt->rcv_tsecr & TSMASK; @@ -244,9 +237,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)  	if (!sysctl_tcp_timestamps)  		return false; -	tcp_opt->sack_ok = (options >> 4) & 0x1; +	tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;  	*ecn_ok = (options >> 5) & 1; -	if (*ecn_ok && !sysctl_tcp_ecn) +	if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)  		return false;  	if (tcp_opt->sack_ok && !sysctl_tcp_sack) @@ -265,7 +258,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  			     struct ip_options *opt)  {  	struct tcp_options_received tcp_opt; -	u8 *hash_location;  	struct inet_request_sock *ireq;  	struct tcp_request_sock *treq;  	struct tcp_sock *tp = tcp_sk(sk); @@ -276,13 +268,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	int mss;  	struct rtable *rt;  	__u8 rcv_wscale; -	bool ecn_ok; +	bool ecn_ok = false; +	struct flowi4 fl4;  	if (!sysctl_tcp_syncookies || !th->ack || th->rst)  		goto out;  	if (tcp_synq_no_recent_overflow(sk) || -	    (mss = cookie_check(skb, cookie)) == 0) { +	    (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);  		goto out;  	} @@ -291,9 +284,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	/* check for timestamp cookie support */  	memset(&tcp_opt, 0, sizeof(tcp_opt)); -	tcp_parse_options(skb, &tcp_opt, &hash_location, 0); +	tcp_parse_options(skb, &tcp_opt, 0, NULL); -	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) +	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))  		goto out;  	ret = NULL; @@ -306,25 +299,28 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	treq->rcv_isn		= ntohl(th->seq) - 1;  	treq->snt_isn		= cookie;  	req->mss		= mss; -	ireq->loc_port		= th->dest; -	ireq->rmt_port		= th->source; -	ireq->loc_addr		= ip_hdr(skb)->daddr; -	ireq->rmt_addr		= ip_hdr(skb)->saddr; +	ireq->ir_num		= ntohs(th->dest); +	ireq->ir_rmt_port	= th->source; +	ireq->ir_loc_addr	= ip_hdr(skb)->daddr; +	ireq->ir_rmt_addr	= ip_hdr(skb)->saddr; +	ireq->ir_mark		= inet_request_mark(sk, skb);  	ireq->ecn_ok		= ecn_ok;  	ireq->snd_wscale	= tcp_opt.snd_wscale;  	ireq->sack_ok		= tcp_opt.sack_ok;  	ireq->wscale_ok		= tcp_opt.wscale_ok;  	ireq->tstamp_ok		= tcp_opt.saw_tstamp;  	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; +	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; +	treq->listener		= NULL;  	/* We throwed the options of the initial SYN away, so we hope  	 * the ACK carries the same options again (see RFC1122 4.2.3.8)  	 */  	if (opt && opt->optlen) { -		int opt_size = sizeof(struct ip_options) + opt->optlen; +		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;  		ireq->opt = kmalloc(opt_size, GFP_ATOMIC); -		if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) { +		if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {  			kfree(ireq->opt);  			ireq->opt = NULL;  		} @@ -336,7 +332,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	}  	req->expires	= 0UL; -	req->retrans	= 0; +	req->num_retrans = 0;  	/*  	 * We need to lookup the route here to get at the correct @@ -344,21 +340,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	 * hasn't changed since we received the original syn, but I see  	 * no easy way to do this.  	 */ -	{ -		struct flowi fl = { .mark = sk->sk_mark, -				    .fl4_dst = ((opt && opt->srr) ? -						opt->faddr : ireq->rmt_addr), -				    .fl4_src = ireq->loc_addr, -				    .fl4_tos = RT_CONN_FLAGS(sk), -				    .proto = IPPROTO_TCP, -				    .flags = inet_sk_flowi_flags(sk), -				    .fl_ip_sport = th->dest, -				    .fl_ip_dport = th->source }; -		security_req_classify_flow(req, &fl); -		if (ip_route_output_key(sock_net(sk), &rt, &fl)) { -			reqsk_free(req); -			goto out; -		} +	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, +			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, +			   inet_sk_flowi_flags(sk), +			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, +			   ireq->ir_loc_addr, th->source, th->dest); +	security_req_classify_flow(req, flowi4_to_flowi(&fl4)); +	rt = ip_route_output_key(sock_net(sk), &fl4); +	if (IS_ERR(rt)) { +		reqsk_free(req); +		goto out;  	}  	/* Try to redo what tcp_v4_send_synack did. */ @@ -372,5 +363,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,  	ireq->rcv_wscale  = rcv_wscale;  	ret = get_cookie_sock(sk, skb, req, &rt->dst); +	/* ip_queue_xmit() depends on our flow being setup +	 * Normal sockets get it right from inet_csk_route_child_sock() +	 */ +	if (ret) +		inet_sk(ret)->cork.fl.u.ip4 = fl4;  out:	return ret;  } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e91911d7aae..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -13,6 +13,8 @@  #include <linux/seqlock.h>  #include <linux/init.h>  #include <linux/slab.h> +#include <linux/nsproxy.h> +#include <linux/swap.h>  #include <net/snmp.h>  #include <net/icmp.h>  #include <net/ip.h> @@ -21,29 +23,44 @@  #include <net/udp.h>  #include <net/cipso_ipv4.h>  #include <net/inet_frag.h> +#include <net/ping.h> +#include <net/tcp_memcontrol.h>  static int zero; +static int one = 1; +static int four = 4; +static int gso_max_segs = GSO_MAX_SEGS;  static int tcp_retr1_max = 255;  static int ip_local_port_range_min[] = { 1, 1 };  static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; +static int ip_ttl_min = 1; +static int ip_ttl_max = 255; +static int tcp_syn_retries_min = 1; +static int tcp_syn_retries_max = MAX_TCP_SYNCNT; +static int ip_ping_group_range_min[] = { 0, 0 }; +static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };  /* Update system visible IP port range */ -static void set_local_port_range(int range[2]) +static void set_local_port_range(struct net *net, int range[2])  { -	write_seqlock(&sysctl_local_ports.lock); -	sysctl_local_ports.range[0] = range[0]; -	sysctl_local_ports.range[1] = range[1]; -	write_sequnlock(&sysctl_local_ports.lock); +	write_seqlock(&net->ipv4.ip_local_ports.lock); +	net->ipv4.ip_local_ports.range[0] = range[0]; +	net->ipv4.ip_local_ports.range[1] = range[1]; +	write_sequnlock(&net->ipv4.ip_local_ports.lock);  }  /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, +static int ipv4_local_port_range(struct ctl_table *table, int write,  				 void __user *buffer,  				 size_t *lenp, loff_t *ppos)  { +	struct net *net = +		container_of(table->data, struct net, ipv4.ip_local_ports.range);  	int ret;  	int range[2]; -	ctl_table tmp = { +	struct ctl_table tmp = {  		.data = &range,  		.maxlen = sizeof(range),  		.mode = table->mode, @@ -51,24 +68,88 @@ static int ipv4_local_port_range(ctl_table *table, int write,  		.extra2 = &ip_local_port_range_max,  	}; -	inet_get_local_port_range(range, range + 1); +	inet_get_local_port_range(net, &range[0], &range[1]); +  	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);  	if (write && ret == 0) {  		if (range[1] < range[0])  			ret = -EINVAL;  		else -			set_local_port_range(range); +			set_local_port_range(net, range); +	} + +	return ret; +} + + +static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) +{ +	kgid_t *data = table->data; +	struct net *net = +		container_of(table->data, struct net, ipv4.ping_group_range.range); +	unsigned int seq; +	do { +		seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); + +		*low = data[0]; +		*high = data[1]; +	} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); +} + +/* Update system visible IP port range */ +static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) +{ +	kgid_t *data = table->data; +	struct net *net = +		container_of(table->data, struct net, ipv4.ping_group_range.range); +	write_seqlock(&net->ipv4.ip_local_ports.lock); +	data[0] = low; +	data[1] = high; +	write_sequnlock(&net->ipv4.ip_local_ports.lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_ping_group_range(struct ctl_table *table, int write, +				 void __user *buffer, +				 size_t *lenp, loff_t *ppos) +{ +	struct user_namespace *user_ns = current_user_ns(); +	int ret; +	gid_t urange[2]; +	kgid_t low, high; +	struct ctl_table tmp = { +		.data = &urange, +		.maxlen = sizeof(urange), +		.mode = table->mode, +		.extra1 = &ip_ping_group_range_min, +		.extra2 = &ip_ping_group_range_max, +	}; + +	inet_get_ping_group_range_table(table, &low, &high); +	urange[0] = from_kgid_munged(user_ns, low); +	urange[1] = from_kgid_munged(user_ns, high); +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + +	if (write && ret == 0) { +		low = make_kgid(user_ns, urange[0]); +		high = make_kgid(user_ns, urange[1]); +		if (!gid_valid(low) || !gid_valid(high) || +		    (urange[1] < urange[0]) || gid_lt(high, low)) { +			low = make_kgid(&init_user_ns, 1); +			high = make_kgid(&init_user_ns, 0); +		} +		set_ping_group_range(table, low, high);  	}  	return ret;  } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, +static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,  				       void __user *buffer, size_t *lenp, loff_t *ppos)  {  	char val[TCP_CA_NAME_MAX]; -	ctl_table tbl = { +	struct ctl_table tbl = {  		.data = val,  		.maxlen = TCP_CA_NAME_MAX,  	}; @@ -82,12 +163,12 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write,  	return ret;  } -static int proc_tcp_available_congestion_control(ctl_table *ctl, +static int proc_tcp_available_congestion_control(struct ctl_table *ctl,  						 int write,  						 void __user *buffer, size_t *lenp,  						 loff_t *ppos)  { -	ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; +	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };  	int ret;  	tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -99,12 +180,12 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,  	return ret;  } -static int proc_allowed_congestion_control(ctl_table *ctl, +static int proc_allowed_congestion_control(struct ctl_table *ctl,  					   int write,  					   void __user *buffer, size_t *lenp,  					   loff_t *ppos)  { -	ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; +	struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };  	int ret;  	tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -119,6 +200,53 @@ static int proc_allowed_congestion_control(ctl_table *ctl,  	return ret;  } +static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +				 void __user *buffer, size_t *lenp, +				 loff_t *ppos) +{ +	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; +	struct tcp_fastopen_context *ctxt; +	int ret; +	u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + +	tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); +	if (!tbl.data) +		return -ENOMEM; + +	rcu_read_lock(); +	ctxt = rcu_dereference(tcp_fastopen_ctx); +	if (ctxt) +		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); +	else +		memset(user_key, 0, sizeof(user_key)); +	rcu_read_unlock(); + +	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", +		user_key[0], user_key[1], user_key[2], user_key[3]); +	ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + +	if (write && ret == 0) { +		if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, +			   user_key + 2, user_key + 3) != 4) { +			ret = -EINVAL; +			goto bad_key; +		} +		/* Generate a dummy secret but don't publish it. This +		 * is needed so we don't regenerate a new key on the +		 * first invocation of tcp_fastopen_cookie_gen +		 */ +		tcp_fastopen_init_key_once(false); +		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); +	} + +bad_key: +	pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", +	       user_key[0], user_key[1], user_key[2], user_key[3], +	       (char *)tbl.data, ret); +	kfree(tbl.data); +	return ret; +} +  static struct ctl_table ipv4_table[] = {  	{  		.procname	= "tcp_timestamps", @@ -153,15 +281,9 @@ static struct ctl_table ipv4_table[] = {  		.data		= &sysctl_ip_default_ttl,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= ipv4_doint_and_flush, -		.extra2		= &init_net, -	}, -	{ -		.procname	= "ip_no_pmtu_disc", -		.data		= &ipv4_config.no_pmtu_disc, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &ip_ttl_min, +		.extra2		= &ip_ttl_max,  	},  	{  		.procname	= "ip_nonlocal_bind", @@ -175,7 +297,9 @@ static struct ctl_table ipv4_table[] = {  		.data		= &sysctl_tcp_syn_retries,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &tcp_syn_retries_min, +		.extra2		= &tcp_syn_retries_max  	},  	{  		.procname	= "tcp_synack_retries", @@ -199,6 +323,13 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ +		.procname	= "ip_early_demux", +		.data		= &sysctl_ip_early_demux, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{  		.procname	= "ip_dynaddr",  		.data		= &sysctl_ip_dynaddr,  		.maxlen		= sizeof(int), @@ -258,6 +389,19 @@ static struct ctl_table ipv4_table[] = {  	},  #endif  	{ +		.procname	= "tcp_fastopen", +		.data		= &sysctl_tcp_fastopen, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "tcp_fastopen_key", +		.mode		= 0600, +		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), +		.proc_handler	= proc_tcp_fastopen_key, +	}, +	{  		.procname	= "tcp_tw_recycle",  		.data		= &tcp_death_row.sysctl_tw_recycle,  		.maxlen		= sizeof(int), @@ -293,29 +437,12 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "ip_local_port_range", -		.data		= &sysctl_local_ports.range, -		.maxlen		= sizeof(sysctl_local_ports.range), -		.mode		= 0644, -		.proc_handler	= ipv4_local_port_range, -	}, -	{ -		.procname	= "ip_local_reserved_ports", -		.data		= NULL, /* initialized in sysctl_ipv4_init */ -		.maxlen		= 65536, -		.mode		= 0644, -		.proc_handler	= proc_do_large_bitmap, -	}, -#ifdef CONFIG_IP_MULTICAST -	{  		.procname	= "igmp_max_memberships",  		.data		= &sysctl_igmp_max_memberships,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec  	}, - -#endif  	{  		.procname	= "igmp_max_msf",  		.data		= &sysctl_igmp_max_msf, @@ -345,20 +472,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec_jiffies,  	},  	{ -		.procname	= "inet_peer_gc_mintime", -		.data		= &inet_peer_gc_mintime, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_jiffies, -	}, -	{ -		.procname	= "inet_peer_gc_maxtime", -		.data		= &inet_peer_gc_maxtime, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_jiffies, -	}, -	{  		.procname	= "tcp_orphan_retries",  		.data		= &sysctl_tcp_orphan_retries,  		.maxlen		= sizeof(int), @@ -380,13 +493,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "tcp_ecn", -		.data		= &sysctl_tcp_ecn, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "tcp_dsack",  		.data		= &sysctl_tcp_dsack,  		.maxlen		= sizeof(int), @@ -395,24 +501,33 @@ static struct ctl_table ipv4_table[] = {  	},  	{  		.procname	= "tcp_mem", -		.data		= &sysctl_tcp_mem,  		.maxlen		= sizeof(sysctl_tcp_mem), +		.data		= &sysctl_tcp_mem,  		.mode		= 0644, -		.proc_handler	= proc_doulongvec_minmax +		.proc_handler	= proc_doulongvec_minmax,  	},  	{  		.procname	= "tcp_wmem",  		.data		= &sysctl_tcp_wmem,  		.maxlen		= sizeof(sysctl_tcp_wmem),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one, +	}, +	{ +		.procname	= "tcp_notsent_lowat", +		.data		= &sysctl_tcp_notsent_lowat, +		.maxlen		= sizeof(sysctl_tcp_notsent_lowat), +		.mode		= 0644, +		.proc_handler	= proc_dointvec,  	},  	{  		.procname	= "tcp_rmem",  		.data		= &sysctl_tcp_rmem,  		.maxlen		= sizeof(sysctl_tcp_rmem),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &one,  	},  	{  		.procname	= "tcp_app_win", @@ -426,7 +541,9 @@ static struct ctl_table ipv4_table[] = {  		.data		= &sysctl_tcp_adv_win_scale,  		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_dointvec +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &tcp_adv_win_scale_min, +		.extra2		= &tcp_adv_win_scale_max,  	},  	{  		.procname	= "tcp_tw_reuse", @@ -443,13 +560,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "tcp_frto_response", -		.data		= &sysctl_tcp_frto_response, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname	= "tcp_low_latency",  		.data		= &sysctl_tcp_low_latency,  		.maxlen		= sizeof(int), @@ -484,13 +594,6 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler	= proc_tcp_congestion_control,  	},  	{ -		.procname	= "tcp_abc", -		.data		= &sysctl_tcp_abc, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{  		.procname	= "tcp_mtu_probing",  		.data		= &sysctl_tcp_mtu_probing,  		.maxlen		= sizeof(int), @@ -511,6 +614,20 @@ static struct ctl_table ipv4_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec  	}, +	{ +		.procname	= "tcp_limit_output_bytes", +		.data		= &sysctl_tcp_limit_output_bytes, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "tcp_challenge_ack_limit", +		.data		= &sysctl_tcp_challenge_ack_limit, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	},  #ifdef CONFIG_NET_DMA  	{  		.procname	= "tcp_dma_copybreak", @@ -570,27 +687,13 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler   = proc_allowed_congestion_control,  	},  	{ -		.procname	= "tcp_max_ssthresh", -		.data		= &sysctl_tcp_max_ssthresh, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{ -		.procname	= "tcp_cookie_size", -		.data		= &sysctl_tcp_cookie_size, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec -	}, -	{  		.procname       = "tcp_thin_linear_timeouts",  		.data           = &sysctl_tcp_thin_linear_timeouts,  		.maxlen         = sizeof(int),  		.mode           = 0644,  		.proc_handler   = proc_dointvec  	}, -        { +	{  		.procname       = "tcp_thin_dupack",  		.data           = &sysctl_tcp_thin_dupack,  		.maxlen         = sizeof(int), @@ -598,6 +701,33 @@ static struct ctl_table ipv4_table[] = {  		.proc_handler   = proc_dointvec  	},  	{ +		.procname	= "tcp_early_retrans", +		.data		= &sysctl_tcp_early_retrans, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &four, +	}, +	{ +		.procname	= "tcp_min_tso_segs", +		.data		= &sysctl_tcp_min_tso_segs, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &gso_max_segs, +	}, +	{ +		.procname	= "tcp_autocorking", +		.data		= &sysctl_tcp_autocorking, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +	{  		.procname	= "udp_mem",  		.data		= &sysctl_udp_mem,  		.maxlen		= sizeof(sysctl_udp_mem), @@ -610,7 +740,7 @@ static struct ctl_table ipv4_table[] = {  		.maxlen		= sizeof(sysctl_udp_rmem_min),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &zero +		.extra1		= &one  	},  	{  		.procname	= "udp_wmem_min", @@ -618,7 +748,7 @@ static struct ctl_table ipv4_table[] = {  		.maxlen		= sizeof(sysctl_udp_wmem_min),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &zero +		.extra1		= &one  	},  	{ }  }; @@ -667,57 +797,93 @@ static struct ctl_table ipv4_net_table[] = {  		.proc_handler	= proc_dointvec  	},  	{ -		.procname	= "rt_cache_rebuild_count", -		.data		= &init_net.ipv4.sysctl_rt_cache_rebuild_count, +		.procname	= "ping_group_range", +		.data		= &init_net.ipv4.ping_group_range.range, +		.maxlen		= sizeof(gid_t)*2, +		.mode		= 0644, +		.proc_handler	= ipv4_ping_group_range, +	}, +	{ +		.procname	= "tcp_ecn", +		.data		= &init_net.ipv4.sysctl_tcp_ecn, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec +	}, +	{ +		.procname	= "ip_local_port_range", +		.maxlen		= sizeof(init_net.ipv4.ip_local_ports.range), +		.data		= &init_net.ipv4.ip_local_ports.range, +		.mode		= 0644, +		.proc_handler	= ipv4_local_port_range, +	}, +	{ +		.procname	= "ip_local_reserved_ports", +		.data		= &init_net.ipv4.sysctl_local_reserved_ports, +		.maxlen		= 65536, +		.mode		= 0644, +		.proc_handler	= proc_do_large_bitmap, +	}, +	{ +		.procname	= "ip_no_pmtu_disc", +		.data		= &init_net.ipv4.sysctl_ip_no_pmtu_disc,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec  	}, +	{ +		.procname	= "ip_forward_use_pmtu", +		.data		= &init_net.ipv4.sysctl_ip_fwd_use_pmtu, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "fwmark_reflect", +		.data		= &init_net.ipv4.sysctl_fwmark_reflect, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +	{ +		.procname	= "tcp_fwmark_accept", +		.data		= &init_net.ipv4.sysctl_tcp_fwmark_accept, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	},  	{ }  }; -struct ctl_path net_ipv4_ctl_path[] = { -	{ .procname = "net", }, -	{ .procname = "ipv4", }, -	{ }, -}; -EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); -  static __net_init int ipv4_sysctl_init_net(struct net *net)  {  	struct ctl_table *table;  	table = ipv4_net_table;  	if (!net_eq(net, &init_net)) { +		int i; +  		table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);  		if (table == NULL)  			goto err_alloc; -		table[0].data = -			&net->ipv4.sysctl_icmp_echo_ignore_all; -		table[1].data = -			&net->ipv4.sysctl_icmp_echo_ignore_broadcasts; -		table[2].data = -			&net->ipv4.sysctl_icmp_ignore_bogus_error_responses; -		table[3].data = -			&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr; -		table[4].data = -			&net->ipv4.sysctl_icmp_ratelimit; -		table[5].data = -			&net->ipv4.sysctl_icmp_ratemask; -		table[6].data = -			&net->ipv4.sysctl_rt_cache_rebuild_count; +		/* Update the variables to point into the current struct net */ +		for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) +			table[i].data += (void *)net - (void *)&init_net;  	} -	net->ipv4.sysctl_rt_cache_rebuild_count = 4; - -	net->ipv4.ipv4_hdr = register_net_sysctl_table(net, -			net_ipv4_ctl_path, table); +	net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);  	if (net->ipv4.ipv4_hdr == NULL)  		goto err_reg; +	net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); +	if (!net->ipv4.sysctl_local_reserved_ports) +		goto err_ports; +  	return 0; +err_ports: +	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);  err_reg:  	if (!net_eq(net, &init_net))  		kfree(table); @@ -729,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)  {  	struct ctl_table *table; +	kfree(net->ipv4.sysctl_local_reserved_ports);  	table = net->ipv4.ipv4_hdr->ctl_table_arg;  	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);  	kfree(table); @@ -742,23 +909,13 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {  static __init int sysctl_ipv4_init(void)  {  	struct ctl_table_header *hdr; -	struct ctl_table *i; - -	for (i = ipv4_table; i->procname; i++) { -		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { -			i->data = sysctl_local_reserved_ports; -			break; -		} -	} -	if (!i->procname) -		return -EINVAL; -	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); +	hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);  	if (hdr == NULL)  		return -ENOMEM;  	if (register_pernet_subsys(&ipv4_sysctl_ops)) { -		unregister_sysctl_table(hdr); +		unregister_net_sysctl_table(hdr);  		return -ENOMEM;  	} diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2bb46d55f40..9d2118e5fbc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -245,6 +245,8 @@   *	TCP_CLOSE		socket is finished   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/types.h> @@ -268,6 +270,7 @@  #include <linux/slab.h>  #include <net/icmp.h> +#include <net/inet_common.h>  #include <net/tcp.h>  #include <net/xfrm.h>  #include <net/ip.h> @@ -276,9 +279,14 @@  #include <asm/uaccess.h>  #include <asm/ioctls.h> +#include <net/busy_poll.h>  int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; +int sysctl_tcp_min_tso_segs __read_mostly = 2; + +int sysctl_tcp_autocorking __read_mostly = 1; +  struct percpu_counter tcp_orphan_count;  EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -363,6 +371,61 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)  	return period;  } +/* Address-family independent initialization for a tcp_sock. + * + * NOTE: A lot of things set to zero explicitly by call to + *       sk_alloc() so need not be done here. + */ +void tcp_init_sock(struct sock *sk) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); + +	__skb_queue_head_init(&tp->out_of_order_queue); +	tcp_init_xmit_timers(sk); +	tcp_prequeue_init(tp); +	INIT_LIST_HEAD(&tp->tsq_node); + +	icsk->icsk_rto = TCP_TIMEOUT_INIT; +	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + +	/* So many TCP implementations out there (incorrectly) count the +	 * initial SYN frame in their delayed-ACK and congestion control +	 * algorithms that we must have the following bandaid to talk +	 * efficiently to them.  -DaveM +	 */ +	tp->snd_cwnd = TCP_INIT_CWND; + +	/* See draft-stevens-tcpca-spec-01 for discussion of the +	 * initialization of these values. +	 */ +	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +	tp->snd_cwnd_clamp = ~0; +	tp->mss_cache = TCP_MSS_DEFAULT; + +	tp->reordering = sysctl_tcp_reordering; +	tcp_enable_early_retrans(tp); +	icsk->icsk_ca_ops = &tcp_init_congestion_ops; + +	tp->tsoffset = 0; + +	sk->sk_state = TCP_CLOSE; + +	sk->sk_write_space = sk_stream_write_space; +	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + +	icsk->icsk_sync_mss = tcp_sync_mss; + +	sk->sk_sndbuf = sysctl_tcp_wmem[1]; +	sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + +	local_bh_disable(); +	sock_update_memcg(sk); +	sk_sockets_allocated_inc(sk); +	local_bh_enable(); +} +EXPORT_SYMBOL(tcp_init_sock); +  /*   *	Wait for a TCP event.   * @@ -374,7 +437,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  {  	unsigned int mask;  	struct sock *sk = sock->sk; -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); + +	sock_rps_record_flow(sk);  	sock_poll_wait(file, sk_sleep(sk), wait);  	if (sk->sk_state == TCP_LISTEN) @@ -419,8 +484,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  	if (sk->sk_shutdown & RCV_SHUTDOWN)  		mask |= POLLIN | POLLRDNORM | POLLRDHUP; -	/* Connected? */ -	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { +	/* Connected or passive Fast Open socket? */ +	if (sk->sk_state != TCP_SYN_SENT && +	    (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {  		int target = sock_rcvlowat(sk, 0, INT_MAX);  		if (tp->urg_seq == tp->copied_seq && @@ -435,7 +501,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  			mask |= POLLIN | POLLRDNORM;  		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { +			if (sk_stream_is_writeable(sk)) {  				mask |= POLLOUT | POLLWRNORM;  			} else {  /* send SIGIO later */  				set_bit(SOCK_ASYNC_NOSPACE, @@ -446,7 +512,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)  				 * wspace test but before the flags are set,  				 * IO signal will be lost.  				 */ -				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) +				if (sk_stream_is_writeable(sk))  					mask |= POLLOUT | POLLWRNORM;  			}  		} else @@ -468,30 +534,29 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int answ; +	bool slow;  	switch (cmd) {  	case SIOCINQ:  		if (sk->sk_state == TCP_LISTEN)  			return -EINVAL; -		lock_sock(sk); +		slow = lock_sock_fast(sk);  		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))  			answ = 0;  		else if (sock_flag(sk, SOCK_URGINLINE) ||  			 !tp->urg_data ||  			 before(tp->urg_seq, tp->copied_seq) ||  			 !before(tp->urg_seq, tp->rcv_nxt)) { -			struct sk_buff *skb;  			answ = tp->rcv_nxt - tp->copied_seq; -			/* Subtract 1, if FIN is in queue. */ -			skb = skb_peek_tail(&sk->sk_receive_queue); -			if (answ && skb) -				answ -= tcp_hdr(skb)->fin; +			/* Subtract 1, if FIN was received */ +			if (answ && sock_flag(sk, SOCK_DONE)) +				answ--;  		} else  			answ = tp->urg_seq - tp->copied_seq; -		release_sock(sk); +		unlock_sock_fast(sk, slow);  		break;  	case SIOCATMARK:  		answ = tp->urg_data && tp->urg_seq == tp->copied_seq; @@ -505,6 +570,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)  		else  			answ = tp->write_seq - tp->snd_una;  		break; +	case SIOCOUTQNSD: +		if (sk->sk_state == TCP_LISTEN) +			return -EINVAL; + +		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) +			answ = 0; +		else +			answ = tp->write_seq - tp->snd_nxt; +		break;  	default:  		return -ENOIOCTLCMD;  	} @@ -515,11 +589,11 @@ EXPORT_SYMBOL(tcp_ioctl);  static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)  { -	TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; +	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;  	tp->pushed_seq = tp->write_seq;  } -static inline int forced_push(struct tcp_sock *tp) +static inline bool forced_push(const struct tcp_sock *tp)  {  	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));  } @@ -531,7 +605,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)  	skb->csum    = 0;  	tcb->seq     = tcb->end_seq = tp->write_seq; -	tcb->flags   = TCPHDR_ACK; +	tcb->tcp_flags = TCPHDR_ACK;  	tcb->sacked  = 0;  	skb_header_release(skb);  	tcp_add_write_queue_tail(sk, skb); @@ -547,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)  		tp->snd_up = tp->write_seq;  } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, -			    int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have data packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, +				int size_goal) +{ +	return skb->len < size_goal && +	       sysctl_tcp_autocorking && +	       skb != tcp_write_queue_head(sk) && +	       atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, +		     int nonagle, int size_goal)  { -	if (tcp_send_head(sk)) { -		struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; -		if (!(flags & MSG_MORE) || forced_push(tp)) -			tcp_mark_push(tp, tcp_write_queue_tail(sk)); +	if (!tcp_send_head(sk)) +		return; -		tcp_mark_urg(tp, flags); -		__tcp_push_pending_frames(sk, mss_now, -					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); +	skb = tcp_write_queue_tail(sk); +	if (!(flags & MSG_MORE) || forced_push(tp)) +		tcp_mark_push(tp, skb); + +	tcp_mark_urg(tp, flags); + +	if (tcp_should_autocork(sk, skb, size_goal)) { + +		/* avoid atomic op if TSQ_THROTTLED bit is already set */ +		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { +			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +		} +		/* It is possible TX completion already happened +		 * before we set TSQ_THROTTLED. +		 */ +		if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) +			return;  	} + +	if (flags & MSG_MORE) +		nonagle = TCP_NAGLE_CORK; + +	__tcp_push_pending_frames(sk, mss_now, nonagle);  }  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -692,11 +805,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)  	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);  	if (skb) {  		if (sk_wmem_schedule(sk, skb->truesize)) { +			skb_reserve(skb, sk->sk_prot->max_header);  			/*  			 * Make sure that we have exactly size bytes  			 * available to the caller, no more, no less.  			 */ -			skb_reserve(skb, skb_tailroom(skb) - size); +			skb->reserved_tailroom = skb->end - skb->tail - size;  			return skb;  		}  		__kfree_skb(skb); @@ -716,10 +830,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  	xmit_size_goal = mss_now;  	if (large_allowed && sk_can_gso(sk)) { -		xmit_size_goal = ((sk->sk_gso_max_size - 1) - -				  inet_csk(sk)->icsk_af_ops->net_header_len - -				  inet_csk(sk)->icsk_ext_hdr_len - -				  tp->tcp_header_len); +		u32 gso_size, hlen; + +		/* Maybe we should/could use sk->sk_prot->max_header here ? */ +		hlen = inet_csk(sk)->icsk_af_ops->net_header_len + +		       inet_csk(sk)->icsk_ext_hdr_len + +		       tp->tcp_header_len; + +		/* Goal is to send at least one packet per ms, +		 * not one big TSO packet every 100 ms. +		 * This preserves ACK clocking and is consistent +		 * with tcp_tso_should_defer() heuristic. +		 */ +		gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); +		gso_size = max_t(u32, gso_size, +				 sysctl_tcp_min_tso_segs * mss_now); + +		xmit_size_goal = min_t(u32, gso_size, +				       sk->sk_gso_max_size - 1 - hlen);  		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); @@ -730,7 +858,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,  			   old_size_goal + mss_now > xmit_size_goal)) {  			xmit_size_goal = old_size_goal;  		} else { -			tp->xmit_size_goal_segs = xmit_size_goal / mss_now; +			tp->xmit_size_goal_segs = +				min_t(u16, xmit_size_goal / mss_now, +				      sk->sk_gso_max_segs);  			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;  		}  	} @@ -748,8 +878,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)  	return mss_now;  } -static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, -			 size_t psize, int flags) +static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, +				size_t size, int flags)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int mss_now, size_goal; @@ -757,10 +887,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse  	ssize_t copied;  	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); -	/* Wait for a connection to finish. */ -	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) +	/* Wait for a connection to finish. One exception is TCP Fast Open +	 * (passive side) where data is allowed to be sent before a connection +	 * is fully established. +	 */ +	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +	    !tcp_passive_fastopen(sk)) {  		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)  			goto out_err; +	}  	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -771,12 +906,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse  	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))  		goto out_err; -	while (psize > 0) { +	while (size > 0) {  		struct sk_buff *skb = tcp_write_queue_tail(sk); -		struct page *page = pages[poffset / PAGE_SIZE]; -		int copy, i, can_coalesce; -		int offset = poffset % PAGE_SIZE; -		int size = min_t(size_t, psize, PAGE_SIZE - offset); +		int copy, i; +		bool can_coalesce;  		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {  new_segment: @@ -804,11 +937,12 @@ new_segment:  			goto wait_for_memory;  		if (can_coalesce) { -			skb_shinfo(skb)->frags[i - 1].size += copy; +			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  		} else {  			get_page(page);  			skb_fill_page_desc(skb, i, page, offset, copy);  		} +		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;  		skb->len += copy;  		skb->data_len += copy; @@ -821,11 +955,11 @@ new_segment:  		skb_shinfo(skb)->gso_segs = 0;  		if (!copied) -			TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; +			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;  		copied += copy; -		poffset += copy; -		if (!(psize -= copy)) +		offset += copy; +		if (!(size -= copy))  			goto out;  		if (skb->len < size_goal || (flags & MSG_OOB)) @@ -841,8 +975,8 @@ new_segment:  wait_for_sndbuf:  		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory: -		if (copied) -			tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +		tcp_push(sk, flags & ~MSG_MORE, mss_now, +			 TCP_NAGLE_PUSH, size_goal);  		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  			goto do_error; @@ -851,8 +985,8 @@ wait_for_memory:  	}  out: -	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); +	if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);  	return copied;  do_error: @@ -873,26 +1007,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,  					flags);  	lock_sock(sk); -	TCP_CHECK_TIMER(sk); -	res = do_tcp_sendpages(sk, &page, offset, size, flags); -	TCP_CHECK_TIMER(sk); +	res = do_tcp_sendpages(sk, page, offset, size, flags);  	release_sock(sk);  	return res;  }  EXPORT_SYMBOL(tcp_sendpage); -#define TCP_PAGE(sk)	(sk->sk_sndmsg_page) -#define TCP_OFF(sk)	(sk->sk_sndmsg_off) - -static inline int select_size(struct sock *sk, int sg) +static inline int select_size(const struct sock *sk, bool sg)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	int tmp = tp->mss_cache;  	if (sg) { -		if (sk_can_gso(sk)) -			tmp = 0; -		else { +		if (sk_can_gso(sk)) { +			/* Small frames wont use a full page: +			 * Payload will immediately follow tcp header. +			 */ +			tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); +		} else {  			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);  			if (tmp >= pgbreak && @@ -904,28 +1036,88 @@ static inline int select_size(struct sock *sk, int sg)  	return tmp;  } +void tcp_free_fastopen_req(struct tcp_sock *tp) +{ +	if (tp->fastopen_req != NULL) { +		kfree(tp->fastopen_req); +		tp->fastopen_req = NULL; +	} +} + +static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, +				int *copied, size_t size) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int err, flags; + +	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) +		return -EOPNOTSUPP; +	if (tp->fastopen_req != NULL) +		return -EALREADY; /* Another Fast Open is in progress */ + +	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), +				   sk->sk_allocation); +	if (unlikely(tp->fastopen_req == NULL)) +		return -ENOBUFS; +	tp->fastopen_req->data = msg; +	tp->fastopen_req->size = size; + +	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; +	err = __inet_stream_connect(sk->sk_socket, msg->msg_name, +				    msg->msg_namelen, flags); +	*copied = tp->fastopen_req->copied; +	tcp_free_fastopen_req(tp); +	return err; +} +  int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		size_t size)  {  	struct iovec *iov;  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb; -	int iovlen, flags; -	int mss_now, size_goal; -	int sg, err, copied; +	int iovlen, flags, err, copied = 0; +	int mss_now = 0, size_goal, copied_syn = 0, offset = 0; +	bool sg;  	long timeo;  	lock_sock(sk); -	TCP_CHECK_TIMER(sk);  	flags = msg->msg_flags; +	if (flags & MSG_FASTOPEN) { +		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); +		if (err == -EINPROGRESS && copied_syn > 0) +			goto out; +		else if (err) +			goto out_err; +		offset = copied_syn; +	} +  	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); -	/* Wait for a connection to finish. */ -	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) +	/* Wait for a connection to finish. One exception is TCP Fast Open +	 * (passive side) where data is allowed to be sent before a connection +	 * is fully established. +	 */ +	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && +	    !tcp_passive_fastopen(sk)) {  		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) +			goto do_error; +	} + +	if (unlikely(tp->repair)) { +		if (tp->repair_queue == TCP_RECV_QUEUE) { +			copied = tcp_send_rcvq(sk, msg, size); +			goto out_nopush; +		} + +		err = -EINVAL; +		if (tp->repair_queue == TCP_NO_QUEUE)  			goto out_err; +		/* 'common' sending to sendq */ +	} +  	/* This should be in poll */  	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); @@ -940,13 +1132,22 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))  		goto out_err; -	sg = sk->sk_route_caps & NETIF_F_SG; +	sg = !!(sk->sk_route_caps & NETIF_F_SG);  	while (--iovlen >= 0) {  		size_t seglen = iov->iov_len;  		unsigned char __user *from = iov->iov_base;  		iov++; +		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */ +			if (offset >= seglen) { +				offset -= seglen; +				continue; +			} +			seglen -= offset; +			from += offset; +			offset = 0; +		}  		while (seglen > 0) {  			int copy = 0; @@ -974,6 +1175,13 @@ new_segment:  					goto wait_for_memory;  				/* +				 * All packets are restored as if they have +				 * already been sent. +				 */ +				if (tp->repair) +					TCP_SKB_CB(skb)->when = tcp_time_stamp; + +				/*  				 * Check whether we can use HW checksum.  				 */  				if (sk->sk_route_caps & NETIF_F_ALL_CSUM) @@ -989,85 +1197,54 @@ new_segment:  				copy = seglen;  			/* Where to copy to? */ -			if (skb_tailroom(skb) > 0) { +			if (skb_availroom(skb) > 0) {  				/* We have some space in skb head. Superb! */ -				if (copy > skb_tailroom(skb)) -					copy = skb_tailroom(skb); -				if ((err = skb_add_data(skb, from, copy)) != 0) +				copy = min_t(int, copy, skb_availroom(skb)); +				err = skb_add_data_nocache(sk, skb, from, copy); +				if (err)  					goto do_fault;  			} else { -				int merge = 0; +				bool merge = true;  				int i = skb_shinfo(skb)->nr_frags; -				struct page *page = TCP_PAGE(sk); -				int off = TCP_OFF(sk); - -				if (skb_can_coalesce(skb, i, page, off) && -				    off != PAGE_SIZE) { -					/* We can extend the last page -					 * fragment. */ -					merge = 1; -				} else if (i == MAX_SKB_FRAGS || !sg) { -					/* Need to add new fragment and cannot -					 * do this because interface is non-SG, -					 * or because all the page slots are -					 * busy. */ -					tcp_mark_push(tp, skb); -					goto new_segment; -				} else if (page) { -					if (off == PAGE_SIZE) { -						put_page(page); -						TCP_PAGE(sk) = page = NULL; -						off = 0; +				struct page_frag *pfrag = sk_page_frag(sk); + +				if (!sk_page_frag_refill(sk, pfrag)) +					goto wait_for_memory; + +				if (!skb_can_coalesce(skb, i, pfrag->page, +						      pfrag->offset)) { +					if (i == MAX_SKB_FRAGS || !sg) { +						tcp_mark_push(tp, skb); +						goto new_segment;  					} -				} else -					off = 0; +					merge = false; +				} -				if (copy > PAGE_SIZE - off) -					copy = PAGE_SIZE - off; +				copy = min_t(int, copy, pfrag->size - pfrag->offset);  				if (!sk_wmem_schedule(sk, copy))  					goto wait_for_memory; -				if (!page) { -					/* Allocate new cache page. */ -					if (!(page = sk_stream_alloc_page(sk))) -						goto wait_for_memory; -				} - -				/* Time to copy data. We are close to -				 * the end! */ -				err = skb_copy_to_page(sk, from, skb, page, -						       off, copy); -				if (err) { -					/* If this page was new, give it to the -					 * socket so it does not get leaked. -					 */ -					if (!TCP_PAGE(sk)) { -						TCP_PAGE(sk) = page; -						TCP_OFF(sk) = 0; -					} +				err = skb_copy_to_page_nocache(sk, from, skb, +							       pfrag->page, +							       pfrag->offset, +							       copy); +				if (err)  					goto do_error; -				}  				/* Update the skb. */  				if (merge) { -					skb_shinfo(skb)->frags[i - 1].size += -									copy; +					skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);  				} else { -					skb_fill_page_desc(skb, i, page, off, copy); -					if (TCP_PAGE(sk)) { -						get_page(page); -					} else if (off + copy < PAGE_SIZE) { -						get_page(page); -						TCP_PAGE(sk) = page; -					} +					skb_fill_page_desc(skb, i, pfrag->page, +							   pfrag->offset, copy); +					get_page(pfrag->page);  				} - -				TCP_OFF(sk) = off + copy; +				pfrag->offset += copy;  			}  			if (!copied) -				TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; +				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;  			tp->write_seq += copy;  			TCP_SKB_CB(skb)->end_seq += copy; @@ -1078,7 +1255,7 @@ new_segment:  			if ((seglen -= copy) == 0 && iovlen == 0)  				goto out; -			if (skb->len < max || (flags & MSG_OOB)) +			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))  				continue;  			if (forced_push(tp)) { @@ -1092,7 +1269,8 @@ wait_for_sndbuf:  			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  wait_for_memory:  			if (copied) -				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); +				tcp_push(sk, flags & ~MSG_MORE, mss_now, +					 TCP_NAGLE_PUSH, size_goal);  			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)  				goto do_error; @@ -1103,10 +1281,10 @@ wait_for_memory:  out:  	if (copied) -		tcp_push(sk, flags, mss_now, tp->nonagle); -	TCP_CHECK_TIMER(sk); +		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); +out_nopush:  	release_sock(sk); -	return copied; +	return copied + copied_syn;  do_fault:  	if (!skb->len) { @@ -1119,11 +1297,10 @@ do_fault:  	}  do_error: -	if (copied) +	if (copied + copied_syn)  		goto out;  out_err:  	err = sk_stream_error(sk, flags, err); -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return err;  } @@ -1178,6 +1355,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)  	return -EAGAIN;  } +static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) +{ +	struct sk_buff *skb; +	int copied = 0, err = 0; + +	/* XXX -- need to support SO_PEEK_OFF */ + +	skb_queue_walk(&sk->sk_write_queue, skb) { +		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); +		if (err) +			break; + +		copied += skb->len; +	} + +	return err ?: copied; +} +  /* Clean up the receive buffer for full frames taken by the user,   * then send an ACK if necessary.  COPIED is the number of bytes   * tcp_recvmsg has given to the user so far, it speeds up the @@ -1187,15 +1382,13 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)  void tcp_cleanup_rbuf(struct sock *sk, int copied)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int time_to_ack = 0; +	bool time_to_ack = false; -#if TCP_DEBUG  	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);  	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),  	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",  	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); -#endif  	if (inet_csk_ack_scheduled(sk)) {  		const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1215,7 +1408,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&  		       !icsk->icsk_ack.pingpong)) &&  		      !atomic_read(&sk->sk_rmem_alloc))) -			time_to_ack = 1; +			time_to_ack = true;  	}  	/* We send an ACK if we can now advertise a non-zero window @@ -1237,7 +1430,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)  			 * "Lots" means "at least twice" here.  			 */  			if (new_window && new_window >= 2 * rcv_window_now) -				time_to_ack = 1; +				time_to_ack = true;  		}  	}  	if (time_to_ack) @@ -1273,12 +1466,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  		return;  	last_issued = tp->ucopy.dma_cookie; -	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +	dma_async_issue_pending(tp->ucopy.dma_chan);  	do { -		if (dma_async_memcpy_complete(tp->ucopy.dma_chan, +		if (dma_async_is_tx_complete(tp->ucopy.dma_chan,  					      last_issued, &done, -					      &used) == DMA_SUCCESS) { +					      &used) == DMA_COMPLETE) {  			/* Safe to free early-copied skbs now */  			__skb_queue_purge(&sk->sk_async_wait_queue);  			break; @@ -1286,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  			struct sk_buff *skb;  			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&  			       (dma_async_is_complete(skb->dma_cookie, done, -						      used) == DMA_SUCCESS)) { +						      used) == DMA_COMPLETE)) {  				__skb_dequeue(&sk->sk_async_wait_queue);  				kfree_skb(skb);  			} @@ -1295,12 +1488,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)  }  #endif -static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  {  	struct sk_buff *skb;  	u32 offset; -	skb_queue_walk(&sk->sk_receive_queue, skb) { +	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {  		offset = seq - TCP_SKB_CB(skb)->seq;  		if (tcp_hdr(skb)->syn)  			offset--; @@ -1308,6 +1501,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)  			*off = offset;  			return skb;  		} +		/* This looks weird, but this can happen if TCP collapsing +		 * splitted a fat GRO packet, while we released socket lock +		 * in skb_splice_bits() +		 */ +		sk_eat_skb(sk, skb, false);  	}  	return NULL;  } @@ -1349,7 +1547,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  					break;  			}  			used = recv_actor(desc, skb, offset, len); -			if (used < 0) { +			if (used <= 0) {  				if (!copied)  					copied = used;  				break; @@ -1358,22 +1556,26 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  				copied += used;  				offset += used;  			} -			/* -			 * If recv_actor drops the lock (e.g. TCP splice +			/* If recv_actor drops the lock (e.g. TCP splice  			 * receive) the skb pointer might be invalid when  			 * getting here: tcp_collapse might have deleted it  			 * while aggregating skbs from the socket queue.  			 */ -			skb = tcp_recv_skb(sk, seq-1, &offset); -			if (!skb || (offset+1 != skb->len)) +			skb = tcp_recv_skb(sk, seq - 1, &offset); +			if (!skb)  				break; +			/* TCP coalescing might have appended data to the skb. +			 * Try to splice more frags +			 */ +			if (offset + 1 != skb->len) +				continue;  		}  		if (tcp_hdr(skb)->fin) { -			sk_eat_skb(sk, skb, 0); +			sk_eat_skb(sk, skb, false);  			++seq;  			break;  		} -		sk_eat_skb(sk, skb, 0); +		sk_eat_skb(sk, skb, false);  		if (!desc->count)  			break;  		tp->copied_seq = seq; @@ -1383,8 +1585,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,  	tcp_rcv_space_adjust(sk);  	/* Clean up data we have read: This will do ACK frames. */ -	if (copied > 0) +	if (copied > 0) { +		tcp_recv_skb(sk, seq, &offset);  		tcp_cleanup_rbuf(sk, copied); +	}  	return copied;  }  EXPORT_SYMBOL(tcp_read_sock); @@ -1409,13 +1613,15 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	int target;		/* Read at least this many bytes */  	long timeo;  	struct task_struct *user_recv = NULL; -	int copied_early = 0; +	bool copied_early = false;  	struct sk_buff *skb;  	u32 urg_hole = 0; -	lock_sock(sk); +	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && +	    (sk->sk_state == TCP_ESTABLISHED)) +		sk_busy_loop(sk, nonblock); -	TCP_CHECK_TIMER(sk); +	lock_sock(sk);  	err = -ENOTCONN;  	if (sk->sk_state == TCP_LISTEN) @@ -1427,6 +1633,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	if (flags & MSG_OOB)  		goto recv_urg; +	if (unlikely(tp->repair)) { +		err = -EPERM; +		if (!(flags & MSG_PEEK)) +			goto out; + +		if (tp->repair_queue == TCP_SEND_QUEUE) +			goto recv_sndq; + +		err = -EINVAL; +		if (tp->repair_queue == TCP_NO_QUEUE) +			goto out; + +		/* 'common' recv queue MSG_PEEK-ing */ +	} +  	seq = &tp->copied_seq;  	if (flags & MSG_PEEK) {  		peek_seq = tp->copied_seq; @@ -1447,12 +1668,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		if ((available < target) &&  		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&  		    !sysctl_tcp_low_latency && -		    dma_find_channel(DMA_MEMCPY)) { -			preempt_enable_no_resched(); +		    net_dma_find_channel()) { +			preempt_enable();  			tp->ucopy.pinned_list =  					dma_pin_iovec_pages(msg->msg_iov, len);  		} else { -			preempt_enable_no_resched(); +			preempt_enable();  		}  	}  #endif @@ -1588,8 +1809,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		}  #ifdef CONFIG_NET_DMA -		if (tp->ucopy.dma_chan) -			dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +		if (tp->ucopy.dma_chan) { +			if (tp->rcv_wnd == 0 && +			    !skb_queue_empty(&sk->sk_async_wait_queue)) { +				tcp_service_net_dma(sk, true); +				tcp_cleanup_rbuf(sk, copied); +			} else +				dma_async_issue_pending(tp->ucopy.dma_chan); +		}  #endif  		if (copied >= target) {  			/* Do not sleep, just process backlog. */ @@ -1628,9 +1855,9 @@ do_prequeue:  		}  		if ((flags & MSG_PEEK) &&  		    (peek_seq - copied - urg_hole != tp->copied_seq)) { -			if (net_ratelimit()) -				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", -				       current->comm, task_pid_nr(current)); +			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", +					    current->comm, +					    task_pid_nr(current));  			peek_seq = tp->copied_seq;  		}  		continue; @@ -1662,7 +1889,7 @@ do_prequeue:  		if (!(flags & MSG_TRUNC)) {  #ifdef CONFIG_NET_DMA  			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) -				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); +				tp->ucopy.dma_chan = net_dma_find_channel();  			if (tp->ucopy.dma_chan) {  				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( @@ -1672,7 +1899,8 @@ do_prequeue:  				if (tp->ucopy.dma_cookie < 0) { -					printk(KERN_ALERT "dma_cookie < 0\n"); +					pr_alert("%s: dma_cookie < 0\n", +						 __func__);  					/* Exception. Bailout! */  					if (!copied) @@ -1680,10 +1908,10 @@ do_prequeue:  					break;  				} -				dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +				dma_async_issue_pending(tp->ucopy.dma_chan);  				if ((offset + used) == skb->len) -					copied_early = 1; +					copied_early = true;  			} else  #endif @@ -1717,7 +1945,7 @@ skip_copy:  			goto found_fin_ok;  		if (!(flags & MSG_PEEK)) {  			sk_eat_skb(sk, skb, copied_early); -			copied_early = 0; +			copied_early = false;  		}  		continue; @@ -1726,7 +1954,7 @@ skip_copy:  		++*seq;  		if (!(flags & MSG_PEEK)) {  			sk_eat_skb(sk, skb, copied_early); -			copied_early = 0; +			copied_early = false;  		}  		break;  	} while (len > 0); @@ -1767,18 +1995,20 @@ skip_copy:  	/* Clean up data we have read: This will do ACK frames. */  	tcp_cleanup_rbuf(sk, copied); -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return copied;  out: -	TCP_CHECK_TIMER(sk);  	release_sock(sk);  	return err;  recv_urg:  	err = tcp_recv_urg(sk, msg, len, flags);  	goto out; + +recv_sndq: +	err = tcp_peek_sndq(sk, msg, len); +	goto out;  }  EXPORT_SYMBOL(tcp_recvmsg); @@ -1875,6 +2105,20 @@ void tcp_shutdown(struct sock *sk, int how)  }  EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ +	bool too_many_orphans, out_of_socket_memory; + +	too_many_orphans = tcp_too_many_orphans(sk, shift); +	out_of_socket_memory = tcp_out_of_memory(sk); + +	if (too_many_orphans) +		net_info_ratelimited("too many orphaned sockets\n"); +	if (out_of_socket_memory) +		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); +	return too_many_orphans || out_of_socket_memory; +} +  void tcp_close(struct sock *sk, long timeout)  {  	struct sk_buff *skb; @@ -1917,7 +2161,9 @@ void tcp_close(struct sock *sk, long timeout)  	 * advertise a zero window, then kill -9 the FTP client, wheee...  	 * Note: timeout is always zero in such a case.  	 */ -	if (data_was_unread) { +	if (unlikely(tcp_sk(sk)->repair)) { +		sk->sk_prot->disconnect(sk, 0); +	} else if (data_was_unread) {  		/* Unread data was tossed, zap the connection. */  		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);  		tcp_set_state(sk, TCP_CLOSE); @@ -1951,6 +2197,10 @@ void tcp_close(struct sock *sk, long timeout)  		 * they look as CLOSING or LAST_ACK for Linux)  		 * Probably, I missed some more holelets.  		 * 						--ANK +		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN +		 * in a single packet! (May consider it later but will +		 * probably need API support or TCP_CORK SYN-ACK until +		 * data is written and socket is closed.)  		 */  		tcp_send_fin(sk);  	} @@ -1982,7 +2232,7 @@ adjudge_to_death:  	/*	This is a (useful) BSD violating of the RFC. There is a  	 *	problem with TCP as specified in that the other end could  	 *	keep a socket open forever with no application left this end. -	 *	We use a 3 minute timeout (about the same as BSD) then kill +	 *	We use a 1 minute timeout (about the same as BSD) then kill  	 *	our end. If they send after that then tough - BUT: long enough  	 *	that we won't make the old 4*rto = almost no time - whoops  	 *	reset mistake. @@ -2014,10 +2264,7 @@ adjudge_to_death:  	}  	if (sk->sk_state != TCP_CLOSE) {  		sk_mem_reclaim(sk); -		if (tcp_too_many_orphans(sk, 0)) { -			if (net_ratelimit()) -				printk(KERN_INFO "TCP: too many of orphaned " -				       "sockets\n"); +		if (tcp_check_oom(sk, 0)) {  			tcp_set_state(sk, TCP_CLOSE);  			tcp_send_active_reset(sk, GFP_ATOMIC);  			NET_INC_STATS_BH(sock_net(sk), @@ -2025,8 +2272,16 @@ adjudge_to_death:  		}  	} -	if (sk->sk_state == TCP_CLOSE) +	if (sk->sk_state == TCP_CLOSE) { +		struct request_sock *req = tcp_sk(sk)->fastopen_rsk; +		/* We could get here with a non-NULL req if the socket is +		 * aborted (e.g., closed with unread data) before 3WHS +		 * finishes. +		 */ +		if (req != NULL) +			reqsk_fastopen_remove(sk, req, false);  		inet_csk_destroy_sock(sk); +	}  	/* Otherwise, socket is reprieved until protocol close. */  out: @@ -2038,7 +2293,7 @@ EXPORT_SYMBOL(tcp_close);  /* These states need RST on ABORT according to RFC793 */ -static inline int tcp_need_reset(int state) +static inline bool tcp_need_reset(int state)  {  	return (1 << state) &  	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | @@ -2059,6 +2314,8 @@ int tcp_disconnect(struct sock *sk, int flags)  	/* ABORT function of RFC793 */  	if (old_state == TCP_LISTEN) {  		inet_csk_listen_stop(sk); +	} else if (unlikely(tp->repair)) { +		sk->sk_err = ECONNABORTED;  	} else if (tcp_need_reset(old_state) ||  		   (tp->snd_nxt != tp->write_seq &&  		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -2085,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)  	sk->sk_shutdown = 0;  	sock_reset_flag(sk, SOCK_DONE); -	tp->srtt = 0; +	tp->srtt_us = 0;  	if ((tp->write_seq += tp->max_window + 2) == 0)  		tp->write_seq = 1;  	icsk->icsk_backoff = 0; @@ -2094,7 +2351,6 @@ int tcp_disconnect(struct sock *sk, int flags)  	tp->packets_out = 0;  	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;  	tp->snd_cwnd_cnt = 0; -	tp->bytes_acked = 0;  	tp->window_clamp = 0;  	tcp_set_ca_state(sk, TCP_CA_Open);  	tcp_clear_retrans(tp); @@ -2110,6 +2366,68 @@ int tcp_disconnect(struct sock *sk, int flags)  }  EXPORT_SYMBOL(tcp_disconnect); +void tcp_sock_destruct(struct sock *sk) +{ +	inet_sock_destruct(sk); + +	kfree(inet_csk(sk)->icsk_accept_queue.fastopenq); +} + +static inline bool tcp_can_repair_sock(const struct sock *sk) +{ +	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && +		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); +} + +static int tcp_repair_options_est(struct tcp_sock *tp, +		struct tcp_repair_opt __user *optbuf, unsigned int len) +{ +	struct tcp_repair_opt opt; + +	while (len >= sizeof(opt)) { +		if (copy_from_user(&opt, optbuf, sizeof(opt))) +			return -EFAULT; + +		optbuf++; +		len -= sizeof(opt); + +		switch (opt.opt_code) { +		case TCPOPT_MSS: +			tp->rx_opt.mss_clamp = opt.opt_val; +			break; +		case TCPOPT_WINDOW: +			{ +				u16 snd_wscale = opt.opt_val & 0xFFFF; +				u16 rcv_wscale = opt.opt_val >> 16; + +				if (snd_wscale > 14 || rcv_wscale > 14) +					return -EFBIG; + +				tp->rx_opt.snd_wscale = snd_wscale; +				tp->rx_opt.rcv_wscale = rcv_wscale; +				tp->rx_opt.wscale_ok = 1; +			} +			break; +		case TCPOPT_SACK_PERM: +			if (opt.opt_val != 0) +				return -EINVAL; + +			tp->rx_opt.sack_ok |= TCP_SACK_SEEN; +			if (sysctl_tcp_fack) +				tcp_enable_fack(tp); +			break; +		case TCPOPT_TIMESTAMP: +			if (opt.opt_val != 0) +				return -EINVAL; + +			tp->rx_opt.tstamp_ok = 1; +			break; +		} +	} + +	return 0; +} +  /*   *	Socket option code for TCP.   */ @@ -2140,92 +2458,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		release_sock(sk);  		return err;  	} -	case TCP_COOKIE_TRANSACTIONS: { -		struct tcp_cookie_transactions ctd; -		struct tcp_cookie_values *cvp = NULL; - -		if (sizeof(ctd) > optlen) -			return -EINVAL; -		if (copy_from_user(&ctd, optval, sizeof(ctd))) -			return -EFAULT; - -		if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || -		    ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) -			return -EINVAL; - -		if (ctd.tcpct_cookie_desired == 0) { -			/* default to global value */ -		} else if ((0x1 & ctd.tcpct_cookie_desired) || -			   ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || -			   ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { -			return -EINVAL; -		} - -		if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { -			/* Supercedes all other values */ -			lock_sock(sk); -			if (tp->cookie_values != NULL) { -				kref_put(&tp->cookie_values->kref, -					 tcp_cookie_values_release); -				tp->cookie_values = NULL; -			} -			tp->rx_opt.cookie_in_always = 0; /* false */ -			tp->rx_opt.cookie_out_never = 1; /* true */ -			release_sock(sk); -			return err; -		} - -		/* Allocate ancillary memory before locking. -		 */ -		if (ctd.tcpct_used > 0 || -		    (tp->cookie_values == NULL && -		     (sysctl_tcp_cookie_size > 0 || -		      ctd.tcpct_cookie_desired > 0 || -		      ctd.tcpct_s_data_desired > 0))) { -			cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, -				      GFP_KERNEL); -			if (cvp == NULL) -				return -ENOMEM; - -			kref_init(&cvp->kref); -		} -		lock_sock(sk); -		tp->rx_opt.cookie_in_always = -			(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); -		tp->rx_opt.cookie_out_never = 0; /* false */ - -		if (tp->cookie_values != NULL) { -			if (cvp != NULL) { -				/* Changed values are recorded by a changed -				 * pointer, ensuring the cookie will differ, -				 * without separately hashing each value later. -				 */ -				kref_put(&tp->cookie_values->kref, -					 tcp_cookie_values_release); -			} else { -				cvp = tp->cookie_values; -			} -		} - -		if (cvp != NULL) { -			cvp->cookie_desired = ctd.tcpct_cookie_desired; - -			if (ctd.tcpct_used > 0) { -				memcpy(cvp->s_data_payload, ctd.tcpct_value, -				       ctd.tcpct_used); -				cvp->s_data_desired = ctd.tcpct_used; -				cvp->s_data_constant = 1; /* true */ -			} else { -				/* No constant payload data. */ -				cvp->s_data_desired = ctd.tcpct_s_data_desired; -				cvp->s_data_constant = 0; /* false */ -			} - -			tp->cookie_values = cvp; -		} -		release_sock(sk); -		return err; -	}  	default:  		/* fallthru */  		break; @@ -2244,7 +2476,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		/* Values greater than interface MTU won't take effect. However  		 * at the point when this call is done we typically don't yet  		 * know which interface is going to be used */ -		if (val < 64 || val > MAX_TCP_WINDOW) { +		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {  			err = -EINVAL;  			break;  		} @@ -2278,8 +2510,58 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  	case TCP_THIN_DUPACK:  		if (val < 0 || val > 1)  			err = -EINVAL; -		else +		else {  			tp->thin_dupack = val; +			if (tp->thin_dupack) +				tcp_disable_early_retrans(tp); +		} +		break; + +	case TCP_REPAIR: +		if (!tcp_can_repair_sock(sk)) +			err = -EPERM; +		else if (val == 1) { +			tp->repair = 1; +			sk->sk_reuse = SK_FORCE_REUSE; +			tp->repair_queue = TCP_NO_QUEUE; +		} else if (val == 0) { +			tp->repair = 0; +			sk->sk_reuse = SK_NO_REUSE; +			tcp_send_window_probe(sk); +		} else +			err = -EINVAL; + +		break; + +	case TCP_REPAIR_QUEUE: +		if (!tp->repair) +			err = -EPERM; +		else if (val < TCP_QUEUES_NR) +			tp->repair_queue = val; +		else +			err = -EINVAL; +		break; + +	case TCP_QUEUE_SEQ: +		if (sk->sk_state != TCP_CLOSE) +			err = -EPERM; +		else if (tp->repair_queue == TCP_SEND_QUEUE) +			tp->write_seq = val; +		else if (tp->repair_queue == TCP_RECV_QUEUE) +			tp->rcv_nxt = val; +		else +			err = -EINVAL; +		break; + +	case TCP_REPAIR_OPTIONS: +		if (!tp->repair) +			err = -EINVAL; +		else if (sk->sk_state == TCP_ESTABLISHED) +			err = tcp_repair_options_est(tp, +					(struct tcp_repair_opt __user *)optval, +					optlen); +		else +			err = -EPERM;  		break;  	case TCP_CORK: @@ -2394,7 +2676,28 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  		/* Cap the max timeout in ms TCP will retry/retrans  		 * before giving up and aborting (ETIMEDOUT) a connection.  		 */ -		icsk->icsk_user_timeout = msecs_to_jiffies(val); +		if (val < 0) +			err = -EINVAL; +		else +			icsk->icsk_user_timeout = msecs_to_jiffies(val); +		break; + +	case TCP_FASTOPEN: +		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | +		    TCPF_LISTEN))) +			err = fastopen_init_queue(sk, val); +		else +			err = -EINVAL; +		break; +	case TCP_TIMESTAMP: +		if (!tp->repair) +			err = -EPERM; +		else +			tp->tsoffset = val - tcp_time_stamp; +		break; +	case TCP_NOTSENT_LOWAT: +		tp->notsent_lowat = val; +		sk->sk_write_space(sk);  		break;  	default:  		err = -ENOPROTOOPT; @@ -2408,7 +2711,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,  int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,  		   unsigned int optlen)  { -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	if (level != SOL_TCP)  		return icsk->icsk_af_ops->setsockopt(sk, level, optname, @@ -2430,9 +2733,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);  #endif  /* Return information about state of tcp endpoint in API format. */ -void tcp_get_info(struct sock *sk, struct tcp_info *info) +void tcp_get_info(const struct sock *sk, struct tcp_info *info)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u32 now = tcp_time_stamp; @@ -2454,8 +2757,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;  	} -	if (tp->ecn_flags&TCP_ECN_OK) +	if (tp->ecn_flags & TCP_ECN_OK)  		info->tcpi_options |= TCPI_OPT_ECN; +	if (tp->ecn_flags & TCP_ECN_SEEN) +		info->tcpi_options |= TCPI_OPT_ECN_SEEN; +	if (tp->syn_data_acked) +		info->tcpi_options |= TCPI_OPT_SYN_DATA;  	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);  	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); @@ -2479,8 +2786,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;  	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; -	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; -	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; +	info->tcpi_rtt = tp->srtt_us >> 3; +	info->tcpi_rttvar = tp->mdev_us >> 2;  	info->tcpi_snd_ssthresh = tp->snd_ssthresh;  	info->tcpi_snd_cwnd = tp->snd_cwnd;  	info->tcpi_advmss = tp->advmss; @@ -2490,6 +2797,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)  	info->tcpi_rcv_space = tp->rcvq_space.space;  	info->tcpi_total_retrans = tp->total_retrans; + +	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? +					sk->sk_pacing_rate : ~0ULL; +	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? +					sk->sk_max_pacing_rate : ~0ULL;  }  EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2513,6 +2825,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		val = tp->mss_cache;  		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))  			val = tp->rx_opt.user_mss; +		if (tp->repair) +			val = tp->rx_opt.mss_clamp;  		break;  	case TCP_NODELAY:  		val = !!(tp->nonagle&TCP_NAGLE_OFF); @@ -2573,41 +2887,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  			return -EFAULT;  		return 0; -	case TCP_COOKIE_TRANSACTIONS: { -		struct tcp_cookie_transactions ctd; -		struct tcp_cookie_values *cvp = tp->cookie_values; - -		if (get_user(len, optlen)) -			return -EFAULT; -		if (len < sizeof(ctd)) -			return -EINVAL; - -		memset(&ctd, 0, sizeof(ctd)); -		ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? -				   TCP_COOKIE_IN_ALWAYS : 0) -				| (tp->rx_opt.cookie_out_never ? -				   TCP_COOKIE_OUT_NEVER : 0); - -		if (cvp != NULL) { -			ctd.tcpct_flags |= (cvp->s_data_in ? -					    TCP_S_DATA_IN : 0) -					 | (cvp->s_data_out ? -					    TCP_S_DATA_OUT : 0); - -			ctd.tcpct_cookie_desired = cvp->cookie_desired; -			ctd.tcpct_s_data_desired = cvp->s_data_desired; - -			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], -			       cvp->cookie_pair_size); -			ctd.tcpct_used = cvp->cookie_pair_size; -		} - -		if (put_user(sizeof(ctd), optlen)) -			return -EFAULT; -		if (copy_to_user(optval, &ctd, sizeof(ctd))) -			return -EFAULT; -		return 0; -	}  	case TCP_THIN_LINEAR_TIMEOUTS:  		val = tp->thin_lto;  		break; @@ -2615,9 +2894,43 @@ static int do_tcp_getsockopt(struct sock *sk, int level,  		val = tp->thin_dupack;  		break; +	case TCP_REPAIR: +		val = tp->repair; +		break; + +	case TCP_REPAIR_QUEUE: +		if (tp->repair) +			val = tp->repair_queue; +		else +			return -EINVAL; +		break; + +	case TCP_QUEUE_SEQ: +		if (tp->repair_queue == TCP_SEND_QUEUE) +			val = tp->write_seq; +		else if (tp->repair_queue == TCP_RECV_QUEUE) +			val = tp->rcv_nxt; +		else +			return -EINVAL; +		break; +  	case TCP_USER_TIMEOUT:  		val = jiffies_to_msecs(icsk->icsk_user_timeout);  		break; + +	case TCP_FASTOPEN: +		if (icsk->icsk_accept_queue.fastopenq != NULL) +			val = icsk->icsk_accept_queue.fastopenq->max_qlen; +		else +			val = 0; +		break; + +	case TCP_TIMESTAMP: +		val = tcp_time_stamp + tp->tsoffset; +		break; +	case TCP_NOTSENT_LOWAT: +		val = tp->notsent_lowat; +		break;  	default:  		return -ENOPROTOOPT;  	} @@ -2653,313 +2966,62 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,  EXPORT_SYMBOL(compat_tcp_getsockopt);  #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) -{ -	struct sk_buff *segs = ERR_PTR(-EINVAL); -	struct tcphdr *th; -	unsigned thlen; -	unsigned int seq; -	__be32 delta; -	unsigned int oldlen; -	unsigned int mss; - -	if (!pskb_may_pull(skb, sizeof(*th))) -		goto out; - -	th = tcp_hdr(skb); -	thlen = th->doff * 4; -	if (thlen < sizeof(*th)) -		goto out; - -	if (!pskb_may_pull(skb, thlen)) -		goto out; - -	oldlen = (u16)~skb->len; -	__skb_pull(skb, thlen); - -	mss = skb_shinfo(skb)->gso_size; -	if (unlikely(skb->len <= mss)) -		goto out; - -	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { -		/* Packet is from an untrusted source, reset gso_segs. */ -		int type = skb_shinfo(skb)->gso_type; - -		if (unlikely(type & -			     ~(SKB_GSO_TCPV4 | -			       SKB_GSO_DODGY | -			       SKB_GSO_TCP_ECN | -			       SKB_GSO_TCPV6 | -			       0) || -			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) -			goto out; - -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - -		segs = NULL; -		goto out; -	} - -	segs = skb_segment(skb, features); -	if (IS_ERR(segs)) -		goto out; - -	delta = htonl(oldlen + (thlen + mss)); - -	skb = segs; -	th = tcp_hdr(skb); -	seq = ntohl(th->seq); - -	do { -		th->fin = th->psh = 0; - -		th->check = ~csum_fold((__force __wsum)((__force u32)th->check + -				       (__force u32)delta)); -		if (skb->ip_summed != CHECKSUM_PARTIAL) -			th->check = -			     csum_fold(csum_partial(skb_transport_header(skb), -						    thlen, skb->csum)); - -		seq += mss; -		skb = skb->next; -		th = tcp_hdr(skb); - -		th->seq = htonl(seq); -		th->cwr = 0; -	} while (skb->next); - -	delta = htonl(oldlen + (skb->tail - skb->transport_header) + -		      skb->data_len); -	th->check = ~csum_fold((__force __wsum)((__force u32)th->check + -				(__force u32)delta)); -	if (skb->ip_summed != CHECKSUM_PARTIAL) -		th->check = csum_fold(csum_partial(skb_transport_header(skb), -						   thlen, skb->csum)); - -out: -	return segs; -} -EXPORT_SYMBOL(tcp_tso_segment); - -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ -	struct sk_buff **pp = NULL; -	struct sk_buff *p; -	struct tcphdr *th; -	struct tcphdr *th2; -	unsigned int len; -	unsigned int thlen; -	__be32 flags; -	unsigned int mss = 1; -	unsigned int hlen; -	unsigned int off; -	int flush = 1; -	int i; - -	off = skb_gro_offset(skb); -	hlen = off + sizeof(*th); -	th = skb_gro_header_fast(skb, off); -	if (skb_gro_header_hard(skb, hlen)) { -		th = skb_gro_header_slow(skb, hlen, off); -		if (unlikely(!th)) -			goto out; -	} - -	thlen = th->doff * 4; -	if (thlen < sizeof(*th)) -		goto out; - -	hlen = off + thlen; -	if (skb_gro_header_hard(skb, hlen)) { -		th = skb_gro_header_slow(skb, hlen, off); -		if (unlikely(!th)) -			goto out; -	} - -	skb_gro_pull(skb, thlen); - -	len = skb_gro_len(skb); -	flags = tcp_flag_word(th); - -	for (; (p = *head); head = &p->next) { -		if (!NAPI_GRO_CB(p)->same_flow) -			continue; - -		th2 = tcp_hdr(p); - -		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { -			NAPI_GRO_CB(p)->same_flow = 0; -			continue; -		} - -		goto found; -	} - -	goto out_check_final; - -found: -	flush = NAPI_GRO_CB(p)->flush; -	flush |= (__force int)(flags & TCP_FLAG_CWR); -	flush |= (__force int)((flags ^ tcp_flag_word(th2)) & -		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); -	flush |= (__force int)(th->ack_seq ^ th2->ack_seq); -	for (i = sizeof(*th); i < thlen; i += 4) -		flush |= *(u32 *)((u8 *)th + i) ^ -			 *(u32 *)((u8 *)th2 + i); - -	mss = skb_shinfo(p)->gso_size; - -	flush |= (len - 1) >= mss; -	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - -	if (flush || skb_gro_receive(head, skb)) { -		mss = 1; -		goto out_check_final; -	} - -	p = *head; -	th2 = tcp_hdr(p); -	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); - -out_check_final: -	flush = len < mss; -	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | -					TCP_FLAG_RST | TCP_FLAG_SYN | -					TCP_FLAG_FIN)); - -	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) -		pp = head; - -out: -	NAPI_GRO_CB(skb)->flush |= flush; - -	return pp; -} -EXPORT_SYMBOL(tcp_gro_receive); - -int tcp_gro_complete(struct sk_buff *skb) -{ -	struct tcphdr *th = tcp_hdr(skb); - -	skb->csum_start = skb_transport_header(skb) - skb->head; -	skb->csum_offset = offsetof(struct tcphdr, check); -	skb->ip_summed = CHECKSUM_PARTIAL; - -	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - -	if (th->cwr) -		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - -	return 0; -} -EXPORT_SYMBOL(tcp_gro_complete); -  #ifdef CONFIG_TCP_MD5SIG -static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; -static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_MUTEX(tcp_md5sig_mutex); -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)  {  	int cpu; -	for_each_possible_cpu(cpu) { -		struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu); -		if (p) { -			if (p->md5_desc.tfm) -				crypto_free_hash(p->md5_desc.tfm); -			kfree(p); -		} -	} -	free_percpu(pool); -} -void tcp_free_md5sig_pool(void) -{ -	struct tcp_md5sig_pool * __percpu *pool = NULL; +	for_each_possible_cpu(cpu) { +		struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); -	spin_lock_bh(&tcp_md5sig_pool_lock); -	if (--tcp_md5sig_users == 0) { -		pool = tcp_md5sig_pool; -		tcp_md5sig_pool = NULL; +		if (p->md5_desc.tfm) +			crypto_free_hash(p->md5_desc.tfm);  	} -	spin_unlock_bh(&tcp_md5sig_pool_lock); -	if (pool) -		__tcp_free_md5sig_pool(pool); +	free_percpu(pool);  } -EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool * __percpu * -__tcp_alloc_md5sig_pool(struct sock *sk) +static void __tcp_alloc_md5sig_pool(void)  {  	int cpu; -	struct tcp_md5sig_pool * __percpu *pool; +	struct tcp_md5sig_pool __percpu *pool; -	pool = alloc_percpu(struct tcp_md5sig_pool *); +	pool = alloc_percpu(struct tcp_md5sig_pool);  	if (!pool) -		return NULL; +		return;  	for_each_possible_cpu(cpu) { -		struct tcp_md5sig_pool *p;  		struct crypto_hash *hash; -		p = kzalloc(sizeof(*p), sk->sk_allocation); -		if (!p) -			goto out_free; -		*per_cpu_ptr(pool, cpu) = p; -  		hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); -		if (!hash || IS_ERR(hash)) +		if (IS_ERR_OR_NULL(hash))  			goto out_free; -		p->md5_desc.tfm = hash; +		per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;  	} -	return pool; +	/* before setting tcp_md5sig_pool, we must commit all writes +	 * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() +	 */ +	smp_wmb(); +	tcp_md5sig_pool = pool; +	return;  out_free:  	__tcp_free_md5sig_pool(pool); -	return NULL;  } -struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) +bool tcp_alloc_md5sig_pool(void)  { -	struct tcp_md5sig_pool * __percpu *pool; -	int alloc = 0; - -retry: -	spin_lock_bh(&tcp_md5sig_pool_lock); -	pool = tcp_md5sig_pool; -	if (tcp_md5sig_users++ == 0) { -		alloc = 1; -		spin_unlock_bh(&tcp_md5sig_pool_lock); -	} else if (!pool) { -		tcp_md5sig_users--; -		spin_unlock_bh(&tcp_md5sig_pool_lock); -		cpu_relax(); -		goto retry; -	} else -		spin_unlock_bh(&tcp_md5sig_pool_lock); - -	if (alloc) { -		/* we cannot hold spinlock here because this may sleep. */ -		struct tcp_md5sig_pool * __percpu *p; - -		p = __tcp_alloc_md5sig_pool(sk); -		spin_lock_bh(&tcp_md5sig_pool_lock); -		if (!p) { -			tcp_md5sig_users--; -			spin_unlock_bh(&tcp_md5sig_pool_lock); -			return NULL; -		} -		pool = tcp_md5sig_pool; -		if (pool) { -			/* oops, it has already been assigned. */ -			spin_unlock_bh(&tcp_md5sig_pool_lock); -			__tcp_free_md5sig_pool(p); -		} else { -			tcp_md5sig_pool = pool = p; -			spin_unlock_bh(&tcp_md5sig_pool_lock); -		} +	if (unlikely(!tcp_md5sig_pool)) { +		mutex_lock(&tcp_md5sig_mutex); + +		if (!tcp_md5sig_pool) +			__tcp_alloc_md5sig_pool(); + +		mutex_unlock(&tcp_md5sig_mutex);  	} -	return pool; +	return tcp_md5sig_pool != NULL;  }  EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -2973,56 +3035,45 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);   */  struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)  { -	struct tcp_md5sig_pool * __percpu *p; +	struct tcp_md5sig_pool __percpu *p;  	local_bh_disable(); - -	spin_lock(&tcp_md5sig_pool_lock); -	p = tcp_md5sig_pool; +	p = ACCESS_ONCE(tcp_md5sig_pool);  	if (p) -		tcp_md5sig_users++; -	spin_unlock(&tcp_md5sig_pool_lock); - -	if (p) -		return *this_cpu_ptr(p); +		return __this_cpu_ptr(p);  	local_bh_enable();  	return NULL;  }  EXPORT_SYMBOL(tcp_get_md5sig_pool); -void tcp_put_md5sig_pool(void) -{ -	local_bh_enable(); -	tcp_free_md5sig_pool(); -} -EXPORT_SYMBOL(tcp_put_md5sig_pool); -  int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, -			struct tcphdr *th) +			const struct tcphdr *th)  {  	struct scatterlist sg; +	struct tcphdr hdr;  	int err; -	__sum16 old_checksum = th->check; -	th->check = 0; +	/* We are not allowed to change tcphdr, make a local copy */ +	memcpy(&hdr, th, sizeof(hdr)); +	hdr.check = 0; +  	/* options aren't included in the hash */ -	sg_init_one(&sg, th, sizeof(struct tcphdr)); -	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); -	th->check = old_checksum; +	sg_init_one(&sg, &hdr, sizeof(hdr)); +	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));  	return err;  }  EXPORT_SYMBOL(tcp_md5_hash_header);  int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, -			  struct sk_buff *skb, unsigned header_len) +			  const struct sk_buff *skb, unsigned int header_len)  {  	struct scatterlist sg;  	const struct tcphdr *tp = tcp_hdr(skb);  	struct hash_desc *desc = &hp->md5_desc; -	unsigned i; -	const unsigned head_data_len = skb_headlen(skb) > header_len ? -				       skb_headlen(skb) - header_len : 0; +	unsigned int i; +	const unsigned int head_data_len = skb_headlen(skb) > header_len ? +					   skb_headlen(skb) - header_len : 0;  	const struct skb_shared_info *shi = skb_shinfo(skb);  	struct sk_buff *frag_iter; @@ -3034,8 +3085,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  	for (i = 0; i < shi->nr_frags; ++i) {  		const struct skb_frag_struct *f = &shi->frags[i]; -		sg_set_page(&sg, f->page, f->size, f->page_offset); -		if (crypto_hash_update(desc, &sg, f->size)) +		unsigned int offset = f->page_offset; +		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + +		sg_set_page(&sg, page, skb_frag_size(f), +			    offset_in_page(offset)); +		if (crypto_hash_update(desc, &sg, skb_frag_size(f)))  			return 1;  	} @@ -3047,7 +3102,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,  }  EXPORT_SYMBOL(tcp_md5_hash_skb_data); -int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)  {  	struct scatterlist sg; @@ -3058,142 +3113,17 @@ EXPORT_SYMBOL(tcp_md5_hash_key);  #endif -/** - * Each Responder maintains up to two secret values concurrently for - * efficient secret rollover.  Each secret value has 4 states: - * - * Generating.  (tcp_secret_generating != tcp_secret_primary) - *    Generates new Responder-Cookies, but not yet used for primary - *    verification.  This is a short-term state, typically lasting only - *    one round trip time (RTT). - * - * Primary.  (tcp_secret_generating == tcp_secret_primary) - *    Used both for generation and primary verification. - * - * Retiring.  (tcp_secret_retiring != tcp_secret_secondary) - *    Used for verification, until the first failure that can be - *    verified by the newer Generating secret.  At that time, this - *    cookie's state is changed to Secondary, and the Generating - *    cookie's state is changed to Primary.  This is a short-term state, - *    typically lasting only one round trip time (RTT). - * - * Secondary.  (tcp_secret_retiring == tcp_secret_secondary) - *    Used for secondary verification, after primary verification - *    failures.  This state lasts no more than twice the Maximum Segment - *    Lifetime (2MSL).  Then, the secret is discarded. - */ -struct tcp_cookie_secret { -	/* The secret is divided into two parts.  The digest part is the -	 * equivalent of previously hashing a secret and saving the state, -	 * and serves as an initialization vector (IV).  The message part -	 * serves as the trailing secret. -	 */ -	u32				secrets[COOKIE_WORKSPACE_WORDS]; -	unsigned long			expires; -}; - -#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) -#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) -#define TCP_SECRET_LIFE (HZ * 600) - -static struct tcp_cookie_secret tcp_secret_one; -static struct tcp_cookie_secret tcp_secret_two; - -/* Essentially a circular list, without dynamic allocation. */ -static struct tcp_cookie_secret *tcp_secret_generating; -static struct tcp_cookie_secret *tcp_secret_primary; -static struct tcp_cookie_secret *tcp_secret_retiring; -static struct tcp_cookie_secret *tcp_secret_secondary; - -static DEFINE_SPINLOCK(tcp_secret_locker); - -/* Select a pseudo-random word in the cookie workspace. - */ -static inline u32 tcp_cookie_work(const u32 *ws, const int n) -{ -	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; -} - -/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. - * Called in softirq context. - * Returns: 0 for success. - */ -int tcp_cookie_generator(u32 *bakery) -{ -	unsigned long jiffy = jiffies; - -	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { -		spin_lock_bh(&tcp_secret_locker); -		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { -			/* refreshed by another */ -			memcpy(bakery, -			       &tcp_secret_generating->secrets[0], -			       COOKIE_WORKSPACE_WORDS); -		} else { -			/* still needs refreshing */ -			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); - -			/* The first time, paranoia assumes that the -			 * randomization function isn't as strong.  But, -			 * this secret initialization is delayed until -			 * the last possible moment (packet arrival). -			 * Although that time is observable, it is -			 * unpredictably variable.  Mash in the most -			 * volatile clock bits available, and expire the -			 * secret extra quickly. -			 */ -			if (unlikely(tcp_secret_primary->expires == -				     tcp_secret_secondary->expires)) { -				struct timespec tv; - -				getnstimeofday(&tv); -				bakery[COOKIE_DIGEST_WORDS+0] ^= -					(u32)tv.tv_nsec; - -				tcp_secret_secondary->expires = jiffy -					+ TCP_SECRET_1MSL -					+ (0x0f & tcp_cookie_work(bakery, 0)); -			} else { -				tcp_secret_secondary->expires = jiffy -					+ TCP_SECRET_LIFE -					+ (0xff & tcp_cookie_work(bakery, 1)); -				tcp_secret_primary->expires = jiffy -					+ TCP_SECRET_2MSL -					+ (0x1f & tcp_cookie_work(bakery, 2)); -			} -			memcpy(&tcp_secret_secondary->secrets[0], -			       bakery, COOKIE_WORKSPACE_WORDS); - -			rcu_assign_pointer(tcp_secret_generating, -					   tcp_secret_secondary); -			rcu_assign_pointer(tcp_secret_retiring, -					   tcp_secret_primary); -			/* -			 * Neither call_rcu() nor synchronize_rcu() needed. -			 * Retiring data is not freed.  It is replaced after -			 * further (locked) pointer updates, and a quiet time -			 * (minimum 1MSL, maximum LIFE - 2MSL). -			 */ -		} -		spin_unlock_bh(&tcp_secret_locker); -	} else { -		rcu_read_lock_bh(); -		memcpy(bakery, -		       &rcu_dereference(tcp_secret_generating)->secrets[0], -		       COOKIE_WORKSPACE_WORDS); -		rcu_read_unlock_bh(); -	} -	return 0; -} -EXPORT_SYMBOL(tcp_cookie_generator); -  void tcp_done(struct sock *sk)  { +	struct request_sock *req = tcp_sk(sk)->fastopen_rsk; +  	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);  	tcp_set_state(sk, TCP_CLOSE);  	tcp_clear_xmit_timers(sk); +	if (req != NULL) +		reqsk_fastopen_remove(sk, req, false);  	sk->sk_shutdown = SHUTDOWN_MASK; @@ -3209,19 +3139,34 @@ extern struct tcp_congestion_ops tcp_reno;  static __initdata unsigned long thash_entries;  static int __init set_thash_entries(char *str)  { +	ssize_t ret; +  	if (!str)  		return 0; -	thash_entries = simple_strtoul(str, &str, 0); + +	ret = kstrtoul(str, 0, &thash_entries); +	if (ret) +		return 0; +  	return 1;  }  __setup("thash_entries=", set_thash_entries); +static void tcp_init_mem(void) +{ +	unsigned long limit = nr_free_buffer_pages() / 8; +	limit = max(limit, 128UL); +	sysctl_tcp_mem[0] = limit / 4 * 3; +	sysctl_tcp_mem[1] = limit; +	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; +} +  void __init tcp_init(void)  {  	struct sk_buff *skb = NULL; -	unsigned long nr_pages, limit; -	int i, max_share, cnt; -	unsigned long jiffy = jiffies; +	unsigned long limit; +	int max_rshare, max_wshare, cnt; +	unsigned int i;  	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3241,29 +3186,28 @@ void __init tcp_init(void)  		alloc_large_system_hash("TCP established",  					sizeof(struct inet_ehash_bucket),  					thash_entries, -					(totalram_pages >= 128 * 1024) ? -					13 : 15, +					17, /* one slot per 128 KB of memory */  					0,  					NULL,  					&tcp_hashinfo.ehash_mask, +					0,  					thash_entries ? 0 : 512 * 1024); -	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { +	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)  		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); -		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); -	} +  	if (inet_ehash_locks_alloc(&tcp_hashinfo))  		panic("TCP: failed to alloc ehash_locks");  	tcp_hashinfo.bhash =  		alloc_large_system_hash("TCP bind",  					sizeof(struct inet_bind_hashbucket),  					tcp_hashinfo.ehash_mask + 1, -					(totalram_pages >= 128 * 1024) ? -					13 : 15, +					17, /* one slot per 128 KB of memory */  					0,  					&tcp_hashinfo.bhash_size,  					NULL, +					0,  					64 * 1024); -	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; +	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;  	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {  		spin_lock_init(&tcp_hashinfo.bhash[i].lock);  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); @@ -3276,42 +3220,26 @@ void __init tcp_init(void)  	sysctl_tcp_max_orphans = cnt / 2;  	sysctl_max_syn_backlog = max(128, cnt / 256); -	/* Set the pressure threshold to be a fraction of global memory that -	 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of -	 * memory, with a floor of 128 pages. -	 */ -	nr_pages = totalram_pages - totalhigh_pages; -	limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); -	limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); -	limit = max(limit, 128UL); -	sysctl_tcp_mem[0] = limit / 4 * 3; -	sysctl_tcp_mem[1] = limit; -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; - +	tcp_init_mem();  	/* Set per-socket limits to no more than 1/128 the pressure threshold */ -	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); -	max_share = min(4UL*1024*1024, limit); +	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +	max_wshare = min(4UL*1024*1024, limit); +	max_rshare = min(6UL*1024*1024, limit);  	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;  	sysctl_tcp_wmem[1] = 16*1024; -	sysctl_tcp_wmem[2] = max(64*1024, max_share); +	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);  	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;  	sysctl_tcp_rmem[1] = 87380; -	sysctl_tcp_rmem[2] = max(87380, max_share); +	sysctl_tcp_rmem[2] = max(87380, max_rshare); + +	pr_info("Hash tables configured (established %u bind %u)\n", +		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); -	printk(KERN_INFO "TCP: Hash tables configured " -	       "(established %u bind %u)\n", -	       tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); +	tcp_metrics_init();  	tcp_register_congestion_control(&tcp_reno); -	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); -	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); -	tcp_secret_one.expires = jiffy; /* past due */ -	tcp_secret_two.expires = jiffy; /* past due */ -	tcp_secret_generating = &tcp_secret_one; -	tcp_secret_primary = &tcp_secret_one; -	tcp_secret_retiring = &tcp_secret_two; -	tcp_secret_secondary = &tcp_secret_two; +	tcp_tasklet_init();  } diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 3b53fd1af23..d5de69bc04f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca)  {  	ca->cnt = 0;  	ca->last_max_cwnd = 0; -	ca->loss_cwnd = 0;  	ca->last_cwnd = 0;  	ca->last_time = 0;  	ca->epoch_start = 0; @@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca)  static void bictcp_init(struct sock *sk)  { -	bictcp_reset(inet_csk_ca(sk)); +	struct bictcp *ca = inet_csk_ca(sk); + +	bictcp_reset(ca); +	ca->loss_cwnd = 0; +  	if (initial_ssthresh)  		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;  } @@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  	}  	/* if in slow start or link utilization is very low */ -	if (ca->loss_cwnd == 0) { +	if (ca->last_max_cwnd == 0) {  		if (ca->cnt > 20) /* increase cwnd 5% per RTT */  			ca->cnt = 20;  	} @@ -137,16 +140,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  		ca->cnt = 1;  } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct bictcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		bictcp_update(ca, tp->snd_cwnd);  		tcp_cong_avoid_ai(tp, ca->cnt); @@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	const struct bictcp *ca = inet_csk_ca(sk); -	return max(tp->snd_cwnd, ca->last_max_cwnd); +	return max(tp->snd_cwnd, ca->loss_cwnd);  }  static void bictcp_state(struct sock *sk, u8 new_state) @@ -209,7 +212,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)  } -static struct tcp_congestion_ops bictcp = { +static struct tcp_congestion_ops bictcp __read_mostly = {  	.init		= bictcp_init,  	.ssthresh	= bictcp_recalc_ssthresh,  	.cong_avoid	= bictcp_cong_avoid, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 850c737e08e..7b09d8b49fa 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,11 +1,13 @@  /*   * Plugable TCP congestion control support and newReno   * congestion control. - * Based on ideas from I/O scheduler suport and Web100. + * Based on ideas from I/O scheduler support and Web100.   *   * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <linux/module.h>  #include <linux/mm.h>  #include <linux/types.h> @@ -13,8 +15,6 @@  #include <linux/gfp.h>  #include <net/tcp.h> -int sysctl_tcp_max_ssthresh = 0; -  static DEFINE_SPINLOCK(tcp_cong_list_lock);  static LIST_HEAD(tcp_cong_list); @@ -41,18 +41,17 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)  	/* all algorithms must implement ssthresh and cong_avoid ops */  	if (!ca->ssthresh || !ca->cong_avoid) { -		printk(KERN_ERR "TCP %s does not implement required ops\n", -		       ca->name); +		pr_err("%s does not implement required ops\n", ca->name);  		return -EINVAL;  	}  	spin_lock(&tcp_cong_list_lock);  	if (tcp_ca_find(ca->name)) { -		printk(KERN_NOTICE "TCP %s already registered\n", ca->name); +		pr_notice("%s already registered\n", ca->name);  		ret = -EEXIST;  	} else {  		list_add_tail_rcu(&ca->list, &tcp_cong_list); -		printk(KERN_INFO "TCP %s registered\n", ca->name); +		pr_info("%s registered\n", ca->name);  	}  	spin_unlock(&tcp_cong_list_lock); @@ -258,7 +257,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)  	if (!ca)  		err = -ENOENT; -	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) +	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || +		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))  		err = -EPERM;  	else if (!try_module_get(ca->owner)) @@ -276,65 +276,24 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)  	return err;  } -/* RFC2861 Check whether we are limited by application or congestion window - * This is the inverse of cwnd check in tcp_tso_should_defer - */ -int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) -{ -	const struct tcp_sock *tp = tcp_sk(sk); -	u32 left; - -	if (in_flight >= tp->snd_cwnd) -		return 1; - -	left = tp->snd_cwnd - in_flight; -	if (sk_can_gso(sk) && -	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && -	    left * tp->mss_cache < sk->sk_gso_max_size) -		return 1; -	return left <= tcp_max_burst(tp); -} -EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); - -/* - * Slow start is used when congestion window is less than slow start - * threshold. This version implements the basic RFC2581 version - * and optionally supports: - * 	RFC3742 Limited Slow Start  	  - growth limited to max_ssthresh - *	RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged +/* Slow start is used when congestion window is no greater than the slow start + * threshold. We base on RFC2581 and also handle stretch ACKs properly. + * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but + * something better;) a packet is only considered (s)acked in its entirety to + * defend the ACK attacks described in the RFC. Slow start processes a stretch + * ACK of degree N as if N acks of degree 1 are received back to back except + * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and + * returns the leftover acks to adjust cwnd in congestion avoidance mode.   */ -void tcp_slow_start(struct tcp_sock *tp) +int tcp_slow_start(struct tcp_sock *tp, u32 acked)  { -	int cnt; /* increase in packets */ - -	/* RFC3465: ABC Slow start -	 * Increase only after a full MSS of bytes is acked -	 * -	 * TCP sender SHOULD increase cwnd by the number of -	 * previously unacknowledged bytes ACKed by each incoming -	 * acknowledgment, provided the increase is not more than L -	 */ -	if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) -		return; +	u32 cwnd = tp->snd_cwnd + acked; -	if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) -		cnt = sysctl_tcp_max_ssthresh >> 1;	/* limited slow start */ -	else -		cnt = tp->snd_cwnd;			/* exponential increase */ - -	/* RFC3465: ABC -	 * We MAY increase by 2 if discovered delayed ack -	 */ -	if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) -		cnt <<= 1; -	tp->bytes_acked = 0; - -	tp->snd_cwnd_cnt += cnt; -	while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { -		tp->snd_cwnd_cnt -= tp->snd_cwnd; -		if (tp->snd_cwnd < tp->snd_cwnd_clamp) -			tp->snd_cwnd++; -	} +	if (cwnd > tp->snd_ssthresh) +		cwnd = tp->snd_ssthresh + 1; +	acked -= cwnd - tp->snd_cwnd; +	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); +	return acked;  }  EXPORT_SYMBOL_GPL(tcp_slow_start); @@ -358,30 +317,19 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);  /* This is Jacobson's slow start and congestion avoidance.   * SIGCOMM '88, p. 328.   */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* In "safe" area, increase. */  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); - +		tcp_slow_start(tp, acked);  	/* In dangerous area, increase slowly. */ -	else if (sysctl_tcp_abc) { -		/* RFC3465: Appropriate Byte Count -		 * increase once for each full cwnd acked -		 */ -		if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { -			tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; -			if (tp->snd_cwnd < tp->snd_cwnd_clamp) -				tp->snd_cwnd++; -		} -	} else { +	else  		tcp_cong_avoid_ai(tp, tp->snd_cwnd); -	}  }  EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); @@ -393,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)  }  EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window with halving. */ -u32 tcp_reno_min_cwnd(const struct sock *sk) -{ -	const struct tcp_sock *tp = tcp_sk(sk); -	return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); -  struct tcp_congestion_ops tcp_reno = {  	.flags		= TCP_CONG_NON_RESTRICTED,  	.name		= "reno",  	.owner		= THIS_MODULE,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  };  /* Initial congestion control used (until SYN) @@ -419,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops  = {  	.owner		= THIS_MODULE,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  };  EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f29fa..a9bd8a4828a 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -39,7 +39,7 @@  /* Number of delay samples for detecting the increase of delay */  #define HYSTART_MIN_SAMPLES	8 -#define HYSTART_DELAY_MIN	(2U<<3) +#define HYSTART_DELAY_MIN	(4U<<3)  #define HYSTART_DELAY_MAX	(16U<<3)  #define HYSTART_DELAY_THRESH(x)	clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) @@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;  static int hystart __read_mostly = 1;  static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;  static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2;  static u32 cube_rtt_scale __read_mostly;  static u32 beta_scale __read_mostly; @@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"  		 " 1: packet-train 2: delay 3: both packet-train and delay");  module_param(hystart_low_window, int, 0644);  MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");  /* BIC TCP Parameters */  struct bictcp { @@ -85,17 +88,18 @@ struct bictcp {  	u32	last_time;	/* time when updated last_cwnd */  	u32	bic_origin_point;/* origin point of bic function */  	u32	bic_K;		/* time to origin point from the beginning of the current epoch */ -	u32	delay_min;	/* min delay */ +	u32	delay_min;	/* min delay (msec << 3) */  	u32	epoch_start;	/* beginning of an epoch */  	u32	ack_cnt;	/* number of acks */  	u32	tcp_cwnd;	/* estimated tcp cwnd */  #define ACK_RATIO_SHIFT	4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)  	u16	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */  	u8	sample_cnt;	/* number of samples to decide curr_rtt */  	u8	found;		/* the exit point is found? */  	u32	round_start;	/* beginning of each round */  	u32	end_seq;	/* end_seq of the round */ -	u32	last_jiffies;	/* last time when the ACK spacing is close */ +	u32	last_ack;	/* last time when the ACK spacing is close */  	u32	curr_rtt;	/* the minimum rtt of current round */  }; @@ -103,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca)  {  	ca->cnt = 0;  	ca->last_max_cwnd = 0; -	ca->loss_cwnd = 0;  	ca->last_cwnd = 0;  	ca->last_time = 0;  	ca->bic_origin_point = 0; @@ -116,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca)  	ca->found = 0;  } +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 +	return ktime_to_ms(ktime_get_real()); +#else +	return jiffies_to_msecs(jiffies); +#endif +} +  static inline void bictcp_hystart_reset(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct bictcp *ca = inet_csk_ca(sk); -	ca->round_start = ca->last_jiffies = jiffies; +	ca->round_start = ca->last_ack = bictcp_clock();  	ca->end_seq = tp->snd_nxt;  	ca->curr_rtt = 0;  	ca->sample_cnt = 0; @@ -129,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk)  static void bictcp_init(struct sock *sk)  { -	bictcp_reset(inet_csk_ca(sk)); +	struct bictcp *ca = inet_csk_ca(sk); + +	bictcp_reset(ca); +	ca->loss_cwnd = 0;  	if (hystart)  		bictcp_hystart_reset(sk); @@ -191,8 +206,8 @@ static u32 cubic_root(u64 a)   */  static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  { -	u64 offs; -	u32 delta, t, bic_target, max_cnt; +	u32 delta, bic_target, max_cnt; +	u64 offs, t;  	ca->ack_cnt++;	/* count the number of ACKs */ @@ -235,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  	 * if the cwnd < 1 million packets !!!  	 */ +	t = (s32)(tcp_time_stamp - ca->epoch_start); +	t += msecs_to_jiffies(ca->delay_min >> 3);  	/* change the unit from HZ to bictcp_HZ */ -	t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) -	     << BICTCP_HZ) / HZ; +	t <<= BICTCP_HZ; +	do_div(t, HZ);  	if (t < ca->bic_K)		/* t - K */  		offs = ca->bic_K - t; @@ -258,6 +275,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  		ca->cnt = 100 * cwnd;              /* very small increment*/  	} +	/* +	 * The initial growth of cubic function may be too conservative +	 * when the available bandwidth is still unknown. +	 */ +	if (ca->last_max_cwnd == 0 && ca->cnt > 20) +		ca->cnt = 20;	/* increase cwnd 5% per RTT */ +  	/* TCP Friendly */  	if (tcp_friendliness) {  		u32 scale = beta_scale; @@ -280,18 +304,18 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)  		ca->cnt = 1;  } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct bictcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) {  		if (hystart && after(ack, ca->end_seq))  			bictcp_hystart_reset(sk); -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	} else {  		bictcp_update(ca, tp->snd_cwnd);  		tcp_cong_avoid_ai(tp, ca->cnt); @@ -322,7 +346,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)  {  	struct bictcp *ca = inet_csk_ca(sk); -	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);  }  static void bictcp_state(struct sock *sk, u8 new_state) @@ -339,12 +363,12 @@ static void hystart_update(struct sock *sk, u32 delay)  	struct bictcp *ca = inet_csk_ca(sk);  	if (!(ca->found & hystart_detect)) { -		u32 curr_jiffies = jiffies; +		u32 now = bictcp_clock();  		/* first detection parameter - ack-train detection */ -		if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { -			ca->last_jiffies = curr_jiffies; -			if (curr_jiffies - ca->round_start >= ca->delay_min>>4) +		if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { +			ca->last_ack = now; +			if ((s32)(now - ca->round_start) > ca->delay_min >> 4)  				ca->found |= HYSTART_ACK_TRAIN;  		} @@ -379,8 +403,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)  	u32 delay;  	if (icsk->icsk_ca_state == TCP_CA_Open) { -		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; -		ca->delayed_ack += cnt; +		u32 ratio = ca->delayed_ack; + +		ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; +		ratio += cnt; + +		ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);  	}  	/* Some calls are for duplicates without timetamps */ @@ -388,10 +416,10 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)  		return;  	/* Discard delay samples right after fast recovery */ -	if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) +	if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)  		return; -	delay = usecs_to_jiffies(rtt_us) << 3; +	delay = (rtt_us << 3) / USEC_PER_MSEC;  	if (delay == 0)  		delay = 1; @@ -405,7 +433,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)  		hystart_update(sk, delay);  } -static struct tcp_congestion_ops cubictcp = { +static struct tcp_congestion_ops cubictcp __read_mostly = {  	.init		= bictcp_init,  	.ssthresh	= bictcp_recalc_ssthresh,  	.cong_avoid	= bictcp_cong_avoid, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 939edb3b8e4..ed3f2ad42e0 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -34,11 +34,23 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,  		tcp_get_info(sk, info);  } +static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, +		struct inet_diag_req_v2 *r, struct nlattr *bc) +{ +	inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); +} + +static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +		struct inet_diag_req_v2 *req) +{ +	return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); +} +  static const struct inet_diag_handler tcp_diag_handler = { -	.idiag_hashinfo	 = &tcp_hashinfo, +	.dump		 = tcp_diag_dump, +	.dump_one	 = tcp_diag_dump_one,  	.idiag_get_info	 = tcp_diag_get_info, -	.idiag_type	 = TCPDIAG_GETSOCK, -	.idiag_info_size = sizeof(struct tcp_info), +	.idiag_type	 = IPPROTO_TCP,  };  static int __init tcp_diag_init(void) @@ -54,4 +66,4 @@ static void __exit tcp_diag_exit(void)  module_init(tcp_diag_init);  module_exit(tcp_diag_exit);  MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c new file mode 100644 index 00000000000..9771563ab56 --- /dev/null +++ b/net/ipv4/tcp_fastopen.c @@ -0,0 +1,295 @@ +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/tcp.h> +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <net/inetpeer.h> +#include <net/tcp.h> + +int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; + +struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + +static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); + +void tcp_fastopen_init_key_once(bool publish) +{ +	static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + +	/* tcp_fastopen_reset_cipher publishes the new context +	 * atomically, so we allow this race happening here. +	 * +	 * All call sites of tcp_fastopen_cookie_gen also check +	 * for a valid cookie, so this is an acceptable risk. +	 */ +	if (net_get_random_once(key, sizeof(key)) && publish) +		tcp_fastopen_reset_cipher(key, sizeof(key)); +} + +static void tcp_fastopen_ctx_free(struct rcu_head *head) +{ +	struct tcp_fastopen_context *ctx = +	    container_of(head, struct tcp_fastopen_context, rcu); +	crypto_free_cipher(ctx->tfm); +	kfree(ctx); +} + +int tcp_fastopen_reset_cipher(void *key, unsigned int len) +{ +	int err; +	struct tcp_fastopen_context *ctx, *octx; + +	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); +	if (!ctx) +		return -ENOMEM; +	ctx->tfm = crypto_alloc_cipher("aes", 0, 0); + +	if (IS_ERR(ctx->tfm)) { +		err = PTR_ERR(ctx->tfm); +error:		kfree(ctx); +		pr_err("TCP: TFO aes cipher alloc error: %d\n", err); +		return err; +	} +	err = crypto_cipher_setkey(ctx->tfm, key, len); +	if (err) { +		pr_err("TCP: TFO cipher key error: %d\n", err); +		crypto_free_cipher(ctx->tfm); +		goto error; +	} +	memcpy(ctx->key, key, len); + +	spin_lock(&tcp_fastopen_ctx_lock); + +	octx = rcu_dereference_protected(tcp_fastopen_ctx, +				lockdep_is_held(&tcp_fastopen_ctx_lock)); +	rcu_assign_pointer(tcp_fastopen_ctx, ctx); +	spin_unlock(&tcp_fastopen_ctx_lock); + +	if (octx) +		call_rcu(&octx->rcu, tcp_fastopen_ctx_free); +	return err; +} + +static bool __tcp_fastopen_cookie_gen(const void *path, +				      struct tcp_fastopen_cookie *foc) +{ +	struct tcp_fastopen_context *ctx; +	bool ok = false; + +	tcp_fastopen_init_key_once(true); + +	rcu_read_lock(); +	ctx = rcu_dereference(tcp_fastopen_ctx); +	if (ctx) { +		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); +		foc->len = TCP_FASTOPEN_COOKIE_SIZE; +		ok = true; +	} +	rcu_read_unlock(); +	return ok; +} + +/* Generate the fastopen cookie by doing aes128 encryption on both + * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 + * addresses. For the longer IPv6 addresses use CBC-MAC. + * + * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. + */ +static bool tcp_fastopen_cookie_gen(struct request_sock *req, +				    struct sk_buff *syn, +				    struct tcp_fastopen_cookie *foc) +{ +	if (req->rsk_ops->family == AF_INET) { +		const struct iphdr *iph = ip_hdr(syn); + +		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; +		return __tcp_fastopen_cookie_gen(path, foc); +	} + +#if IS_ENABLED(CONFIG_IPV6) +	if (req->rsk_ops->family == AF_INET6) { +		const struct ipv6hdr *ip6h = ipv6_hdr(syn); +		struct tcp_fastopen_cookie tmp; + +		if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { +			struct in6_addr *buf = (struct in6_addr *) tmp.val; +			int i = 4; + +			for (i = 0; i < 4; i++) +				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; +			return __tcp_fastopen_cookie_gen(buf, foc); +		} +	} +#endif +	return false; +} + +static bool tcp_fastopen_create_child(struct sock *sk, +				      struct sk_buff *skb, +				      struct dst_entry *dst, +				      struct request_sock *req) +{ +	struct tcp_sock *tp; +	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; +	struct sock *child; + +	req->num_retrans = 0; +	req->num_timeout = 0; +	req->sk = NULL; + +	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); +	if (child == NULL) +		return false; + +	spin_lock(&queue->fastopenq->lock); +	queue->fastopenq->qlen++; +	spin_unlock(&queue->fastopenq->lock); + +	/* Initialize the child socket. Have to fix some values to take +	 * into account the child is a Fast Open socket and is created +	 * only out of the bits carried in the SYN packet. +	 */ +	tp = tcp_sk(child); + +	tp->fastopen_rsk = req; +	/* Do a hold on the listner sk so that if the listener is being +	 * closed, the child that has been accepted can live on and still +	 * access listen_lock. +	 */ +	sock_hold(sk); +	tcp_rsk(req)->listener = sk; + +	/* RFC1323: The window in SYN & SYN/ACK segments is never +	 * scaled. So correct it appropriately. +	 */ +	tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + +	/* Activate the retrans timer so that SYNACK can be retransmitted. +	 * The request socket is not added to the SYN table of the parent +	 * because it's been added to the accept queue directly. +	 */ +	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, +				  TCP_TIMEOUT_INIT, TCP_RTO_MAX); + +	/* Add the child socket directly into the accept queue */ +	inet_csk_reqsk_queue_add(sk, req, child); + +	/* Now finish processing the fastopen child socket. */ +	inet_csk(child)->icsk_af_ops->rebuild_header(child); +	tcp_init_congestion_control(child); +	tcp_mtup_init(child); +	tcp_init_metrics(child); +	tcp_init_buffer_space(child); + +	/* Queue the data carried in the SYN packet. We need to first +	 * bump skb's refcnt because the caller will attempt to free it. +	 * +	 * XXX (TFO) - we honor a zero-payload TFO request for now, +	 * (any reason not to?) but no need to queue the skb since +	 * there is no data. How about SYN+FIN? +	 */ +	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { +		skb = skb_get(skb); +		skb_dst_drop(skb); +		__skb_pull(skb, tcp_hdr(skb)->doff * 4); +		skb_set_owner_r(skb, child); +		__skb_queue_tail(&child->sk_receive_queue, skb); +		tp->syn_data_acked = 1; +	} +	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	sk->sk_data_ready(sk); +	bh_unlock_sock(child); +	sock_put(child); +	WARN_ON(req->sk == NULL); +	return true; +} +EXPORT_SYMBOL(tcp_fastopen_create_child); + +static bool tcp_fastopen_queue_check(struct sock *sk) +{ +	struct fastopen_queue *fastopenq; + +	/* Make sure the listener has enabled fastopen, and we don't +	 * exceed the max # of pending TFO requests allowed before trying +	 * to validating the cookie in order to avoid burning CPU cycles +	 * unnecessarily. +	 * +	 * XXX (TFO) - The implication of checking the max_qlen before +	 * processing a cookie request is that clients can't differentiate +	 * between qlen overflow causing Fast Open to be disabled +	 * temporarily vs a server not supporting Fast Open at all. +	 */ +	fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; +	if (fastopenq == NULL || fastopenq->max_qlen == 0) +		return false; + +	if (fastopenq->qlen >= fastopenq->max_qlen) { +		struct request_sock *req1; +		spin_lock(&fastopenq->lock); +		req1 = fastopenq->rskq_rst_head; +		if ((req1 == NULL) || time_after(req1->expires, jiffies)) { +			spin_unlock(&fastopenq->lock); +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); +			return false; +		} +		fastopenq->rskq_rst_head = req1->dl_next; +		fastopenq->qlen--; +		spin_unlock(&fastopenq->lock); +		reqsk_free(req1); +	} +	return true; +} + +/* Returns true if we should perform Fast Open on the SYN. The cookie (foc) + * may be updated and return the client in the SYN-ACK later. E.g., Fast Open + * cookie request (foc->len == 0). + */ +bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, +		      struct request_sock *req, +		      struct tcp_fastopen_cookie *foc, +		      struct dst_entry *dst) +{ +	struct tcp_fastopen_cookie valid_foc = { .len = -1 }; +	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + +	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && +	      (syn_data || foc->len >= 0) && +	      tcp_fastopen_queue_check(sk))) { +		foc->len = -1; +		return false; +	} + +	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) +		goto fastopen; + +	if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && +	    foc->len == TCP_FASTOPEN_COOKIE_SIZE && +	    foc->len == valid_foc.len && +	    !memcmp(foc->val, valid_foc.val, foc->len)) { +		/* Cookie is valid. Create a (full) child socket to accept +		 * the data in SYN before returning a SYN-ACK to ack the +		 * data. If we fail to create the socket, fall back and +		 * ack the ISN only but includes the same cookie. +		 * +		 * Note: Data-less SYN with valid cookie is allowed to send +		 * data in SYN_RECV state. +		 */ +fastopen: +		if (tcp_fastopen_create_child(sk, skb, dst, req)) { +			foc->len = -1; +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPFASTOPENPASSIVE); +			return true; +		} +	} + +	NET_INC_STATS_BH(sock_net(sk), foc->len ? +			 LINUX_MIB_TCPFASTOPENPASSIVEFAIL : +			 LINUX_MIB_TCPFASTOPENCOOKIEREQD); +	*foc = valid_foc; +	return false; +} +EXPORT_SYMBOL(tcp_try_fastopen); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b6caaf75bb..1c4908280d9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,16 +109,16 @@ static void hstcp_init(struct sock *sk)  	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);  } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) +static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct hstcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		/* Update AIMD parameters.  		 * @@ -158,11 +158,10 @@ static u32 hstcp_ssthresh(struct sock *sk)  } -static struct tcp_congestion_ops tcp_highspeed = { +static struct tcp_congestion_ops tcp_highspeed __read_mostly = {  	.init		= hstcp_init,  	.ssthresh	= hstcp_ssthresh,  	.cong_avoid	= hstcp_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.owner		= THIS_MODULE,  	.name		= "highspeed" diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 7c94a495541..031361311a8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -227,16 +227,16 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)  	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);  } -static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct htcp *ca = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		/* In dangerous area, increase slowly.  		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd @@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)  	}  } -static struct tcp_congestion_ops htcp = { +static struct tcp_congestion_ops htcp __read_mostly = {  	.init		= htcp_init,  	.ssthresh	= htcp_recalc_ssthresh,  	.cong_avoid	= htcp_cong_avoid, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 377bc934937..d8f8f05a495 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -15,17 +15,16 @@  /* Tcp Hybla structure. */  struct hybla { -	u8    hybla_en; +	bool  hybla_en;  	u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */  	u32   rho;	      /* Rho parameter, integer part  */  	u32   rho2;	      /* Rho * Rho, integer part */  	u32   rho_3ls;	      /* Rho parameter, <<3 */  	u32   rho2_7ls;	      /* Rho^2, <<7	*/ -	u32   minrtt;	      /* Minimum smoothed round trip time value seen */ +	u32   minrtt_us;      /* Minimum smoothed round trip time value seen */  }; -/* Hybla reference round trip time (default= 1/40 sec = 25 ms), -   expressed in jiffies */ +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */  static int rtt0 = 25;  module_param(rtt0, int, 0644);  MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); @@ -36,10 +35,12 @@ static inline void hybla_recalc_param (struct sock *sk)  {  	struct hybla *ca = inet_csk_ca(sk); -	ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); +	ca->rho_3ls = max_t(u32, +			    tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC), +			    8U);  	ca->rho = ca->rho_3ls >> 3;  	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; -	ca->rho2 = ca->rho2_7ls >>7; +	ca->rho2 = ca->rho2_7ls >> 7;  }  static void hybla_init(struct sock *sk) @@ -52,7 +53,7 @@ static void hybla_init(struct sock *sk)  	ca->rho_3ls = 0;  	ca->rho2_7ls = 0;  	ca->snd_cwnd_cents = 0; -	ca->hybla_en = 1; +	ca->hybla_en = true;  	tp->snd_cwnd = 2;  	tp->snd_cwnd_clamp = 65535; @@ -60,13 +61,14 @@ static void hybla_init(struct sock *sk)  	hybla_recalc_param(sk);  	/* set minimum rtt as this is the 1st ever seen */ -	ca->minrtt = tp->srtt; +	ca->minrtt_us = tp->srtt_us;  	tp->snd_cwnd = ca->rho;  }  static void hybla_state(struct sock *sk, u8 ca_state)  {  	struct hybla *ca = inet_csk_ca(sk); +  	ca->hybla_en = (ca_state == TCP_CA_Open);  } @@ -85,7 +87,7 @@ static inline u32 hybla_fraction(u32 odds)   *     o Give cwnd a new value based on the model proposed   *     o remember increments <1   */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct hybla *ca = inet_csk_ca(sk); @@ -93,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  	int is_slowstart = 0;  	/*  Recalculate rho only if this srtt is the lowest */ -	if (tp->srtt < ca->minrtt){ +	if (tp->srtt_us < ca->minrtt_us) {  		hybla_recalc_param(sk); -		ca->minrtt = tp->srtt; +		ca->minrtt_us = tp->srtt_us;  	} -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (!ca->hybla_en) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	} @@ -162,10 +164,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);  } -static struct tcp_congestion_ops tcp_hybla = { +static struct tcp_congestion_ops tcp_hybla __read_mostly = {  	.init		= hybla_init,  	.ssthresh	= tcp_reno_ssthresh, -	.min_cwnd	= tcp_reno_min_cwnd,  	.cong_avoid	= hybla_cong_avoid,  	.set_state	= hybla_state, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 00ca688d896..5999b3972e6 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -23,7 +23,6 @@  #define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */  #define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */  #define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */ -#define U32_MAX		((u32)~0U)  #define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */  #define BETA_SHIFT	6 @@ -256,7 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)  /*   * Increase window in response to successful acknowledgment.   */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct illinois *ca = inet_csk_ca(sk); @@ -265,12 +264,12 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		update_params(sk);  	/* RFC2861 only increase cwnd if fully utilized */ -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* In slow start */  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else {  		u32 delta; @@ -313,20 +312,20 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,  			.tcpv_rttcnt = ca->cnt_rtt,  			.tcpv_minrtt = ca->base_rtt,  		}; -		u64 t = ca->sum_rtt; -		do_div(t, ca->cnt_rtt); -		info.tcpv_rtt = t; +		if (info.tcpv_rttcnt > 0) { +			u64 t = ca->sum_rtt; +			do_div(t, info.tcpv_rttcnt); +			info.tcpv_rtt = t; +		}  		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);  	}  } -static struct tcp_congestion_ops tcp_illinois = { -	.flags		= TCP_CONG_RTT_STAMP, +static struct tcp_congestion_ops tcp_illinois __read_mostly = {  	.init		= tcp_illinois_init,  	.ssthresh	= tcp_illinois_ssthresh, -	.min_cwnd	= tcp_reno_min_cwnd,  	.cong_avoid	= tcp_illinois_cong_avoid,  	.set_state	= tcp_illinois_state,  	.get_info	= tcp_illinois_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6d8ab1c4efc..40639c288dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,6 +61,8 @@   *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <linux/mm.h>  #include <linux/slab.h>  #include <linux/module.h> @@ -79,24 +81,23 @@ int sysctl_tcp_sack __read_mostly = 1;  int sysctl_tcp_fack __read_mostly = 1;  int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;  EXPORT_SYMBOL(sysctl_tcp_reordering); -int sysctl_tcp_ecn __read_mostly = 2; -EXPORT_SYMBOL(sysctl_tcp_ecn);  int sysctl_tcp_dsack __read_mostly = 1;  int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 2; +int sysctl_tcp_adv_win_scale __read_mostly = 1;  EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 100; +  int sysctl_tcp_stdurg __read_mostly;  int sysctl_tcp_rfc1337 __read_mostly;  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;  int sysctl_tcp_frto __read_mostly = 2; -int sysctl_tcp_frto_response __read_mostly; -int sysctl_tcp_nometrics_save __read_mostly;  int sysctl_tcp_thin_dupack __read_mostly;  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; -int sysctl_tcp_abc __read_mostly; +int sysctl_tcp_early_retrans __read_mostly = 3;  #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/  #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/ @@ -105,19 +106,17 @@ int sysctl_tcp_abc __read_mostly;  #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/  #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/  #define FLAG_ECE		0x40 /* ECE in this ACK				*/ -#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/  #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/ -#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/  #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */  #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */ -#define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */  #define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */  #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)  #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)  #define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)  #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED) -#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)  #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -174,7 +173,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)  static void tcp_incr_quickack(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); -	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); +	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);  	if (quickacks == 0)  		quickacks = 2; @@ -194,9 +193,10 @@ static void tcp_enter_quickack_mode(struct sock *sk)   * and the session is not interactive.   */ -static inline int tcp_in_quickack_mode(const struct sock *sk) +static inline bool tcp_in_quickack_mode(const struct sock *sk)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); +  	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;  } @@ -206,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)  		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;  } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)  {  	if (tcp_hdr(skb)->cwr)  		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; @@ -217,36 +217,49 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)  	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;  } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)  { -	if (tp->ecn_flags & TCP_ECN_OK) { -		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) -			tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +	if (!(tp->ecn_flags & TCP_ECN_OK)) +		return; + +	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { +	case INET_ECN_NOT_ECT:  		/* Funny extension: if ECT is not set on a segment, -		 * it is surely retransmit. It is not in ECN RFC, -		 * but Linux follows this rule. */ -		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) +		 * and we already seen ECT on a previous segment, +		 * it is probably a retransmit. +		 */ +		if (tp->ecn_flags & TCP_ECN_SEEN) +			tcp_enter_quickack_mode((struct sock *)tp); +		break; +	case INET_ECN_CE: +		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +			/* Better not delay acks, sender can have a very low cwnd */  			tcp_enter_quickack_mode((struct sock *)tp); +			tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +		} +		/* fallinto */ +	default: +		tp->ecn_flags |= TCP_ECN_SEEN;  	}  } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)  {  	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))  		tp->ecn_flags &= ~TCP_ECN_OK;  } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)  {  	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))  		tp->ecn_flags &= ~TCP_ECN_OK;  } -static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)  {  	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) -		return 1; -	return 0; +		return true; +	return false;  }  /* Buffer size and advertised window tuning. @@ -254,16 +267,33 @@ static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)   * 1. Tuning sk->sk_sndbuf, when connection enters established state.   */ -static void tcp_fixup_sndbuf(struct sock *sk) +static void tcp_sndbuf_expand(struct sock *sk)  { -	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + -		     sizeof(struct sk_buff); +	const struct tcp_sock *tp = tcp_sk(sk); +	int sndmem, per_mss; +	u32 nr_segs; -	if (sk->sk_sndbuf < 3 * sndmem) { -		sk->sk_sndbuf = 3 * sndmem; -		if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) -			sk->sk_sndbuf = sysctl_tcp_wmem[2]; -	} +	/* Worst case is non GSO/TSO : each frame consumes one skb +	 * and skb->head is kmalloced using power of two area of memory +	 */ +	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + +		  MAX_TCP_HEADER + +		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + +	per_mss = roundup_pow_of_two(per_mss) + +		  SKB_DATA_ALIGN(sizeof(struct sk_buff)); + +	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); +	nr_segs = max_t(u32, nr_segs, tp->reordering + 1); + +	/* Fast Recovery (RFC 5681 3.2) : +	 * Cubic needs 1.7 factor, rounded to 2 to include +	 * extra cushion (application might react slowly to POLLOUT) +	 */ +	sndmem = 2 * nr_segs * per_mss; + +	if (sk->sk_sndbuf < sndmem) +		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);  }  /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -309,14 +339,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)  	return 0;  } -static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)  {  	struct tcp_sock *tp = tcp_sk(sk);  	/* Check #1 */  	if (tp->rcv_ssthresh < tp->window_clamp &&  	    (int)tp->rcv_ssthresh < tcp_space(sk) && -	    !tcp_memory_pressure) { +	    !sk_under_memory_pressure(sk)) {  		int incr;  		/* Check #2. Increase window, if skb with such overhead @@ -328,6 +358,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)  			incr = __tcp_grow_window(sk, skb);  		if (incr) { +			incr = max_t(int, incr, 2 * skb->len);  			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,  					       tp->window_clamp);  			inet_csk(sk)->icsk_ack.quick |= 1; @@ -336,26 +367,28 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)  }  /* 3. Tuning rcvbuf, when connection enters established state. */ -  static void tcp_fixup_rcvbuf(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); -	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); +	u32 mss = tcp_sk(sk)->advmss; +	int rcvmem; + +	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * +		 tcp_default_init_rwnd(mss); -	/* Try to select rcvbuf so that 4 mss-sized segments -	 * will fit to window and corresponding skbs will fit to our rcvbuf. -	 * (was 3; 4 is minimum to allow fast retransmit to work.) +	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency +	 * Allow enough cushion so that sender is not limited by our window  	 */ -	while (tcp_win_from_space(rcvmem) < tp->advmss) -		rcvmem += 128; -	if (sk->sk_rcvbuf < 4 * rcvmem) -		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); +	if (sysctl_tcp_moderate_rcvbuf) +		rcvmem <<= 2; + +	if (sk->sk_rcvbuf < rcvmem) +		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);  }  /* 4. Try to fixup all. It is made immediately after connection enters   *    established state.   */ -static void tcp_init_buffer_space(struct sock *sk) +void tcp_init_buffer_space(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int maxwin; @@ -363,9 +396,11 @@ static void tcp_init_buffer_space(struct sock *sk)  	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))  		tcp_fixup_rcvbuf(sk);  	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) -		tcp_fixup_sndbuf(sk); +		tcp_sndbuf_expand(sk);  	tp->rcvq_space.space = tp->rcv_wnd; +	tp->rcvq_space.time = tcp_time_stamp; +	tp->rcvq_space.seq = tp->copied_seq;  	maxwin = tcp_full_space(sk); @@ -398,8 +433,8 @@ static void tcp_clamp_window(struct sock *sk)  	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&  	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && -	    !tcp_memory_pressure && -	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { +	    !sk_under_memory_pressure(sk) && +	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {  		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),  				    sysctl_tcp_rmem[2]);  	} @@ -416,7 +451,7 @@ static void tcp_clamp_window(struct sock *sk)   */  void tcp_initialize_rcv_mss(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);  	hint = min(hint, tp->rcv_wnd / 2); @@ -460,8 +495,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)  		if (!win_dep) {  			m -= (new_sample >> 3);  			new_sample += m; -		} else if (m < new_sample) -			new_sample = m << 3; +		} else { +			m <<= 3; +			if (m < new_sample) +				new_sample = m; +		}  	} else {  		/* No previous measure. */  		new_sample = m << 3; @@ -477,7 +515,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)  		goto new_measure;  	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))  		return; -	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); +	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);  new_measure:  	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; @@ -502,49 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int time; -	int space; - -	if (tp->rcvq_space.time == 0) -		goto new_measure; +	int copied;  	time = tcp_time_stamp - tp->rcvq_space.time;  	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)  		return; -	space = 2 * (tp->copied_seq - tp->rcvq_space.seq); +	/* Number of bytes copied to user in last RTT */ +	copied = tp->copied_seq - tp->rcvq_space.seq; +	if (copied <= tp->rcvq_space.space) +		goto new_measure; + +	/* A bit of theory : +	 * copied = bytes received in previous RTT, our base window +	 * To cope with packet losses, we need a 2x factor +	 * To cope with slow start, and sender growing its cwin by 100 % +	 * every RTT, we need a 4x factor, because the ACK we are sending +	 * now is for the next RTT, not the current one : +	 * <prev RTT . ><current RTT .. ><next RTT .... > +	 */ + +	if (sysctl_tcp_moderate_rcvbuf && +	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { +		int rcvwin, rcvmem, rcvbuf; -	space = max(tp->rcvq_space.space, space); +		/* minimal window to cope with packet losses, assuming +		 * steady state. Add some cushion because of small variations. +		 */ +		rcvwin = (copied << 1) + 16 * tp->advmss; -	if (tp->rcvq_space.space != space) { -		int rcvmem; +		/* If rate increased by 25%, +		 *	assume slow start, rcvwin = 3 * copied +		 * If rate increased by 50%, +		 *	assume sender can use 2x growth, rcvwin = 4 * copied +		 */ +		if (copied >= +		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { +			if (copied >= +			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) +				rcvwin <<= 1; +			else +				rcvwin += (rcvwin >> 1); +		} -		tp->rcvq_space.space = space; +		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); +		while (tcp_win_from_space(rcvmem) < tp->advmss) +			rcvmem += 128; -		if (sysctl_tcp_moderate_rcvbuf && -		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { -			int new_clamp = space; +		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); +		if (rcvbuf > sk->sk_rcvbuf) { +			sk->sk_rcvbuf = rcvbuf; -			/* Receive space grows, normalize in order to -			 * take into account packet headers and sk_buff -			 * structure overhead. -			 */ -			space /= tp->advmss; -			if (!space) -				space = 1; -			rcvmem = (tp->advmss + MAX_TCP_HEADER + -				  16 + sizeof(struct sk_buff)); -			while (tcp_win_from_space(rcvmem) < tp->advmss) -				rcvmem += 128; -			space *= rcvmem; -			space = min(space, sysctl_tcp_rmem[2]); -			if (space > sk->sk_rcvbuf) { -				sk->sk_rcvbuf = space; - -				/* Make the window clamp follow along.  */ -				tp->window_clamp = new_clamp; -			} +			/* Make the window clamp follow along.  */ +			tp->window_clamp = rcvwin;  		}  	} +	tp->rcvq_space.space = copied;  new_measure:  	tp->rcvq_space.seq = tp->copied_seq; @@ -616,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)   * To save cycles in the RFC 1323 implementation it was better to break   * it up into three procedures. -- erics   */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)  {  	struct tcp_sock *tp = tcp_sk(sk); -	long m = mrtt; /* RTT */ +	long m = mrtt_us; /* RTT */ +	u32 srtt = tp->srtt_us;  	/*	The following amusing code comes from Jacobson's  	 *	article in SIGCOMM '88.  Note that rtt and mdev @@ -637,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  	 * does not matter how to _calculate_ it. Seems, it was trap  	 * that VJ failed to avoid. 8)  	 */ -	if (m == 0) -		m = 1; -	if (tp->srtt != 0) { -		m -= (tp->srtt >> 3);	/* m is now error in rtt est */ -		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */ +	if (srtt != 0) { +		m -= (srtt >> 3);	/* m is now error in rtt est */ +		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */  		if (m < 0) {  			m = -m;		/* m is now abs(error) */ -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  			/* This is similar to one of Eifel findings.  			 * Eifel blocks mdev updates when rtt decreases.  			 * This solution is a bit different: we use finer gain @@ -656,33 +706,62 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)  			if (m > 0)  				m >>= 3;  		} else { -			m -= (tp->mdev >> 2);   /* similar update on mdev */ +			m -= (tp->mdev_us >> 2);   /* similar update on mdev */  		} -		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */ -		if (tp->mdev > tp->mdev_max) { -			tp->mdev_max = tp->mdev; -			if (tp->mdev_max > tp->rttvar) -				tp->rttvar = tp->mdev_max; +		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */ +		if (tp->mdev_us > tp->mdev_max_us) { +			tp->mdev_max_us = tp->mdev_us; +			if (tp->mdev_max_us > tp->rttvar_us) +				tp->rttvar_us = tp->mdev_max_us;  		}  		if (after(tp->snd_una, tp->rtt_seq)) { -			if (tp->mdev_max < tp->rttvar) -				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; +			if (tp->mdev_max_us < tp->rttvar_us) +				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;  			tp->rtt_seq = tp->snd_nxt; -			tp->mdev_max = tcp_rto_min(sk); +			tp->mdev_max_us = tcp_rto_min_us(sk);  		}  	} else {  		/* no previous measure. */ -		tp->srtt = m << 3;	/* take the measured time to be rtt */ -		tp->mdev = m << 1;	/* make sure rto = 3*rtt */ -		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); +		srtt = m << 3;		/* take the measured time to be rtt */ +		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */ +		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); +		tp->mdev_max_us = tp->rttvar_us;  		tp->rtt_seq = tp->snd_nxt;  	} +	tp->srtt_us = max(1U, srtt); +} + +/* Set the sk_pacing_rate to allow proper sizing of TSO packets. + * Note: TCP stack does not yet implement pacing. + * FQ packet scheduler can be used to implement cheap but effective + * TCP pacing, to smooth the burst on large writes when packets + * in flight is significantly lower than cwnd (or rwin) + */ +static void tcp_update_pacing_rate(struct sock *sk) +{ +	const struct tcp_sock *tp = tcp_sk(sk); +	u64 rate; + +	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ +	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + +	rate *= max(tp->snd_cwnd, tp->packets_out); + +	if (likely(tp->srtt_us)) +		do_div(rate, tp->srtt_us); + +	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate +	 * without any lock. We want to make sure compiler wont store +	 * intermediate values in this location. +	 */ +	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, +						sk->sk_max_pacing_rate);  }  /* Calculate rto without backoff.  This is the second half of Van Jacobson's   * routine referred to above.   */ -static inline void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	/* Old crap is replaced with new one. 8) @@ -709,228 +788,31 @@ static inline void tcp_set_rto(struct sock *sk)  	tcp_bound_rto(sk);  } -/* Save metrics learned by this TCP session. -   This function is called only, when TCP finishes successfully -   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. - */ -void tcp_update_metrics(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); - -	if (sysctl_tcp_nometrics_save) -		return; - -	dst_confirm(dst); - -	if (dst && (dst->flags & DST_HOST)) { -		const struct inet_connection_sock *icsk = inet_csk(sk); -		int m; -		unsigned long rtt; - -		if (icsk->icsk_backoff || !tp->srtt) { -			/* This session failed to estimate rtt. Why? -			 * Probably, no packets returned in time. -			 * Reset our results. -			 */ -			if (!(dst_metric_locked(dst, RTAX_RTT))) -				dst->metrics[RTAX_RTT - 1] = 0; -			return; -		} - -		rtt = dst_metric_rtt(dst, RTAX_RTT); -		m = rtt - tp->srtt; - -		/* If newly calculated rtt larger than stored one, -		 * store new one. Otherwise, use EWMA. Remember, -		 * rtt overestimation is always better than underestimation. -		 */ -		if (!(dst_metric_locked(dst, RTAX_RTT))) { -			if (m <= 0) -				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); -			else -				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); -		} - -		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { -			unsigned long var; -			if (m < 0) -				m = -m; - -			/* Scale deviation to rttvar fixed point */ -			m >>= 1; -			if (m < tp->mdev) -				m = tp->mdev; - -			var = dst_metric_rtt(dst, RTAX_RTTVAR); -			if (m >= var) -				var = m; -			else -				var -= (var - m) >> 2; - -			set_dst_metric_rtt(dst, RTAX_RTTVAR, var); -		} - -		if (tcp_in_initial_slowstart(tp)) { -			/* Slow start still did not finish. */ -			if (dst_metric(dst, RTAX_SSTHRESH) && -			    !dst_metric_locked(dst, RTAX_SSTHRESH) && -			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) -				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; -			if (!dst_metric_locked(dst, RTAX_CWND) && -			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) -				dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; -		} else if (tp->snd_cwnd > tp->snd_ssthresh && -			   icsk->icsk_ca_state == TCP_CA_Open) { -			/* Cong. avoidance phase, cwnd is reliable. */ -			if (!dst_metric_locked(dst, RTAX_SSTHRESH)) -				dst->metrics[RTAX_SSTHRESH-1] = -					max(tp->snd_cwnd >> 1, tp->snd_ssthresh); -			if (!dst_metric_locked(dst, RTAX_CWND)) -				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; -		} else { -			/* Else slow start did not finish, cwnd is non-sense, -			   ssthresh may be also invalid. -			 */ -			if (!dst_metric_locked(dst, RTAX_CWND)) -				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; -			if (dst_metric(dst, RTAX_SSTHRESH) && -			    !dst_metric_locked(dst, RTAX_SSTHRESH) && -			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) -				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; -		} - -		if (!dst_metric_locked(dst, RTAX_REORDERING)) { -			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && -			    tp->reordering != sysctl_tcp_reordering) -				dst->metrics[RTAX_REORDERING-1] = tp->reordering; -		} -	} -} - -__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) +__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)  {  	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);  	if (!cwnd) -		cwnd = rfc3390_bytes_to_packets(tp->mss_cache); +		cwnd = TCP_INIT_CWND;  	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);  } -/* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	const struct inet_connection_sock *icsk = inet_csk(sk); - -	tp->prior_ssthresh = 0; -	tp->bytes_acked = 0; -	if (icsk->icsk_ca_state < TCP_CA_CWR) { -		tp->undo_marker = 0; -		if (set_ssthresh) -			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -		tp->snd_cwnd = min(tp->snd_cwnd, -				   tcp_packets_in_flight(tp) + 1U); -		tp->snd_cwnd_cnt = 0; -		tp->high_seq = tp->snd_nxt; -		tp->snd_cwnd_stamp = tcp_time_stamp; -		TCP_ECN_queue_cwr(tp); - -		tcp_set_ca_state(sk, TCP_CA_CWR); -	} -} -  /*   * Packet counting of FACK is based on in-order assumptions, therefore TCP   * disables it when reordering is detected   */ -static void tcp_disable_fack(struct tcp_sock *tp) +void tcp_disable_fack(struct tcp_sock *tp)  {  	/* RFC3517 uses different metric in lost marker => reset on change */  	if (tcp_is_fack(tp))  		tp->lost_skb_hint = NULL; -	tp->rx_opt.sack_ok &= ~2; +	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;  }  /* Take a notice that peer is sending D-SACKs */  static void tcp_dsack_seen(struct tcp_sock *tp)  { -	tp->rx_opt.sack_ok |= 4; -} - -/* Initialize metrics on socket. */ - -static void tcp_init_metrics(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); - -	if (dst == NULL) -		goto reset; - -	dst_confirm(dst); - -	if (dst_metric_locked(dst, RTAX_CWND)) -		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); -	if (dst_metric(dst, RTAX_SSTHRESH)) { -		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); -		if (tp->snd_ssthresh > tp->snd_cwnd_clamp) -			tp->snd_ssthresh = tp->snd_cwnd_clamp; -	} -	if (dst_metric(dst, RTAX_REORDERING) && -	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) { -		tcp_disable_fack(tp); -		tp->reordering = dst_metric(dst, RTAX_REORDERING); -	} - -	if (dst_metric(dst, RTAX_RTT) == 0) -		goto reset; - -	if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) -		goto reset; - -	/* Initial rtt is determined from SYN,SYN-ACK. -	 * The segment is small and rtt may appear much -	 * less than real one. Use per-dst memory -	 * to make it more realistic. -	 * -	 * A bit of theory. RTT is time passed after "normal" sized packet -	 * is sent until it is ACKed. In normal circumstances sending small -	 * packets force peer to delay ACKs and calculation is correct too. -	 * The algorithm is adaptive and, provided we follow specs, it -	 * NEVER underestimate RTT. BUT! If peer tries to make some clever -	 * tricks sort of "quick acks" for time long enough to decrease RTT -	 * to low value, and then abruptly stops to do it and starts to delay -	 * ACKs, wait for troubles. -	 */ -	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { -		tp->srtt = dst_metric_rtt(dst, RTAX_RTT); -		tp->rtt_seq = tp->snd_nxt; -	} -	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { -		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); -		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); -	} -	tcp_set_rto(sk); -	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) -		goto reset; - -cwnd: -	tp->snd_cwnd = tcp_init_cwnd(tp, dst); -	tp->snd_cwnd_stamp = tcp_time_stamp; -	return; - -reset: -	/* Play conservative. If timestamps are not -	 * supported, TCP will fail to recalculate correct -	 * rtt, if initial rto is too small. FORGET ALL AND RESET! -	 */ -	if (!tp->rx_opt.saw_tstamp && tp->srtt) { -		tp->srtt = 0; -		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; -		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; -	} -	goto cwnd; +	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;  }  static void tcp_update_reordering(struct sock *sk, const int metric, @@ -954,15 +836,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,  		NET_INC_STATS_BH(sock_net(sk), mib_idx);  #if FASTRETRANS_DEBUG > 1 -		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", -		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, -		       tp->reordering, -		       tp->fackets_out, -		       tp->sacked_out, -		       tp->undo_marker ? tp->undo_retrans : 0); +		pr_debug("Disorder%d %d %u f%u s%u rr%d\n", +			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, +			 tp->reordering, +			 tp->fackets_out, +			 tp->sacked_out, +			 tp->undo_marker ? tp->undo_retrans : 0);  #endif  		tcp_disable_fack(tp);  	} + +	if (metric > 0) +		tcp_disable_early_retrans(tp);  }  /* This must be called before lost_out is incremented */ @@ -1020,13 +905,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,   * These 6 states form finite state machine, controlled by the following events:   * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())   * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) - * 3. Loss detection event of one of three flavors: + * 3. Loss detection event of two flavors:   *	A. Scoreboard estimator decided the packet is lost.   *	   A'. Reno "three dupacks" marks head of queue lost. - *	   A''. Its FACK modfication, head until snd.fack is lost. - *	B. SACK arrives sacking data transmitted after never retransmitted - *	   hole was sent out. - *	C. SACK arrives sacking SND.NXT at the moment, when the + *	   A''. Its FACK modification, head until snd.fack is lost. + *	B. SACK arrives sacking SND.NXT at the moment, when the   *	   segment was retransmitted.   * 4. D-SACK added new rule: D-SACK changes any tag to S.   * @@ -1095,36 +978,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,   * the exact amount is rather hard to quantify. However, tp->max_window can   * be used as an exaggerated estimate.   */ -static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, -				  u32 start_seq, u32 end_seq) +static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, +				   u32 start_seq, u32 end_seq)  {  	/* Too far in future, or reversed (interpretation is ambiguous) */  	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) -		return 0; +		return false;  	/* Nasty start_seq wrap-around check (see comments above) */  	if (!before(start_seq, tp->snd_nxt)) -		return 0; +		return false;  	/* In outstanding window? ...This is valid exit for D-SACKs too.  	 * start_seq == snd_una is non-sensical (see comments above)  	 */  	if (after(start_seq, tp->snd_una)) -		return 1; +		return true;  	if (!is_dsack || !tp->undo_marker) -		return 0; +		return false;  	/* ...Then it's D-SACK, and must reside below snd_una completely */ -	if (!after(end_seq, tp->snd_una)) -		return 0; +	if (after(end_seq, tp->snd_una)) +		return false;  	if (!before(start_seq, tp->undo_marker)) -		return 1; +		return true;  	/* Too old */  	if (!after(end_seq, tp->undo_marker)) -		return 0; +		return false;  	/* Undo_marker boundary crossing (overestimates a lot). Known already:  	 *   start_seq < undo_marker and end_seq >= undo_marker. @@ -1133,7 +1016,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,  }  /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "C". Later note: FACK people cheated me again 8), we have to account + * Event "B". Later note: FACK people cheated me again 8), we have to account   * for reordering! Ugly, but should help.   *   * Search retransmitted skbs from write_queue that were sent when snd_nxt was @@ -1196,17 +1079,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)  		tp->lost_retrans_low = new_low_seq;  } -static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, -			   struct tcp_sack_block_wire *sp, int num_sacks, -			   u32 prior_snd_una) +static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, +			    struct tcp_sack_block_wire *sp, int num_sacks, +			    u32 prior_snd_una)  {  	struct tcp_sock *tp = tcp_sk(sk);  	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);  	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); -	int dup_sack = 0; +	bool dup_sack = false;  	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { -		dup_sack = 1; +		dup_sack = true;  		tcp_dsack_seen(tp);  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);  	} else if (num_sacks > 1) { @@ -1215,7 +1098,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,  		if (!after(end_seq_0, end_seq_1) &&  		    !before(start_seq_0, start_seq_1)) { -			dup_sack = 1; +			dup_sack = true;  			tcp_dsack_seen(tp);  			NET_INC_STATS_BH(sock_net(sk),  					LINUX_MIB_TCPDSACKOFORECV); @@ -1223,7 +1106,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,  	}  	/* D-SACK for already forgotten data... Do dumb counting. */ -	if (dup_sack && +	if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&  	    !after(end_seq_0, prior_snd_una) &&  	    after(end_seq_0, tp->undo_marker))  		tp->undo_retrans--; @@ -1232,9 +1115,10 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,  }  struct tcp_sacktag_state { -	int reord; -	int fack_count; -	int flag; +	int	reord; +	int	fack_count; +	long	rtt_us; /* RTT measured by SACKing never-retransmitted data */ +	int	flag;  };  /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1246,9 +1130,10 @@ struct tcp_sacktag_state {   * FIXME: this could be merged to shift decision code   */  static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, -				 u32 start_seq, u32 end_seq) +				  u32 start_seq, u32 end_seq)  { -	int in_sack, err; +	int err; +	bool in_sack;  	unsigned int pkt_len;  	unsigned int mss; @@ -1277,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  			unsigned int new_len = (pkt_len / mss) * mss;  			if (!in_sack && new_len < pkt_len) {  				new_len += mss; -				if (new_len > skb->len) +				if (new_len >= skb->len)  					return 0;  			}  			pkt_len = new_len;  		} -		err = tcp_fragment(sk, skb, pkt_len, mss); +		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);  		if (err < 0)  			return err;  	} @@ -1290,24 +1175,27 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,  	return in_sack;  } -static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, -			  struct tcp_sacktag_state *state, -			  int dup_sack, int pcount) +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, +			  struct tcp_sacktag_state *state, u8 sacked, +			  u32 start_seq, u32 end_seq, +			  int dup_sack, int pcount, +			  const struct skb_mstamp *xmit_time)  {  	struct tcp_sock *tp = tcp_sk(sk); -	u8 sacked = TCP_SKB_CB(skb)->sacked;  	int fack_count = state->fack_count;  	/* Account D-SACK for retransmitted packet. */  	if (dup_sack && (sacked & TCPCB_RETRANS)) { -		if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) +		if (tp->undo_marker && tp->undo_retrans > 0 && +		    after(end_seq, tp->undo_marker))  			tp->undo_retrans--;  		if (sacked & TCPCB_SACKED_ACKED)  			state->reord = min(fack_count, state->reord);  	}  	/* Nothing to do; acked frame is about to be dropped (was ACKed). */ -	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) +	if (!after(end_seq, tp->snd_una))  		return sacked;  	if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1326,14 +1214,20 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,  				/* New sack for not retransmitted frame,  				 * which was in hole. It is reordering.  				 */ -				if (before(TCP_SKB_CB(skb)->seq, +				if (before(start_seq,  					   tcp_highest_sack_seq(tp)))  					state->reord = min(fack_count,  							   state->reord); - -				/* SACK enhanced F-RTO (RFC4138; Appendix B) */ -				if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) -					state->flag |= FLAG_ONLY_ORIG_SACKED; +				if (!after(end_seq, tp->high_seq)) +					state->flag |= FLAG_ORIG_SACK_ACKED; +				/* Pick the earliest sequence sacked for RTT */ +				if (state->rtt_us < 0) { +					struct skb_mstamp now; + +					skb_mstamp_get(&now); +					state->rtt_us = skb_mstamp_us_delta(&now, +								xmit_time); +				}  			}  			if (sacked & TCPCB_LOST) { @@ -1350,8 +1244,7 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,  		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */  		if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && -		    before(TCP_SKB_CB(skb)->seq, -			   TCP_SKB_CB(tp->lost_skb_hint)->seq)) +		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))  			tp->lost_cnt_hint += pcount;  		if (fack_count > tp->fackets_out) @@ -1370,19 +1263,32 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,  	return sacked;  } -static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, -			   struct tcp_sacktag_state *state, -			   unsigned int pcount, int shifted, int mss, -			   int dup_sack) +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +			    struct tcp_sacktag_state *state, +			    unsigned int pcount, int shifted, int mss, +			    bool dup_sack)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *prev = tcp_write_queue_prev(sk, skb); +	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */ +	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */  	BUG_ON(!pcount); -	/* Tweak before seqno plays */ -	if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint && -	    !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) +	/* Adjust counters and hints for the newly sacked sequence +	 * range but discard the return value since prev is already +	 * marked. We must tag the range first because the seq +	 * advancement below implicitly advances +	 * tcp_highest_sack_seq() when skb is highest_sack. +	 */ +	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, +			start_seq, end_seq, dup_sack, pcount, +			&skb->skb_mstamp); + +	if (skb == tp->lost_skb_hint)  		tp->lost_cnt_hint += pcount;  	TCP_SKB_CB(prev)->end_seq += shifted; @@ -1408,30 +1314,28 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  		skb_shinfo(skb)->gso_type = 0;  	} -	/* We discard results */ -	tcp_sacktag_one(skb, sk, state, dup_sack, pcount); -  	/* Difference in this won't matter, both ACKed by the same cumul. ACK */  	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);  	if (skb->len > 0) {  		BUG_ON(!tcp_skb_pcount(skb));  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); -		return 0; +		return false;  	}  	/* Whole SKB was eaten :-) */  	if (skb == tp->retransmit_skb_hint)  		tp->retransmit_skb_hint = prev; -	if (skb == tp->scoreboard_skb_hint) -		tp->scoreboard_skb_hint = prev;  	if (skb == tp->lost_skb_hint) {  		tp->lost_skb_hint = prev;  		tp->lost_cnt_hint -= tcp_skb_pcount(prev);  	} -	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags; +	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) +		TCP_SKB_CB(prev)->end_seq++; +  	if (skb == tcp_highest_sack(sk))  		tcp_advance_highest_sack(sk, skb); @@ -1440,19 +1344,19 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); -	return 1; +	return true;  }  /* I wish gso_size would have a bit more sane initialization than   * something-or-zero which complicates things   */ -static int tcp_skb_seglen(struct sk_buff *skb) +static int tcp_skb_seglen(const struct sk_buff *skb)  {  	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);  }  /* Shifting pages past head area doesn't work */ -static int skb_can_shift(struct sk_buff *skb) +static int skb_can_shift(const struct sk_buff *skb)  {  	return !skb_headlen(skb) && skb_is_nonlinear(skb);  } @@ -1463,7 +1367,7 @@ static int skb_can_shift(struct sk_buff *skb)  static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,  					  struct tcp_sacktag_state *state,  					  u32 start_seq, u32 end_seq, -					  int dup_sack) +					  bool dup_sack)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *prev; @@ -1558,6 +1462,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,  		}  	} +	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ +	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) +		goto fallback; +  	if (!skb_shift(prev, skb, len))  		goto fallback;  	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) @@ -1598,14 +1506,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  					struct tcp_sack_block *next_dup,  					struct tcp_sacktag_state *state,  					u32 start_seq, u32 end_seq, -					int dup_sack_in) +					bool dup_sack_in)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *tmp;  	tcp_for_write_queue_from(skb, sk) {  		int in_sack = 0; -		int dup_sack = dup_sack_in; +		bool dup_sack = dup_sack_in;  		if (skb == tcp_send_head(sk))  			break; @@ -1620,7 +1528,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  							next_dup->start_seq,  							next_dup->end_seq);  			if (in_sack > 0) -				dup_sack = 1; +				dup_sack = true;  		}  		/* skb reference here is a bit tricky to get right, since @@ -1648,10 +1556,15 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,  			break;  		if (in_sack) { -			TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, -								  state, -								  dup_sack, -								  tcp_skb_pcount(skb)); +			TCP_SKB_CB(skb)->sacked = +				tcp_sacktag_one(sk, +						state, +						TCP_SKB_CB(skb)->sacked, +						TCP_SKB_CB(skb)->seq, +						TCP_SKB_CB(skb)->end_seq, +						dup_sack, +						tcp_skb_pcount(skb), +						&skb->skb_mstamp);  			if (!before(TCP_SKB_CB(skb)->seq,  				    tcp_highest_sack_seq(tp))) @@ -1701,19 +1614,18 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,  	return skb;  } -static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)  {  	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);  }  static int -tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, -			u32 prior_snd_una) +tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, +			u32 prior_snd_una, long *sack_rtt_us)  { -	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	unsigned char *ptr = (skb_transport_header(ack_skb) + -			      TCP_SKB_CB(ack_skb)->sacked); +	const unsigned char *ptr = (skb_transport_header(ack_skb) + +				    TCP_SKB_CB(ack_skb)->sacked);  	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);  	struct tcp_sack_block sp[TCP_NUM_SACKS];  	struct tcp_sack_block *cache; @@ -1721,12 +1633,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	struct sk_buff *skb;  	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);  	int used_sacks; -	int found_dup_sack = 0; +	bool found_dup_sack = false;  	int i, j;  	int first_sack_index;  	state.flag = 0;  	state.reord = tp->packets_out; +	state.rtt_us = -1L;  	if (!tp->sacked_out) {  		if (WARN_ON(tp->fackets_out)) @@ -1752,7 +1665,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	used_sacks = 0;  	first_sack_index = 0;  	for (i = 0; i < num_sacks; i++) { -		int dup_sack = !i && found_dup_sack; +		bool dup_sack = !i && found_dup_sack;  		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);  		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); @@ -1819,16 +1732,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,  	while (i < used_sacks) {  		u32 start_seq = sp[i].start_seq;  		u32 end_seq = sp[i].end_seq; -		int dup_sack = (found_dup_sack && (i == first_sack_index)); +		bool dup_sack = (found_dup_sack && (i == first_sack_index));  		struct tcp_sack_block *next_dup = NULL;  		if (found_dup_sack && ((i + 1) == first_sack_index))  			next_dup = &sp[i + 1]; -		/* Event "B" in the comment above. */ -		if (after(end_seq, tp->high_seq)) -			state.flag |= FLAG_DATA_LOST; -  		/* Skip too early cached blocks */  		while (tcp_sack_cache_ok(tp, cache) &&  		       !before(start_seq, cache->end_seq)) @@ -1887,12 +1796,6 @@ walk:  				       start_seq, end_seq, dup_sack);  advance_sp: -		/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct -		 * due to in-order walk -		 */ -		if (after(end_seq, tp->frto_highmark)) -			state.flag &= ~FLAG_ONLY_ORIG_SACKED; -  		i++;  	} @@ -1909,8 +1812,7 @@ advance_sp:  	tcp_verify_left_out(tp);  	if ((state.reord < tp->fackets_out) && -	    ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && -	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) +	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))  		tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);  out: @@ -1921,13 +1823,14 @@ out:  	WARN_ON((int)tp->retrans_out < 0);  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);  #endif +	*sack_rtt_us = state.rtt_us;  	return state.flag;  }  /* Limits sacked_out so that sum with lost_out isn't ever larger than - * packets_out. Returns zero if sacked_out adjustement wasn't necessary. + * packets_out. Returns false if sacked_out adjustement wasn't necessary.   */ -static int tcp_limit_reno_sacked(struct tcp_sock *tp) +static bool tcp_limit_reno_sacked(struct tcp_sock *tp)  {  	u32 holes; @@ -1936,9 +1839,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp)  	if ((tp->sacked_out + holes) > tp->packets_out) {  		tp->sacked_out = tp->packets_out - holes; -		return 1; +		return true;  	} -	return 0; +	return false;  }  /* If we receive more dupacks than we expected counting segments @@ -1984,205 +1887,13 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)  	tp->sacked_out = 0;  } -static int tcp_is_sackfrto(const struct tcp_sock *tp) -{ -	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp); -} - -/* F-RTO can only be used if TCP has never retransmitted anything other than - * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) - */ -int tcp_use_frto(struct sock *sk) -{ -	const struct tcp_sock *tp = tcp_sk(sk); -	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct sk_buff *skb; - -	if (!sysctl_tcp_frto) -		return 0; - -	/* MTU probe and F-RTO won't really play nicely along currently */ -	if (icsk->icsk_mtup.probe_size) -		return 0; - -	if (tcp_is_sackfrto(tp)) -		return 1; - -	/* Avoid expensive walking of rexmit queue if possible */ -	if (tp->retrans_out > 1) -		return 0; - -	skb = tcp_write_queue_head(sk); -	if (tcp_skb_is_last(sk, skb)) -		return 1; -	skb = tcp_write_queue_next(sk, skb);	/* Skips head */ -	tcp_for_write_queue_from(skb, sk) { -		if (skb == tcp_send_head(sk)) -			break; -		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) -			return 0; -		/* Short-circuit when first non-SACKed skb has been checked */ -		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) -			break; -	} -	return 1; -} - -/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO - * recovery a bit and use heuristics in tcp_process_frto() to detect if - * the RTO was spurious. Only clear SACKED_RETRANS of the head here to - * keep retrans_out counting accurate (with SACK F-RTO, other than head - * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS - * bits are handled if the Loss state is really to be entered (in - * tcp_enter_frto_loss). - * - * Do like tcp_enter_loss() would; when RTO expires the second time it - * does: - *  "Reduce ssthresh if it has not yet been made inside this window." - */ -void tcp_enter_frto(struct sock *sk) -{ -	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); -	struct sk_buff *skb; - -	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) || -	    tp->snd_una == tp->high_seq || -	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) && -	     !icsk->icsk_retransmits)) { -		tp->prior_ssthresh = tcp_current_ssthresh(sk); -		/* Our state is too optimistic in ssthresh() call because cwnd -		 * is not reduced until tcp_enter_frto_loss() when previous F-RTO -		 * recovery has not yet completed. Pattern would be this: RTO, -		 * Cumulative ACK, RTO (2xRTO for the same segment does not end -		 * up here twice). -		 * RFC4138 should be more specific on what to do, even though -		 * RTO is quite unlikely to occur after the first Cumulative ACK -		 * due to back-off and complexity of triggering events ... -		 */ -		if (tp->frto_counter) { -			u32 stored_cwnd; -			stored_cwnd = tp->snd_cwnd; -			tp->snd_cwnd = 2; -			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -			tp->snd_cwnd = stored_cwnd; -		} else { -			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -		} -		/* ... in theory, cong.control module could do "any tricks" in -		 * ssthresh(), which means that ca_state, lost bits and lost_out -		 * counter would have to be faked before the call occurs. We -		 * consider that too expensive, unlikely and hacky, so modules -		 * using these in ssthresh() must deal these incompatibility -		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0 -		 */ -		tcp_ca_event(sk, CA_EVENT_FRTO); -	} - -	tp->undo_marker = tp->snd_una; -	tp->undo_retrans = 0; - -	skb = tcp_write_queue_head(sk); -	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) -		tp->undo_marker = 0; -	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { -		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; -		tp->retrans_out -= tcp_skb_pcount(skb); -	} -	tcp_verify_left_out(tp); - -	/* Too bad if TCP was application limited */ -	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); - -	/* Earlier loss recovery underway (see RFC4138; Appendix B). -	 * The last condition is necessary at least in tp->frto_counter case. -	 */ -	if (tcp_is_sackfrto(tp) && (tp->frto_counter || -	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && -	    after(tp->high_seq, tp->snd_una)) { -		tp->frto_highmark = tp->high_seq; -	} else { -		tp->frto_highmark = tp->snd_nxt; -	} -	tcp_set_ca_state(sk, TCP_CA_Disorder); -	tp->high_seq = tp->snd_nxt; -	tp->frto_counter = 1; -} - -/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, - * which indicates that we should follow the traditional RTO recovery, - * i.e. mark everything lost and do go-back-N retransmission. - */ -static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	struct sk_buff *skb; - -	tp->lost_out = 0; -	tp->retrans_out = 0; -	if (tcp_is_reno(tp)) -		tcp_reset_reno_sack(tp); - -	tcp_for_write_queue(skb, sk) { -		if (skb == tcp_send_head(sk)) -			break; - -		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; -		/* -		 * Count the retransmission made on RTO correctly (only when -		 * waiting for the first ACK and did not get it)... -		 */ -		if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { -			/* For some reason this R-bit might get cleared? */ -			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) -				tp->retrans_out += tcp_skb_pcount(skb); -			/* ...enter this if branch just for the first segment */ -			flag |= FLAG_DATA_ACKED; -		} else { -			if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) -				tp->undo_marker = 0; -			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; -		} - -		/* Marking forward transmissions that were made after RTO lost -		 * can cause unnecessary retransmissions in some scenarios, -		 * SACK blocks will mitigate that in some but not in all cases. -		 * We used to not mark them but it was causing break-ups with -		 * receivers that do only in-order receival. -		 * -		 * TODO: we could detect presence of such receiver and select -		 * different behavior per flow. -		 */ -		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { -			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; -			tp->lost_out += tcp_skb_pcount(skb); -			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; -		} -	} -	tcp_verify_left_out(tp); - -	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; -	tp->snd_cwnd_cnt = 0; -	tp->snd_cwnd_stamp = tcp_time_stamp; -	tp->frto_counter = 0; -	tp->bytes_acked = 0; - -	tp->reordering = min_t(unsigned int, tp->reordering, -			       sysctl_tcp_reordering); -	tcp_set_ca_state(sk, TCP_CA_Loss); -	tp->high_seq = tp->snd_nxt; -	TCP_ECN_queue_cwr(tp); - -	tcp_clear_all_retrans_hints(tp); -} -  static void tcp_clear_retrans_partial(struct tcp_sock *tp)  {  	tp->retrans_out = 0;  	tp->lost_out = 0;  	tp->undo_marker = 0; -	tp->undo_retrans = 0; +	tp->undo_retrans = -1;  }  void tcp_clear_retrans(struct tcp_sock *tp) @@ -2202,10 +1913,13 @@ void tcp_enter_loss(struct sock *sk, int how)  	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb; +	bool new_recovery = false;  	/* Reduce ssthresh if it has not yet been made inside this window. */ -	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || +	if (icsk->icsk_ca_state <= TCP_CA_Disorder || +	    !after(tp->high_seq, tp->snd_una) ||  	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { +		new_recovery = true;  		tp->prior_ssthresh = tcp_current_ssthresh(sk);  		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);  		tcp_ca_event(sk, CA_EVENT_LOSS); @@ -2214,17 +1928,13 @@ void tcp_enter_loss(struct sock *sk, int how)  	tp->snd_cwnd_cnt   = 0;  	tp->snd_cwnd_stamp = tcp_time_stamp; -	tp->bytes_acked = 0;  	tcp_clear_retrans_partial(tp);  	if (tcp_is_reno(tp))  		tcp_reset_reno_sack(tp); -	if (!how) { -		/* Push undo marker, if it was plain RTO and nothing -		 * was retransmitted. */ -		tp->undo_marker = tp->snd_una; -	} else { +	tp->undo_marker = tp->snd_una; +	if (how) {  		tp->sacked_out = 0;  		tp->fackets_out = 0;  	} @@ -2234,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how)  		if (skb == tcp_send_head(sk))  			break; -		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) +		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)  			tp->undo_marker = 0; +  		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;  		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; @@ -2246,13 +1957,24 @@ void tcp_enter_loss(struct sock *sk, int how)  	}  	tcp_verify_left_out(tp); -	tp->reordering = min_t(unsigned int, tp->reordering, -			       sysctl_tcp_reordering); +	/* Timeout in disordered state after receiving substantial DUPACKs +	 * suggests that the degree of reordering is over-estimated. +	 */ +	if (icsk->icsk_ca_state <= TCP_CA_Disorder && +	    tp->sacked_out >= sysctl_tcp_reordering) +		tp->reordering = min_t(unsigned int, tp->reordering, +				       sysctl_tcp_reordering);  	tcp_set_ca_state(sk, TCP_CA_Loss);  	tp->high_seq = tp->snd_nxt;  	TCP_ECN_queue_cwr(tp); -	/* Abort F-RTO algorithm if one is in progress */ -	tp->frto_counter = 0; + +	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous +	 * loss recovery is underway except recurring timeout(s) on +	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing +	 */ +	tp->frto = sysctl_tcp_frto && +		   (new_recovery || icsk->icsk_retransmits) && +		   !inet_csk(sk)->icsk_mtup.probe_size;  }  /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2261,7 +1983,7 @@ void tcp_enter_loss(struct sock *sk, int how)   *   * Do processing similar to RTO timeout.   */ -static int tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int flag)  {  	if (flag & FLAG_SACK_RENEGING) {  		struct inet_connection_sock *icsk = inet_csk(sk); @@ -2272,12 +1994,12 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag)  		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,  					  icsk->icsk_rto, TCP_RTO_MAX); -		return 1; +		return true;  	} -	return 0; +	return false;  } -static inline int tcp_fackets_out(struct tcp_sock *tp) +static inline int tcp_fackets_out(const struct tcp_sock *tp)  {  	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;  } @@ -2297,22 +2019,33 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)   * they differ. Since neither occurs due to loss, TCP should really   * ignore them.   */ -static inline int tcp_dupack_heuristics(struct tcp_sock *tp) +static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)  {  	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;  } -static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) -{ -	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; -} - -static inline int tcp_head_timedout(struct sock *sk) +static bool tcp_pause_early_retransmit(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk); +	unsigned long delay; + +	/* Delay early retransmit and entering fast recovery for +	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples +	 * available, or RTO is scheduled to fire first. +	 */ +	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || +	    (flag & FLAG_ECE) || !tp->srtt_us) +		return false; + +	delay = max(usecs_to_jiffies(tp->srtt_us >> 5), +		    msecs_to_jiffies(2)); -	return tp->packets_out && -	       tcp_skb_timedout(sk, tcp_write_queue_head(sk)); +	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) +		return false; + +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, +				  TCP_RTO_MAX); +	return true;  }  /* Linux NewReno/SACK/FACK/ECN state machine. @@ -2408,28 +2141,18 @@ static inline int tcp_head_timedout(struct sock *sk)   * Main question: may we further continue forward transmission   * with the same cwnd?   */ -static int tcp_time_to_recover(struct sock *sk) +static bool tcp_time_to_recover(struct sock *sk, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk);  	__u32 packets_out; -	/* Do not perform any recovery during F-RTO algorithm */ -	if (tp->frto_counter) -		return 0; -  	/* Trick#1: The loss is proven. */  	if (tp->lost_out) -		return 1; +		return true;  	/* Not-A-Trick#2 : Classic rule... */  	if (tcp_dupack_heuristics(tp) > tp->reordering) -		return 1; - -	/* Trick#3 : when we use RFC2988 timer restart, fast -	 * retransmit can be triggered by timeout of queue head. -	 */ -	if (tcp_is_fack(tp) && tcp_head_timedout(sk)) -		return 1; +		return true;  	/* Trick#4: It is still not OK... But will it be useful to delay  	 * recovery more? @@ -2441,7 +2164,7 @@ static int tcp_time_to_recover(struct sock *sk)  		/* We have nothing to send. This connection is limited  		 * either by receiver window or by application.  		 */ -		return 1; +		return true;  	}  	/* If a thin stream is detected, retransmit after first @@ -2452,51 +2175,26 @@ static int tcp_time_to_recover(struct sock *sk)  	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&  	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&  	    tcp_is_sack(tp) && !tcp_send_head(sk)) -		return 1; - -	return 0; -} - -/* New heuristics: it is possible only after we switched to restart timer - * each time when something is ACKed. Hence, we can detect timed out packets - * during fast retransmit without falling to slow start. - * - * Usefulness of this as is very questionable, since we should know which of - * the segments is the next to timeout which is relatively expensive to find - * in general case unless we add some data structure just for that. The - * current approach certainly won't find the right one too often and when it - * finally does find _something_ it usually marks large part of the window - * right away (because a retransmission with a larger timestamp blocks the - * loop from advancing). -ij - */ -static void tcp_timeout_skbs(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	struct sk_buff *skb; - -	if (!tcp_is_fack(tp) || !tcp_head_timedout(sk)) -		return; - -	skb = tp->scoreboard_skb_hint; -	if (tp->scoreboard_skb_hint == NULL) -		skb = tcp_write_queue_head(sk); - -	tcp_for_write_queue_from(skb, sk) { -		if (skb == tcp_send_head(sk)) -			break; -		if (!tcp_skb_timedout(sk, skb)) -			break; - -		tcp_skb_mark_lost(tp, skb); -	} +		return true; -	tp->scoreboard_skb_hint = skb; +	/* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious +	 * retransmissions due to small network reorderings, we implement +	 * Mitigation A.3 in the RFC and delay the retransmission for a short +	 * interval if appropriate. +	 */ +	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && +	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && +	    !tcp_may_send_now(sk)) +		return !tcp_pause_early_retransmit(sk, flag); -	tcp_verify_left_out(tp); +	return false;  } -/* Mark head of queue up as lost. With RFC3517 SACK, the packets is - * is against sacked "cnt", otherwise it's against facked "cnt" +/* Detect loss in event "A" above by marking head of queue up as lost. + * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * has at least tp->reordering SACKed seqments above it; "packets" refers to + * the maximum SACKed segments to pass before reaching this limit.   */  static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  { @@ -2505,6 +2203,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  	int cnt, oldcnt;  	int err;  	unsigned int mss; +	/* Use SACK to deduce losses of new sequences sent during recovery */ +	const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;  	WARN_ON(packets > tp->packets_out);  	if (tp->lost_skb_hint) { @@ -2526,7 +2226,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  		tp->lost_skb_hint = skb;  		tp->lost_cnt_hint = cnt; -		if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) +		if (after(TCP_SKB_CB(skb)->end_seq, loss_high))  			break;  		oldcnt = cnt; @@ -2536,11 +2236,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)  		if (cnt > packets) {  			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || +			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||  			    (oldcnt >= packets))  				break;  			mss = skb_shinfo(skb)->gso_size; -			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); +			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, +					   mss, GFP_ATOMIC);  			if (err < 0)  				break;  			cnt = packets; @@ -2574,8 +2276,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)  		else if (fast_rexmit)  			tcp_mark_head_lost(sk, 1, 1);  	} - -	tcp_timeout_skbs(sk);  }  /* CWND moderation, preventing bursts due to too big ACKs @@ -2588,39 +2288,10 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)  	tp->snd_cwnd_stamp = tcp_time_stamp;  } -/* Lower bound on congestion window is slow start threshold - * unless congestion avoidance choice decides to overide it. - */ -static inline u32 tcp_cwnd_min(const struct sock *sk) -{ -	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - -	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; -} - -/* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk, int flag) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	int decr = tp->snd_cwnd_cnt + 1; - -	if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || -	    (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { -		tp->snd_cwnd_cnt = decr & 1; -		decr >>= 1; - -		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) -			tp->snd_cwnd -= decr; - -		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); -		tp->snd_cwnd_stamp = tcp_time_stamp; -	} -} -  /* Nothing was retransmitted or returned timestamp is less   * than timestamp of the first retransmission.   */ -static inline int tcp_packet_delayed(struct tcp_sock *tp) +static inline bool tcp_packet_delayed(const struct tcp_sock *tp)  {  	return !tp->retrans_stamp ||  		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2636,22 +2307,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)  	struct inet_sock *inet = inet_sk(sk);  	if (sk->sk_family == AF_INET) { -		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", -		       msg, -		       &inet->inet_daddr, ntohs(inet->inet_dport), -		       tp->snd_cwnd, tcp_left_out(tp), -		       tp->snd_ssthresh, tp->prior_ssthresh, -		       tp->packets_out); -	} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +		pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", +			 msg, +			 &inet->inet_daddr, ntohs(inet->inet_dport), +			 tp->snd_cwnd, tcp_left_out(tp), +			 tp->snd_ssthresh, tp->prior_ssthresh, +			 tp->packets_out); +	} +#if IS_ENABLED(CONFIG_IPV6)  	else if (sk->sk_family == AF_INET6) {  		struct ipv6_pinfo *np = inet6_sk(sk); -		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", -		       msg, -		       &np->daddr, ntohs(inet->inet_dport), -		       tp->snd_cwnd, tcp_left_out(tp), -		       tp->snd_ssthresh, tp->prior_ssthresh, -		       tp->packets_out); +		pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", +			 msg, +			 &np->daddr, ntohs(inet->inet_dport), +			 tp->snd_cwnd, tcp_left_out(tp), +			 tp->snd_ssthresh, tp->prior_ssthresh, +			 tp->packets_out);  	}  #endif  } @@ -2659,10 +2330,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)  #define DBGUNDO(x...) do { } while (0)  #endif -static void tcp_undo_cwr(struct sock *sk, const int undo) +static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)  {  	struct tcp_sock *tp = tcp_sk(sk); +	if (unmark_loss) { +		struct sk_buff *skb; + +		tcp_for_write_queue(skb, sk) { +			if (skb == tcp_send_head(sk)) +				break; +			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; +		} +		tp->lost_out = 0; +		tcp_clear_all_retrans_hints(tp); +	} +  	if (tp->prior_ssthresh) {  		const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2671,24 +2354,24 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)  		else  			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); -		if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { +		if (tp->prior_ssthresh > tp->snd_ssthresh) {  			tp->snd_ssthresh = tp->prior_ssthresh;  			TCP_ECN_withdraw_cwr(tp);  		}  	} else {  		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);  	} -	tcp_moderate_cwnd(tp);  	tp->snd_cwnd_stamp = tcp_time_stamp; +	tp->undo_marker = 0;  } -static inline int tcp_may_undo(struct tcp_sock *tp) +static inline bool tcp_may_undo(const struct tcp_sock *tp)  {  	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));  }  /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk) +static bool tcp_try_undo_recovery(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2699,37 +2382,37 @@ static int tcp_try_undo_recovery(struct sock *sk)  		 * or our original transmission succeeded.  		 */  		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); -		tcp_undo_cwr(sk, 1); +		tcp_undo_cwnd_reduction(sk, false);  		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)  			mib_idx = LINUX_MIB_TCPLOSSUNDO;  		else  			mib_idx = LINUX_MIB_TCPFULLUNDO;  		NET_INC_STATS_BH(sock_net(sk), mib_idx); -		tp->undo_marker = 0;  	}  	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {  		/* Hold old state until something *above* high_seq  		 * is ACKed. For Reno it is MUST to prevent false  		 * fast retransmits (RFC2582). SACK TCP is safe. */  		tcp_moderate_cwnd(tp); -		return 1; +		return true;  	}  	tcp_set_ca_state(sk, TCP_CA_Open); -	return 0; +	return false;  }  /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk) +static bool tcp_try_undo_dsack(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk);  	if (tp->undo_marker && !tp->undo_retrans) {  		DBGUNDO(sk, "D-SACK"); -		tcp_undo_cwr(sk, 1); -		tp->undo_marker = 0; +		tcp_undo_cwnd_reduction(sk, false);  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); +		return true;  	} +	return false;  }  /* We can clear retrans_stamp when there are no retransmissions in the @@ -2746,85 +2429,115 @@ static void tcp_try_undo_dsack(struct sock *sk)   * that successive retransmissions of a segment must not advance   * retrans_stamp under any conditions.   */ -static int tcp_any_retrans_done(struct sock *sk) +static bool tcp_any_retrans_done(const struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb;  	if (tp->retrans_out) -		return 1; +		return true;  	skb = tcp_write_queue_head(sk);  	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) -		return 1; +		return true; -	return 0; +	return false;  } -/* Undo during fast recovery after partial ACK. */ - -static int tcp_try_undo_partial(struct sock *sk, int acked) +/* Undo during loss recovery after partial ACK or using F-RTO. */ +static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)  {  	struct tcp_sock *tp = tcp_sk(sk); -	/* Partial ACK arrived. Force Hoe's retransmit. */ -	int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); -	if (tcp_may_undo(tp)) { -		/* Plain luck! Hole if filled with delayed -		 * packet, rather than with a retransmit. -		 */ -		if (!tcp_any_retrans_done(sk)) -			tp->retrans_stamp = 0; +	if (frto_undo || tcp_may_undo(tp)) { +		tcp_undo_cwnd_reduction(sk, true); -		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); +		DBGUNDO(sk, "partial loss"); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); +		if (frto_undo) +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPSPURIOUSRTOS); +		inet_csk(sk)->icsk_retransmits = 0; +		if (frto_undo || tcp_is_sack(tp)) +			tcp_set_ca_state(sk, TCP_CA_Open); +		return true; +	} +	return false; +} -		DBGUNDO(sk, "Hoe"); -		tcp_undo_cwr(sk, 0); -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); +/* The cwnd reduction in CWR and Recovery use the PRR algorithm + * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ + * It computes the number of packets to send (sndcnt) based on packets newly + * delivered: + *   1) If the packets in flight is larger than ssthresh, PRR spreads the + *	cwnd reductions across a full RTT. + *   2) If packets in flight is lower than ssthresh (such as due to excess + *	losses and/or application stalls), do not perform any further cwnd + *	reductions, but instead slow start up to ssthresh. + */ +static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) +{ +	struct tcp_sock *tp = tcp_sk(sk); -		/* So... Do not make Hoe's retransmit yet. -		 * If the first packet was delayed, the rest -		 * ones are most probably delayed as well. -		 */ -		failed = 0; -	} -	return failed; +	tp->high_seq = tp->snd_nxt; +	tp->tlp_high_seq = 0; +	tp->snd_cwnd_cnt = 0; +	tp->prior_cwnd = tp->snd_cwnd; +	tp->prr_delivered = 0; +	tp->prr_out = 0; +	if (set_ssthresh) +		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); +	TCP_ECN_queue_cwr(tp);  } -/* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk) +static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, +			       int fast_rexmit)  {  	struct tcp_sock *tp = tcp_sk(sk); +	int sndcnt = 0; +	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); +	int newly_acked_sacked = prior_unsacked - +				 (tp->packets_out - tp->sacked_out); + +	tp->prr_delivered += newly_acked_sacked; +	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { +		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + +			       tp->prior_cwnd - 1; +		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; +	} else { +		sndcnt = min_t(int, delta, +			       max_t(int, tp->prr_delivered - tp->prr_out, +				     newly_acked_sacked) + 1); +	} -	if (tcp_may_undo(tp)) { -		struct sk_buff *skb; -		tcp_for_write_queue(skb, sk) { -			if (skb == tcp_send_head(sk)) -				break; -			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; -		} +	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); +	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; +} -		tcp_clear_all_retrans_hints(tp); +static inline void tcp_end_cwnd_reduction(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); -		DBGUNDO(sk, "partial loss"); -		tp->lost_out = 0; -		tcp_undo_cwr(sk, 1); -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); -		inet_csk(sk)->icsk_retransmits = 0; -		tp->undo_marker = 0; -		if (tcp_is_sack(tp)) -			tcp_set_ca_state(sk, TCP_CA_Open); -		return 1; +	/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ +	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || +	    (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { +		tp->snd_cwnd = tp->snd_ssthresh; +		tp->snd_cwnd_stamp = tcp_time_stamp;  	} -	return 0; +	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);  } -static inline void tcp_complete_cwr(struct sock *sk) +/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)  {  	struct tcp_sock *tp = tcp_sk(sk); -	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); -	tp->snd_cwnd_stamp = tcp_time_stamp; -	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); + +	tp->prior_ssthresh = 0; +	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { +		tp->undo_marker = 0; +		tcp_init_cwnd_reduction(sk, set_ssthresh); +		tcp_set_ca_state(sk, TCP_CA_CWR); +	}  }  static void tcp_try_keep_open(struct sock *sk) @@ -2832,7 +2545,7 @@ static void tcp_try_keep_open(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	int state = TCP_CA_Open; -	if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) +	if (tcp_left_out(tp) || tcp_any_retrans_done(sk))  		state = TCP_CA_Disorder;  	if (inet_csk(sk)->icsk_ca_state != state) { @@ -2841,13 +2554,13 @@ static void tcp_try_keep_open(struct sock *sk)  	}  } -static void tcp_try_to_open(struct sock *sk, int flag) +static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	tcp_verify_left_out(tp); -	if (!tp->frto_counter && !tcp_any_retrans_done(sk)) +	if (!tcp_any_retrans_done(sk))  		tp->retrans_stamp = 0;  	if (flag & FLAG_ECE) @@ -2855,9 +2568,8 @@ static void tcp_try_to_open(struct sock *sk, int flag)  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {  		tcp_try_keep_open(sk); -		tcp_moderate_cwnd(tp);  	} else { -		tcp_cwnd_down(sk, flag); +		tcp_cwnd_reduction(sk, prior_unsacked, 0);  	}  } @@ -2939,6 +2651,115 @@ void tcp_simple_retransmit(struct sock *sk)  }  EXPORT_SYMBOL(tcp_simple_retransmit); +static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int mib_idx; + +	if (tcp_is_reno(tp)) +		mib_idx = LINUX_MIB_TCPRENORECOVERY; +	else +		mib_idx = LINUX_MIB_TCPSACKRECOVERY; + +	NET_INC_STATS_BH(sock_net(sk), mib_idx); + +	tp->prior_ssthresh = 0; +	tp->undo_marker = tp->snd_una; +	tp->undo_retrans = tp->retrans_out ? : -1; + +	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { +		if (!ece_ack) +			tp->prior_ssthresh = tcp_current_ssthresh(sk); +		tcp_init_cwnd_reduction(sk, true); +	} +	tcp_set_ca_state(sk, TCP_CA_Recovery); +} + +/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are + * recovered or spurious. Otherwise retransmits more on partial ACKs. + */ +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	bool recovered = !before(tp->snd_una, tp->high_seq); + +	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ +		/* Step 3.b. A timeout is spurious if not all data are +		 * lost, i.e., never-retransmitted data are (s)acked. +		 */ +		if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) +			return; + +		if (after(tp->snd_nxt, tp->high_seq) && +		    (flag & FLAG_DATA_SACKED || is_dupack)) { +			tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ +		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { +			tp->high_seq = tp->snd_nxt; +			__tcp_push_pending_frames(sk, tcp_current_mss(sk), +						  TCP_NAGLE_OFF); +			if (after(tp->snd_nxt, tp->high_seq)) +				return; /* Step 2.b */ +			tp->frto = 0; +		} +	} + +	if (recovered) { +		/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ +		icsk->icsk_retransmits = 0; +		tcp_try_undo_recovery(sk); +		return; +	} +	if (flag & FLAG_DATA_ACKED) +		icsk->icsk_retransmits = 0; +	if (tcp_is_reno(tp)) { +		/* A Reno DUPACK means new data in F-RTO step 2.b above are +		 * delivered. Lower inflight to clock out (re)tranmissions. +		 */ +		if (after(tp->snd_nxt, tp->high_seq) && is_dupack) +			tcp_add_reno_sack(sk); +		else if (flag & FLAG_SND_UNA_ADVANCED) +			tcp_reset_reno_sack(tp); +	} +	if (tcp_try_undo_loss(sk, false)) +		return; +	tcp_xmit_retransmit_queue(sk); +} + +/* Undo during fast recovery after partial ACK. */ +static bool tcp_try_undo_partial(struct sock *sk, const int acked, +				 const int prior_unsacked) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (tp->undo_marker && tcp_packet_delayed(tp)) { +		/* Plain luck! Hole if filled with delayed +		 * packet, rather than with a retransmit. +		 */ +		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); + +		/* We are getting evidence that the reordering degree is higher +		 * than we realized. If there are no retransmits out then we +		 * can undo. Otherwise we clock out new packets but do not +		 * mark more packets lost or retransmit more. +		 */ +		if (tp->retrans_out) { +			tcp_cwnd_reduction(sk, prior_unsacked, 0); +			return true; +		} + +		if (!tcp_any_retrans_done(sk)) +			tp->retrans_stamp = 0; + +		DBGUNDO(sk, "partial recovery"); +		tcp_undo_cwnd_reduction(sk, true); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); +		tcp_try_keep_open(sk); +		return true; +	} +	return false; +} +  /* Process an event, which can update packets-in-flight not trivially.   * Main goal of this function is to calculate new estimate for left_out,   * taking into account both packets sitting in receiver's buffer and @@ -2950,14 +2771,15 @@ EXPORT_SYMBOL(tcp_simple_retransmit);   * It does _not_ decide what to send, it is made in function   * tcp_xmit_retransmit_queue().   */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +static void tcp_fastretrans_alert(struct sock *sk, const int acked, +				  const int prior_unsacked, +				  bool is_dupack, int flag)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); -	int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && +	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&  				    (tcp_fackets_out(tp) > tp->reordering)); -	int fast_rexmit = 0, mib_idx; +	int fast_rexmit = 0;  	if (WARN_ON(!tp->packets_out && tp->sacked_out))  		tp->sacked_out = 0; @@ -2973,47 +2795,21 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)  	if (tcp_check_sack_reneging(sk, flag))  		return; -	/* C. Process data loss notification, provided it is valid. */ -	if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && -	    before(tp->snd_una, tp->high_seq) && -	    icsk->icsk_ca_state != TCP_CA_Open && -	    tp->fackets_out > tp->reordering) { -		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); -	} - -	/* D. Check consistency of the current state. */ +	/* C. Check consistency of the current state. */  	tcp_verify_left_out(tp); -	/* E. Check state exit conditions. State can be terminated +	/* D. Check state exit conditions. State can be terminated  	 *    when high_seq is ACKed. */  	if (icsk->icsk_ca_state == TCP_CA_Open) {  		WARN_ON(tp->retrans_out != 0);  		tp->retrans_stamp = 0;  	} else if (!before(tp->snd_una, tp->high_seq)) {  		switch (icsk->icsk_ca_state) { -		case TCP_CA_Loss: -			icsk->icsk_retransmits = 0; -			if (tcp_try_undo_recovery(sk)) -				return; -			break; -  		case TCP_CA_CWR:  			/* CWR is to be held something *above* high_seq  			 * is ACKed for CWR bit to reach receiver. */  			if (tp->snd_una != tp->high_seq) { -				tcp_complete_cwr(sk); -				tcp_set_ca_state(sk, TCP_CA_Open); -			} -			break; - -		case TCP_CA_Disorder: -			tcp_try_undo_dsack(sk); -			if (!tp->undo_marker || -			    /* For SACK case do not Open to allow to undo -			     * catching for all duplicate ACKs. */ -			    tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { -				tp->undo_marker = 0; +				tcp_end_cwnd_reduction(sk);  				tcp_set_ca_state(sk, TCP_CA_Open);  			}  			break; @@ -3023,33 +2819,34 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)  				tcp_reset_reno_sack(tp);  			if (tcp_try_undo_recovery(sk))  				return; -			tcp_complete_cwr(sk); +			tcp_end_cwnd_reduction(sk);  			break;  		}  	} -	/* F. Process state. */ +	/* E. Process state. */  	switch (icsk->icsk_ca_state) {  	case TCP_CA_Recovery:  		if (!(flag & FLAG_SND_UNA_ADVANCED)) {  			if (tcp_is_reno(tp) && is_dupack)  				tcp_add_reno_sack(sk); -		} else -			do_lost = tcp_try_undo_partial(sk, pkts_acked); -		break; -	case TCP_CA_Loss: -		if (flag & FLAG_DATA_ACKED) -			icsk->icsk_retransmits = 0; -		if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) -			tcp_reset_reno_sack(tp); -		if (!tcp_try_undo_loss(sk)) { -			tcp_moderate_cwnd(tp); -			tcp_xmit_retransmit_queue(sk); +		} else { +			if (tcp_try_undo_partial(sk, acked, prior_unsacked)) +				return; +			/* Partial ACK arrived. Force fast retransmit. */ +			do_lost = tcp_is_reno(tp) || +				  tcp_fackets_out(tp) > tp->reordering; +		} +		if (tcp_try_undo_dsack(sk)) { +			tcp_try_keep_open(sk);  			return;  		} +		break; +	case TCP_CA_Loss: +		tcp_process_loss(sk, flag, is_dupack);  		if (icsk->icsk_ca_state != TCP_CA_Open)  			return; -		/* Loss is undone; fall through to processing in Open state. */ +		/* Fall through to processing in Open state. */  	default:  		if (tcp_is_reno(tp)) {  			if (flag & FLAG_SND_UNA_ADVANCED) @@ -3058,11 +2855,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)  				tcp_add_reno_sack(sk);  		} -		if (icsk->icsk_ca_state == TCP_CA_Disorder) +		if (icsk->icsk_ca_state <= TCP_CA_Disorder)  			tcp_try_undo_dsack(sk); -		if (!tcp_time_to_recover(sk)) { -			tcp_try_to_open(sk, flag); +		if (!tcp_time_to_recover(sk, flag)) { +			tcp_try_to_open(sk, flag, prior_unsacked);  			return;  		} @@ -3078,120 +2875,130 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)  		}  		/* Otherwise enter Recovery state */ - -		if (tcp_is_reno(tp)) -			mib_idx = LINUX_MIB_TCPRENORECOVERY; -		else -			mib_idx = LINUX_MIB_TCPSACKRECOVERY; - -		NET_INC_STATS_BH(sock_net(sk), mib_idx); - -		tp->high_seq = tp->snd_nxt; -		tp->prior_ssthresh = 0; -		tp->undo_marker = tp->snd_una; -		tp->undo_retrans = tp->retrans_out; - -		if (icsk->icsk_ca_state < TCP_CA_CWR) { -			if (!(flag & FLAG_ECE)) -				tp->prior_ssthresh = tcp_current_ssthresh(sk); -			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); -			TCP_ECN_queue_cwr(tp); -		} - -		tp->bytes_acked = 0; -		tp->snd_cwnd_cnt = 0; -		tcp_set_ca_state(sk, TCP_CA_Recovery); +		tcp_enter_recovery(sk, (flag & FLAG_ECE));  		fast_rexmit = 1;  	} -	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) +	if (do_lost)  		tcp_update_scoreboard(sk, fast_rexmit); -	tcp_cwnd_down(sk, flag); +	tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);  	tcp_xmit_retransmit_queue(sk);  } -static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, +				      long seq_rtt_us, long sack_rtt_us)  { -	tcp_rtt_estimator(sk, seq_rtt); -	tcp_set_rto(sk); -	inet_csk(sk)->icsk_backoff = 0; -} +	const struct tcp_sock *tp = tcp_sk(sk); + +	/* Prefer RTT measured from ACK's timing to TS-ECR. This is because +	 * broken middle-boxes or peers may corrupt TS-ECR fields. But +	 * Karn's algorithm forbids taking RTT if some retransmitted data +	 * is acked (RFC6298). +	 */ +	if (flag & FLAG_RETRANS_DATA_ACKED) +		seq_rtt_us = -1L; + +	if (seq_rtt_us < 0) +		seq_rtt_us = sack_rtt_us; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{  	/* RTTM Rule: A TSecr value received in a segment is used to  	 * update the averaged RTT measurement only if the segment  	 * acknowledges some new data, i.e., only if it advances the  	 * left edge of the send window. -	 *  	 * See draft-ietf-tcplw-high-performance-00, section 3.3. -	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> -	 * -	 * Changed: reset backoff as soon as we see the first valid sample. -	 * If we do not, we get strongly overestimated rto. With timestamps -	 * samples are accepted even from very old segments: f.e., when rtt=1 -	 * increases to 8, we retransmit 5 times and after 8 seconds delayed -	 * answer arrives rto becomes 120 seconds! If at least one of segments -	 * in window is lost... Voila.	 			--ANK (010210)  	 */ -	struct tcp_sock *tp = tcp_sk(sk); - -	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} +	if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && +	    flag & FLAG_ACKED) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ -	/* We don't have a timestamp. Can only use -	 * packets that are not retransmitted to determine -	 * rtt estimates. Also, we must not reset the -	 * backoff for rto until we get a non-retransmitted -	 * packet. This allows us to deal with a situation -	 * where the network delay has increased suddenly. -	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.) -	 */ +	if (seq_rtt_us < 0) +		return false; -	if (flag & FLAG_RETRANS_DATA_ACKED) -		return; +	tcp_rtt_estimator(sk, seq_rtt_us); +	tcp_set_rto(sk); -	tcp_valid_rtt_meas(sk, seq_rtt); +	/* RFC6298: only reset backoff on valid RTT measurement. */ +	inet_csk(sk)->icsk_backoff = 0; +	return true;  } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, -				      const s32 seq_rtt) +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)  { -	const struct tcp_sock *tp = tcp_sk(sk); -	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ -	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) -		tcp_ack_saw_tstamp(sk, flag); -	else if (seq_rtt >= 0) -		tcp_ack_no_tstamp(sk, seq_rtt, flag); +	struct tcp_sock *tp = tcp_sk(sk); +	long seq_rtt_us = -1L; + +	if (synack_stamp && !tp->total_retrans) +		seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); + +	/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets +	 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() +	 */ +	if (!tp->srtt_us) +		tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);  } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); + +	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);  	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;  }  /* Restart timer after forward progress on connection.   * RFC2988 recommends to restart timer to now+rto.   */ -static void tcp_rearm_rto(struct sock *sk) +void tcp_rearm_rto(struct sock *sk)  { +	const struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); +	/* If the retrans timer is currently being used by Fast Open +	 * for SYN-ACK retrans purpose, stay put. +	 */ +	if (tp->fastopen_rsk) +		return; +  	if (!tp->packets_out) {  		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);  	} else { -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +		u32 rto = inet_csk(sk)->icsk_rto; +		/* Offset the time elapsed after installing regular RTO */ +		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { +			struct sk_buff *skb = tcp_write_queue_head(sk); +			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; +			s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); +			/* delta may not be positive if the socket is locked +			 * when the retrans timer fires and is rescheduled. +			 */ +			if (delta > 0) +				rto = delta; +		} +		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, +					  TCP_RTO_MAX);  	}  } +/* This function is called when the delayed ER timer fires. TCP enters + * fast recovery and performs fast-retransmit. + */ +void tcp_resume_early_retransmit(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	tcp_rearm_rto(sk); + +	/* Stop if ER is disabled after the delayed ER timer is scheduled */ +	if (!tp->do_early_retrans) +		return; + +	tcp_enter_recovery(sk, false); +	tcp_update_scoreboard(sk, 1); +	tcp_xmit_retransmit_queue(sk); +} +  /* If we get here, the whole TSO packet has not been acked. */  static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)  { @@ -3218,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)   * arrived at the other end.   */  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, -			       u32 prior_snd_una) +			       u32 prior_snd_una, long sack_rtt_us)  { -	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); +	struct skb_mstamp first_ackt, last_ackt, now; +	struct tcp_sock *tp = tcp_sk(sk); +	u32 prior_sacked = tp->sacked_out; +	u32 reord = tp->packets_out; +	bool fully_acked = true; +	long ca_seq_rtt_us = -1L; +	long seq_rtt_us = -1L;  	struct sk_buff *skb; -	u32 now = tcp_time_stamp; -	int fully_acked = 1; -	int flag = 0;  	u32 pkts_acked = 0; -	u32 reord = tp->packets_out; -	u32 prior_sacked = tp->sacked_out; -	s32 seq_rtt = -1; -	s32 ca_seq_rtt = -1; -	ktime_t last_ackt = net_invalid_timestamp(); +	bool rtt_update; +	int flag = 0; + +	first_ackt.v64 = 0;  	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -		u32 acked_pcount;  		u8 sacked = scb->sacked; +		u32 acked_pcount;  		/* Determine how many packets and what bytes were acked, tso and else */  		if (after(scb->end_seq, tp->snd_una)) { @@ -3248,7 +3057,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			if (!acked_pcount)  				break; -			fully_acked = 0; +			fully_acked = false;  		} else {  			acked_pcount = tcp_skb_pcount(skb);  		} @@ -3257,18 +3066,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  			if (sacked & TCPCB_SACKED_RETRANS)  				tp->retrans_out -= acked_pcount;  			flag |= FLAG_RETRANS_DATA_ACKED; -			ca_seq_rtt = -1; -			seq_rtt = -1; -			if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) -				flag |= FLAG_NONHEAD_RETRANS_ACKED;  		} else { -			ca_seq_rtt = now - scb->when; -			last_ackt = skb->tstamp; -			if (seq_rtt < 0) { -				seq_rtt = ca_seq_rtt; -			} +			last_ackt = skb->skb_mstamp; +			WARN_ON_ONCE(last_ackt.v64 == 0); +			if (!first_ackt.v64) +				first_ackt = last_ackt; +  			if (!(sacked & TCPCB_SACKED_ACKED))  				reord = min(pkts_acked, reord); +			if (!after(scb->end_seq, tp->high_seq)) +				flag |= FLAG_ORIG_SACK_ACKED;  		}  		if (sacked & TCPCB_SACKED_ACKED) @@ -3286,7 +3093,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		 * connection startup slow start one packet too  		 * quickly.  This is severely frowned upon behavior.  		 */ -		if (!(scb->flags & TCPHDR_SYN)) { +		if (!(scb->tcp_flags & TCPHDR_SYN)) {  			flag |= FLAG_DATA_ACKED;  		} else {  			flag |= FLAG_SYN_ACKED; @@ -3298,7 +3105,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		tcp_unlink_write_queue(skb, sk);  		sk_wmem_free_skb(sk, skb); -		tp->scoreboard_skb_hint = NULL;  		if (skb == tp->retransmit_skb_hint)  			tp->retransmit_skb_hint = NULL;  		if (skb == tp->lost_skb_hint) @@ -3311,18 +3117,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))  		flag |= FLAG_SACK_RENEGING; +	skb_mstamp_get(&now); +	if (first_ackt.v64) { +		seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); +		ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); +	} + +	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); +  	if (flag & FLAG_ACKED) {  		const struct tcp_congestion_ops *ca_ops  			= inet_csk(sk)->icsk_ca_ops; +		tcp_rearm_rto(sk);  		if (unlikely(icsk->icsk_mtup.probe_size &&  			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {  			tcp_mtup_probe_success(sk);  		} -		tcp_ack_update_rtt(sk, flag, seq_rtt); -		tcp_rearm_rto(sk); -  		if (tcp_is_reno(tp)) {  			tcp_remove_reno_sacks(sk, pkts_acked);  		} else { @@ -3339,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  		tp->fackets_out -= min(pkts_acked, tp->fackets_out); -		if (ca_ops->pkts_acked) { -			s32 rtt_us = -1; - -			/* Is the ACK triggering packet unambiguous? */ -			if (!(flag & FLAG_RETRANS_DATA_ACKED)) { -				/* High resolution needed and available? */ -				if (ca_ops->flags & TCP_CONG_RTT_STAMP && -				    !ktime_equal(last_ackt, -						 net_invalid_timestamp())) -					rtt_us = ktime_us_delta(ktime_get_real(), -								last_ackt); -				else if (ca_seq_rtt > 0) -					rtt_us = jiffies_to_usecs(ca_seq_rtt); -			} +		if (ca_ops->pkts_acked) +			ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); -			ca_ops->pkts_acked(sk, pkts_acked, rtt_us); -		} +	} else if (skb && rtt_update && sack_rtt_us >= 0 && +		   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { +		/* Do not re-arm RTO if the sack RTT is measured from data sent +		 * after when the head was last (re)transmitted. Otherwise the +		 * timeout may continue to extend in loss recovery. +		 */ +		tcp_rearm_rto(sk);  	}  #if FASTRETRANS_DEBUG > 0 @@ -3365,18 +3170,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,  	if (!tp->packets_out && tcp_is_sack(tp)) {  		icsk = inet_csk(sk);  		if (tp->lost_out) { -			printk(KERN_DEBUG "Leak l=%u %d\n", -			       tp->lost_out, icsk->icsk_ca_state); +			pr_debug("Leak l=%u %d\n", +				 tp->lost_out, icsk->icsk_ca_state);  			tp->lost_out = 0;  		}  		if (tp->sacked_out) { -			printk(KERN_DEBUG "Leak s=%u %d\n", -			       tp->sacked_out, icsk->icsk_ca_state); +			pr_debug("Leak s=%u %d\n", +				 tp->sacked_out, icsk->icsk_ca_state);  			tp->sacked_out = 0;  		}  		if (tp->retrans_out) { -			printk(KERN_DEBUG "Leak r=%u %d\n", -			       tp->retrans_out, icsk->icsk_ca_state); +			pr_debug("Leak r=%u %d\n", +				 tp->retrans_out, icsk->icsk_ca_state);  			tp->retrans_out = 0;  		}  	} @@ -3404,23 +3209,34 @@ static void tcp_ack_probe(struct sock *sk)  	}  } -static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) +static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)  {  	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||  		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;  } -static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) +/* Decide wheather to run the increase function of congestion control. */ +static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)  { -	const struct tcp_sock *tp = tcp_sk(sk); -	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && -		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); +	if (tcp_in_cwnd_reduction(sk)) +		return false; + +	/* If reordering is high then always grow cwnd whenever data is +	 * delivered regardless of its ordering. Otherwise stay conservative +	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ +	 * new SACK or ECE mark may first advance cwnd here and later reduce +	 * cwnd in tcp_fastretrans_alert() based on more states. +	 */ +	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) +		return flag & FLAG_FORWARD_PROGRESS; + +	return flag & FLAG_DATA_ACKED;  }  /* Check that window update is acceptable.   * The function assumes that snd_una<=ack<=snd_next.   */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, +static inline bool tcp_may_update_window(const struct tcp_sock *tp,  					const u32 ack, const u32 ack_seq,  					const u32 nwin)  { @@ -3434,7 +3250,7 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,   * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2   * and in FreeBSD. NetBSD's one is even worse.) is wrong.   */ -static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, +static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,  				 u32 ack_seq)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -3469,164 +3285,103 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,  	return flag;  } -/* A very conservative spurious RTO response algorithm: reduce cwnd and - * continue in congestion avoidance. - */ -static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +/* RFC 5961 7 [ACK Throttling] */ +static void tcp_send_challenge_ack(struct sock *sk)  { -	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); -	tp->snd_cwnd_cnt = 0; -	tp->bytes_acked = 0; -	TCP_ECN_queue_cwr(tp); -	tcp_moderate_cwnd(tp); +	/* unprotected vars, we dont care of overwrites */ +	static u32 challenge_timestamp; +	static unsigned int challenge_count; +	u32 now = jiffies / HZ; + +	if (now != challenge_timestamp) { +		challenge_timestamp = now; +		challenge_count = 0; +	} +	if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); +		tcp_send_ack(sk); +	}  } -/* A conservative spurious RTO response algorithm: reduce cwnd using - * rate halving and continue in congestion avoidance. - */ -static void tcp_ratehalving_spur_to_response(struct sock *sk) +static void tcp_store_ts_recent(struct tcp_sock *tp)  { -	tcp_enter_cwr(sk, 0); +	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; +	tp->rx_opt.ts_recent_stamp = get_seconds();  } -static void tcp_undo_spur_to_response(struct sock *sk, int flag) +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)  { -	if (flag & FLAG_ECE) -		tcp_ratehalving_spur_to_response(sk); -	else -		tcp_undo_cwr(sk, 1); +	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { +		/* PAWS bug workaround wrt. ACK frames, the PAWS discard +		 * extra check below makes sure this can only happen +		 * for pure ACK frames.  -DaveM +		 * +		 * Not only, also it occurs for expired timestamps. +		 */ + +		if (tcp_paws_check(&tp->rx_opt, 0)) +			tcp_store_ts_recent(tp); +	}  } -/* F-RTO spurious RTO detection algorithm (RFC4138) - * - * F-RTO affects during two new ACKs following RTO (well, almost, see inline - * comments). State (ACK number) is kept in frto_counter. When ACK advances - * window (but not to or beyond highest sequence sent before RTO): - *   On First ACK,  send two new segments out. - *   On Second ACK, RTO was likely spurious. Do spurious response (response - *                  algorithm is not part of the F-RTO detection algorithm - *                  given in RFC4138 but can be selected separately). - * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. F-RTO allows overriding - * of Nagle, this is done using frto_counter states 2 and 3, when a new data - * segment of any size sent during F-RTO, state 2 is upgraded to 3. - * - * Rationale: if the RTO was spurious, new ACKs should arrive from the - * original window even after we transmit two new data segments. - * - * SACK version: - *   on first step, wait until first cumulative ACK arrives, then move to - *   the second step. In second step, the next ACK decides. - * - * F-RTO is implemented (mainly) in four functions: - *   - tcp_use_frto() is used to determine if TCP is can use F-RTO - *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is - *     called when tcp_use_frto() showed green light - *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm - *   - tcp_enter_frto_loss() is called if there is not enough evidence - *     to prove that the RTO is indeed spurious. It transfers the control - *     from F-RTO to the conventional RTO recovery +/* This routine deals with acks during a TLP episode. + * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.   */ -static int tcp_process_frto(struct sock *sk, int flag) +static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)  {  	struct tcp_sock *tp = tcp_sk(sk); +	bool is_tlp_dupack = (ack == tp->tlp_high_seq) && +			     !(flag & (FLAG_SND_UNA_ADVANCED | +				       FLAG_NOT_DUP | FLAG_DATA_SACKED)); -	tcp_verify_left_out(tp); - -	/* Duplicate the behavior from Loss state (fastretrans_alert) */ -	if (flag & FLAG_DATA_ACKED) -		inet_csk(sk)->icsk_retransmits = 0; - -	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || -	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) -		tp->undo_marker = 0; - -	if (!before(tp->snd_una, tp->frto_highmark)) { -		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); -		return 1; -	} - -	if (!tcp_is_sackfrto(tp)) { -		/* RFC4138 shortcoming in step 2; should also have case c): -		 * ACK isn't duplicate nor advances window, e.g., opposite dir -		 * data, winupdate -		 */ -		if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) -			return 1; - -		if (!(flag & FLAG_DATA_ACKED)) { -			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), -					    flag); -			return 1; -		} -	} else { -		if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { -			/* Prevent sending of new data. */ -			tp->snd_cwnd = min(tp->snd_cwnd, -					   tcp_packets_in_flight(tp)); -			return 1; -		} - -		if ((tp->frto_counter >= 2) && -		    (!(flag & FLAG_FORWARD_PROGRESS) || -		     ((flag & FLAG_DATA_SACKED) && -		      !(flag & FLAG_ONLY_ORIG_SACKED)))) { -			/* RFC4138 shortcoming (see comment above) */ -			if (!(flag & FLAG_FORWARD_PROGRESS) && -			    (flag & FLAG_NOT_DUP)) -				return 1; - -			tcp_enter_frto_loss(sk, 3, flag); -			return 1; -		} +	/* Mark the end of TLP episode on receiving TLP dupack or when +	 * ack is after tlp_high_seq. +	 */ +	if (is_tlp_dupack) { +		tp->tlp_high_seq = 0; +		return;  	} -	if (tp->frto_counter == 1) { -		/* tcp_may_send_now needs to see updated state */ -		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; -		tp->frto_counter = 2; - -		if (!tcp_may_send_now(sk)) -			tcp_enter_frto_loss(sk, 2, flag); - -		return 1; -	} else { -		switch (sysctl_tcp_frto_response) { -		case 2: -			tcp_undo_spur_to_response(sk, flag); -			break; -		case 1: -			tcp_conservative_spur_to_response(tp); -			break; -		default: -			tcp_ratehalving_spur_to_response(sk); -			break; +	if (after(ack, tp->tlp_high_seq)) { +		tp->tlp_high_seq = 0; +		/* Don't reduce cwnd if DSACK arrives for TLP retrans. */ +		if (!(flag & FLAG_DSACKING_ACK)) { +			tcp_init_cwnd_reduction(sk, true); +			tcp_set_ca_state(sk, TCP_CA_CWR); +			tcp_end_cwnd_reduction(sk); +			tcp_try_keep_open(sk); +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPLOSSPROBERECOVERY);  		} -		tp->frto_counter = 0; -		tp->undo_marker = 0; -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);  	} -	return 0;  }  /* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	u32 prior_snd_una = tp->snd_una;  	u32 ack_seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq; -	u32 prior_in_flight; +	bool is_dupack = false;  	u32 prior_fackets; -	int prior_packets; -	int frto_cwnd = 0; +	int prior_packets = tp->packets_out; +	const int prior_unsacked = tp->packets_out - tp->sacked_out; +	int acked = 0; /* Number of packets newly acked */ +	long sack_rtt_us = -1L;  	/* If the ack is older than previous acks  	 * then we can probably ignore it.  	 */ -	if (before(ack, prior_snd_una)) +	if (before(ack, prior_snd_una)) { +		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ +		if (before(ack, prior_snd_una - tp->max_window)) { +			tcp_send_challenge_ack(sk); +			return -1; +		}  		goto old_ack; +	}  	/* If the ack includes data we haven't sent yet, discard  	 * this segment (RFC793 Section 3.9). @@ -3634,20 +3389,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)  	if (after(ack, tp->snd_nxt))  		goto invalid_ack; +	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) +		tcp_rearm_rto(sk); +  	if (after(ack, prior_snd_una))  		flag |= FLAG_SND_UNA_ADVANCED; -	if (sysctl_tcp_abc) { -		if (icsk->icsk_ca_state < TCP_CA_CWR) -			tp->bytes_acked += ack - prior_snd_una; -		else if (icsk->icsk_ca_state == TCP_CA_Loss) -			/* we assume just one segment left network */ -			tp->bytes_acked += min(ack - prior_snd_una, -					       tp->mss_cache); -	} -  	prior_fackets = tp->fackets_out; -	prior_in_flight = tcp_packets_in_flight(tp); + +	/* ts_recent update must be made after we are sure that the packet +	 * is in window. +	 */ +	if (flag & FLAG_UPDATE_TS_RECENT) +		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);  	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {  		/* Window is constant, pure forward advance. @@ -3670,7 +3425,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)  		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);  		if (TCP_SKB_CB(skb)->sacked) -			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); +			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, +							&sack_rtt_us);  		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))  			flag |= FLAG_ECE; @@ -3684,43 +3440,52 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)  	sk->sk_err_soft = 0;  	icsk->icsk_probes_out = 0;  	tp->rcv_tstamp = tcp_time_stamp; -	prior_packets = tp->packets_out;  	if (!prior_packets)  		goto no_queue;  	/* See if we can take anything off of the retransmit queue. */ -	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); +	acked = tp->packets_out; +	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, +				    sack_rtt_us); +	acked -= tp->packets_out; -	if (tp->frto_counter) -		frto_cwnd = tcp_process_frto(sk, flag); -	/* Guarantee sacktag reordering detection against wrap-arounds */ -	if (before(tp->frto_highmark, tp->snd_una)) -		tp->frto_highmark = 0; +	/* Advance cwnd if state allows */ +	if (tcp_may_raise_cwnd(sk, flag)) +		tcp_cong_avoid(sk, ack, acked);  	if (tcp_ack_is_dubious(sk, flag)) { -		/* Advance CWND, if state allows this. */ -		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && -		    tcp_may_raise_cwnd(sk, flag)) -			tcp_cong_avoid(sk, ack, prior_in_flight); -		tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, -				      flag); -	} else { -		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) -			tcp_cong_avoid(sk, ack, prior_in_flight); +		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); +		tcp_fastretrans_alert(sk, acked, prior_unsacked, +				      is_dupack, flag);  	} +	if (tp->tlp_high_seq) +		tcp_process_tlp_ack(sk, ack, flag); -	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) -		dst_confirm(__sk_dst_get(sk)); +	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { +		struct dst_entry *dst = __sk_dst_get(sk); +		if (dst) +			dst_confirm(dst); +	} +	if (icsk->icsk_pending == ICSK_TIME_RETRANS) +		tcp_schedule_loss_probe(sk); +	tcp_update_pacing_rate(sk);  	return 1;  no_queue: +	/* If data was DSACKed, see if we can undo a cwnd reduction. */ +	if (flag & FLAG_DSACKING_ACK) +		tcp_fastretrans_alert(sk, acked, prior_unsacked, +				      is_dupack, flag);  	/* If this ack opens up a zero window, clear backoff.  It was  	 * being used to time the probes, and is probably far higher than  	 * it needs to be for normal retransmission.  	 */  	if (tcp_send_head(sk))  		tcp_ack_probe(sk); + +	if (tp->tlp_high_seq) +		tcp_process_tlp_ack(sk, ack, flag);  	return 1;  invalid_ack: @@ -3728,10 +3493,14 @@ invalid_ack:  	return -1;  old_ack: +	/* If data was SACKed, tag it and see if we should send more data. +	 * If data was DSACKed, see if we can undo a cwnd reduction. +	 */  	if (TCP_SKB_CB(skb)->sacked) { -		tcp_sacktag_write_queue(sk, skb, prior_snd_una); -		if (icsk->icsk_ca_state == TCP_CA_Open) -			tcp_try_keep_open(sk); +		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, +						&sack_rtt_us); +		tcp_fastretrans_alert(sk, acked, prior_unsacked, +				      is_dupack, flag);  	}  	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -3742,14 +3511,15 @@ old_ack:   * But, this can also be called on packets in the established flow when   * the fast version below fails.   */ -void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, -		       u8 **hvpp, int estab) +void tcp_parse_options(const struct sk_buff *skb, +		       struct tcp_options_received *opt_rx, int estab, +		       struct tcp_fastopen_cookie *foc)  { -	unsigned char *ptr; -	struct tcphdr *th = tcp_hdr(skb); +	const unsigned char *ptr; +	const struct tcphdr *th = tcp_hdr(skb);  	int length = (th->doff * 4) - sizeof(struct tcphdr); -	ptr = (unsigned char *)(th + 1); +	ptr = (const unsigned char *)(th + 1);  	opt_rx->saw_tstamp = 0;  	while (length > 0) { @@ -3786,10 +3556,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  					__u8 snd_wscale = *(__u8 *)ptr;  					opt_rx->wscale_ok = 1;  					if (snd_wscale > 14) { -						if (net_ratelimit()) -							printk(KERN_INFO "tcp_parse_options: Illegal window " -							       "scaling value %d >14 received.\n", -							       snd_wscale); +						net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", +								     __func__, +								     snd_wscale);  						snd_wscale = 14;  					}  					opt_rx->snd_wscale = snd_wscale; @@ -3807,7 +3576,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  			case TCPOPT_SACK_PERM:  				if (opsize == TCPOLEN_SACK_PERM && th->syn &&  				    !estab && sysctl_tcp_sack) { -					opt_rx->sack_ok = 1; +					opt_rx->sack_ok = TCP_SACK_SEEN;  					tcp_sack_reset(opt_rx);  				}  				break; @@ -3827,32 +3596,24 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  				 */  				break;  #endif -			case TCPOPT_COOKIE: -				/* This option is variable length. +			case TCPOPT_EXP: +				/* Fast Open option shares code 254 using a +				 * 16 bits magic number. It's valid only in +				 * SYN or SYN-ACK with an even size.  				 */ -				switch (opsize) { -				case TCPOLEN_COOKIE_BASE: -					/* not yet implemented */ -					break; -				case TCPOLEN_COOKIE_PAIR: -					/* not yet implemented */ -					break; -				case TCPOLEN_COOKIE_MIN+0: -				case TCPOLEN_COOKIE_MIN+2: -				case TCPOLEN_COOKIE_MIN+4: -				case TCPOLEN_COOKIE_MIN+6: -				case TCPOLEN_COOKIE_MAX: -					/* 16-bit multiple */ -					opt_rx->cookie_plus = opsize; -					*hvpp = ptr; -					break; -				default: -					/* ignore option */ +				if (opsize < TCPOLEN_EXP_FASTOPEN_BASE || +				    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC || +				    foc == NULL || !th->syn || (opsize & 1))  					break; -				} +				foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE; +				if (foc->len >= TCP_FASTOPEN_COOKIE_MIN && +				    foc->len <= TCP_FASTOPEN_COOKIE_MAX) +					memcpy(foc->val, ptr + 2, foc->len); +				else if (foc->len != 0) +					foc->len = -1;  				break; -			} +			}  			ptr += opsize-2;  			length -= opsize;  		} @@ -3860,9 +3621,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,  }  EXPORT_SYMBOL(tcp_parse_options); -static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) +static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)  { -	__be32 *ptr = (__be32 *)(th + 1); +	const __be32 *ptr = (const __be32 *)(th + 1);  	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)  			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { @@ -3870,41 +3631,48 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)  		++ptr;  		tp->rx_opt.rcv_tsval = ntohl(*ptr);  		++ptr; -		tp->rx_opt.rcv_tsecr = ntohl(*ptr); -		return 1; +		if (*ptr) +			tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; +		else +			tp->rx_opt.rcv_tsecr = 0; +		return true;  	} -	return 0; +	return false;  }  /* Fast parse options. This hopes to only see timestamps.   * If it is wrong it falls back on tcp_parse_options().   */ -static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, -				  struct tcp_sock *tp, u8 **hvpp) +static bool tcp_fast_parse_options(const struct sk_buff *skb, +				   const struct tcphdr *th, struct tcp_sock *tp)  {  	/* In the spirit of fast parsing, compare doff directly to constant  	 * values.  Because equality is used, short doff can be ignored here.  	 */  	if (th->doff == (sizeof(*th) / 4)) {  		tp->rx_opt.saw_tstamp = 0; -		return 0; +		return false;  	} else if (tp->rx_opt.tstamp_ok &&  		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {  		if (tcp_parse_aligned_timestamp(tp, th)) -			return 1; +			return true;  	} -	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); -	return 1; + +	tcp_parse_options(skb, &tp->rx_opt, 1, NULL); +	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) +		tp->rx_opt.rcv_tsecr -= tp->tsoffset; + +	return true;  }  #ifdef CONFIG_TCP_MD5SIG  /*   * Parse MD5 Signature option   */ -u8 *tcp_parse_md5sig_option(struct tcphdr *th) +const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)  { -	int length = (th->doff << 2) - sizeof (*th); -	u8 *ptr = (u8*)(th + 1); +	int length = (th->doff << 2) - sizeof(*th); +	const u8 *ptr = (const u8 *)(th + 1);  	/* If the TCP option is too short, we can short cut */  	if (length < TCPOLEN_MD5SIG) @@ -3914,7 +3682,7 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)  		int opcode = *ptr++;  		int opsize; -		switch(opcode) { +		switch (opcode) {  		case TCPOPT_EOL:  			return NULL;  		case TCPOPT_NOP: @@ -3935,27 +3703,6 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)  EXPORT_SYMBOL(tcp_parse_md5sig_option);  #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ -	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; -	tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ -	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { -		/* PAWS bug workaround wrt. ACK frames, the PAWS discard -		 * extra check below makes sure this can only happen -		 * for pure ACK frames.  -DaveM -		 * -		 * Not only, also it occurs for expired timestamps. -		 */ - -		if (tcp_paws_check(&tp->rx_opt, 0)) -			tcp_store_ts_recent(tp); -	} -} -  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM   *   * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -3981,8 +3728,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)  static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct tcphdr *th = tcp_hdr(skb);  	u32 seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -3999,7 +3746,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)  		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);  } -static inline int tcp_paws_discard(const struct sock *sk, +static inline bool tcp_paws_discard(const struct sock *sk,  				   const struct sk_buff *skb)  {  	const struct tcp_sock *tp = tcp_sk(sk); @@ -4021,14 +3768,14 @@ static inline int tcp_paws_discard(const struct sock *sk,   * (borrowed from freebsd)   */ -static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) +static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)  {  	return	!before(end_seq, tp->rcv_wup) &&  		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));  }  /* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk)  {  	/* We want the right error as BSD sees it (and indeed as we do). */  	switch (sk->sk_state) { @@ -4066,9 +3813,10 @@ static void tcp_reset(struct sock *sk)   *   *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.   */ -static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +static void tcp_fin(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	const struct dst_entry *dst;  	inet_csk_schedule_ack(sk); @@ -4080,7 +3828,9 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)  	case TCP_ESTABLISHED:  		/* Move to CLOSE_WAIT */  		tcp_set_state(sk, TCP_CLOSE_WAIT); -		inet_csk(sk)->icsk_ack.pingpong = 1; +		dst = __sk_dst_get(sk); +		if (!dst || !dst_metric(dst, RTAX_QUICKACK)) +			inet_csk(sk)->icsk_ack.pingpong = 1;  		break;  	case TCP_CLOSE_WAIT: @@ -4110,7 +3860,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)  		/* Only TCP_LISTEN and TCP_CLOSE are left, in these  		 * cases we should never reach this piece of code.  		 */ -		printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", +		pr_err("%s: Impossible, sk->sk_state=%d\n",  		       __func__, sk->sk_state);  		break;  	} @@ -4135,7 +3885,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)  	}  } -static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, +static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,  				  u32 end_seq)  {  	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { @@ -4143,9 +3893,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,  			sp->start_seq = seq;  		if (after(end_seq, sp->end_seq))  			sp->end_seq = end_seq; -		return 1; +		return true;  	} -	return 0; +	return false;  }  static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) @@ -4178,7 +3928,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)  		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);  } -static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -4292,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)  			WARN_ON(before(tp->rcv_nxt, sp->end_seq));  			/* Zap this SACK, by moving forward any other SACKS. */ -			for (i=this_sack+1; i < num_sacks; i++) +			for (i = this_sack+1; i < num_sacks; i++)  				tp->selective_acks[i-1] = tp->selective_acks[i];  			num_sacks--;  			continue; @@ -4337,37 +4087,261 @@ static void tcp_ofo_queue(struct sock *sk)  		__skb_queue_tail(&sk->sk_receive_queue, skb);  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;  		if (tcp_hdr(skb)->fin) -			tcp_fin(skb, sk, tcp_hdr(skb)); +			tcp_fin(sk);  	}  } -static int tcp_prune_ofo_queue(struct sock *sk); +static bool tcp_prune_ofo_queue(struct sock *sk);  static int tcp_prune_queue(struct sock *sk); -static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, +				 unsigned int size)  {  	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || -	    !sk_rmem_schedule(sk, size)) { +	    !sk_rmem_schedule(sk, skb, size)) {  		if (tcp_prune_queue(sk) < 0)  			return -1; -		if (!sk_rmem_schedule(sk, size)) { +		if (!sk_rmem_schedule(sk, skb, size)) {  			if (!tcp_prune_ofo_queue(sk))  				return -1; -			if (!sk_rmem_schedule(sk, size)) +			if (!sk_rmem_schedule(sk, skb, size))  				return -1;  		}  	}  	return 0;  } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, +			     struct sk_buff *to, +			     struct sk_buff *from, +			     bool *fragstolen) +{ +	int delta; + +	*fragstolen = false; + +	if (tcp_hdr(from)->fin) +		return false; + +	/* Its possible this segment overlaps with prior segment in queue */ +	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) +		return false; + +	if (!skb_try_coalesce(to, from, fragstolen, &delta)) +		return false; + +	atomic_add(delta, &sk->sk_rmem_alloc); +	sk_mem_charge(sk, delta); +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); +	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; +	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; +	return true; +} + +static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb1; +	u32 seq, end_seq; + +	TCP_ECN_check_ce(tp, skb); + +	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); +		__kfree_skb(skb); +		return; +	} + +	/* Disable header prediction. */ +	tp->pred_flags = 0; +	inet_csk_schedule_ack(sk); + +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); +	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", +		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + +	skb1 = skb_peek_tail(&tp->out_of_order_queue); +	if (!skb1) { +		/* Initial out of order segment, build 1 SACK. */ +		if (tcp_is_sack(tp)) { +			tp->rx_opt.num_sacks = 1; +			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; +			tp->selective_acks[0].end_seq = +						TCP_SKB_CB(skb)->end_seq; +		} +		__skb_queue_head(&tp->out_of_order_queue, skb); +		goto end; +	} + +	seq = TCP_SKB_CB(skb)->seq; +	end_seq = TCP_SKB_CB(skb)->end_seq; + +	if (seq == TCP_SKB_CB(skb1)->end_seq) { +		bool fragstolen; + +		if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { +			__skb_queue_after(&tp->out_of_order_queue, skb1, skb); +		} else { +			tcp_grow_window(sk, skb); +			kfree_skb_partial(skb, fragstolen); +			skb = NULL; +		} + +		if (!tp->rx_opt.num_sacks || +		    tp->selective_acks[0].end_seq != seq) +			goto add_sack; + +		/* Common case: data arrive in order after hole. */ +		tp->selective_acks[0].end_seq = end_seq; +		goto end; +	} + +	/* Find place to insert this segment. */ +	while (1) { +		if (!after(TCP_SKB_CB(skb1)->seq, seq)) +			break; +		if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { +			skb1 = NULL; +			break; +		} +		skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); +	} + +	/* Do skb overlap to previous one? */ +	if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { +		if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { +			/* All the bits are present. Drop. */ +			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); +			__kfree_skb(skb); +			skb = NULL; +			tcp_dsack_set(sk, seq, end_seq); +			goto add_sack; +		} +		if (after(seq, TCP_SKB_CB(skb1)->seq)) { +			/* Partial overlap. */ +			tcp_dsack_set(sk, seq, +				      TCP_SKB_CB(skb1)->end_seq); +		} else { +			if (skb_queue_is_first(&tp->out_of_order_queue, +					       skb1)) +				skb1 = NULL; +			else +				skb1 = skb_queue_prev( +					&tp->out_of_order_queue, +					skb1); +		} +	} +	if (!skb1) +		__skb_queue_head(&tp->out_of_order_queue, skb); +	else +		__skb_queue_after(&tp->out_of_order_queue, skb1, skb); + +	/* And clean segments covered by new one as whole. */ +	while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { +		skb1 = skb_queue_next(&tp->out_of_order_queue, skb); + +		if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) +			break; +		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { +			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, +					 end_seq); +			break; +		} +		__skb_unlink(skb1, &tp->out_of_order_queue); +		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, +				 TCP_SKB_CB(skb1)->end_seq); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); +		__kfree_skb(skb1); +	} + +add_sack: +	if (tcp_is_sack(tp)) +		tcp_sack_new_ofo_skb(sk, seq, end_seq); +end: +	if (skb) { +		tcp_grow_window(sk, skb); +		skb_set_owner_r(skb, sk); +	} +} + +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, +		  bool *fragstolen) +{ +	int eaten; +	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); + +	__skb_pull(skb, hdrlen); +	eaten = (tail && +		 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; +	tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +	if (!eaten) { +		__skb_queue_tail(&sk->sk_receive_queue, skb); +		skb_set_owner_r(skb, sk); +	} +	return eaten; +} + +int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) +{ +	struct sk_buff *skb = NULL; +	struct tcphdr *th; +	bool fragstolen; + +	if (size == 0) +		return 0; + +	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); +	if (!skb) +		goto err; + +	if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) +		goto err_free; + +	th = (struct tcphdr *)skb_put(skb, sizeof(*th)); +	skb_reset_transport_header(skb); +	memset(th, 0, sizeof(*th)); + +	if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) +		goto err_free; + +	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; +	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; +	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; + +	if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { +		WARN_ON_ONCE(fragstolen); /* should not happen */ +		__kfree_skb(skb); +	} +	return size; + +err_free: +	kfree_skb(skb); +err: +	return -ENOMEM; +} +  static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)  { -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb);  	struct tcp_sock *tp = tcp_sk(sk);  	int eaten = -1; +	bool fragstolen = false;  	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)  		goto drop; @@ -4400,7 +4374,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)  			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {  				tp->ucopy.len -= chunk;  				tp->copied_seq += chunk; -				eaten = (chunk == skb->len && !th->fin); +				eaten = (chunk == skb->len);  				tcp_rcv_space_adjust(sk);  			}  			local_bh_disable(); @@ -4409,17 +4383,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)  		if (eaten <= 0) {  queue_and_out:  			if (eaten < 0 && -			    tcp_try_rmem_schedule(sk, skb->truesize)) +			    tcp_try_rmem_schedule(sk, skb, skb->truesize))  				goto drop; -			skb_set_owner_r(skb, sk); -			__skb_queue_tail(&sk->sk_receive_queue, skb); +			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);  		}  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;  		if (skb->len)  			tcp_event_data_recv(sk, skb);  		if (th->fin) -			tcp_fin(skb, sk, th); +			tcp_fin(sk);  		if (!skb_queue_empty(&tp->out_of_order_queue)) {  			tcp_ofo_queue(sk); @@ -4437,9 +4410,9 @@ queue_and_out:  		tcp_fast_path_check(sk);  		if (eaten > 0) -			__kfree_skb(skb); -		else if (!sock_flag(sk, SOCK_DEAD)) -			sk->sk_data_ready(sk, 0); +			kfree_skb_partial(skb, fragstolen); +		if (!sock_flag(sk, SOCK_DEAD)) +			sk->sk_data_ready(sk);  		return;  	} @@ -4478,105 +4451,7 @@ drop:  		goto queue_and_out;  	} -	TCP_ECN_check_ce(tp, skb); - -	if (tcp_try_rmem_schedule(sk, skb->truesize)) -		goto drop; - -	/* Disable header prediction. */ -	tp->pred_flags = 0; -	inet_csk_schedule_ack(sk); - -	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", -		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - -	skb_set_owner_r(skb, sk); - -	if (!skb_peek(&tp->out_of_order_queue)) { -		/* Initial out of order segment, build 1 SACK. */ -		if (tcp_is_sack(tp)) { -			tp->rx_opt.num_sacks = 1; -			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; -			tp->selective_acks[0].end_seq = -						TCP_SKB_CB(skb)->end_seq; -		} -		__skb_queue_head(&tp->out_of_order_queue, skb); -	} else { -		struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); -		u32 seq = TCP_SKB_CB(skb)->seq; -		u32 end_seq = TCP_SKB_CB(skb)->end_seq; - -		if (seq == TCP_SKB_CB(skb1)->end_seq) { -			__skb_queue_after(&tp->out_of_order_queue, skb1, skb); - -			if (!tp->rx_opt.num_sacks || -			    tp->selective_acks[0].end_seq != seq) -				goto add_sack; - -			/* Common case: data arrive in order after hole. */ -			tp->selective_acks[0].end_seq = end_seq; -			return; -		} - -		/* Find place to insert this segment. */ -		while (1) { -			if (!after(TCP_SKB_CB(skb1)->seq, seq)) -				break; -			if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { -				skb1 = NULL; -				break; -			} -			skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); -		} - -		/* Do skb overlap to previous one? */ -		if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { -			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { -				/* All the bits are present. Drop. */ -				__kfree_skb(skb); -				tcp_dsack_set(sk, seq, end_seq); -				goto add_sack; -			} -			if (after(seq, TCP_SKB_CB(skb1)->seq)) { -				/* Partial overlap. */ -				tcp_dsack_set(sk, seq, -					      TCP_SKB_CB(skb1)->end_seq); -			} else { -				if (skb_queue_is_first(&tp->out_of_order_queue, -						       skb1)) -					skb1 = NULL; -				else -					skb1 = skb_queue_prev( -						&tp->out_of_order_queue, -						skb1); -			} -		} -		if (!skb1) -			__skb_queue_head(&tp->out_of_order_queue, skb); -		else -			__skb_queue_after(&tp->out_of_order_queue, skb1, skb); - -		/* And clean segments covered by new one as whole. */ -		while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { -			skb1 = skb_queue_next(&tp->out_of_order_queue, skb); - -			if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) -				break; -			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { -				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, -						 end_seq); -				break; -			} -			__skb_unlink(skb1, &tp->out_of_order_queue); -			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, -					 TCP_SKB_CB(skb1)->end_seq); -			__kfree_skb(skb1); -		} - -add_sack: -		if (tcp_is_sack(tp)) -			tcp_sack_new_ofo_skb(sk, seq, end_seq); -	} +	tcp_data_queue_ofo(sk, skb);  }  static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4755,10 +4630,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)   * Purge the out-of-order queue.   * Return true if queue was pruned.   */ -static int tcp_prune_ofo_queue(struct sock *sk) +static bool tcp_prune_ofo_queue(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int res = 0; +	bool res = false;  	if (!skb_queue_empty(&tp->out_of_order_queue)) {  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); @@ -4772,7 +4647,7 @@ static int tcp_prune_ofo_queue(struct sock *sk)  		if (tp->rx_opt.sack_ok)  			tcp_sack_reset(&tp->rx_opt);  		sk_mem_reclaim(sk); -		res = 1; +		res = true;  	}  	return res;  } @@ -4794,7 +4669,7 @@ static int tcp_prune_queue(struct sock *sk)  	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)  		tcp_clamp_window(sk); -	else if (tcp_memory_pressure) +	else if (sk_under_memory_pressure(sk))  		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);  	tcp_collapse_ofo_queue(sk); @@ -4827,51 +4702,29 @@ static int tcp_prune_queue(struct sock *sk)  	return -1;  } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) +static bool tcp_should_expand_sndbuf(const struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); - -	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && -	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { -		/* Limited by application or receiver window. */ -		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); -		u32 win_used = max(tp->snd_cwnd_used, init_win); -		if (win_used < tp->snd_cwnd) { -			tp->snd_ssthresh = tcp_current_ssthresh(sk); -			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; -		} -		tp->snd_cwnd_used = 0; -	} -	tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static int tcp_should_expand_sndbuf(struct sock *sk) -{ -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	/* If the user specified a specific send buffer setting, do  	 * not modify it.  	 */  	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) -		return 0; +		return false;  	/* If we are under global TCP memory pressure, do not expand.  */ -	if (tcp_memory_pressure) -		return 0; +	if (sk_under_memory_pressure(sk)) +		return false;  	/* If we are under soft global TCP memory pressure, do not expand.  */ -	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) -		return 0; +	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) +		return false;  	/* If we filled the congestion window, do not expand.  */  	if (tp->packets_out >= tp->snd_cwnd) -		return 0; +		return false; -	return 1; +	return true;  }  /* When incoming ACK allowed to free some skb from write_queue, @@ -4885,13 +4738,7 @@ static void tcp_new_space(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	if (tcp_should_expand_sndbuf(sk)) { -		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + -			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); -		int demanded = max_t(unsigned int, tp->snd_cwnd, -				     tp->reordering + 1); -		sndmem *= 2 * demanded; -		if (sndmem > sk->sk_sndbuf) -			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); +		tcp_sndbuf_expand(sk);  		tp->snd_cwnd_stamp = tcp_time_stamp;  	} @@ -4958,7 +4805,7 @@ static inline void tcp_ack_snd_check(struct sock *sk)   *	either form (or just set the sysctl tcp_stdurg).   */ -static void tcp_check_urg(struct sock *sk, struct tcphdr *th) +static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)  {  	struct tcp_sock *tp = tcp_sk(sk);  	u32 ptr = ntohs(th->urg_ptr); @@ -5024,7 +4871,7 @@ static void tcp_check_urg(struct sock *sk, struct tcphdr *th)  }  /* This is the 'fast' part of urgent handling. */ -static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -5044,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)  				BUG();  			tp->urg_data = TCP_URG_VALID | tmp;  			if (!sock_flag(sk, SOCK_DEAD)) -				sk->sk_data_ready(sk, 0); +				sk->sk_data_ready(sk);  		}  	}  } @@ -5087,7 +4934,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,  	return result;  } -static inline int tcp_checksum_complete_user(struct sock *sk, +static inline bool tcp_checksum_complete_user(struct sock *sk,  					     struct sk_buff *skb)  {  	return !skb_csum_unnecessary(skb) && @@ -5095,19 +4942,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk,  }  #ifdef CONFIG_NET_DMA -static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, +static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,  				  int hlen)  {  	struct tcp_sock *tp = tcp_sk(sk);  	int chunk = skb->len - hlen;  	int dma_cookie; -	int copied_early = 0; +	bool copied_early = false;  	if (tp->ucopy.wakeup) -		return 0; +		return false;  	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) -		tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); +		tp->ucopy.dma_chan = net_dma_find_channel();  	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { @@ -5120,7 +4967,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,  			goto out;  		tp->ucopy.dma_cookie = dma_cookie; -		copied_early = 1; +		copied_early = true;  		tp->ucopy.len -= chunk;  		tp->copied_seq += chunk; @@ -5130,11 +4977,11 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,  		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||  		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {  			tp->ucopy.wakeup = 1; -			sk->sk_data_ready(sk, 0); +			sk->sk_data_ready(sk);  		}  	} else if (chunk > 0) {  		tp->ucopy.wakeup = 1; -		sk->sk_data_ready(sk, 0); +		sk->sk_data_ready(sk);  	}  out:  	return copied_early; @@ -5144,15 +4991,13 @@ out:  /* Does PAWS and seqno based validation of an incoming segment, flags will   * play significant role here.   */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, -			      struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, +				  const struct tcphdr *th, int syn_inerr)  { -	u8 *hash_location;  	struct tcp_sock *tp = tcp_sk(sk);  	/* RFC1323: H1. Apply PAWS check first. */ -	if (tcp_fast_parse_options(skb, th, tp, &hash_location) && -	    tp->rx_opt.saw_tstamp && +	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&  	    tcp_paws_discard(sk, skb)) {  		if (!th->rst) {  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5170,38 +5015,48 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,  		 * an acknowledgment should be sent in reply (unless the RST  		 * bit is set, if so drop the segment and return)".  		 */ -		if (!th->rst) +		if (!th->rst) { +			if (th->syn) +				goto syn_challenge;  			tcp_send_dupack(sk, skb); +		}  		goto discard;  	}  	/* Step 2: check RST bit */  	if (th->rst) { -		tcp_reset(sk); +		/* RFC 5961 3.2 : +		 * If sequence number exactly matches RCV.NXT, then +		 *     RESET the connection +		 * else +		 *     Send a challenge ACK +		 */ +		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) +			tcp_reset(sk); +		else +			tcp_send_challenge_ack(sk);  		goto discard;  	} -	/* ts_recent update must be made after we are sure that the packet -	 * is in window. -	 */ -	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); -  	/* step 3: check security and precedence [ignored] */ -	/* step 4: Check for a SYN in window. */ -	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { +	/* step 4: Check for a SYN +	 * RFC 5691 4.2 : Send a challenge ack +	 */ +	if (th->syn) { +syn_challenge:  		if (syn_inerr)  			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); -		tcp_reset(sk); -		return -1; +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); +		tcp_send_challenge_ack(sk); +		goto discard;  	} -	return 1; +	return true;  discard:  	__kfree_skb(skb); -	return 0; +	return false;  }  /* @@ -5227,12 +5082,13 @@ discard:   *	the rest is checked inline. Fast processing is turned on in   *	tcp_data_queue when everything is OK.   */ -int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, -			struct tcphdr *th, unsigned len) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, +			 const struct tcphdr *th, unsigned int len)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int res; +	if (unlikely(sk->sk_rx_dst == NULL)) +		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);  	/*  	 *	Header prediction.  	 *	The code loosely follows the one in the famous @@ -5304,7 +5160,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,  				tcp_ack(sk, skb, 0);  				__kfree_skb(skb);  				tcp_data_snd_check(sk); -				return 0; +				return;  			} else { /* Header too small */  				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);  				goto discard; @@ -5312,11 +5168,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,  		} else {  			int eaten = 0;  			int copied_early = 0; +			bool fragstolen = false;  			if (tp->copied_seq == tp->rcv_nxt &&  			    len - tcp_header_len <= tp->ucopy.len) {  #ifdef CONFIG_NET_DMA -				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { +				if (tp->ucopy.task == current && +				    sock_owned_by_user(sk) && +				    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {  					copied_early = 1;  					eaten = 1;  				} @@ -5352,6 +5211,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,  				if (tcp_checksum_complete_user(sk, skb))  					goto csum_error; +				if ((int)skb->truesize > sk->sk_forward_alloc) +					goto step5; +  				/* Predicted packet is in window by definition.  				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.  				 * Hence, check seq<=rcv_wup reduces to: @@ -5363,16 +5225,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,  				tcp_rcv_rtt_measure_ts(sk, skb); -				if ((int)skb->truesize > sk->sk_forward_alloc) -					goto step5; -  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);  				/* Bulk data transfer: receiver */ -				__skb_pull(skb, tcp_header_len); -				__skb_queue_tail(&sk->sk_receive_queue, skb); -				skb_set_owner_r(skb, sk); -				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; +				eaten = tcp_queue_rcv(sk, skb, tcp_header_len, +						      &fragstolen);  			}  			tcp_event_data_recv(sk, skb); @@ -5394,10 +5251,9 @@ no_ack:  			else  #endif  			if (eaten) -				__kfree_skb(skb); -			else -				sk->sk_data_ready(sk, 0); -			return 0; +				kfree_skb_partial(skb, fragstolen); +			sk->sk_data_ready(sk); +			return;  		}  	} @@ -5405,16 +5261,18 @@ slow_path:  	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))  		goto csum_error; +	if (!th->ack && !th->rst) +		goto discard; +  	/*  	 *	Standard slow path.  	 */ -	res = tcp_validate_incoming(sk, skb, th, 1); -	if (res <= 0) -		return -res; +	if (!tcp_validate_incoming(sk, skb, th, 1)) +		return;  step5: -	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) +	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)  		goto discard;  	tcp_rcv_rtt_measure_ts(sk, skb); @@ -5427,27 +5285,113 @@ step5:  	tcp_data_snd_check(sk);  	tcp_ack_snd_check(sk); -	return 0; +	return;  csum_error: +	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);  discard:  	__kfree_skb(skb); -	return 0;  }  EXPORT_SYMBOL(tcp_rcv_established); +void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct inet_connection_sock *icsk = inet_csk(sk); + +	tcp_set_state(sk, TCP_ESTABLISHED); + +	if (skb != NULL) { +		icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); +		security_inet_conn_established(sk, skb); +	} + +	/* Make sure socket is routed, for correct metrics.  */ +	icsk->icsk_af_ops->rebuild_header(sk); + +	tcp_init_metrics(sk); + +	tcp_init_congestion_control(sk); + +	/* Prevent spurious tcp_cwnd_restart() on first data +	 * packet. +	 */ +	tp->lsndtime = tcp_time_stamp; + +	tcp_init_buffer_space(sk); + +	if (sock_flag(sk, SOCK_KEEPOPEN)) +		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + +	if (!tp->rx_opt.snd_wscale) +		__tcp_fast_path_on(tp, tp->snd_wnd); +	else +		tp->pred_flags = 0; + +	if (!sock_flag(sk, SOCK_DEAD)) { +		sk->sk_state_change(sk); +		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); +	} +} + +static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, +				    struct tcp_fastopen_cookie *cookie) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; +	u16 mss = tp->rx_opt.mss_clamp; +	bool syn_drop; + +	if (mss == tp->rx_opt.user_mss) { +		struct tcp_options_received opt; + +		/* Get original SYNACK MSS value if user MSS sets mss_clamp */ +		tcp_clear_options(&opt); +		opt.user_mss = opt.mss_clamp = 0; +		tcp_parse_options(synack, &opt, 0, NULL); +		mss = opt.mss_clamp; +	} + +	if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */ +		cookie->len = -1; + +	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably +	 * the remote receives only the retransmitted (regular) SYNs: either +	 * the original SYN-data or the corresponding SYN-ACK is lost. +	 */ +	syn_drop = (cookie->len <= 0 && data && tp->total_retrans); + +	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop); + +	if (data) { /* Retransmit unacked data in SYN */ +		tcp_for_write_queue_from(data, sk) { +			if (data == tcp_send_head(sk) || +			    __tcp_retransmit_skb(sk, data)) +				break; +		} +		tcp_rearm_rto(sk); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); +		return true; +	} +	tp->syn_data_acked = tp->syn_data; +	if (tp->syn_data_acked) +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); +	return false; +} +  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, -					 struct tcphdr *th, unsigned len) +					 const struct tcphdr *th, unsigned int len)  { -	u8 *hash_location;  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	struct tcp_cookie_values *cvp = tp->cookie_values; +	struct tcp_fastopen_cookie foc = { .len = -1 };  	int saved_clamp = tp->rx_opt.mss_clamp; -	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); +	tcp_parse_options(skb, &tp->rx_opt, 0, &foc); +	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) +		tp->rx_opt.rcv_tsecr -= tp->tsoffset;  	if (th->ack) {  		/* rfc793: @@ -5457,11 +5401,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send  		 *        a reset (unless the RST bit is set, if so drop  		 *        the segment and return)" -		 * -		 *  We do not send data with SYN, so that RFC-correct -		 *  test reduces to:  		 */ -		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) +		if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || +		    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))  			goto reset_and_undo;  		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -5503,7 +5445,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		TCP_ECN_rcv_synack(tp, th); -		tp->snd_wl1 = TCP_SKB_CB(skb)->seq; +		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);  		tcp_ack(sk, skb, FLAG_SLOWPATH);  		/* Ok.. it's good. Set up sequence numbers and @@ -5516,7 +5458,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		 * never scaled.  		 */  		tp->snd_wnd = ntohs(th->window); -		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);  		if (!tp->rx_opt.wscale_ok) {  			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5545,61 +5486,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  		 * is initialized. */  		tp->copied_seq = tp->rcv_nxt; -		if (cvp != NULL && -		    cvp->cookie_pair_size > 0 && -		    tp->rx_opt.cookie_plus > 0) { -			int cookie_size = tp->rx_opt.cookie_plus -					- TCPOLEN_COOKIE_BASE; -			int cookie_pair_size = cookie_size -					     + cvp->cookie_desired; - -			/* A cookie extension option was sent and returned. -			 * Note that each incoming SYNACK replaces the -			 * Responder cookie.  The initial exchange is most -			 * fragile, as protection against spoofing relies -			 * entirely upon the sequence and timestamp (above). -			 * This replacement strategy allows the correct pair to -			 * pass through, while any others will be filtered via -			 * Responder verification later. -			 */ -			if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { -				memcpy(&cvp->cookie_pair[cvp->cookie_desired], -				       hash_location, cookie_size); -				cvp->cookie_pair_size = cookie_pair_size; -			} -		} -  		smp_mb(); -		tcp_set_state(sk, TCP_ESTABLISHED); -		security_inet_conn_established(sk, skb); - -		/* Make sure socket is routed, for correct metrics.  */ -		icsk->icsk_af_ops->rebuild_header(sk); - -		tcp_init_metrics(sk); - -		tcp_init_congestion_control(sk); - -		/* Prevent spurious tcp_cwnd_restart() on first data -		 * packet. -		 */ -		tp->lsndtime = tcp_time_stamp; - -		tcp_init_buffer_space(sk); +		tcp_finish_connect(sk, skb); -		if (sock_flag(sk, SOCK_KEEPOPEN)) -			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); - -		if (!tp->rx_opt.snd_wscale) -			__tcp_fast_path_on(tp, tp->snd_wnd); -		else -			tp->pred_flags = 0; - -		if (!sock_flag(sk, SOCK_DEAD)) { -			sk->sk_state_change(sk); -			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); -		} +		if ((tp->syn_fastopen || tp->syn_data) && +		    tcp_rcv_fastopen_synack(sk, skb, &foc)) +			return -1;  		if (sk->sk_write_pending ||  		    icsk->icsk_accept_queue.rskq_defer_accept || @@ -5613,8 +5506,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,  			 */  			inet_csk_schedule_ack(sk);  			icsk->icsk_ack.lrcvtime = tcp_time_stamp; -			icsk->icsk_ack.ato	 = TCP_ATO_MIN; -			tcp_incr_quickack(sk);  			tcp_enter_quickack_mode(sk);  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,  						  TCP_DELACK_MAX, TCP_RTO_MAX); @@ -5680,7 +5571,9 @@ discard:  		tcp_send_synack(sk);  #if 0  		/* Note, we could accept data and URG from this segment. -		 * There are no obstacles to make this. +		 * There are no obstacles to make this (except that we must +		 * either change tcp_recvmsg() to prevent it from returning data +		 * before 3WHS completes per RFC793, or employ TCP Fast Open).  		 *  		 * However, if we ignore data in ACKless segments sometimes,  		 * we have no reasons to accept it sometimes. @@ -5716,12 +5609,14 @@ reset_and_undo:   */  int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, -			  struct tcphdr *th, unsigned len) +			  const struct tcphdr *th, unsigned int len)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct request_sock *req;  	int queued = 0; -	int res; +	bool acceptable; +	u32 synack_stamp;  	tp->rx_opt.saw_tstamp = 0; @@ -5737,6 +5632,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  			goto discard;  		if (th->syn) { +			if (th->fin) +				goto discard;  			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)  				return 1; @@ -5774,124 +5671,167 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,  		return 0;  	} -	res = tcp_validate_incoming(sk, skb, th, 0); -	if (res <= 0) -		return -res; +	req = tp->fastopen_rsk; +	if (req != NULL) { +		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && +		    sk->sk_state != TCP_FIN_WAIT1); + +		if (tcp_check_req(sk, skb, req, NULL, true) == NULL) +			goto discard; +	} + +	if (!th->ack && !th->rst) +		goto discard; + +	if (!tcp_validate_incoming(sk, skb, th, 0)) +		return 0;  	/* step 5: check the ACK field */ -	if (th->ack) { -		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; - -		switch (sk->sk_state) { -		case TCP_SYN_RECV: -			if (acceptable) { -				tp->copied_seq = tp->rcv_nxt; -				smp_mb(); -				tcp_set_state(sk, TCP_ESTABLISHED); -				sk->sk_state_change(sk); - -				/* Note, that this wakeup is only for marginal -				 * crossed SYN case. Passively open sockets -				 * are not waked up, because sk->sk_sleep == -				 * NULL and sk->sk_socket == NULL. -				 */ -				if (sk->sk_socket) -					sk_wake_async(sk, -						      SOCK_WAKE_IO, POLL_OUT); - -				tp->snd_una = TCP_SKB_CB(skb)->ack_seq; -				tp->snd_wnd = ntohs(th->window) << -					      tp->rx_opt.snd_wscale; -				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - -				/* tcp_ack considers this ACK as duplicate -				 * and does not calculate rtt. -				 * Force it here. -				 */ -				tcp_ack_update_rtt(sk, 0, 0); +	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | +				      FLAG_UPDATE_TS_RECENT) > 0; -				if (tp->rx_opt.tstamp_ok) -					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; +	switch (sk->sk_state) { +	case TCP_SYN_RECV: +		if (!acceptable) +			return 1; -				/* Make sure socket is routed, for -				 * correct metrics. -				 */ -				icsk->icsk_af_ops->rebuild_header(sk); +		/* Once we leave TCP_SYN_RECV, we no longer need req +		 * so release it. +		 */ +		if (req) { +			synack_stamp = tcp_rsk(req)->snt_synack; +			tp->total_retrans = req->num_retrans; +			reqsk_fastopen_remove(sk, req, false); +		} else { +			synack_stamp = tp->lsndtime; +			/* Make sure socket is routed, for correct metrics. */ +			icsk->icsk_af_ops->rebuild_header(sk); +			tcp_init_congestion_control(sk); + +			tcp_mtup_init(sk); +			tp->copied_seq = tp->rcv_nxt; +			tcp_init_buffer_space(sk); +		} +		smp_mb(); +		tcp_set_state(sk, TCP_ESTABLISHED); +		sk->sk_state_change(sk); -				tcp_init_metrics(sk); +		/* Note, that this wakeup is only for marginal crossed SYN case. +		 * Passively open sockets are not waked up, because +		 * sk->sk_sleep == NULL and sk->sk_socket == NULL. +		 */ +		if (sk->sk_socket) +			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); -				tcp_init_congestion_control(sk); +		tp->snd_una = TCP_SKB_CB(skb)->ack_seq; +		tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; +		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); +		tcp_synack_rtt_meas(sk, synack_stamp); -				/* Prevent spurious tcp_cwnd_restart() on -				 * first data packet. -				 */ -				tp->lsndtime = tcp_time_stamp; +		if (tp->rx_opt.tstamp_ok) +			tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -				tcp_mtup_init(sk); -				tcp_initialize_rcv_mss(sk); -				tcp_init_buffer_space(sk); -				tcp_fast_path_on(tp); -			} else { +		if (req) { +			/* Re-arm the timer because data may have been sent out. +			 * This is similar to the regular data transmission case +			 * when new data has just been ack'ed. +			 * +			 * (TFO) - we could try to be more aggressive and +			 * retransmitting any data sooner based on when they +			 * are sent out. +			 */ +			tcp_rearm_rto(sk); +		} else +			tcp_init_metrics(sk); + +		tcp_update_pacing_rate(sk); + +		/* Prevent spurious tcp_cwnd_restart() on first data packet */ +		tp->lsndtime = tcp_time_stamp; + +		tcp_initialize_rcv_mss(sk); +		tcp_fast_path_on(tp); +		break; + +	case TCP_FIN_WAIT1: { +		struct dst_entry *dst; +		int tmo; + +		/* If we enter the TCP_FIN_WAIT1 state and we are a +		 * Fast Open socket and this is the first acceptable +		 * ACK we have received, this would have acknowledged +		 * our SYNACK so stop the SYNACK timer. +		 */ +		if (req != NULL) { +			/* Return RST if ack_seq is invalid. +			 * Note that RFC793 only says to generate a +			 * DUPACK for it but for TCP Fast Open it seems +			 * better to treat this case like TCP_SYN_RECV +			 * above. +			 */ +			if (!acceptable)  				return 1; -			} +			/* We no longer need the request sock. */ +			reqsk_fastopen_remove(sk, req, false); +			tcp_rearm_rto(sk); +		} +		if (tp->snd_una != tp->write_seq)  			break; -		case TCP_FIN_WAIT1: -			if (tp->snd_una == tp->write_seq) { -				tcp_set_state(sk, TCP_FIN_WAIT2); -				sk->sk_shutdown |= SEND_SHUTDOWN; -				dst_confirm(__sk_dst_get(sk)); - -				if (!sock_flag(sk, SOCK_DEAD)) -					/* Wake up lingering close() */ -					sk->sk_state_change(sk); -				else { -					int tmo; - -					if (tp->linger2 < 0 || -					    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && -					     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { -						tcp_done(sk); -						NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); -						return 1; -					} +		tcp_set_state(sk, TCP_FIN_WAIT2); +		sk->sk_shutdown |= SEND_SHUTDOWN; -					tmo = tcp_fin_time(sk); -					if (tmo > TCP_TIMEWAIT_LEN) { -						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); -					} else if (th->fin || sock_owned_by_user(sk)) { -						/* Bad case. We could lose such FIN otherwise. -						 * It is not a big problem, but it looks confusing -						 * and not so rare event. We still can lose it now, -						 * if it spins in bh_lock_sock(), but it is really -						 * marginal case. -						 */ -						inet_csk_reset_keepalive_timer(sk, tmo); -					} else { -						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); -						goto discard; -					} -				} -			} -			break; +		dst = __sk_dst_get(sk); +		if (dst) +			dst_confirm(dst); -		case TCP_CLOSING: -			if (tp->snd_una == tp->write_seq) { -				tcp_time_wait(sk, TCP_TIME_WAIT, 0); -				goto discard; -			} +		if (!sock_flag(sk, SOCK_DEAD)) { +			/* Wake up lingering close() */ +			sk->sk_state_change(sk);  			break; +		} -		case TCP_LAST_ACK: -			if (tp->snd_una == tp->write_seq) { -				tcp_update_metrics(sk); -				tcp_done(sk); -				goto discard; -			} -			break; +		if (tp->linger2 < 0 || +		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && +		     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { +			tcp_done(sk); +			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); +			return 1;  		} -	} else -		goto discard; + +		tmo = tcp_fin_time(sk); +		if (tmo > TCP_TIMEWAIT_LEN) { +			inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); +		} else if (th->fin || sock_owned_by_user(sk)) { +			/* Bad case. We could lose such FIN otherwise. +			 * It is not a big problem, but it looks confusing +			 * and not so rare event. We still can lose it now, +			 * if it spins in bh_lock_sock(), but it is really +			 * marginal case. +			 */ +			inet_csk_reset_keepalive_timer(sk, tmo); +		} else { +			tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); +			goto discard; +		} +		break; +	} + +	case TCP_CLOSING: +		if (tp->snd_una == tp->write_seq) { +			tcp_time_wait(sk, TCP_TIME_WAIT, 0); +			goto discard; +		} +		break; + +	case TCP_LAST_ACK: +		if (tp->snd_una == tp->write_seq) { +			tcp_update_metrics(sk); +			tcp_done(sk); +			goto discard; +		} +		break; +	}  	/* step 6: check the URG bit */  	tcp_urg(sk, skb, th); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dd555051ec8..77cccda1ad0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -50,6 +50,7 @@   *					a single port at the same time.   */ +#define pr_fmt(fmt) "TCP: " fmt  #include <linux/bottom_half.h>  #include <linux/types.h> @@ -72,6 +73,9 @@  #include <net/timewait_sock.h>  #include <net/xfrm.h>  #include <net/netdma.h> +#include <net/secure_seq.h> +#include <net/tcp_memcontrol.h> +#include <net/busy_poll.h>  #include <linux/inet.h>  #include <linux/ipv6.h> @@ -88,22 +92,14 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);  #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, -						   __be32 addr); -static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, -			       __be32 daddr, __be32 saddr, struct tcphdr *th); -#else -static inline -struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) -{ -	return NULL; -} +static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, +			       __be32 daddr, __be32 saddr, const struct tcphdr *th);  #endif  struct inet_hashinfo tcp_hashinfo;  EXPORT_SYMBOL(tcp_hashinfo); -static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) +static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)  {  	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,  					  ip_hdr(skb)->saddr, @@ -146,13 +142,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique);  /* This will initiate an outgoing connection. */  int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  { +	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;  	struct inet_sock *inet = inet_sk(sk);  	struct tcp_sock *tp = tcp_sk(sk); -	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; -	struct rtable *rt; +	__be16 orig_sport, orig_dport;  	__be32 daddr, nexthop; -	int tmp; +	struct flowi4 *fl4; +	struct rtable *rt;  	int err; +	struct ip_options_rcu *inet_opt;  	if (addr_len < sizeof(struct sockaddr_in))  		return -EINVAL; @@ -161,20 +159,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  		return -EAFNOSUPPORT;  	nexthop = daddr = usin->sin_addr.s_addr; -	if (inet->opt && inet->opt->srr) { +	inet_opt = rcu_dereference_protected(inet->inet_opt, +					     sock_owned_by_user(sk)); +	if (inet_opt && inet_opt->opt.srr) {  		if (!daddr)  			return -EINVAL; -		nexthop = inet->opt->faddr; -	} - -	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, -			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, -			       IPPROTO_TCP, -			       inet->inet_sport, usin->sin_port, sk, 1); -	if (tmp < 0) { -		if (tmp == -ENETUNREACH) -			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); -		return tmp; +		nexthop = inet_opt->opt.faddr; +	} + +	orig_sport = inet->inet_sport; +	orig_dport = usin->sin_port; +	fl4 = &inet->cork.fl.u.ip4; +	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, +			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, +			      IPPROTO_TCP, +			      orig_sport, orig_dport, sk); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt); +		if (err == -ENETUNREACH) +			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); +		return err;  	}  	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { @@ -182,44 +186,31 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  		return -ENETUNREACH;  	} -	if (!inet->opt || !inet->opt->srr) -		daddr = rt->rt_dst; +	if (!inet_opt || !inet_opt->opt.srr) +		daddr = fl4->daddr;  	if (!inet->inet_saddr) -		inet->inet_saddr = rt->rt_src; +		inet->inet_saddr = fl4->saddr;  	inet->inet_rcv_saddr = inet->inet_saddr;  	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {  		/* Reset inherited state */  		tp->rx_opt.ts_recent	   = 0;  		tp->rx_opt.ts_recent_stamp = 0; -		tp->write_seq		   = 0; +		if (likely(!tp->repair)) +			tp->write_seq	   = 0;  	}  	if (tcp_death_row.sysctl_tw_recycle && -	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { -		struct inet_peer *peer = rt_get_peer(rt); -		/* -		 * VJ's idea. We save last timestamp seen from -		 * the destination in peer table, when entering state -		 * TIME-WAIT * and initialize rx_opt.ts_recent from it, -		 * when trying new connection. -		 */ -		if (peer) { -			inet_peer_refcheck(peer); -			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { -				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; -				tp->rx_opt.ts_recent = peer->tcp_ts; -			} -		} -	} +	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) +		tcp_fetch_timewait_stamp(sk, &rt->dst);  	inet->inet_dport = usin->sin_port;  	inet->inet_daddr = daddr;  	inet_csk(sk)->icsk_ext_hdr_len = 0; -	if (inet->opt) -		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; +	if (inet_opt) +		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;  	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -233,16 +224,18 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	if (err)  		goto failure; -	err = ip_route_newports(&rt, IPPROTO_TCP, -				inet->inet_sport, inet->inet_dport, sk); -	if (err) +	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, +			       inet->inet_sport, inet->inet_dport, sk); +	if (IS_ERR(rt)) { +		err = PTR_ERR(rt); +		rt = NULL;  		goto failure; - +	}  	/* OK, now commit destination to socket.  */  	sk->sk_gso_type = SKB_GSO_TCPV4;  	sk_setup_caps(sk, &rt->dst); -	if (!tp->write_seq) +	if (!tp->write_seq && likely(!tp->repair))  		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,  							   inet->inet_daddr,  							   inet->inet_sport, @@ -251,6 +244,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)  	inet->inet_id = tp->write_seq ^ jiffies;  	err = tcp_connect(sk); +  	rt = NULL;  	if (err)  		goto failure; @@ -271,31 +265,20 @@ failure:  EXPORT_SYMBOL(tcp_v4_connect);  /* - * This routine does path mtu discovery as defined in RFC1191. + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. + * It can be called through tcp_release_cb() if socket was owned by user + * at the time tcp_v4_err() was called to handle ICMP message.   */ -static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) +static void tcp_v4_mtu_reduced(struct sock *sk)  {  	struct dst_entry *dst;  	struct inet_sock *inet = inet_sk(sk); +	u32 mtu = tcp_sk(sk)->mtu_info; -	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs -	 * send out by Linux are always <576bytes so they should go through -	 * unfragmented). -	 */ -	if (sk->sk_state == TCP_LISTEN) -		return; - -	/* We don't check in the destentry if pmtu discovery is forbidden -	 * on this route. We just assume that no packet_to_big packets -	 * are send back when pmtu discovery is not active. -	 * There is a small race when the user changes this flag in the -	 * route, but I think that's acceptable. -	 */ -	if ((dst = __sk_dst_check(sk, 0)) == NULL) +	dst = inet_csk_update_pmtu(sk, mtu); +	if (!dst)  		return; -	dst->ops->update_pmtu(dst, mtu); -  	/* Something is about to be wrong... Remember soft error  	 * for the case, if this connection will not able to recover.  	 */ @@ -305,6 +288,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)  	mtu = dst_mtu(dst);  	if (inet->pmtudisc != IP_PMTUDISC_DONT && +	    ip_sk_accept_pmtu(sk) &&  	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {  		tcp_sync_mss(sk, mtu); @@ -317,6 +301,14 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)  	} /* else let the usual retransmit timer handle it */  } +static void do_redirect(struct sk_buff *skb, struct sock *sk) +{ +	struct dst_entry *dst = __sk_dst_check(sk, 0); + +	if (dst) +		dst->ops->redirect(dst, sk, skb); +} +  /*   * This routine is called by the ICMP module when it gets some   * sort of error condition.  If err < 0 then the socket should @@ -335,7 +327,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)  void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  { -	struct iphdr *iph = (struct iphdr *)icmp_skb->data; +	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;  	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));  	struct inet_connection_sock *icsk;  	struct tcp_sock *tp; @@ -344,7 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	const int code = icmp_hdr(icmp_skb)->code;  	struct sock *sk;  	struct sk_buff *skb; -	__u32 seq; +	struct request_sock *fastopen; +	__u32 seq, snd_una;  	__u32 remaining;  	int err;  	struct net *net = dev_net(icmp_skb->dev); @@ -368,10 +361,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	bh_lock_sock(sk);  	/* If too many ICMPs get dropped on busy  	 * servers this needs to be solved differently. +	 * We do take care of PMTU discovery (RFC1191) special case : +	 * we can receive locally generated ICMP messages while socket is held.  	 */ -	if (sock_owned_by_user(sk)) -		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); - +	if (sock_owned_by_user(sk)) { +		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) +			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); +	}  	if (sk->sk_state == TCP_CLOSE)  		goto out; @@ -383,13 +379,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  	icsk = inet_csk(sk);  	tp = tcp_sk(sk);  	seq = ntohl(th->seq); +	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ +	fastopen = tp->fastopen_rsk; +	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;  	if (sk->sk_state != TCP_LISTEN && -	    !between(seq, tp->snd_una, tp->snd_nxt)) { +	    !between(seq, snd_una, tp->snd_nxt)) {  		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);  		goto out;  	}  	switch (type) { +	case ICMP_REDIRECT: +		do_redirect(icmp_skb, sk); +		goto out;  	case ICMP_SOURCE_QUENCH:  		/* Just silently ignore these. */  		goto out; @@ -401,8 +403,20 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  			goto out;  		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ -			if (!sock_owned_by_user(sk)) -				do_pmtu_discovery(sk, iph, info); +			/* We are not interested in TCP_LISTEN and open_requests +			 * (SYN-ACKs send out by Linux are always <576bytes so +			 * they should go through unfragmented). +			 */ +			if (sk->sk_state == TCP_LISTEN) +				goto out; + +			tp->mtu_info = info; +			if (!sock_owned_by_user(sk)) { +				tcp_v4_mtu_reduced(sk); +			} else { +				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) +					sock_hold(sk); +			}  			goto out;  		} @@ -412,15 +426,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)  			break;  		if (seq != tp->snd_una  || !icsk->icsk_retransmits || -		    !icsk->icsk_backoff) +		    !icsk->icsk_backoff || fastopen)  			break;  		if (sock_owned_by_user(sk))  			break;  		icsk->icsk_backoff--; -		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << -					 icsk->icsk_backoff; +		inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) : +			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;  		tcp_bound_rto(sk);  		skb = tcp_write_queue_head(sk); @@ -474,12 +488,17 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		 * errors returned from accept().  		 */  		inet_csk_reqsk_queue_drop(sk, req, prev); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);  		goto out;  	case TCP_SYN_SENT: -	case TCP_SYN_RECV:  /* Cannot happen. -			       It can f.e. if SYNs crossed. -			     */ +	case TCP_SYN_RECV: +		/* Only in fast or simultaneous open. If a fast open socket is +		 * is already accepted it is treated as a connected one below. +		 */ +		if (fastopen && fastopen->sk == NULL) +			break; +  		if (!sock_owned_by_user(sk)) {  			sk->sk_err = err; @@ -521,8 +540,7 @@ out:  	sock_put(sk);  } -static void __tcp_v4_send_check(struct sk_buff *skb, -				__be32 saddr, __be32 daddr) +void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)  {  	struct tcphdr *th = tcp_hdr(skb); @@ -541,29 +559,12 @@ static void __tcp_v4_send_check(struct sk_buff *skb,  /* This routine computes an IPv4 TCP checksum. */  void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)  { -	struct inet_sock *inet = inet_sk(sk); +	const struct inet_sock *inet = inet_sk(sk);  	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);  }  EXPORT_SYMBOL(tcp_v4_send_check); -int tcp_v4_gso_send_check(struct sk_buff *skb) -{ -	const struct iphdr *iph; -	struct tcphdr *th; - -	if (!pskb_may_pull(skb, sizeof(*th))) -		return -EINVAL; - -	iph = ip_hdr(skb); -	th = tcp_hdr(skb); - -	th->check = 0; -	skb->ip_summed = CHECKSUM_PARTIAL; -	__tcp_v4_send_check(skb, iph->saddr, iph->daddr); -	return 0; -} -  /*   *	This routine will send an RST to the other tcp.   * @@ -579,7 +580,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)  static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)  { -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb);  	struct {  		struct tcphdr th;  #ifdef CONFIG_TCP_MD5SIG @@ -589,6 +590,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)  	struct ip_reply_arg arg;  #ifdef CONFIG_TCP_MD5SIG  	struct tcp_md5sig_key *key; +	const __u8 *hash_location = NULL; +	unsigned char newhash[16]; +	int genhash; +	struct sock *sk1 = NULL;  #endif  	struct net *net; @@ -619,7 +624,37 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)  	arg.iov[0].iov_len  = sizeof(rep.th);  #ifdef CONFIG_TCP_MD5SIG -	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; +	hash_location = tcp_parse_md5sig_option(th); +	if (!sk && hash_location) { +		/* +		 * active side is lost. Try to find listening socket through +		 * source port, and then find md5 key through listening socket. +		 * we are not loose security here: +		 * Incoming packet is checked with md5 hash with finding key, +		 * no RST generated if md5 hash doesn't match. +		 */ +		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), +					     &tcp_hashinfo, ip_hdr(skb)->saddr, +					     th->source, ip_hdr(skb)->daddr, +					     ntohs(th->source), inet_iif(skb)); +		/* don't send rst if it can't find key */ +		if (!sk1) +			return; +		rcu_read_lock(); +		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) +					&ip_hdr(skb)->saddr, AF_INET); +		if (!key) +			goto release_sk1; + +		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); +		if (genhash || memcmp(hash_location, newhash, 16) != 0) +			goto release_sk1; +	} else { +		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *) +					     &ip_hdr(skb)->saddr, +					     AF_INET) : NULL; +	} +  	if (key) {  		rep.opt[0] = htonl((TCPOPT_NOP << 24) |  				   (TCPOPT_NOP << 16) | @@ -639,13 +674,28 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)  				      arg.iov[0].iov_len, IPPROTO_TCP, 0);  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;  	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; +	/* When socket is gone, all binding information is lost. +	 * routing might fail in this case. No choice here, if we choose to force +	 * input interface, we will misroute in case of asymmetric route. +	 */ +	if (sk) +		arg.bound_dev_if = sk->sk_bound_dev_if;  	net = dev_net(skb_dst(skb)->dev); -	ip_send_reply(net->ipv4.tcp_sock, skb, -		      &arg, arg.iov[0].iov_len); +	arg.tos = ip_hdr(skb)->tos; +	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, +			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);  	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);  	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + +#ifdef CONFIG_TCP_MD5SIG +release_sk1: +	if (sk1) { +		rcu_read_unlock(); +		sock_put(sk1); +	} +#endif  }  /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states @@ -653,11 +703,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)   */  static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, -			    u32 win, u32 ts, int oif, +			    u32 win, u32 tsval, u32 tsecr, int oif,  			    struct tcp_md5sig_key *key, -			    int reply_flags) +			    int reply_flags, u8 tos)  { -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb);  	struct {  		struct tcphdr th;  		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) @@ -674,12 +724,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,  	arg.iov[0].iov_base = (unsigned char *)&rep;  	arg.iov[0].iov_len  = sizeof(rep.th); -	if (ts) { +	if (tsecr) {  		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |  				   (TCPOPT_TIMESTAMP << 8) |  				   TCPOLEN_TIMESTAMP); -		rep.opt[1] = htonl(tcp_time_stamp); -		rep.opt[2] = htonl(ts); +		rep.opt[1] = htonl(tsval); +		rep.opt[2] = htonl(tsecr);  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;  	} @@ -694,7 +744,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,  #ifdef CONFIG_TCP_MD5SIG  	if (key) { -		int offset = (ts) ? 3 : 0; +		int offset = (tsecr) ? 3 : 0;  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |  					  (TCPOPT_NOP << 16) | @@ -715,9 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,  	arg.csumoffset = offsetof(struct tcphdr, check) / 2;  	if (oif)  		arg.bound_dev_if = oif; - -	ip_send_reply(net->ipv4.tcp_sock, skb, -		      &arg, arg.iov[0].iov_len); +	arg.tos = tos; +	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, +			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);  	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);  } @@ -729,10 +779,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)  	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,  			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, +			tcp_time_stamp + tcptw->tw_ts_offset,  			tcptw->tw_ts_recent,  			tw->tw_bound_dev_if,  			tcp_twsk_md5_key(tcptw), -			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 +			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, +			tw->tw_tos  			);  	inet_twsk_put(tw); @@ -741,12 +793,19 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)  static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,  				  struct request_sock *req)  { -	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, -			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, +	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV +	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. +	 */ +	tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? +			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, +			tcp_rsk(req)->rcv_nxt, req->rcv_wnd, +			tcp_time_stamp,  			req->ts_recent,  			0, -			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), -			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); +			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, +					  AF_INET), +			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, +			ip_hdr(skb)->tos);  }  /* @@ -756,36 +815,44 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,   */  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,  			      struct request_sock *req, -			      struct request_values *rvp) +			      u16 queue_mapping, +			      struct tcp_fastopen_cookie *foc)  {  	const struct inet_request_sock *ireq = inet_rsk(req); +	struct flowi4 fl4;  	int err = -1; -	struct sk_buff * skb; +	struct sk_buff *skb;  	/* First, grab a route. */ -	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) +	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)  		return -1; -	skb = tcp_make_synack(sk, dst, req, rvp); +	skb = tcp_make_synack(sk, dst, req, foc);  	if (skb) { -		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); +		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); -		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, -					    ireq->rmt_addr, +		skb_set_queue_mapping(skb, queue_mapping); +		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, +					    ireq->ir_rmt_addr,  					    ireq->opt);  		err = net_xmit_eval(err); +		if (!tcp_rsk(req)->snt_synack && !err) +			tcp_rsk(req)->snt_synack = tcp_time_stamp;  	} -	dst_release(dst);  	return err;  } -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, -			      struct request_values *rvp) +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)  { -	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); -	return tcp_v4_send_synack(sk, NULL, req, rvp); +	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL); + +	if (!res) { +		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); +	} +	return res;  }  /* @@ -796,35 +863,50 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)  	kfree(inet_rsk(req)->opt);  } -static void syn_flood_warning(const struct sk_buff *skb) +/* + * Return true if a syncookie should be sent + */ +bool tcp_syn_flood_action(struct sock *sk, +			 const struct sk_buff *skb, +			 const char *proto)  { -	const char *msg; +	const char *msg = "Dropping request"; +	bool want_cookie = false; +	struct listen_sock *lopt;  #ifdef CONFIG_SYN_COOKIES -	if (sysctl_tcp_syncookies) +	if (sysctl_tcp_syncookies) {  		msg = "Sending cookies"; -	else +		want_cookie = true; +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); +	} else  #endif -		msg = "Dropping request"; +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); -	pr_info("TCP: Possible SYN flooding on port %d. %s.\n", -				ntohs(tcp_hdr(skb)->dest), msg); +	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; +	if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { +		lopt->synflood_warned = 1; +		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n", +			proto, ntohs(tcp_hdr(skb)->dest), msg); +	} +	return want_cookie;  } +EXPORT_SYMBOL(tcp_syn_flood_action);  /*   * Save and compile IPv4 options into the request_sock if needed.   */ -static struct ip_options *tcp_v4_save_options(struct sock *sk, -					      struct sk_buff *skb) +static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)  { -	struct ip_options *opt = &(IPCB(skb)->opt); -	struct ip_options *dopt = NULL; +	const struct ip_options *opt = &(IPCB(skb)->opt); +	struct ip_options_rcu *dopt = NULL;  	if (opt && opt->optlen) { -		int opt_size = optlength(opt); +		int opt_size = sizeof(*dopt) + opt->optlen; +  		dopt = kmalloc(opt_size, GFP_ATOMIC);  		if (dopt) { -			if (ip_options_echo(dopt, skb)) { +			if (ip_options_echo(&dopt->opt, skb)) {  				kfree(dopt);  				dopt = NULL;  			} @@ -841,150 +923,129 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,   */  /* Find the Key structure for an address.  */ -static struct tcp_md5sig_key * -			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +					 const union tcp_md5_addr *addr, +					 int family)  {  	struct tcp_sock *tp = tcp_sk(sk); -	int i; - -	if (!tp->md5sig_info || !tp->md5sig_info->entries4) +	struct tcp_md5sig_key *key; +	unsigned int size = sizeof(struct in_addr); +	struct tcp_md5sig_info *md5sig; + +	/* caller either holds rcu_read_lock() or socket lock */ +	md5sig = rcu_dereference_check(tp->md5sig_info, +				       sock_owned_by_user(sk) || +				       lockdep_is_held(&sk->sk_lock.slock)); +	if (!md5sig)  		return NULL; -	for (i = 0; i < tp->md5sig_info->entries4; i++) { -		if (tp->md5sig_info->keys4[i].addr == addr) -			return &tp->md5sig_info->keys4[i].base; +#if IS_ENABLED(CONFIG_IPV6) +	if (family == AF_INET6) +		size = sizeof(struct in6_addr); +#endif +	hlist_for_each_entry_rcu(key, &md5sig->head, node) { +		if (key->family != family) +			continue; +		if (!memcmp(&key->addr, addr, size)) +			return key;  	}  	return NULL;  } +EXPORT_SYMBOL(tcp_md5_do_lookup);  struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,  					 struct sock *addr_sk)  { -	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); +	union tcp_md5_addr *addr; + +	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; +	return tcp_md5_do_lookup(sk, addr, AF_INET);  }  EXPORT_SYMBOL(tcp_v4_md5_lookup);  static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,  						      struct request_sock *req)  { -	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr); +	union tcp_md5_addr *addr; + +	addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; +	return tcp_md5_do_lookup(sk, addr, AF_INET);  }  /* This can be called on a newly created socket, from other files */ -int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, -		      u8 *newkey, u8 newkeylen) +int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, +		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)  {  	/* Add Key to the list */  	struct tcp_md5sig_key *key;  	struct tcp_sock *tp = tcp_sk(sk); -	struct tcp4_md5sig_key *keys; +	struct tcp_md5sig_info *md5sig; -	key = tcp_v4_md5_do_lookup(sk, addr); +	key = tcp_md5_do_lookup(sk, addr, family);  	if (key) {  		/* Pre-existing entry - just update that one. */ -		kfree(key->key); -		key->key = newkey; +		memcpy(key->key, newkey, newkeylen);  		key->keylen = newkeylen; -	} else { -		struct tcp_md5sig_info *md5sig; - -		if (!tp->md5sig_info) { -			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), -						  GFP_ATOMIC); -			if (!tp->md5sig_info) { -				kfree(newkey); -				return -ENOMEM; -			} -			sk_nocaps_add(sk, NETIF_F_GSO_MASK); -		} -		if (tcp_alloc_md5sig_pool(sk) == NULL) { -			kfree(newkey); +		return 0; +	} + +	md5sig = rcu_dereference_protected(tp->md5sig_info, +					   sock_owned_by_user(sk)); +	if (!md5sig) { +		md5sig = kmalloc(sizeof(*md5sig), gfp); +		if (!md5sig)  			return -ENOMEM; -		} -		md5sig = tp->md5sig_info; - -		if (md5sig->alloced4 == md5sig->entries4) { -			keys = kmalloc((sizeof(*keys) * -					(md5sig->entries4 + 1)), GFP_ATOMIC); -			if (!keys) { -				kfree(newkey); -				tcp_free_md5sig_pool(); -				return -ENOMEM; -			} -			if (md5sig->entries4) -				memcpy(keys, md5sig->keys4, -				       sizeof(*keys) * md5sig->entries4); +		sk_nocaps_add(sk, NETIF_F_GSO_MASK); +		INIT_HLIST_HEAD(&md5sig->head); +		rcu_assign_pointer(tp->md5sig_info, md5sig); +	} -			/* Free old key list, and reference new one */ -			kfree(md5sig->keys4); -			md5sig->keys4 = keys; -			md5sig->alloced4++; -		} -		md5sig->entries4++; -		md5sig->keys4[md5sig->entries4 - 1].addr        = addr; -		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey; -		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen; +	key = sock_kmalloc(sk, sizeof(*key), gfp); +	if (!key) +		return -ENOMEM; +	if (!tcp_alloc_md5sig_pool()) { +		sock_kfree_s(sk, key, sizeof(*key)); +		return -ENOMEM;  	} + +	memcpy(key->key, newkey, newkeylen); +	key->keylen = newkeylen; +	key->family = family; +	memcpy(&key->addr, addr, +	       (family == AF_INET6) ? sizeof(struct in6_addr) : +				      sizeof(struct in_addr)); +	hlist_add_head_rcu(&key->node, &md5sig->head);  	return 0;  } -EXPORT_SYMBOL(tcp_v4_md5_do_add); +EXPORT_SYMBOL(tcp_md5_do_add); -static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, -			       u8 *newkey, u8 newkeylen) +int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)  { -	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, -				 newkey, newkeylen); -} +	struct tcp_md5sig_key *key; -int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) -{ -	struct tcp_sock *tp = tcp_sk(sk); -	int i; - -	for (i = 0; i < tp->md5sig_info->entries4; i++) { -		if (tp->md5sig_info->keys4[i].addr == addr) { -			/* Free the key */ -			kfree(tp->md5sig_info->keys4[i].base.key); -			tp->md5sig_info->entries4--; - -			if (tp->md5sig_info->entries4 == 0) { -				kfree(tp->md5sig_info->keys4); -				tp->md5sig_info->keys4 = NULL; -				tp->md5sig_info->alloced4 = 0; -			} else if (tp->md5sig_info->entries4 != i) { -				/* Need to do some manipulation */ -				memmove(&tp->md5sig_info->keys4[i], -					&tp->md5sig_info->keys4[i+1], -					(tp->md5sig_info->entries4 - i) * -					 sizeof(struct tcp4_md5sig_key)); -			} -			tcp_free_md5sig_pool(); -			return 0; -		} -	} -	return -ENOENT; +	key = tcp_md5_do_lookup(sk, addr, family); +	if (!key) +		return -ENOENT; +	hlist_del_rcu(&key->node); +	atomic_sub(sizeof(*key), &sk->sk_omem_alloc); +	kfree_rcu(key, rcu); +	return 0;  } -EXPORT_SYMBOL(tcp_v4_md5_do_del); +EXPORT_SYMBOL(tcp_md5_do_del); -static void tcp_v4_clear_md5_list(struct sock *sk) +static void tcp_clear_md5_list(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_md5sig_key *key; +	struct hlist_node *n; +	struct tcp_md5sig_info *md5sig; -	/* Free each key, then the set of key keys, -	 * the crypto element, and then decrement our -	 * hold on the last resort crypto. -	 */ -	if (tp->md5sig_info->entries4) { -		int i; -		for (i = 0; i < tp->md5sig_info->entries4; i++) -			kfree(tp->md5sig_info->keys4[i].base.key); -		tp->md5sig_info->entries4 = 0; -		tcp_free_md5sig_pool(); -	} -	if (tp->md5sig_info->keys4) { -		kfree(tp->md5sig_info->keys4); -		tp->md5sig_info->keys4 = NULL; -		tp->md5sig_info->alloced4  = 0; +	md5sig = rcu_dereference_protected(tp->md5sig_info, 1); + +	hlist_for_each_entry_safe(key, n, &md5sig->head, node) { +		hlist_del_rcu(&key->node); +		atomic_sub(sizeof(*key), &sk->sk_omem_alloc); +		kfree_rcu(key, rcu);  	}  } @@ -993,7 +1054,6 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,  {  	struct tcp_md5sig cmd;  	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; -	u8 *newkey;  	if (optlen < sizeof(cmd))  		return -EINVAL; @@ -1004,32 +1064,16 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,  	if (sin->sin_family != AF_INET)  		return -EINVAL; -	if (!cmd.tcpm_key || !cmd.tcpm_keylen) { -		if (!tcp_sk(sk)->md5sig_info) -			return -ENOENT; -		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr); -	} +	if (!cmd.tcpm_key || !cmd.tcpm_keylen) +		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, +				      AF_INET);  	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)  		return -EINVAL; -	if (!tcp_sk(sk)->md5sig_info) { -		struct tcp_sock *tp = tcp_sk(sk); -		struct tcp_md5sig_info *p; - -		p = kzalloc(sizeof(*p), sk->sk_allocation); -		if (!p) -			return -EINVAL; - -		tp->md5sig_info = p; -		sk_nocaps_add(sk, NETIF_F_GSO_MASK); -	} - -	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); -	if (!newkey) -		return -ENOMEM; -	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, -				 newkey, cmd.tcpm_keylen); +	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, +			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, +			      GFP_KERNEL);  }  static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, @@ -1055,8 +1099,8 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,  	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));  } -static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, -			       __be32 daddr, __be32 saddr, struct tcphdr *th) +static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, +			       __be32 daddr, __be32 saddr, const struct tcphdr *th)  {  	struct tcp_md5sig_pool *hp;  	struct hash_desc *desc; @@ -1088,20 +1132,20 @@ clear_hash_noput:  }  int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, -			struct sock *sk, struct request_sock *req, -			struct sk_buff *skb) +			const struct sock *sk, const struct request_sock *req, +			const struct sk_buff *skb)  {  	struct tcp_md5sig_pool *hp;  	struct hash_desc *desc; -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb);  	__be32 saddr, daddr;  	if (sk) {  		saddr = inet_sk(sk)->inet_saddr;  		daddr = inet_sk(sk)->inet_daddr;  	} else if (req) { -		saddr = inet_rsk(req)->loc_addr; -		daddr = inet_rsk(req)->rmt_addr; +		saddr = inet_rsk(req)->ir_loc_addr; +		daddr = inet_rsk(req)->ir_rmt_addr;  	} else {  		const struct iphdr *iph = ip_hdr(skb);  		saddr = iph->saddr; @@ -1138,7 +1182,7 @@ clear_hash_noput:  }  EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) +static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)  {  	/*  	 * This gets called for each TCP segment that arrives @@ -1148,28 +1192,29 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)  	 * o MD5 hash and we're not expecting one.  	 * o MD5 hash and its wrong.  	 */ -	__u8 *hash_location = NULL; +	const __u8 *hash_location = NULL;  	struct tcp_md5sig_key *hash_expected;  	const struct iphdr *iph = ip_hdr(skb); -	struct tcphdr *th = tcp_hdr(skb); +	const struct tcphdr *th = tcp_hdr(skb);  	int genhash;  	unsigned char newhash[16]; -	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); +	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, +					  AF_INET);  	hash_location = tcp_parse_md5sig_option(th);  	/* We've parsed the options - do we have a hash? */  	if (!hash_expected && !hash_location) -		return 0; +		return false;  	if (hash_expected && !hash_location) {  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); -		return 1; +		return true;  	}  	if (!hash_expected && hash_location) {  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); -		return 1; +		return true;  	}  	/* Okay, so this is hash_expected and hash_location - @@ -1180,15 +1225,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)  				      NULL, NULL, skb);  	if (genhash || memcmp(hash_location, newhash, 16) != 0) { -		if (net_ratelimit()) { -			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", -			       &iph->saddr, ntohs(th->source), -			       &iph->daddr, ntohs(th->dest), -			       genhash ? " tcp_v4_calc_md5_hash failed" : ""); -		} -		return 1; +		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", +				     &iph->saddr, ntohs(th->source), +				     &iph->daddr, ntohs(th->dest), +				     genhash ? " tcp_v4_calc_md5_hash failed" +				     : ""); +		return true;  	} -	return 0; +	return false;  }  #endif @@ -1212,9 +1256,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {  int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  { -	struct tcp_extend_values tmp_ext;  	struct tcp_options_received tmp_opt; -	u8 *hash_location;  	struct request_sock *req;  	struct inet_request_sock *ireq;  	struct tcp_sock *tp = tcp_sk(sk); @@ -1222,11 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	__be32 saddr = ip_hdr(skb)->saddr;  	__be32 daddr = ip_hdr(skb)->daddr;  	__u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_SYN_COOKIES -	int want_cookie = 0; -#else -#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ -#endif +	bool want_cookie = false, fastopen; +	struct flowi4 fl4; +	struct tcp_fastopen_cookie foc = { .len = -1 }; +	int err;  	/* Never answer to SYNs send to broadcast or multicast */  	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1236,15 +1277,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	 * limitations, they conserve resources and peer is  	 * evidently real one.  	 */ -	if (inet_csk_reqsk_queue_is_full(sk) && !isn) { -		if (net_ratelimit()) -			syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES -		if (sysctl_tcp_syncookies) { -			want_cookie = 1; -		} else -#endif -		goto drop; +	if ((sysctl_tcp_syncookies == 2 || +	     inet_csk_reqsk_queue_is_full(sk)) && !isn) { +		want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); +		if (!want_cookie) +			goto drop;  	}  	/* Accept backlog is full. If we have already queued enough @@ -1252,8 +1289,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	 * clogging syn queue with openreqs with exponentially increasing  	 * timeout.  	 */ -	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) +	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);  		goto drop; +	}  	req = inet_reqsk_alloc(&tcp_request_sock_ops);  	if (!req) @@ -1266,43 +1305,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	tcp_clear_options(&tmp_opt);  	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;  	tmp_opt.user_mss  = tp->rx_opt.user_mss; -	tcp_parse_options(skb, &tmp_opt, &hash_location, 0); - -	if (tmp_opt.cookie_plus > 0 && -	    tmp_opt.saw_tstamp && -	    !tp->rx_opt.cookie_out_never && -	    (sysctl_tcp_cookie_size > 0 || -	     (tp->cookie_values != NULL && -	      tp->cookie_values->cookie_desired > 0))) { -		u8 *c; -		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; -		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; - -		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) -			goto drop_and_release; - -		/* Secret recipe starts with IP addresses */ -		*mess++ ^= (__force u32)daddr; -		*mess++ ^= (__force u32)saddr; - -		/* plus variable length Initiator Cookie */ -		c = (u8 *)mess; -		while (l-- > 0) -			*c++ ^= *hash_location++; - -#ifdef CONFIG_SYN_COOKIES -		want_cookie = 0;	/* not our kind of cookie */ -#endif -		tmp_ext.cookie_out_never = 0; /* false */ -		tmp_ext.cookie_plus = tmp_opt.cookie_plus; -	} else if (!tp->rx_opt.cookie_in_always) { -		/* redundant indications, but ensure initialization. */ -		tmp_ext.cookie_out_never = 1; /* true */ -		tmp_ext.cookie_plus = 0; -	} else { -		goto drop_and_release; -	} -	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; +	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);  	if (want_cookie && !tmp_opt.saw_tstamp)  		tcp_clear_options(&tmp_opt); @@ -1311,23 +1314,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  	tcp_openreq_init(req, &tmp_opt, skb);  	ireq = inet_rsk(req); -	ireq->loc_addr = daddr; -	ireq->rmt_addr = saddr; +	ireq->ir_loc_addr = daddr; +	ireq->ir_rmt_addr = saddr;  	ireq->no_srccheck = inet_sk(sk)->transparent; -	ireq->opt = tcp_v4_save_options(sk, skb); +	ireq->opt = tcp_v4_save_options(skb); +	ireq->ir_mark = inet_request_mark(sk, skb);  	if (security_inet_conn_request(sk, skb, req))  		goto drop_and_free;  	if (!want_cookie || tmp_opt.tstamp_ok) -		TCP_ECN_create_request(req, tcp_hdr(skb)); +		TCP_ECN_create_request(req, skb, sock_net(sk));  	if (want_cookie) {  		isn = cookie_v4_init_sequence(sk, skb, &req->mss);  		req->cookie_ts = tmp_opt.tstamp_ok;  	} else if (!isn) { -		struct inet_peer *peer = NULL; -  		/* VJ's idea. We save last timestamp seen  		 * from the destination in peer table, when entering  		 * state TIME-WAIT, and check against it before @@ -1339,13 +1341,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  		 */  		if (tmp_opt.saw_tstamp &&  		    tcp_death_row.sysctl_tw_recycle && -		    (dst = inet_csk_route_req(sk, req)) != NULL && -		    (peer = rt_get_peer((struct rtable *)dst)) != NULL && -		    peer->daddr.a4 == saddr) { -			inet_peer_refcheck(peer); -			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && -			    (s32)(peer->tcp_ts - req->ts_recent) > -							TCP_PAWS_WINDOW) { +		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && +		    fl4.daddr == saddr) { +			if (!tcp_peer_is_proven(req, dst, true)) {  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);  				goto drop_and_release;  			} @@ -1354,8 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  		else if (!sysctl_tcp_syncookies &&  			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <  			  (sysctl_max_syn_backlog >> 2)) && -			 (!peer || !peer->tcp_ts_stamp) && -			 (!dst || !dst_metric(dst, RTAX_RTT))) { +			 !tcp_peer_is_proven(req, dst, false)) {  			/* Without syncookies last quarter of  			 * backlog is filled with destinations,  			 * proven to be alive. @@ -1363,21 +1360,32 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)  			 * to destinations, already remembered  			 * to the moment of synflood.  			 */ -			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", +			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),  				       &saddr, ntohs(tcp_hdr(skb)->source));  			goto drop_and_release;  		}  		isn = tcp_v4_init_sequence(skb);  	} +	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) +		goto drop_and_free; +  	tcp_rsk(req)->snt_isn = isn; +	tcp_rsk(req)->snt_synack = tcp_time_stamp; +	tcp_openreq_init_rwin(req, sk, dst); +	fastopen = !want_cookie && +		   tcp_try_fastopen(sk, skb, req, &foc, dst); +	err = tcp_v4_send_synack(sk, dst, req, +				 skb_get_queue_mapping(skb), &foc); +	if (!fastopen) { +		if (err || want_cookie) +			goto drop_and_free; -	if (tcp_v4_send_synack(sk, dst, req, -			       (struct request_values *)&tmp_ext) || -	    want_cookie) -		goto drop_and_free; +		tcp_rsk(req)->snt_synack = tcp_time_stamp; +		tcp_rsk(req)->listener = NULL; +		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); +	} -	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);  	return 0;  drop_and_release: @@ -1385,6 +1393,7 @@ drop_and_release:  drop_and_free:  	reqsk_free(req);  drop: +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);  	return 0;  }  EXPORT_SYMBOL(tcp_v4_conn_request); @@ -1405,38 +1414,46 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,  #ifdef CONFIG_TCP_MD5SIG  	struct tcp_md5sig_key *key;  #endif +	struct ip_options_rcu *inet_opt;  	if (sk_acceptq_is_full(sk))  		goto exit_overflow; -	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) -		goto exit; -  	newsk = tcp_create_openreq_child(sk, req, skb);  	if (!newsk)  		goto exit_nonewsk;  	newsk->sk_gso_type = SKB_GSO_TCPV4; -	sk_setup_caps(newsk, dst); +	inet_sk_rx_dst_set(newsk, skb);  	newtp		      = tcp_sk(newsk);  	newinet		      = inet_sk(newsk);  	ireq		      = inet_rsk(req); -	newinet->inet_daddr   = ireq->rmt_addr; -	newinet->inet_rcv_saddr = ireq->loc_addr; -	newinet->inet_saddr	      = ireq->loc_addr; -	newinet->opt	      = ireq->opt; +	newinet->inet_daddr   = ireq->ir_rmt_addr; +	newinet->inet_rcv_saddr = ireq->ir_loc_addr; +	newinet->inet_saddr	      = ireq->ir_loc_addr; +	inet_opt	      = ireq->opt; +	rcu_assign_pointer(newinet->inet_opt, inet_opt);  	ireq->opt	      = NULL;  	newinet->mc_index     = inet_iif(skb);  	newinet->mc_ttl	      = ip_hdr(skb)->ttl; +	newinet->rcv_tos      = ip_hdr(skb)->tos;  	inet_csk(newsk)->icsk_ext_hdr_len = 0; -	if (newinet->opt) -		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; +	if (inet_opt) +		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;  	newinet->inet_id = newtp->write_seq ^ jiffies; -	tcp_mtup_init(newsk); +	if (!dst) { +		dst = inet_csk_route_child_sock(sk, newsk, req); +		if (!dst) +			goto put_and_exit; +	} else { +		/* syncookie case : see end of cookie_v4_check() */ +	} +	sk_setup_caps(newsk, dst); +  	tcp_sync_mss(newsk, dst_mtu(dst)); -	newtp->advmss = dst_metric(dst, RTAX_ADVMSS); +	newtp->advmss = dst_metric_advmss(dst);  	if (tcp_sk(sk)->rx_opt.user_mss &&  	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)  		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; @@ -1445,7 +1462,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,  #ifdef CONFIG_TCP_MD5SIG  	/* Copy over the MD5 key from the original socket */ -	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); +	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, +				AF_INET);  	if (key != NULL) {  		/*  		 * We're using one, so create a matching key @@ -1453,18 +1471,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,  		 * memory, then we end up not copying the key  		 * across. Shucks.  		 */ -		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); -		if (newkey != NULL) -			tcp_v4_md5_do_add(newsk, newinet->inet_daddr, -					  newkey, key->keylen); +		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, +			       AF_INET, key->key, key->keylen, GFP_ATOMIC);  		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);  	}  #endif -	if (__inet_inherit_port(sk, newsk) < 0) { -		sock_put(newsk); -		goto exit; -	} +	if (__inet_inherit_port(sk, newsk) < 0) +		goto put_and_exit;  	__inet_hash_nolisten(newsk, NULL);  	return newsk; @@ -1476,6 +1490,10 @@ exit_nonewsk:  exit:  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);  	return NULL; +put_and_exit: +	inet_csk_prepare_forced_close(newsk); +	tcp_done(newsk); +	goto exit;  }  EXPORT_SYMBOL(tcp_v4_syn_recv_sock); @@ -1489,7 +1507,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)  	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,  						       iph->saddr, iph->daddr);  	if (req) -		return tcp_check_req(sk, skb, req, prev); +		return tcp_check_req(sk, skb, req, prev, false);  	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,  			th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -1510,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)  	return sk;  } -static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) -{ -	const struct iphdr *iph = ip_hdr(skb); - -	if (skb->ip_summed == CHECKSUM_COMPLETE) { -		if (!tcp_v4_check(skb->len, iph->saddr, -				  iph->daddr, skb->csum)) { -			skb->ip_summed = CHECKSUM_UNNECESSARY; -			return 0; -		} -	} - -	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -				       skb->len, IPPROTO_TCP, 0); - -	if (skb->len <= 76) { -		return __skb_checksum_complete(skb); -	} -	return 0; -} - -  /* The socket must have it's spinlock held when we get   * here.   * @@ -1555,13 +1551,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)  #endif  	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ -		sock_rps_save_rxhash(sk, skb->rxhash); -		TCP_CHECK_TIMER(sk); -		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { -			rsk = sk; -			goto reset; +		struct dst_entry *dst = sk->sk_rx_dst; + +		sock_rps_save_rxhash(sk, skb); +		if (dst) { +			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || +			    dst->ops->check(dst, 0) == NULL) { +				dst_release(dst); +				sk->sk_rx_dst = NULL; +			}  		} -		TCP_CHECK_TIMER(sk); +		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);  		return 0;  	} @@ -1574,6 +1574,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)  			goto discard;  		if (nsk != sk) { +			sock_rps_save_rxhash(nsk, skb);  			if (tcp_child_process(sk, nsk, skb)) {  				rsk = nsk;  				goto reset; @@ -1581,15 +1582,12 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)  			return 0;  		}  	} else -		sock_rps_save_rxhash(sk, skb->rxhash); +		sock_rps_save_rxhash(sk, skb); - -	TCP_CHECK_TIMER(sk);  	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {  		rsk = sk;  		goto reset;  	} -	TCP_CHECK_TIMER(sk);  	return 0;  reset: @@ -1604,11 +1602,94 @@ discard:  	return 0;  csum_err: +	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);  	goto discard;  }  EXPORT_SYMBOL(tcp_v4_do_rcv); +void tcp_v4_early_demux(struct sk_buff *skb) +{ +	const struct iphdr *iph; +	const struct tcphdr *th; +	struct sock *sk; + +	if (skb->pkt_type != PACKET_HOST) +		return; + +	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) +		return; + +	iph = ip_hdr(skb); +	th = tcp_hdr(skb); + +	if (th->doff < sizeof(struct tcphdr) / 4) +		return; + +	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, +				       iph->saddr, th->source, +				       iph->daddr, ntohs(th->dest), +				       skb->skb_iif); +	if (sk) { +		skb->sk = sk; +		skb->destructor = sock_edemux; +		if (sk->sk_state != TCP_TIME_WAIT) { +			struct dst_entry *dst = sk->sk_rx_dst; + +			if (dst) +				dst = dst_check(dst, 0); +			if (dst && +			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) +				skb_dst_set_noref(skb, dst); +		} +	} +} + +/* Packet is added to VJ-style prequeue for processing in process + * context, if a reader task is waiting. Apparently, this exciting + * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) + * failed somewhere. Latency? Burstiness? Well, at least now we will + * see, why it failed. 8)8)				  --ANK + * + */ +bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (sysctl_tcp_low_latency || !tp->ucopy.task) +		return false; + +	if (skb->len <= tcp_hdrlen(skb) && +	    skb_queue_len(&tp->ucopy.prequeue) == 0) +		return false; + +	skb_dst_force(skb); +	__skb_queue_tail(&tp->ucopy.prequeue, skb); +	tp->ucopy.memory += skb->truesize; +	if (tp->ucopy.memory > sk->sk_rcvbuf) { +		struct sk_buff *skb1; + +		BUG_ON(sock_owned_by_user(sk)); + +		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { +			sk_backlog_rcv(sk, skb1); +			NET_INC_STATS_BH(sock_net(sk), +					 LINUX_MIB_TCPPREQUEUEDROPPED); +		} + +		tp->ucopy.memory = 0; +	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { +		wake_up_interruptible_sync_poll(sk_sleep(sk), +					   POLLIN | POLLRDNORM | POLLRDBAND); +		if (!inet_csk_ack_scheduled(sk)) +			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, +						  (3 * tcp_rto_min(sk)) / 4, +						  TCP_RTO_MAX); +	} +	return true; +} +EXPORT_SYMBOL(tcp_prequeue); +  /*   *	From tcp_input.c   */ @@ -1616,7 +1697,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);  int tcp_v4_rcv(struct sk_buff *skb)  {  	const struct iphdr *iph; -	struct tcphdr *th; +	const struct tcphdr *th;  	struct sock *sk;  	int ret;  	struct net *net = dev_net(skb->dev); @@ -1641,8 +1722,9 @@ int tcp_v4_rcv(struct sk_buff *skb)  	 * Packet length and doff are validated by header prediction,  	 * provided case of th->doff==0 is eliminated.  	 * So, we defer the checks. */ -	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) -		goto bad_packet; + +	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) +		goto csum_error;  	th = tcp_hdr(skb);  	iph = ip_hdr(skb); @@ -1651,7 +1733,7 @@ int tcp_v4_rcv(struct sk_buff *skb)  				    skb->len - th->doff * 4);  	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);  	TCP_SKB_CB(skb)->when	 = 0; -	TCP_SKB_CB(skb)->flags	 = iph->tos; +	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);  	TCP_SKB_CB(skb)->sacked	 = 0;  	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); @@ -1674,6 +1756,7 @@ process:  	if (sk_filter(sk, skb))  		goto discard_and_relse; +	sk_mark_napi_id(sk, skb);  	skb->dev = NULL;  	bh_lock_sock_nested(sk); @@ -1682,7 +1765,7 @@ process:  #ifdef CONFIG_NET_DMA  		struct tcp_sock *tp = tcp_sk(sk);  		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) -			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); +			tp->ucopy.dma_chan = net_dma_find_channel();  		if (tp->ucopy.dma_chan)  			ret = tcp_v4_do_rcv(sk, skb);  		else @@ -1691,7 +1774,8 @@ process:  			if (!tcp_prequeue(sk, skb))  				ret = tcp_v4_do_rcv(sk, skb);  		} -	} else if (unlikely(sk_add_backlog(sk, skb))) { +	} else if (unlikely(sk_add_backlog(sk, skb, +					   sk->sk_rcvbuf + sk->sk_sndbuf))) {  		bh_unlock_sock(sk);  		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);  		goto discard_and_relse; @@ -1707,6 +1791,8 @@ no_tcp_socket:  		goto discard_it;  	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +csum_error: +		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);  bad_packet:  		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);  	} else { @@ -1728,15 +1814,19 @@ do_time_wait:  		goto discard_it;  	} -	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { -		TCP_INC_STATS_BH(net, TCP_MIB_INERRS); +	if (skb->len < (th->doff << 2)) {  		inet_twsk_put(inet_twsk(sk)); -		goto discard_it; +		goto bad_packet; +	} +	if (tcp_checksum_complete(skb)) { +		inet_twsk_put(inet_twsk(sk)); +		goto csum_error;  	}  	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {  	case TCP_TW_SYN: {  		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),  							&tcp_hashinfo, +							iph->saddr, th->source,  							iph->daddr, th->dest,  							inet_iif(skb));  		if (sk2) { @@ -1757,48 +1847,29 @@ do_time_wait:  	goto discard_it;  } -struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) -{ -	struct rtable *rt = (struct rtable *) __sk_dst_get(sk); -	struct inet_sock *inet = inet_sk(sk); -	struct inet_peer *peer; - -	if (!rt || rt->rt_dst != inet->inet_daddr) { -		peer = inet_getpeer_v4(inet->inet_daddr, 1); -		*release_it = true; -	} else { -		if (!rt->peer) -			rt_bind_peer(rt, 1); -		peer = rt->peer; -		*release_it = false; -	} - -	return peer; -} -EXPORT_SYMBOL(tcp_v4_get_peer); - -void *tcp_v4_tw_get_peer(struct sock *sk) -{ -	struct inet_timewait_sock *tw = inet_twsk(sk); - -	return inet_getpeer_v4(tw->tw_daddr, 1); -} -EXPORT_SYMBOL(tcp_v4_tw_get_peer); -  static struct timewait_sock_ops tcp_timewait_sock_ops = {  	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),  	.twsk_unique	= tcp_twsk_unique,  	.twsk_destructor= tcp_twsk_destructor, -	.twsk_getpeer	= tcp_v4_tw_get_peer,  }; +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ +	struct dst_entry *dst = skb_dst(skb); + +	dst_hold(dst); +	sk->sk_rx_dst = dst; +	inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(inet_sk_rx_dst_set); +  const struct inet_connection_sock_af_ops ipv4_specific = {  	.queue_xmit	   = ip_queue_xmit,  	.send_check	   = tcp_v4_send_check,  	.rebuild_header	   = inet_sk_rebuild_header, +	.sk_rx_dst_set	   = inet_sk_rx_dst_set,  	.conn_request	   = tcp_v4_conn_request,  	.syn_recv_sock	   = tcp_v4_syn_recv_sock, -	.get_peer	   = tcp_v4_get_peer,  	.net_header_len	   = sizeof(struct iphdr),  	.setsockopt	   = ip_setsockopt,  	.getsockopt	   = ip_getsockopt, @@ -1816,7 +1887,6 @@ EXPORT_SYMBOL(ipv4_specific);  static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {  	.md5_lookup		= tcp_v4_md5_lookup,  	.calc_md5_hash		= tcp_v4_md5_hash_skb, -	.md5_add		= tcp_v4_md5_add_func,  	.md5_parse		= tcp_v4_parse_md5_keys,  };  #endif @@ -1827,63 +1897,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {  static int tcp_v4_init_sock(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); - -	skb_queue_head_init(&tp->out_of_order_queue); -	tcp_init_xmit_timers(sk); -	tcp_prequeue_init(tp); - -	icsk->icsk_rto = TCP_TIMEOUT_INIT; -	tp->mdev = TCP_TIMEOUT_INIT; - -	/* So many TCP implementations out there (incorrectly) count the -	 * initial SYN frame in their delayed-ACK and congestion control -	 * algorithms that we must have the following bandaid to talk -	 * efficiently to them.  -DaveM -	 */ -	tp->snd_cwnd = 2; - -	/* See draft-stevens-tcpca-spec-01 for discussion of the -	 * initialization of these values. -	 */ -	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -	tp->snd_cwnd_clamp = ~0; -	tp->mss_cache = TCP_MSS_DEFAULT; - -	tp->reordering = sysctl_tcp_reordering; -	icsk->icsk_ca_ops = &tcp_init_congestion_ops; -	sk->sk_state = TCP_CLOSE; - -	sk->sk_write_space = sk_stream_write_space; -	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); +	tcp_init_sock(sk);  	icsk->icsk_af_ops = &ipv4_specific; -	icsk->icsk_sync_mss = tcp_sync_mss; +  #ifdef CONFIG_TCP_MD5SIG -	tp->af_specific = &tcp_sock_ipv4_specific; +	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;  #endif -	/* TCP Cookie Transactions */ -	if (sysctl_tcp_cookie_size > 0) { -		/* Default, cookies without s_data_payload. */ -		tp->cookie_values = -			kzalloc(sizeof(*tp->cookie_values), -				sk->sk_allocation); -		if (tp->cookie_values != NULL) -			kref_init(&tp->cookie_values->kref); -	} -	/* Presumed zeroed, in order of appearance: -	 *	cookie_in_always, cookie_out_never, -	 *	s_data_constant, s_data_in, s_data_out -	 */ -	sk->sk_sndbuf = sysctl_tcp_wmem[1]; -	sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - -	local_bh_disable(); -	percpu_counter_inc(&tcp_sockets_allocated); -	local_bh_enable(); -  	return 0;  } @@ -1904,8 +1926,8 @@ void tcp_v4_destroy_sock(struct sock *sk)  #ifdef CONFIG_TCP_MD5SIG  	/* Clean up the MD5 key list, if any */  	if (tp->md5sig_info) { -		tcp_v4_clear_md5_list(sk); -		kfree(tp->md5sig_info); +		tcp_clear_md5_list(sk); +		kfree_rcu(tp->md5sig_info, rcu);  		tp->md5sig_info = NULL;  	}  #endif @@ -1922,40 +1944,19 @@ void tcp_v4_destroy_sock(struct sock *sk)  	if (inet_csk(sk)->icsk_bind_hash)  		inet_put_port(sk); -	/* -	 * If sendmsg cached page exists, toss it. -	 */ -	if (sk->sk_sndmsg_page) { -		__free_page(sk->sk_sndmsg_page); -		sk->sk_sndmsg_page = NULL; -	} +	BUG_ON(tp->fastopen_rsk != NULL); -	/* TCP Cookie Transactions */ -	if (tp->cookie_values != NULL) { -		kref_put(&tp->cookie_values->kref, -			 tcp_cookie_values_release); -		tp->cookie_values = NULL; -	} +	/* If socket is aborted during connect operation */ +	tcp_free_fastopen_req(tp); -	percpu_counter_dec(&tcp_sockets_allocated); +	sk_sockets_allocated_dec(sk); +	sock_release_memcg(sk);  }  EXPORT_SYMBOL(tcp_v4_destroy_sock);  #ifdef CONFIG_PROC_FS  /* Proc filesystem TCP sock list dumping. */ -static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head) -{ -	return hlist_nulls_empty(head) ? NULL : -		list_entry(head->first, struct inet_timewait_sock, tw_node); -} - -static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) -{ -	return !is_a_nulls(tw->tw_node.next) ? -		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; -} -  /*   * Get next listener socket follow cur.  If cur is NULL, get first socket   * starting from bucket given in st->bucket; when st->bucket is zero the @@ -1994,13 +1995,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)  				}  				req = req->dl_next;  			} -			st->offset = 0;  			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)  				break;  get_req:  			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];  		} -		sk	  = sk_next(st->syn_wait_sk); +		sk	  = sk_nulls_next(st->syn_wait_sk);  		st->state = TCP_SEQ_STATE_LISTENING;  		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);  	} else { @@ -2009,11 +2009,13 @@ get_req:  		if (reqsk_queue_len(&icsk->icsk_accept_queue))  			goto start_req;  		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); -		sk = sk_next(sk); +		sk = sk_nulls_next(sk);  	}  get_sk:  	sk_nulls_for_each_from(sk, node) { -		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { +		if (!net_eq(sock_net(sk), net)) +			continue; +		if (sk->sk_family == st->family) {  			cur = sk;  			goto out;  		} @@ -2058,10 +2060,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)  	return rc;  } -static inline int empty_bucket(struct tcp_iter_state *st) +static inline bool empty_bucket(const struct tcp_iter_state *st)  { -	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && -		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); +	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);  }  /* @@ -2078,7 +2079,6 @@ static void *established_get_first(struct seq_file *seq)  	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {  		struct sock *sk;  		struct hlist_nulls_node *node; -		struct inet_timewait_sock *tw;  		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);  		/* Lockless fast path for the common case of empty buckets */ @@ -2094,18 +2094,7 @@ static void *established_get_first(struct seq_file *seq)  			rc = sk;  			goto out;  		} -		st->state = TCP_SEQ_STATE_TIME_WAIT; -		inet_twsk_for_each(tw, node, -				   &tcp_hashinfo.ehash[st->bucket].twchain) { -			if (tw->tw_family != st->family || -			    !net_eq(twsk_net(tw), net)) { -				continue; -			} -			rc = tw; -			goto out; -		}  		spin_unlock_bh(lock); -		st->state = TCP_SEQ_STATE_ESTABLISHED;  	}  out:  	return rc; @@ -2114,7 +2103,6 @@ out:  static void *established_get_next(struct seq_file *seq, void *cur)  {  	struct sock *sk = cur; -	struct inet_timewait_sock *tw;  	struct hlist_nulls_node *node;  	struct tcp_iter_state *st = seq->private;  	struct net *net = seq_file_net(seq); @@ -2122,45 +2110,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)  	++st->num;  	++st->offset; -	if (st->state == TCP_SEQ_STATE_TIME_WAIT) { -		tw = cur; -		tw = tw_next(tw); -get_tw: -		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { -			tw = tw_next(tw); -		} -		if (tw) { -			cur = tw; -			goto out; -		} -		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); -		st->state = TCP_SEQ_STATE_ESTABLISHED; - -		/* Look for next non empty bucket */ -		st->offset = 0; -		while (++st->bucket <= tcp_hashinfo.ehash_mask && -				empty_bucket(st)) -			; -		if (st->bucket > tcp_hashinfo.ehash_mask) -			return NULL; - -		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); -		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain); -	} else -		sk = sk_nulls_next(sk); +	sk = sk_nulls_next(sk);  	sk_nulls_for_each_from(sk, node) {  		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) -			goto found; +			return sk;  	} -	st->state = TCP_SEQ_STATE_TIME_WAIT; -	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain); -	goto get_tw; -found: -	cur = sk; -out: -	return cur; +	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); +	++st->bucket; +	return established_get_first(seq);  }  static void *established_get_idx(struct seq_file *seq, loff_t pos) @@ -2213,10 +2172,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)  		if (rc)  			break;  		st->bucket = 0; +		st->state = TCP_SEQ_STATE_ESTABLISHED;  		/* Fallthrough */  	case TCP_SEQ_STATE_ESTABLISHED: -	case TCP_SEQ_STATE_TIME_WAIT: -		st->state = TCP_SEQ_STATE_ESTABLISHED;  		if (st->bucket > tcp_hashinfo.ehash_mask)  			break;  		rc = established_get_first(seq); @@ -2273,7 +2231,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)  		}  		break;  	case TCP_SEQ_STATE_ESTABLISHED: -	case TCP_SEQ_STATE_TIME_WAIT:  		rc = established_get_next(seq, v);  		break;  	} @@ -2297,7 +2254,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)  		if (v != SEQ_START_TOKEN)  			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);  		break; -	case TCP_SEQ_STATE_TIME_WAIT:  	case TCP_SEQ_STATE_ESTABLISHED:  		if (v)  			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2305,9 +2261,9 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)  	}  } -static int tcp_seq_open(struct inode *inode, struct file *file) +int tcp_seq_open(struct inode *inode, struct file *file)  { -	struct tcp_seq_afinfo *afinfo = PDE(inode)->data; +	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);  	struct tcp_iter_state *s;  	int err; @@ -2321,23 +2277,19 @@ static int tcp_seq_open(struct inode *inode, struct file *file)  	s->last_pos 		= 0;  	return 0;  } +EXPORT_SYMBOL(tcp_seq_open);  int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)  {  	int rc = 0;  	struct proc_dir_entry *p; -	afinfo->seq_fops.open		= tcp_seq_open; -	afinfo->seq_fops.read		= seq_read; -	afinfo->seq_fops.llseek		= seq_lseek; -	afinfo->seq_fops.release	= seq_release_net; -  	afinfo->seq_ops.start		= tcp_seq_start;  	afinfo->seq_ops.next		= tcp_seq_next;  	afinfo->seq_ops.stop		= tcp_seq_stop;  	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, -			     &afinfo->seq_fops, afinfo); +			     afinfo->seq_fops, afinfo);  	if (!p)  		rc = -ENOMEM;  	return rc; @@ -2346,50 +2298,52 @@ EXPORT_SYMBOL(tcp_proc_register);  void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)  { -	proc_net_remove(net, afinfo->name); +	remove_proc_entry(afinfo->name, net->proc_net);  }  EXPORT_SYMBOL(tcp_proc_unregister); -static void get_openreq4(struct sock *sk, struct request_sock *req, -			 struct seq_file *f, int i, int uid, int *len) +static void get_openreq4(const struct sock *sk, const struct request_sock *req, +			 struct seq_file *f, int i, kuid_t uid)  {  	const struct inet_request_sock *ireq = inet_rsk(req); -	int ttd = req->expires - jiffies; +	long delta = req->expires - jiffies;  	seq_printf(f, "%4d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",  		i, -		ireq->loc_addr, +		ireq->ir_loc_addr,  		ntohs(inet_sk(sk)->inet_sport), -		ireq->rmt_addr, -		ntohs(ireq->rmt_port), +		ireq->ir_rmt_addr, +		ntohs(ireq->ir_rmt_port),  		TCP_SYN_RECV,  		0, 0, /* could print option size, but that is af dependent. */  		1,    /* timers active (only the expire timer) */ -		jiffies_to_clock_t(ttd), -		req->retrans, -		uid, +		jiffies_delta_to_clock_t(delta), +		req->num_timeout, +		from_kuid_munged(seq_user_ns(f), uid),  		0,  /* non standard timer */  		0, /* open_requests have no inode */  		atomic_read(&sk->sk_refcnt), -		req, -		len); +		req);  } -static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) +static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)  {  	int timer_active;  	unsigned long timer_expires; -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct inet_sock *inet = inet_sk(sk); +	const struct inet_sock *inet = inet_sk(sk); +	struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;  	__be32 dest = inet->inet_daddr;  	__be32 src = inet->inet_rcv_saddr;  	__u16 destp = ntohs(inet->inet_dport);  	__u16 srcp = ntohs(inet->inet_sport);  	int rx_queue; -	if (icsk->icsk_pending == ICSK_TIME_RETRANS) { +	if (icsk->icsk_pending == ICSK_TIME_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {  		timer_active	= 1;  		timer_expires	= icsk->icsk_timeout;  	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { @@ -2412,14 +2366,14 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);  	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " -			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", +			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",  		i, src, srcp, dest, destp, sk->sk_state,  		tp->write_seq - tp->snd_una,  		rx_queue,  		timer_active, -		jiffies_to_clock_t(timer_expires - jiffies), +		jiffies_delta_to_clock_t(timer_expires - jiffies),  		icsk->icsk_retransmits, -		sock_i_uid(sk), +		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),  		icsk->icsk_probes_out,  		sock_i_ino(sk),  		atomic_read(&sk->sk_refcnt), sk, @@ -2427,19 +2381,17 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)  		jiffies_to_clock_t(icsk->icsk_ack.ato),  		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,  		tp->snd_cwnd, -		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, -		len); +		sk->sk_state == TCP_LISTEN ? +		    (fastopenq ? fastopenq->max_qlen : 0) : +		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));  } -static void get_timewait4_sock(struct inet_timewait_sock *tw, -			       struct seq_file *f, int i, int *len) +static void get_timewait4_sock(const struct inet_timewait_sock *tw, +			       struct seq_file *f, int i)  {  	__be32 dest, src;  	__u16 destp, srcp; -	int ttd = tw->tw_ttd - jiffies; - -	if (ttd < 0) -		ttd = 0; +	s32 delta = tw->tw_ttd - inet_tw_time_stamp();  	dest  = tw->tw_daddr;  	src   = tw->tw_rcv_saddr; @@ -2447,10 +2399,10 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,  	srcp  = ntohs(tw->tw_sport);  	seq_printf(f, "%4d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", +		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",  		i, src, srcp, dest, destp, tw->tw_substate, 0, 0, -		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, -		atomic_read(&tw->tw_refcnt), tw, len); +		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, +		atomic_read(&tw->tw_refcnt), tw);  }  #define TMPSZ 150 @@ -2458,11 +2410,11 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,  static int tcp4_seq_show(struct seq_file *seq, void *v)  {  	struct tcp_iter_state *st; -	int len; +	struct sock *sk = v; +	seq_setwidth(seq, TMPSZ - 1);  	if (v == SEQ_START_TOKEN) { -		seq_printf(seq, "%-*s\n", TMPSZ - 1, -			   "  sl  local_address rem_address   st tx_queue " +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "  			   "rx_queue tr tm->when retrnsmt   uid  timeout "  			   "inode");  		goto out; @@ -2472,26 +2424,32 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)  	switch (st->state) {  	case TCP_SEQ_STATE_LISTENING:  	case TCP_SEQ_STATE_ESTABLISHED: -		get_tcp4_sock(v, seq, st->num, &len); +		if (sk->sk_state == TCP_TIME_WAIT) +			get_timewait4_sock(v, seq, st->num); +		else +			get_tcp4_sock(v, seq, st->num);  		break;  	case TCP_SEQ_STATE_OPENREQ: -		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len); -		break; -	case TCP_SEQ_STATE_TIME_WAIT: -		get_timewait4_sock(v, seq, st->num, &len); +		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);  		break;  	} -	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");  out: +	seq_pad(seq, '\n');  	return 0;  } +static const struct file_operations tcp_afinfo_seq_fops = { +	.owner   = THIS_MODULE, +	.open    = tcp_seq_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_net +}; +  static struct tcp_seq_afinfo tcp4_seq_afinfo = {  	.name		= "tcp",  	.family		= AF_INET, -	.seq_fops	= { -		.owner		= THIS_MODULE, -	}, +	.seq_fops	= &tcp_afinfo_seq_fops,  	.seq_ops	= {  		.show		= tcp4_seq_show,  	}, @@ -2523,39 +2481,6 @@ void tcp4_proc_exit(void)  }  #endif /* CONFIG_PROC_FS */ -struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) -{ -	struct iphdr *iph = skb_gro_network_header(skb); - -	switch (skb->ip_summed) { -	case CHECKSUM_COMPLETE: -		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, -				  skb->csum)) { -			skb->ip_summed = CHECKSUM_UNNECESSARY; -			break; -		} - -		/* fall through */ -	case CHECKSUM_NONE: -		NAPI_GRO_CB(skb)->flush = 1; -		return NULL; -	} - -	return tcp_gro_receive(head, skb); -} - -int tcp4_gro_complete(struct sk_buff *skb) -{ -	struct iphdr *iph = ip_hdr(skb); -	struct tcphdr *th = tcp_hdr(skb); - -	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), -				  iph->saddr, iph->daddr, 0); -	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - -	return tcp_gro_complete(skb); -} -  struct proto tcp_prot = {  	.name			= "TCP",  	.owner			= THIS_MODULE, @@ -2573,10 +2498,13 @@ struct proto tcp_prot = {  	.sendmsg		= tcp_sendmsg,  	.sendpage		= tcp_sendpage,  	.backlog_rcv		= tcp_v4_do_rcv, +	.release_cb		= tcp_release_cb, +	.mtu_reduced		= tcp_v4_mtu_reduced,  	.hash			= inet_hash,  	.unhash			= inet_unhash,  	.get_port		= inet_csk_get_port,  	.enter_memory_pressure	= tcp_enter_memory_pressure, +	.stream_memory_free	= tcp_stream_memory_free,  	.sockets_allocated	= &tcp_sockets_allocated,  	.orphan_count		= &tcp_orphan_count,  	.memory_allocated	= &tcp_memory_allocated, @@ -2595,19 +2523,22 @@ struct proto tcp_prot = {  	.compat_setsockopt	= compat_tcp_setsockopt,  	.compat_getsockopt	= compat_tcp_getsockopt,  #endif +#ifdef CONFIG_MEMCG_KMEM +	.init_cgroup		= tcp_init_cgroup, +	.destroy_cgroup		= tcp_destroy_cgroup, +	.proto_cgroup		= tcp_proto_cgroup, +#endif  };  EXPORT_SYMBOL(tcp_prot); -  static int __net_init tcp_sk_init(struct net *net)  { -	return inet_ctl_sock_create(&net->ipv4.tcp_sock, -				    PF_INET, SOCK_RAW, IPPROTO_TCP, net); +	net->ipv4.sysctl_tcp_ecn = 2; +	return 0;  }  static void __net_exit tcp_sk_exit(struct net *net)  { -	inet_ctl_sock_destroy(net->ipv4.tcp_sock);  }  static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index de870377fbb..1e70fa8fa79 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -12,7 +12,7 @@   *     within cong_avoid.   *   o Error correcting in remote HZ, therefore remote HZ will be keeped   *     on checking and updating. - *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since   *     OWD have a similar meaning as RTT. Also correct the buggy formular.   *   o Handle reaction for Early Congestion Indication (ECI) within   *     pkts_acked, as mentioned within pseudo code. @@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk)   * Will only call newReno CA when away from inference.   * From TCP-LP's paper, this will be handled in additive increasement.   */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct lp *lp = inet_csk_ca(sk);  	if (!(lp->flag & LP_WITHIN_INF)) -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  }  /** @@ -313,12 +313,10 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)  	lp->last_drop = tcp_time_stamp;  } -static struct tcp_congestion_ops tcp_lp = { -	.flags = TCP_CONG_RTT_STAMP, +static struct tcp_congestion_ops tcp_lp __read_mostly = {  	.init = tcp_lp_init,  	.ssthresh = tcp_reno_ssthresh,  	.cong_avoid = tcp_lp_cong_avoid, -	.min_cwnd = tcp_reno_min_cwnd,  	.pkts_acked = tcp_lp_pkts_acked,  	.owner = THIS_MODULE, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c new file mode 100644 index 00000000000..f7a2ec3ac58 --- /dev/null +++ b/net/ipv4/tcp_memcontrol.c @@ -0,0 +1,228 @@ +#include <net/tcp.h> +#include <net/tcp_memcontrol.h> +#include <net/sock.h> +#include <net/ip.h> +#include <linux/nsproxy.h> +#include <linux/memcontrol.h> +#include <linux/module.h> + +int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +{ +	/* +	 * The root cgroup does not use res_counters, but rather, +	 * rely on the data already collected by the network +	 * subsystem +	 */ +	struct res_counter *res_parent = NULL; +	struct cg_proto *cg_proto, *parent_cg; +	struct mem_cgroup *parent = parent_mem_cgroup(memcg); + +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return 0; + +	cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; +	cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; +	cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; +	cg_proto->memory_pressure = 0; +	cg_proto->memcg = memcg; + +	parent_cg = tcp_prot.proto_cgroup(parent); +	if (parent_cg) +		res_parent = &parent_cg->memory_allocated; + +	res_counter_init(&cg_proto->memory_allocated, res_parent); +	percpu_counter_init(&cg_proto->sockets_allocated, 0); + +	return 0; +} +EXPORT_SYMBOL(tcp_init_cgroup); + +void tcp_destroy_cgroup(struct mem_cgroup *memcg) +{ +	struct cg_proto *cg_proto; + +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return; + +	percpu_counter_destroy(&cg_proto->sockets_allocated); +} +EXPORT_SYMBOL(tcp_destroy_cgroup); + +static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) +{ +	struct cg_proto *cg_proto; +	int i; +	int ret; + +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return -EINVAL; + +	if (val > RES_COUNTER_MAX) +		val = RES_COUNTER_MAX; + +	ret = res_counter_set_limit(&cg_proto->memory_allocated, val); +	if (ret) +		return ret; + +	for (i = 0; i < 3; i++) +		cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, +						sysctl_tcp_mem[i]); + +	if (val == RES_COUNTER_MAX) +		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); +	else if (val != RES_COUNTER_MAX) { +		/* +		 * The active bit needs to be written after the static_key +		 * update. This is what guarantees that the socket activation +		 * function is the last one to run. See sock_update_memcg() for +		 * details, and note that we don't mark any socket as belonging +		 * to this memcg until that flag is up. +		 * +		 * We need to do this, because static_keys will span multiple +		 * sites, but we can't control their order. If we mark a socket +		 * as accounted, but the accounting functions are not patched in +		 * yet, we'll lose accounting. +		 * +		 * We never race with the readers in sock_update_memcg(), +		 * because when this value change, the code to process it is not +		 * patched in yet. +		 * +		 * The activated bit is used to guarantee that no two writers +		 * will do the update in the same memcg. Without that, we can't +		 * properly shutdown the static key. +		 */ +		if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) +			static_key_slow_inc(&memcg_socket_limit_enabled); +		set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); +	} + +	return 0; +} + +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); +	unsigned long long val; +	int ret = 0; + +	buf = strstrip(buf); + +	switch (of_cft(of)->private) { +	case RES_LIMIT: +		/* see memcontrol.c */ +		ret = res_counter_memparse_write_strategy(buf, &val); +		if (ret) +			break; +		ret = tcp_update_limit(memcg, val); +		break; +	default: +		ret = -EINVAL; +		break; +	} +	return ret ?: nbytes; +} + +static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) +{ +	struct cg_proto *cg_proto; + +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return default_val; + +	return res_counter_read_u64(&cg_proto->memory_allocated, type); +} + +static u64 tcp_read_usage(struct mem_cgroup *memcg) +{ +	struct cg_proto *cg_proto; + +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; + +	return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); +} + +static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ +	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	u64 val; + +	switch (cft->private) { +	case RES_LIMIT: +		val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); +		break; +	case RES_USAGE: +		val = tcp_read_usage(memcg); +		break; +	case RES_FAILCNT: +	case RES_MAX_USAGE: +		val = tcp_read_stat(memcg, cft->private, 0); +		break; +	default: +		BUG(); +	} +	return val; +} + +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off) +{ +	struct mem_cgroup *memcg; +	struct cg_proto *cg_proto; + +	memcg = mem_cgroup_from_css(of_css(of)); +	cg_proto = tcp_prot.proto_cgroup(memcg); +	if (!cg_proto) +		return nbytes; + +	switch (of_cft(of)->private) { +	case RES_MAX_USAGE: +		res_counter_reset_max(&cg_proto->memory_allocated); +		break; +	case RES_FAILCNT: +		res_counter_reset_failcnt(&cg_proto->memory_allocated); +		break; +	} + +	return nbytes; +} + +static struct cftype tcp_files[] = { +	{ +		.name = "kmem.tcp.limit_in_bytes", +		.write = tcp_cgroup_write, +		.read_u64 = tcp_cgroup_read, +		.private = RES_LIMIT, +	}, +	{ +		.name = "kmem.tcp.usage_in_bytes", +		.read_u64 = tcp_cgroup_read, +		.private = RES_USAGE, +	}, +	{ +		.name = "kmem.tcp.failcnt", +		.private = RES_FAILCNT, +		.write = tcp_cgroup_reset, +		.read_u64 = tcp_cgroup_read, +	}, +	{ +		.name = "kmem.tcp.max_usage_in_bytes", +		.private = RES_MAX_USAGE, +		.write = tcp_cgroup_reset, +		.read_u64 = tcp_cgroup_read, +	}, +	{ }	/* terminate */ +}; + +static int __init tcp_memcontrol_init(void) +{ +	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); +	return 0; +} +__initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c new file mode 100644 index 00000000000..4fe04180598 --- /dev/null +++ b/net/ipv4/tcp_metrics.c @@ -0,0 +1,1188 @@ +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/tcp.h> +#include <linux/hash.h> +#include <linux/tcp_metrics.h> +#include <linux/vmalloc.h> + +#include <net/inet_connection_sock.h> +#include <net/net_namespace.h> +#include <net/request_sock.h> +#include <net/inetpeer.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/dst.h> +#include <net/tcp.h> +#include <net/genetlink.h> + +int sysctl_tcp_nometrics_save __read_mostly; + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, +						   const struct inetpeer_addr *daddr, +						   struct net *net, unsigned int hash); + +struct tcp_fastopen_metrics { +	u16	mss; +	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */ +	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */ +	struct	tcp_fastopen_cookie	cookie; +}; + +/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility + * Kernel only stores RTT and RTTVAR in usec resolution + */ +#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) + +struct tcp_metrics_block { +	struct tcp_metrics_block __rcu	*tcpm_next; +	struct inetpeer_addr		tcpm_saddr; +	struct inetpeer_addr		tcpm_daddr; +	unsigned long			tcpm_stamp; +	u32				tcpm_ts; +	u32				tcpm_ts_stamp; +	u32				tcpm_lock; +	u32				tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; +	struct tcp_fastopen_metrics	tcpm_fastopen; + +	struct rcu_head			rcu_head; +}; + +static bool tcp_metric_locked(struct tcp_metrics_block *tm, +			      enum tcp_metric_index idx) +{ +	return tm->tcpm_lock & (1 << idx); +} + +static u32 tcp_metric_get(struct tcp_metrics_block *tm, +			  enum tcp_metric_index idx) +{ +	return tm->tcpm_vals[idx]; +} + +static void tcp_metric_set(struct tcp_metrics_block *tm, +			   enum tcp_metric_index idx, +			   u32 val) +{ +	tm->tcpm_vals[idx] = val; +} + +static bool addr_same(const struct inetpeer_addr *a, +		      const struct inetpeer_addr *b) +{ +	const struct in6_addr *a6, *b6; + +	if (a->family != b->family) +		return false; +	if (a->family == AF_INET) +		return a->addr.a4 == b->addr.a4; + +	a6 = (const struct in6_addr *) &a->addr.a6[0]; +	b6 = (const struct in6_addr *) &b->addr.a6[0]; + +	return ipv6_addr_equal(a6, b6); +} + +struct tcpm_hash_bucket { +	struct tcp_metrics_block __rcu	*chain; +}; + +static DEFINE_SPINLOCK(tcp_metrics_lock); + +static void tcpm_suck_dst(struct tcp_metrics_block *tm, +			  const struct dst_entry *dst, +			  bool fastopen_clear) +{ +	u32 msval; +	u32 val; + +	tm->tcpm_stamp = jiffies; + +	val = 0; +	if (dst_metric_locked(dst, RTAX_RTT)) +		val |= 1 << TCP_METRIC_RTT; +	if (dst_metric_locked(dst, RTAX_RTTVAR)) +		val |= 1 << TCP_METRIC_RTTVAR; +	if (dst_metric_locked(dst, RTAX_SSTHRESH)) +		val |= 1 << TCP_METRIC_SSTHRESH; +	if (dst_metric_locked(dst, RTAX_CWND)) +		val |= 1 << TCP_METRIC_CWND; +	if (dst_metric_locked(dst, RTAX_REORDERING)) +		val |= 1 << TCP_METRIC_REORDERING; +	tm->tcpm_lock = val; + +	msval = dst_metric_raw(dst, RTAX_RTT); +	tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC; + +	msval = dst_metric_raw(dst, RTAX_RTTVAR); +	tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC; +	tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); +	tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); +	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); +	tm->tcpm_ts = 0; +	tm->tcpm_ts_stamp = 0; +	if (fastopen_clear) { +		tm->tcpm_fastopen.mss = 0; +		tm->tcpm_fastopen.syn_loss = 0; +		tm->tcpm_fastopen.cookie.len = 0; +	} +} + +#define TCP_METRICS_TIMEOUT		(60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ +	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) +		tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH	5 +#define TCP_METRICS_RECLAIM_PTR		(struct tcp_metrics_block *) 0x1UL + +static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, +					  struct inetpeer_addr *saddr, +					  struct inetpeer_addr *daddr, +					  unsigned int hash) +{ +	struct tcp_metrics_block *tm; +	struct net *net; +	bool reclaim = false; + +	spin_lock_bh(&tcp_metrics_lock); +	net = dev_net(dst->dev); + +	/* While waiting for the spin-lock the cache might have been populated +	 * with this entry and so we have to check again. +	 */ +	tm = __tcp_get_metrics(saddr, daddr, net, hash); +	if (tm == TCP_METRICS_RECLAIM_PTR) { +		reclaim = true; +		tm = NULL; +	} +	if (tm) { +		tcpm_check_stamp(tm, dst); +		goto out_unlock; +	} + +	if (unlikely(reclaim)) { +		struct tcp_metrics_block *oldest; + +		oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); +		for (tm = rcu_dereference(oldest->tcpm_next); tm; +		     tm = rcu_dereference(tm->tcpm_next)) { +			if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) +				oldest = tm; +		} +		tm = oldest; +	} else { +		tm = kmalloc(sizeof(*tm), GFP_ATOMIC); +		if (!tm) +			goto out_unlock; +	} +	tm->tcpm_saddr = *saddr; +	tm->tcpm_daddr = *daddr; + +	tcpm_suck_dst(tm, dst, true); + +	if (likely(!reclaim)) { +		tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; +		rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); +	} + +out_unlock: +	spin_unlock_bh(&tcp_metrics_lock); +	return tm; +} + +static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) +{ +	if (tm) +		return tm; +	if (depth > TCP_METRICS_RECLAIM_DEPTH) +		return TCP_METRICS_RECLAIM_PTR; +	return NULL; +} + +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, +						   const struct inetpeer_addr *daddr, +						   struct net *net, unsigned int hash) +{ +	struct tcp_metrics_block *tm; +	int depth = 0; + +	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; +	     tm = rcu_dereference(tm->tcpm_next)) { +		if (addr_same(&tm->tcpm_saddr, saddr) && +		    addr_same(&tm->tcpm_daddr, daddr)) +			break; +		depth++; +	} +	return tcp_get_encode(tm, depth); +} + +static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, +						       struct dst_entry *dst) +{ +	struct tcp_metrics_block *tm; +	struct inetpeer_addr saddr, daddr; +	unsigned int hash; +	struct net *net; + +	saddr.family = req->rsk_ops->family; +	daddr.family = req->rsk_ops->family; +	switch (daddr.family) { +	case AF_INET: +		saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; +		daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; +		hash = (__force unsigned int) daddr.addr.a4; +		break; +#if IS_ENABLED(CONFIG_IPV6) +	case AF_INET6: +		*(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; +		*(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; +		hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); +		break; +#endif +	default: +		return NULL; +	} + +	net = dev_net(dst->dev); +	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + +	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; +	     tm = rcu_dereference(tm->tcpm_next)) { +		if (addr_same(&tm->tcpm_saddr, &saddr) && +		    addr_same(&tm->tcpm_daddr, &daddr)) +			break; +	} +	tcpm_check_stamp(tm, dst); +	return tm; +} + +static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) +{ +	struct tcp_metrics_block *tm; +	struct inetpeer_addr saddr, daddr; +	unsigned int hash; +	struct net *net; + +	if (tw->tw_family == AF_INET) { +		saddr.family = AF_INET; +		saddr.addr.a4 = tw->tw_rcv_saddr; +		daddr.family = AF_INET; +		daddr.addr.a4 = tw->tw_daddr; +		hash = (__force unsigned int) daddr.addr.a4; +	} +#if IS_ENABLED(CONFIG_IPV6) +	else if (tw->tw_family == AF_INET6) { +		if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { +			saddr.family = AF_INET; +			saddr.addr.a4 = tw->tw_rcv_saddr; +			daddr.family = AF_INET; +			daddr.addr.a4 = tw->tw_daddr; +			hash = (__force unsigned int) daddr.addr.a4; +		} else { +			saddr.family = AF_INET6; +			*(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; +			daddr.family = AF_INET6; +			*(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; +			hash = ipv6_addr_hash(&tw->tw_v6_daddr); +		} +	} +#endif +	else +		return NULL; + +	net = twsk_net(tw); +	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + +	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; +	     tm = rcu_dereference(tm->tcpm_next)) { +		if (addr_same(&tm->tcpm_saddr, &saddr) && +		    addr_same(&tm->tcpm_daddr, &daddr)) +			break; +	} +	return tm; +} + +static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, +						 struct dst_entry *dst, +						 bool create) +{ +	struct tcp_metrics_block *tm; +	struct inetpeer_addr saddr, daddr; +	unsigned int hash; +	struct net *net; + +	if (sk->sk_family == AF_INET) { +		saddr.family = AF_INET; +		saddr.addr.a4 = inet_sk(sk)->inet_saddr; +		daddr.family = AF_INET; +		daddr.addr.a4 = inet_sk(sk)->inet_daddr; +		hash = (__force unsigned int) daddr.addr.a4; +	} +#if IS_ENABLED(CONFIG_IPV6) +	else if (sk->sk_family == AF_INET6) { +		if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { +			saddr.family = AF_INET; +			saddr.addr.a4 = inet_sk(sk)->inet_saddr; +			daddr.family = AF_INET; +			daddr.addr.a4 = inet_sk(sk)->inet_daddr; +			hash = (__force unsigned int) daddr.addr.a4; +		} else { +			saddr.family = AF_INET6; +			*(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; +			daddr.family = AF_INET6; +			*(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; +			hash = ipv6_addr_hash(&sk->sk_v6_daddr); +		} +	} +#endif +	else +		return NULL; + +	net = dev_net(dst->dev); +	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + +	tm = __tcp_get_metrics(&saddr, &daddr, net, hash); +	if (tm == TCP_METRICS_RECLAIM_PTR) +		tm = NULL; +	if (!tm && create) +		tm = tcpm_new(dst, &saddr, &daddr, hash); +	else +		tcpm_check_stamp(tm, dst); + +	return tm; +} + +/* Save metrics learned by this TCP session.  This function is called + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT + * or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ +	const struct inet_connection_sock *icsk = inet_csk(sk); +	struct dst_entry *dst = __sk_dst_get(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_metrics_block *tm; +	unsigned long rtt; +	u32 val; +	int m; + +	if (sysctl_tcp_nometrics_save || !dst) +		return; + +	if (dst->flags & DST_HOST) +		dst_confirm(dst); + +	rcu_read_lock(); +	if (icsk->icsk_backoff || !tp->srtt_us) { +		/* This session failed to estimate rtt. Why? +		 * Probably, no packets returned in time.  Reset our +		 * results. +		 */ +		tm = tcp_get_metrics(sk, dst, false); +		if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) +			tcp_metric_set(tm, TCP_METRIC_RTT, 0); +		goto out_unlock; +	} else +		tm = tcp_get_metrics(sk, dst, true); + +	if (!tm) +		goto out_unlock; + +	rtt = tcp_metric_get(tm, TCP_METRIC_RTT); +	m = rtt - tp->srtt_us; + +	/* If newly calculated rtt larger than stored one, store new +	 * one. Otherwise, use EWMA. Remember, rtt overestimation is +	 * always better than underestimation. +	 */ +	if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { +		if (m <= 0) +			rtt = tp->srtt_us; +		else +			rtt -= (m >> 3); +		tcp_metric_set(tm, TCP_METRIC_RTT, rtt); +	} + +	if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { +		unsigned long var; + +		if (m < 0) +			m = -m; + +		/* Scale deviation to rttvar fixed point */ +		m >>= 1; +		if (m < tp->mdev_us) +			m = tp->mdev_us; + +		var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); +		if (m >= var) +			var = m; +		else +			var -= (var - m) >> 2; + +		tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); +	} + +	if (tcp_in_initial_slowstart(tp)) { +		/* Slow start still did not finish. */ +		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { +			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); +			if (val && (tp->snd_cwnd >> 1) > val) +				tcp_metric_set(tm, TCP_METRIC_SSTHRESH, +					       tp->snd_cwnd >> 1); +		} +		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { +			val = tcp_metric_get(tm, TCP_METRIC_CWND); +			if (tp->snd_cwnd > val) +				tcp_metric_set(tm, TCP_METRIC_CWND, +					       tp->snd_cwnd); +		} +	} else if (tp->snd_cwnd > tp->snd_ssthresh && +		   icsk->icsk_ca_state == TCP_CA_Open) { +		/* Cong. avoidance phase, cwnd is reliable. */ +		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) +			tcp_metric_set(tm, TCP_METRIC_SSTHRESH, +				       max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); +		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { +			val = tcp_metric_get(tm, TCP_METRIC_CWND); +			tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); +		} +	} else { +		/* Else slow start did not finish, cwnd is non-sense, +		 * ssthresh may be also invalid. +		 */ +		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { +			val = tcp_metric_get(tm, TCP_METRIC_CWND); +			tcp_metric_set(tm, TCP_METRIC_CWND, +				       (val + tp->snd_ssthresh) >> 1); +		} +		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { +			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); +			if (val && tp->snd_ssthresh > val) +				tcp_metric_set(tm, TCP_METRIC_SSTHRESH, +					       tp->snd_ssthresh); +		} +		if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { +			val = tcp_metric_get(tm, TCP_METRIC_REORDERING); +			if (val < tp->reordering && +			    tp->reordering != sysctl_tcp_reordering) +				tcp_metric_set(tm, TCP_METRIC_REORDERING, +					       tp->reordering); +		} +	} +	tm->tcpm_stamp = jiffies; +out_unlock: +	rcu_read_unlock(); +} + +/* Initialize metrics on socket. */ + +void tcp_init_metrics(struct sock *sk) +{ +	struct dst_entry *dst = __sk_dst_get(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_metrics_block *tm; +	u32 val, crtt = 0; /* cached RTT scaled by 8 */ + +	if (dst == NULL) +		goto reset; + +	dst_confirm(dst); + +	rcu_read_lock(); +	tm = tcp_get_metrics(sk, dst, true); +	if (!tm) { +		rcu_read_unlock(); +		goto reset; +	} + +	if (tcp_metric_locked(tm, TCP_METRIC_CWND)) +		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); + +	val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); +	if (val) { +		tp->snd_ssthresh = val; +		if (tp->snd_ssthresh > tp->snd_cwnd_clamp) +			tp->snd_ssthresh = tp->snd_cwnd_clamp; +	} else { +		/* ssthresh may have been reduced unnecessarily during. +		 * 3WHS. Restore it back to its initial default. +		 */ +		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +	} +	val = tcp_metric_get(tm, TCP_METRIC_REORDERING); +	if (val && tp->reordering != val) { +		tcp_disable_fack(tp); +		tcp_disable_early_retrans(tp); +		tp->reordering = val; +	} + +	crtt = tcp_metric_get(tm, TCP_METRIC_RTT); +	rcu_read_unlock(); +reset: +	/* The initial RTT measurement from the SYN/SYN-ACK is not ideal +	 * to seed the RTO for later data packets because SYN packets are +	 * small. Use the per-dst cached values to seed the RTO but keep +	 * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). +	 * Later the RTO will be updated immediately upon obtaining the first +	 * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only +	 * influences the first RTO but not later RTT estimation. +	 * +	 * But if RTT is not available from the SYN (due to retransmits or +	 * syn cookies) or the cache, force a conservative 3secs timeout. +	 * +	 * A bit of theory. RTT is time passed after "normal" sized packet +	 * is sent until it is ACKed. In normal circumstances sending small +	 * packets force peer to delay ACKs and calculation is correct too. +	 * The algorithm is adaptive and, provided we follow specs, it +	 * NEVER underestimate RTT. BUT! If peer tries to make some clever +	 * tricks sort of "quick acks" for time long enough to decrease RTT +	 * to low value, and then abruptly stops to do it and starts to delay +	 * ACKs, wait for troubles. +	 */ +	if (crtt > tp->srtt_us) { +		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ +		crtt /= 8 * USEC_PER_MSEC; +		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); +	} else if (tp->srtt_us == 0) { +		/* RFC6298: 5.7 We've failed to get a valid RTT sample from +		 * 3WHS. This is most likely due to retransmission, +		 * including spurious one. Reset the RTO back to 3secs +		 * from the more aggressive 1sec to avoid more spurious +		 * retransmission. +		 */ +		tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); +		tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; + +		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; +	} +	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been +	 * retransmitted. In light of RFC6298 more aggressive 1sec +	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK +	 * retransmission has occurred. +	 */ +	if (tp->total_retrans > 1) +		tp->snd_cwnd = 1; +	else +		tp->snd_cwnd = tcp_init_cwnd(tp, dst); +	tp->snd_cwnd_stamp = tcp_time_stamp; +} + +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) +{ +	struct tcp_metrics_block *tm; +	bool ret; + +	if (!dst) +		return false; + +	rcu_read_lock(); +	tm = __tcp_get_metrics_req(req, dst); +	if (paws_check) { +		if (tm && +		    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && +		    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) +			ret = false; +		else +			ret = true; +	} else { +		if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) +			ret = true; +		else +			ret = false; +	} +	rcu_read_unlock(); + +	return ret; +} +EXPORT_SYMBOL_GPL(tcp_peer_is_proven); + +void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) +{ +	struct tcp_metrics_block *tm; + +	rcu_read_lock(); +	tm = tcp_get_metrics(sk, dst, true); +	if (tm) { +		struct tcp_sock *tp = tcp_sk(sk); + +		if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { +			tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; +			tp->rx_opt.ts_recent = tm->tcpm_ts; +		} +	} +	rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); + +/* VJ's idea. Save last timestamp seen from this destination and hold + * it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter + * synchronized state. + */ +bool tcp_remember_stamp(struct sock *sk) +{ +	struct dst_entry *dst = __sk_dst_get(sk); +	bool ret = false; + +	if (dst) { +		struct tcp_metrics_block *tm; + +		rcu_read_lock(); +		tm = tcp_get_metrics(sk, dst, true); +		if (tm) { +			struct tcp_sock *tp = tcp_sk(sk); + +			if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || +			    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && +			     tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { +				tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; +				tm->tcpm_ts = tp->rx_opt.ts_recent; +			} +			ret = true; +		} +		rcu_read_unlock(); +	} +	return ret; +} + +bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ +	struct tcp_metrics_block *tm; +	bool ret = false; + +	rcu_read_lock(); +	tm = __tcp_get_metrics_tw(tw); +	if (tm) { +		const struct tcp_timewait_sock *tcptw; +		struct sock *sk = (struct sock *) tw; + +		tcptw = tcp_twsk(sk); +		if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || +		    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && +		     tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { +			tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; +			tm->tcpm_ts	   = tcptw->tw_ts_recent; +		} +		ret = true; +	} +	rcu_read_unlock(); + +	return ret; +} + +static DEFINE_SEQLOCK(fastopen_seqlock); + +void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, +			    struct tcp_fastopen_cookie *cookie, +			    int *syn_loss, unsigned long *last_syn_loss) +{ +	struct tcp_metrics_block *tm; + +	rcu_read_lock(); +	tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); +	if (tm) { +		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; +		unsigned int seq; + +		do { +			seq = read_seqbegin(&fastopen_seqlock); +			if (tfom->mss) +				*mss = tfom->mss; +			*cookie = tfom->cookie; +			*syn_loss = tfom->syn_loss; +			*last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; +		} while (read_seqretry(&fastopen_seqlock, seq)); +	} +	rcu_read_unlock(); +} + +void tcp_fastopen_cache_set(struct sock *sk, u16 mss, +			    struct tcp_fastopen_cookie *cookie, bool syn_lost) +{ +	struct dst_entry *dst = __sk_dst_get(sk); +	struct tcp_metrics_block *tm; + +	if (!dst) +		return; +	rcu_read_lock(); +	tm = tcp_get_metrics(sk, dst, true); +	if (tm) { +		struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; + +		write_seqlock_bh(&fastopen_seqlock); +		if (mss) +			tfom->mss = mss; +		if (cookie && cookie->len > 0) +			tfom->cookie = *cookie; +		if (syn_lost) { +			++tfom->syn_loss; +			tfom->last_syn_loss = jiffies; +		} else +			tfom->syn_loss = 0; +		write_sequnlock_bh(&fastopen_seqlock); +	} +	rcu_read_unlock(); +} + +static struct genl_family tcp_metrics_nl_family = { +	.id		= GENL_ID_GENERATE, +	.hdrsize	= 0, +	.name		= TCP_METRICS_GENL_NAME, +	.version	= TCP_METRICS_GENL_VERSION, +	.maxattr	= TCP_METRICS_ATTR_MAX, +	.netnsok	= true, +}; + +static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { +	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, }, +	[TCP_METRICS_ATTR_ADDR_IPV6]	= { .type = NLA_BINARY, +					    .len = sizeof(struct in6_addr), }, +	/* Following attributes are not received for GET/DEL, +	 * we keep them for reference +	 */ +#if 0 +	[TCP_METRICS_ATTR_AGE]		= { .type = NLA_MSECS, }, +	[TCP_METRICS_ATTR_TW_TSVAL]	= { .type = NLA_U32, }, +	[TCP_METRICS_ATTR_TW_TS_STAMP]	= { .type = NLA_S32, }, +	[TCP_METRICS_ATTR_VALS]		= { .type = NLA_NESTED, }, +	[TCP_METRICS_ATTR_FOPEN_MSS]	= { .type = NLA_U16, }, +	[TCP_METRICS_ATTR_FOPEN_SYN_DROPS]	= { .type = NLA_U16, }, +	[TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]	= { .type = NLA_MSECS, }, +	[TCP_METRICS_ATTR_FOPEN_COOKIE]	= { .type = NLA_BINARY, +					    .len = TCP_FASTOPEN_COOKIE_MAX, }, +#endif +}; + +/* Add attributes, caller cancels its header on failure */ +static int tcp_metrics_fill_info(struct sk_buff *msg, +				 struct tcp_metrics_block *tm) +{ +	struct nlattr *nest; +	int i; + +	switch (tm->tcpm_daddr.family) { +	case AF_INET: +		if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, +				tm->tcpm_daddr.addr.a4) < 0) +			goto nla_put_failure; +		if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, +				tm->tcpm_saddr.addr.a4) < 0) +			goto nla_put_failure; +		break; +	case AF_INET6: +		if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, +			    tm->tcpm_daddr.addr.a6) < 0) +			goto nla_put_failure; +		if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, +			    tm->tcpm_saddr.addr.a6) < 0) +			goto nla_put_failure; +		break; +	default: +		return -EAFNOSUPPORT; +	} + +	if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, +			  jiffies - tm->tcpm_stamp) < 0) +		goto nla_put_failure; +	if (tm->tcpm_ts_stamp) { +		if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, +				(s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) +			goto nla_put_failure; +		if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, +				tm->tcpm_ts) < 0) +			goto nla_put_failure; +	} + +	{ +		int n = 0; + +		nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); +		if (!nest) +			goto nla_put_failure; +		for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { +			u32 val = tm->tcpm_vals[i]; + +			if (!val) +				continue; +			if (i == TCP_METRIC_RTT) { +				if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, +						val) < 0) +					goto nla_put_failure; +				n++; +				val = max(val / 1000, 1U); +			} +			if (i == TCP_METRIC_RTTVAR) { +				if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, +						val) < 0) +					goto nla_put_failure; +				n++; +				val = max(val / 1000, 1U); +			} +			if (nla_put_u32(msg, i + 1, val) < 0) +				goto nla_put_failure; +			n++; +		} +		if (n) +			nla_nest_end(msg, nest); +		else +			nla_nest_cancel(msg, nest); +	} + +	{ +		struct tcp_fastopen_metrics tfom_copy[1], *tfom; +		unsigned int seq; + +		do { +			seq = read_seqbegin(&fastopen_seqlock); +			tfom_copy[0] = tm->tcpm_fastopen; +		} while (read_seqretry(&fastopen_seqlock, seq)); + +		tfom = tfom_copy; +		if (tfom->mss && +		    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, +				tfom->mss) < 0) +			goto nla_put_failure; +		if (tfom->syn_loss && +		    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, +				tfom->syn_loss) < 0 || +		     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, +				jiffies - tfom->last_syn_loss) < 0)) +			goto nla_put_failure; +		if (tfom->cookie.len > 0 && +		    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, +			    tfom->cookie.len, tfom->cookie.val) < 0) +			goto nla_put_failure; +	} + +	return 0; + +nla_put_failure: +	return -EMSGSIZE; +} + +static int tcp_metrics_dump_info(struct sk_buff *skb, +				 struct netlink_callback *cb, +				 struct tcp_metrics_block *tm) +{ +	void *hdr; + +	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, +			  &tcp_metrics_nl_family, NLM_F_MULTI, +			  TCP_METRICS_CMD_GET); +	if (!hdr) +		return -EMSGSIZE; + +	if (tcp_metrics_fill_info(skb, tm) < 0) +		goto nla_put_failure; + +	return genlmsg_end(skb, hdr); + +nla_put_failure: +	genlmsg_cancel(skb, hdr); +	return -EMSGSIZE; +} + +static int tcp_metrics_nl_dump(struct sk_buff *skb, +			       struct netlink_callback *cb) +{ +	struct net *net = sock_net(skb->sk); +	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; +	unsigned int row, s_row = cb->args[0]; +	int s_col = cb->args[1], col = s_col; + +	for (row = s_row; row < max_rows; row++, s_col = 0) { +		struct tcp_metrics_block *tm; +		struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + +		rcu_read_lock(); +		for (col = 0, tm = rcu_dereference(hb->chain); tm; +		     tm = rcu_dereference(tm->tcpm_next), col++) { +			if (col < s_col) +				continue; +			if (tcp_metrics_dump_info(skb, cb, tm) < 0) { +				rcu_read_unlock(); +				goto done; +			} +		} +		rcu_read_unlock(); +	} + +done: +	cb->args[0] = row; +	cb->args[1] = col; +	return skb->len; +} + +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, +			   unsigned int *hash, int optional, int v4, int v6) +{ +	struct nlattr *a; + +	a = info->attrs[v4]; +	if (a) { +		addr->family = AF_INET; +		addr->addr.a4 = nla_get_be32(a); +		if (hash) +			*hash = (__force unsigned int) addr->addr.a4; +		return 0; +	} +	a = info->attrs[v6]; +	if (a) { +		if (nla_len(a) != sizeof(struct in6_addr)) +			return -EINVAL; +		addr->family = AF_INET6; +		memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); +		if (hash) +			*hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); +		return 0; +	} +	return optional ? 1 : -EAFNOSUPPORT; +} + +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, +			 unsigned int *hash, int optional) +{ +	return __parse_nl_addr(info, addr, hash, optional, +			       TCP_METRICS_ATTR_ADDR_IPV4, +			       TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ +	return __parse_nl_addr(info, addr, NULL, 0, +			       TCP_METRICS_ATTR_SADDR_IPV4, +			       TCP_METRICS_ATTR_SADDR_IPV6); +} + +static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ +	struct tcp_metrics_block *tm; +	struct inetpeer_addr saddr, daddr; +	unsigned int hash; +	struct sk_buff *msg; +	struct net *net = genl_info_net(info); +	void *reply; +	int ret; +	bool src = true; + +	ret = parse_nl_addr(info, &daddr, &hash, 0); +	if (ret < 0) +		return ret; + +	ret = parse_nl_saddr(info, &saddr); +	if (ret < 0) +		src = false; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return -ENOMEM; + +	reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, +				  info->genlhdr->cmd); +	if (!reply) +		goto nla_put_failure; + +	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); +	ret = -ESRCH; +	rcu_read_lock(); +	for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; +	     tm = rcu_dereference(tm->tcpm_next)) { +		if (addr_same(&tm->tcpm_daddr, &daddr) && +		    (!src || addr_same(&tm->tcpm_saddr, &saddr))) { +			ret = tcp_metrics_fill_info(msg, tm); +			break; +		} +	} +	rcu_read_unlock(); +	if (ret < 0) +		goto out_free; + +	genlmsg_end(msg, reply); +	return genlmsg_reply(msg, info); + +nla_put_failure: +	ret = -EMSGSIZE; + +out_free: +	nlmsg_free(msg); +	return ret; +} + +#define deref_locked_genl(p)	\ +	rcu_dereference_protected(p, lockdep_genl_is_held() && \ +				     lockdep_is_held(&tcp_metrics_lock)) + +#define deref_genl(p)	rcu_dereference_protected(p, lockdep_genl_is_held()) + +static int tcp_metrics_flush_all(struct net *net) +{ +	unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; +	struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; +	struct tcp_metrics_block *tm; +	unsigned int row; + +	for (row = 0; row < max_rows; row++, hb++) { +		spin_lock_bh(&tcp_metrics_lock); +		tm = deref_locked_genl(hb->chain); +		if (tm) +			hb->chain = NULL; +		spin_unlock_bh(&tcp_metrics_lock); +		while (tm) { +			struct tcp_metrics_block *next; + +			next = deref_genl(tm->tcpm_next); +			kfree_rcu(tm, rcu_head); +			tm = next; +		} +	} +	return 0; +} + +static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ +	struct tcpm_hash_bucket *hb; +	struct tcp_metrics_block *tm; +	struct tcp_metrics_block __rcu **pp; +	struct inetpeer_addr saddr, daddr; +	unsigned int hash; +	struct net *net = genl_info_net(info); +	int ret; +	bool src = true, found = false; + +	ret = parse_nl_addr(info, &daddr, &hash, 1); +	if (ret < 0) +		return ret; +	if (ret > 0) +		return tcp_metrics_flush_all(net); +	ret = parse_nl_saddr(info, &saddr); +	if (ret < 0) +		src = false; + +	hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); +	hb = net->ipv4.tcp_metrics_hash + hash; +	pp = &hb->chain; +	spin_lock_bh(&tcp_metrics_lock); +	for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { +		if (addr_same(&tm->tcpm_daddr, &daddr) && +		    (!src || addr_same(&tm->tcpm_saddr, &saddr))) { +			*pp = tm->tcpm_next; +			kfree_rcu(tm, rcu_head); +			found = true; +		} else { +			pp = &tm->tcpm_next; +		} +	} +	spin_unlock_bh(&tcp_metrics_lock); +	if (!found) +		return -ESRCH; +	return 0; +} + +static const struct genl_ops tcp_metrics_nl_ops[] = { +	{ +		.cmd = TCP_METRICS_CMD_GET, +		.doit = tcp_metrics_nl_cmd_get, +		.dumpit = tcp_metrics_nl_dump, +		.policy = tcp_metrics_nl_policy, +		.flags = GENL_ADMIN_PERM, +	}, +	{ +		.cmd = TCP_METRICS_CMD_DEL, +		.doit = tcp_metrics_nl_cmd_del, +		.policy = tcp_metrics_nl_policy, +		.flags = GENL_ADMIN_PERM, +	}, +}; + +static unsigned int tcpmhash_entries; +static int __init set_tcpmhash_entries(char *str) +{ +	ssize_t ret; + +	if (!str) +		return 0; + +	ret = kstrtouint(str, 0, &tcpmhash_entries); +	if (ret) +		return 0; + +	return 1; +} +__setup("tcpmhash_entries=", set_tcpmhash_entries); + +static int __net_init tcp_net_metrics_init(struct net *net) +{ +	size_t size; +	unsigned int slots; + +	slots = tcpmhash_entries; +	if (!slots) { +		if (totalram_pages >= 128 * 1024) +			slots = 16 * 1024; +		else +			slots = 8 * 1024; +	} + +	net->ipv4.tcp_metrics_hash_log = order_base_2(slots); +	size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + +	net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); +	if (!net->ipv4.tcp_metrics_hash) +		net->ipv4.tcp_metrics_hash = vzalloc(size); + +	if (!net->ipv4.tcp_metrics_hash) +		return -ENOMEM; + +	return 0; +} + +static void __net_exit tcp_net_metrics_exit(struct net *net) +{ +	unsigned int i; + +	for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { +		struct tcp_metrics_block *tm, *next; + +		tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); +		while (tm) { +			next = rcu_dereference_protected(tm->tcpm_next, 1); +			kfree(tm); +			tm = next; +		} +	} +	kvfree(net->ipv4.tcp_metrics_hash); +} + +static __net_initdata struct pernet_operations tcp_net_metrics_ops = { +	.init	=	tcp_net_metrics_init, +	.exit	=	tcp_net_metrics_exit, +}; + +void __init tcp_metrics_init(void) +{ +	int ret; + +	ret = register_pernet_subsys(&tcp_net_metrics_ops); +	if (ret < 0) +		goto cleanup; +	ret = genl_register_family_with_ops(&tcp_metrics_nl_family, +					    tcp_metrics_nl_ops); +	if (ret < 0) +		goto cleanup_subsys; +	return; + +cleanup_subsys: +	unregister_pernet_subsys(&tcp_net_metrics_ops); + +cleanup: +	return; +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3527b51d615..e68e0d4af6c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,62 +49,12 @@ struct inet_timewait_death_row tcp_death_row = {  };  EXPORT_SYMBOL_GPL(tcp_death_row); -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -static int tcp_remember_stamp(struct sock *sk) -{ -	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_peer *peer; -	bool release_it; - -	peer = icsk->icsk_af_ops->get_peer(sk, &release_it); -	if (peer) { -		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || -		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && -		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { -			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; -			peer->tcp_ts = tp->rx_opt.ts_recent; -		} -		if (release_it) -			inet_putpeer(peer); -		return 1; -	} - -	return 0; -} - -static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ -	struct sock *sk = (struct sock *) tw; -	struct inet_peer *peer; - -	peer = twsk_getpeer(sk); -	if (peer) { -		const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - -		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || -		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && -		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { -			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; -			peer->tcp_ts	   = tcptw->tw_ts_recent; -		} -		inet_putpeer(peer); -		return 1; -	} -	return 0; -} - -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)  {  	if (seq == s_win) -		return 1; +		return true;  	if (after(end_seq, s_win) && before(seq, e_win)) -		return 1; +		return true;  	return seq == e_win && seq == end_seq;  } @@ -135,21 +85,23 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)   * spinlock it. I do not want! Well, probability of misbehaviour   * is ridiculously low and, seems, we could use some mb() tricks   * to avoid misread sequence numbers, states etc.  --ANK + * + * We don't need to initialize tmp_out.sack_ok as we don't use the results   */  enum tcp_tw_status  tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,  			   const struct tcphdr *th)  {  	struct tcp_options_received tmp_opt; -	u8 *hash_location;  	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); -	int paws_reject = 0; +	bool paws_reject = false;  	tmp_opt.saw_tstamp = 0;  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { -		tcp_parse_options(skb, &tmp_opt, &hash_location, 0); +		tcp_parse_options(skb, &tmp_opt, 0, NULL);  		if (tmp_opt.saw_tstamp) { +			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;  			tmp_opt.ts_recent	= tcptw->tw_ts_recent;  			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst); @@ -316,7 +268,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  	struct inet_timewait_sock *tw = NULL;  	const struct inet_connection_sock *icsk = inet_csk(sk);  	const struct tcp_sock *tp = tcp_sk(sk); -	int recycle_ok = 0; +	bool recycle_ok = false;  	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)  		recycle_ok = tcp_remember_stamp(sk); @@ -327,23 +279,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  	if (tw != NULL) {  		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);  		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); +		struct inet_sock *inet = inet_sk(sk); +		tw->tw_transparent	= inet->transparent;  		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;  		tcptw->tw_rcv_nxt	= tp->rcv_nxt;  		tcptw->tw_snd_nxt	= tp->snd_nxt;  		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);  		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;  		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; +		tcptw->tw_ts_offset	= tp->tsoffset; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  		if (tw->tw_family == PF_INET6) {  			struct ipv6_pinfo *np = inet6_sk(sk); -			struct inet6_timewait_sock *tw6; -			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); -			tw6 = inet6_twsk((struct sock *)tw); -			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); -			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); +			tw->tw_v6_daddr = sk->sk_v6_daddr; +			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +			tw->tw_tclass = np->tclass; +			tw->tw_flowlabel = np->flow_label >> 12;  			tw->tw_ipv6only = np->ipv6only;  		}  #endif @@ -357,13 +311,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		 */  		do {  			struct tcp_md5sig_key *key; -			memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key)); -			tcptw->tw_md5_keylen = 0; +			tcptw->tw_md5_key = NULL;  			key = tp->af_specific->md5_lookup(sk, sk);  			if (key != NULL) { -				memcpy(&tcptw->tw_md5_key, key->key, key->keylen); -				tcptw->tw_md5_keylen = key->keylen; -				if (tcp_alloc_md5sig_pool(sk) == NULL) +				tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); +				if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())  					BUG();  			}  		} while (0); @@ -392,7 +344,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		 * socket up.  We've got bigger problems than  		 * non-graceful socket closings.  		 */ -		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);  	}  	tcp_update_metrics(sk); @@ -403,12 +355,44 @@ void tcp_twsk_destructor(struct sock *sk)  {  #ifdef CONFIG_TCP_MD5SIG  	struct tcp_timewait_sock *twsk = tcp_twsk(sk); -	if (twsk->tw_md5_keylen) -		tcp_free_md5sig_pool(); + +	if (twsk->tw_md5_key) +		kfree_rcu(twsk->tw_md5_key, rcu);  #endif  }  EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_openreq_init_rwin(struct request_sock *req, +			   struct sock *sk, struct dst_entry *dst) +{ +	struct inet_request_sock *ireq = inet_rsk(req); +	struct tcp_sock *tp = tcp_sk(sk); +	__u8 rcv_wscale; +	int mss = dst_metric_advmss(dst); + +	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) +		mss = tp->rx_opt.user_mss; + +	/* Set this up on the first call only */ +	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + +	/* limit the window selection if the user enforce a smaller rx buffer */ +	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && +	    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) +		req->window_clamp = tcp_full_space(sk); + +	/* tcp_full_space because it is guaranteed to be the first packet */ +	tcp_select_initial_window(tcp_full_space(sk), +		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), +		&req->rcv_wnd, +		&req->window_clamp, +		ireq->wscale_ok, +		&rcv_wscale, +		dst_metric(dst, RTAX_INITRWND)); +	ireq->rcv_wscale = rcv_wscale; +} +EXPORT_SYMBOL(tcp_openreq_init_rwin); +  static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,  					 struct request_sock *req)  { @@ -423,39 +407,13 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,   */  struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)  { -	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); +	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);  	if (newsk != NULL) {  		const struct inet_request_sock *ireq = inet_rsk(req);  		struct tcp_request_sock *treq = tcp_rsk(req);  		struct inet_connection_sock *newicsk = inet_csk(newsk);  		struct tcp_sock *newtp = tcp_sk(newsk); -		struct tcp_sock *oldtp = tcp_sk(sk); -		struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - -		/* TCP Cookie Transactions require space for the cookie pair, -		 * as it differs for each connection.  There is no need to -		 * copy any s_data_payload stored at the original socket. -		 * Failure will prevent resuming the connection. -		 * -		 * Presumed copied, in order of appearance: -		 *	cookie_in_always, cookie_out_never -		 */ -		if (oldcvp != NULL) { -			struct tcp_cookie_values *newcvp = -				kzalloc(sizeof(*newtp->cookie_values), -					GFP_ATOMIC); - -			if (newcvp != NULL) { -				kref_init(&newcvp->kref); -				newcvp->cookie_desired = -						oldcvp->cookie_desired; -				newtp->cookie_values = newcvp; -			} else { -				/* Not Yet Implemented */ -				newtp->cookie_values = NULL; -			} -		}  		/* Now setup tcp_sock */  		newtp->pred_flags = 0; @@ -464,15 +422,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		newtp->rcv_nxt = treq->rcv_isn + 1;  		newtp->snd_sml = newtp->snd_una = -		newtp->snd_nxt = newtp->snd_up = -			treq->snt_isn + 1 + tcp_s_data_size(oldtp); +		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;  		tcp_prequeue_init(newtp); +		INIT_LIST_HEAD(&newtp->tsq_node);  		tcp_init_wl(newtp, treq->rcv_isn); -		newtp->srtt = 0; -		newtp->mdev = TCP_TIMEOUT_INIT; +		newtp->srtt_us = 0; +		newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);  		newicsk->icsk_rto = TCP_TIMEOUT_INIT;  		newtp->packets_out = 0; @@ -480,26 +438,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  		newtp->sacked_out = 0;  		newtp->fackets_out = 0;  		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +		tcp_enable_early_retrans(newtp); +		newtp->tlp_high_seq = 0; +		newtp->lsndtime = treq->snt_synack; +		newtp->total_retrans = req->num_retrans;  		/* So many TCP implementations out there (incorrectly) count the  		 * initial SYN frame in their delayed-ACK and congestion control  		 * algorithms that we must have the following bandaid to talk  		 * efficiently to them.  -DaveM  		 */ -		newtp->snd_cwnd = 2; +		newtp->snd_cwnd = TCP_INIT_CWND;  		newtp->snd_cwnd_cnt = 0; -		newtp->bytes_acked = 0; -		newtp->frto_counter = 0; -		newtp->frto_highmark = 0; - -		newicsk->icsk_ca_ops = &tcp_init_congestion_ops; +		if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && +		    !try_module_get(newicsk->icsk_ca_ops->owner)) +			newicsk->icsk_ca_ops = &tcp_init_congestion_ops;  		tcp_set_ca_state(newsk, TCP_CA_Open);  		tcp_init_xmit_timers(newsk); -		skb_queue_head_init(&newtp->out_of_order_queue); -		newtp->write_seq = newtp->pushed_seq = -			treq->snt_isn + 1 + tcp_s_data_size(oldtp); +		__skb_queue_head_init(&newtp->out_of_order_queue); +		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;  		newtp->rx_opt.saw_tstamp = 0; @@ -540,6 +499,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  			newtp->rx_opt.ts_recent_stamp = 0;  			newtp->tcp_header_len = sizeof(struct tcphdr);  		} +		newtp->tsoffset = 0;  #ifdef CONFIG_TCP_MD5SIG  		newtp->md5sig_info = NULL;	/*XXX*/  		if (newtp->af_specific->md5_lookup(sk, newsk)) @@ -549,6 +509,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;  		newtp->rx_opt.mss_clamp = req->mss;  		TCP_ECN_openreq_child(newtp, req); +		newtp->fastopen_rsk = NULL; +		newtp->syn_data_acked = 0;  		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);  	} @@ -557,24 +519,32 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,  EXPORT_SYMBOL(tcp_create_openreq_child);  /* - *	Process an incoming packet for SYN_RECV sockets represented - *	as a request_sock. + * Process an incoming packet for SYN_RECV sockets represented as a + * request_sock. Normally sk is the listener socket but for TFO it + * points to the child socket. + * + * XXX (TFO) - The current impl contains a special check for ack + * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? + * + * We don't need to initialize tmp_opt.sack_ok as we don't use the results   */  struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  			   struct request_sock *req, -			   struct request_sock **prev) +			   struct request_sock **prev, +			   bool fastopen)  {  	struct tcp_options_received tmp_opt; -	u8 *hash_location;  	struct sock *child;  	const struct tcphdr *th = tcp_hdr(skb);  	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); -	int paws_reject = 0; +	bool paws_reject = false; + +	BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));  	tmp_opt.saw_tstamp = 0;  	if (th->doff > (sizeof(struct tcphdr)>>2)) { -		tcp_parse_options(skb, &tmp_opt, &hash_location, 0); +		tcp_parse_options(skb, &tmp_opt, 0, NULL);  		if (tmp_opt.saw_tstamp) {  			tmp_opt.ts_recent = req->ts_recent; @@ -582,7 +552,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  			 * it can be estimated (approximately)  			 * from another data.  			 */ -			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); +			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);  			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);  		}  	} @@ -607,8 +577,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  		 *  		 * Enforce "SYN-ACK" according to figure 8, figure 6  		 * of RFC793, fixed by RFC1122. +		 * +		 * Note that even if there is new data in the SYN packet +		 * they will be thrown away too. +		 * +		 * Reset timer after retransmitting SYNACK, similar to +		 * the idea of fast retransmit in recovery.  		 */ -		req->rsk_ops->rtx_syn_ack(sk, req, NULL); +		if (!inet_rtx_syn_ack(sk, req)) +			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, +					   TCP_RTO_MAX) + jiffies;  		return NULL;  	} @@ -664,11 +642,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	 *                  sent (the segment carries an unacceptable ACK) ...  	 *                  a reset is sent."  	 * -	 * Invalid ACK: reset will be sent by listening socket +	 * Invalid ACK: reset will be sent by listening socket. +	 * Note that the ACK validity check for a Fast Open socket is done +	 * elsewhere and is checked directly against the child socket rather +	 * than req because user data may have been sent out.  	 */ -	if ((flg & TCP_FLAG_ACK) && +	if ((flg & TCP_FLAG_ACK) && !fastopen &&  	    (TCP_SKB_CB(skb)->ack_seq != -	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) +	     tcp_rsk(req)->snt_isn + 1))  		return sk;  	/* Also, it would be not so bad idea to check rcv_tsecr, which @@ -679,7 +660,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* RFC793: "first check sequence number". */  	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, -					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { +					  tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {  		/* Out of window: send ACK and drop. */  		if (!(flg & TCP_FLAG_RST))  			req->rsk_ops->send_ack(sk, skb, req); @@ -690,7 +671,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* In sequence, PAWS is OK. */ -	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) +	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))  		req->ts_recent = tmp_opt.rcv_tsval;  	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { @@ -709,12 +690,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,  	/* ACK sequence verified above, just make sure ACK is  	 * set.  If ACK not set, just silently drop the packet. +	 * +	 * XXX (TFO) - if we ever allow "data after SYN", the +	 * following check needs to be removed.  	 */  	if (!(flg & TCP_FLAG_ACK))  		return NULL; +	/* For Fast Open no more processing is needed (sk is the +	 * child socket). +	 */ +	if (fastopen) +		return sk; +  	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ -	if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && +	if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&  	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {  		inet_rsk(req)->acked = 1;  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); @@ -744,11 +734,21 @@ listen_overflow:  	}  embryonic_reset: -	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); -	if (!(flg & TCP_FLAG_RST)) +	if (!(flg & TCP_FLAG_RST)) { +		/* Received a bad SYN pkt - for TFO We try not to reset +		 * the local connection unless it's really necessary to +		 * avoid becoming vulnerable to outside attack aiming at +		 * resetting legit local connections. +		 */  		req->rsk_ops->send_reset(sk, skb); - -	inet_csk_reqsk_queue_drop(sk, req, prev); +	} else if (fastopen) { /* received a valid RST pkt */ +		reqsk_fastopen_remove(sk, req, true); +		tcp_reset(sk); +	} +	if (!fastopen) { +		inet_csk_reqsk_queue_drop(sk, req, prev); +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); +	}  	return NULL;  }  EXPORT_SYMBOL(tcp_check_req); @@ -757,6 +757,12 @@ EXPORT_SYMBOL(tcp_check_req);   * Queue segment on the new socket if the new socket is active,   * otherwise we just shortcircuit this and continue with   * the new socket. + * + * For the vast majority of cases child->sk_state will be TCP_SYN_RECV + * when entering. But other states are possible due to a race condition + * where after __inet_lookup_established() fails but before the listener + * locked is obtained, other packets cause the same connection to + * be created.   */  int tcp_child_process(struct sock *parent, struct sock *child, @@ -770,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,  					    skb->len);  		/* Wakeup parent, send SIGIO */  		if (state == TCP_SYN_RECV && child->sk_state != state) -			parent->sk_data_ready(parent, 0); +			parent->sk_data_ready(parent);  	} else {  		/* Alas, it is possible again, because we do lookup  		 * in main socket hash table and lock on listening diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c new file mode 100644 index 00000000000..55046ecd083 --- /dev/null +++ b/net/ipv4/tcp_offload.c @@ -0,0 +1,329 @@ +/* + *	IPV4 GSO/GRO offload support + *	Linux INET implementation + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *	TCPv4 GSO/GRO support + */ + +#include <linux/skbuff.h> +#include <net/tcp.h> +#include <net/protocol.h> + +struct sk_buff *tcp_gso_segment(struct sk_buff *skb, +				netdev_features_t features) +{ +	struct sk_buff *segs = ERR_PTR(-EINVAL); +	unsigned int sum_truesize = 0; +	struct tcphdr *th; +	unsigned int thlen; +	unsigned int seq; +	__be32 delta; +	unsigned int oldlen; +	unsigned int mss; +	struct sk_buff *gso_skb = skb; +	__sum16 newcheck; +	bool ooo_okay, copy_destructor; + +	if (!pskb_may_pull(skb, sizeof(*th))) +		goto out; + +	th = tcp_hdr(skb); +	thlen = th->doff * 4; +	if (thlen < sizeof(*th)) +		goto out; + +	if (!pskb_may_pull(skb, thlen)) +		goto out; + +	oldlen = (u16)~skb->len; +	__skb_pull(skb, thlen); + +	mss = tcp_skb_mss(skb); +	if (unlikely(skb->len <= mss)) +		goto out; + +	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { +		/* Packet is from an untrusted source, reset gso_segs. */ +		int type = skb_shinfo(skb)->gso_type; + +		if (unlikely(type & +			     ~(SKB_GSO_TCPV4 | +			       SKB_GSO_DODGY | +			       SKB_GSO_TCP_ECN | +			       SKB_GSO_TCPV6 | +			       SKB_GSO_GRE | +			       SKB_GSO_GRE_CSUM | +			       SKB_GSO_IPIP | +			       SKB_GSO_SIT | +			       SKB_GSO_MPLS | +			       SKB_GSO_UDP_TUNNEL | +			       SKB_GSO_UDP_TUNNEL_CSUM | +			       0) || +			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) +			goto out; + +		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + +		segs = NULL; +		goto out; +	} + +	copy_destructor = gso_skb->destructor == tcp_wfree; +	ooo_okay = gso_skb->ooo_okay; +	/* All segments but the first should have ooo_okay cleared */ +	skb->ooo_okay = 0; + +	segs = skb_segment(skb, features); +	if (IS_ERR(segs)) +		goto out; + +	/* Only first segment might have ooo_okay set */ +	segs->ooo_okay = ooo_okay; + +	delta = htonl(oldlen + (thlen + mss)); + +	skb = segs; +	th = tcp_hdr(skb); +	seq = ntohl(th->seq); + +	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + +					       (__force u32)delta)); + +	do { +		th->fin = th->psh = 0; +		th->check = newcheck; + +		if (skb->ip_summed != CHECKSUM_PARTIAL) +			th->check = gso_make_checksum(skb, ~th->check); + +		seq += mss; +		if (copy_destructor) { +			skb->destructor = gso_skb->destructor; +			skb->sk = gso_skb->sk; +			sum_truesize += skb->truesize; +		} +		skb = skb->next; +		th = tcp_hdr(skb); + +		th->seq = htonl(seq); +		th->cwr = 0; +	} while (skb->next); + +	/* Following permits TCP Small Queues to work well with GSO : +	 * The callback to TCP stack will be called at the time last frag +	 * is freed at TX completion, and not right now when gso_skb +	 * is freed by GSO engine +	 */ +	if (copy_destructor) { +		swap(gso_skb->sk, skb->sk); +		swap(gso_skb->destructor, skb->destructor); +		sum_truesize += skb->truesize; +		atomic_add(sum_truesize - gso_skb->truesize, +			   &skb->sk->sk_wmem_alloc); +	} + +	delta = htonl(oldlen + (skb_tail_pointer(skb) - +				skb_transport_header(skb)) + +		      skb->data_len); +	th->check = ~csum_fold((__force __wsum)((__force u32)th->check + +				(__force u32)delta)); +	if (skb->ip_summed != CHECKSUM_PARTIAL) +		th->check = gso_make_checksum(skb, ~th->check); +out: +	return segs; +} + +struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ +	struct sk_buff **pp = NULL; +	struct sk_buff *p; +	struct tcphdr *th; +	struct tcphdr *th2; +	unsigned int len; +	unsigned int thlen; +	__be32 flags; +	unsigned int mss = 1; +	unsigned int hlen; +	unsigned int off; +	int flush = 1; +	int i; + +	off = skb_gro_offset(skb); +	hlen = off + sizeof(*th); +	th = skb_gro_header_fast(skb, off); +	if (skb_gro_header_hard(skb, hlen)) { +		th = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!th)) +			goto out; +	} + +	thlen = th->doff * 4; +	if (thlen < sizeof(*th)) +		goto out; + +	hlen = off + thlen; +	if (skb_gro_header_hard(skb, hlen)) { +		th = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!th)) +			goto out; +	} + +	skb_gro_pull(skb, thlen); + +	len = skb_gro_len(skb); +	flags = tcp_flag_word(th); + +	for (; (p = *head); head = &p->next) { +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		th2 = tcp_hdr(p); + +		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} + +		goto found; +	} + +	goto out_check_final; + +found: +	/* Include the IP ID check below from the inner most IP hdr */ +	flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; +	flush |= (__force int)(flags & TCP_FLAG_CWR); +	flush |= (__force int)((flags ^ tcp_flag_word(th2)) & +		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); +	flush |= (__force int)(th->ack_seq ^ th2->ack_seq); +	for (i = sizeof(*th); i < thlen; i += 4) +		flush |= *(u32 *)((u8 *)th + i) ^ +			 *(u32 *)((u8 *)th2 + i); + +	mss = tcp_skb_mss(p); + +	flush |= (len - 1) >= mss; +	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); + +	if (flush || skb_gro_receive(head, skb)) { +		mss = 1; +		goto out_check_final; +	} + +	p = *head; +	th2 = tcp_hdr(p); +	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); + +out_check_final: +	flush = len < mss; +	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | +					TCP_FLAG_RST | TCP_FLAG_SYN | +					TCP_FLAG_FIN)); + +	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) +		pp = head; + +out: +	NAPI_GRO_CB(skb)->flush |= (flush != 0); + +	return pp; +} + +int tcp_gro_complete(struct sk_buff *skb) +{ +	struct tcphdr *th = tcp_hdr(skb); + +	skb->csum_start = (unsigned char *)th - skb->head; +	skb->csum_offset = offsetof(struct tcphdr, check); +	skb->ip_summed = CHECKSUM_PARTIAL; + +	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + +	if (th->cwr) +		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + +	return 0; +} +EXPORT_SYMBOL(tcp_gro_complete); + +static int tcp_v4_gso_send_check(struct sk_buff *skb) +{ +	const struct iphdr *iph; +	struct tcphdr *th; + +	if (!pskb_may_pull(skb, sizeof(*th))) +		return -EINVAL; + +	iph = ip_hdr(skb); +	th = tcp_hdr(skb); + +	th->check = 0; +	skb->ip_summed = CHECKSUM_PARTIAL; +	__tcp_v4_send_check(skb, iph->saddr, iph->daddr); +	return 0; +} + +static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ +	/* Use the IP hdr immediately proceeding for this transport */ +	const struct iphdr *iph = skb_gro_network_header(skb); +	__wsum wsum; + +	/* Don't bother verifying checksum if we're going to flush anyway. */ +	if (NAPI_GRO_CB(skb)->flush) +		goto skip_csum; + +	wsum = NAPI_GRO_CB(skb)->csum; + +	switch (skb->ip_summed) { +	case CHECKSUM_NONE: +		wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), +				    0); + +		/* fall through */ + +	case CHECKSUM_COMPLETE: +		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, +				  wsum)) { +			skb->ip_summed = CHECKSUM_UNNECESSARY; +			break; +		} + +		NAPI_GRO_CB(skb)->flush = 1; +		return NULL; +	} + +skip_csum: +	return tcp_gro_receive(head, skb); +} + +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) +{ +	const struct iphdr *iph = ip_hdr(skb); +	struct tcphdr *th = tcp_hdr(skb); + +	th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, +				  iph->daddr, 0); +	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + +	return tcp_gro_complete(skb); +} + +static const struct net_offload tcpv4_offload = { +	.callbacks = { +		.gso_send_check	=	tcp_v4_gso_send_check, +		.gso_segment	=	tcp_gso_segment, +		.gro_receive	=	tcp4_gro_receive, +		.gro_complete	=	tcp4_gro_complete, +	}, +}; + +int __init tcpv4_offload_init(void) +{ +	return inet_add_offload(&tcpv4_offload, IPPROTO_TCP); +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 749b6498588..179b51e6bda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -34,6 +34,8 @@   *   */ +#define pr_fmt(fmt) "TCP: " fmt +  #include <net/tcp.h>  #include <linux/compiler.h> @@ -48,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;   */  int sysctl_tcp_workaround_signed_windows __read_mostly = 0; +/* Default TSQ limit of two TSO segments */ +int sysctl_tcp_limit_output_bytes __read_mostly = 131072; +  /* This limits the percentage of the congestion window which we   * will allow a single TSO frame to consume.  Building TSO frames   * which are too large can cause TCP streams to be bursty. @@ -60,27 +65,30 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;  /* By default, RFC2861 behavior.  */  int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ -EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +			   int push_one, gfp_t gfp);  /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)  { +	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	unsigned int prior_packets = tp->packets_out;  	tcp_advance_send_head(sk, skb);  	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; -	/* Don't override Nagle indefinately with F-RTO */ -	if (tp->frto_counter == 2) -		tp->frto_counter = 3; -  	tp->packets_out += tcp_skb_pcount(skb); -	if (!prior_packets) -		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { +		tcp_rearm_rto(sk); +	} + +	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, +		      tcp_skb_pcount(skb));  }  /* SND.NXT, if window was not shrunk. @@ -89,9 +97,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)   * Anything in between SND.UNA...SND.UNA+SND.WND also can be already   * invalid. OK, let's make this for now:   */ -static inline __u32 tcp_acceptable_seq(struct sock *sk) +static inline __u32 tcp_acceptable_seq(const struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	if (!before(tcp_wnd_end(tp), tp->snd_nxt))  		return tp->snd_nxt; @@ -116,12 +124,16 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk)  static __u16 tcp_advertise_mss(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	int mss = tp->advmss; -	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { -		mss = dst_metric(dst, RTAX_ADVMSS); -		tp->advmss = mss; +	if (dst) { +		unsigned int metric = dst_metric_advmss(dst); + +		if (metric < mss) { +			mss = metric; +			tp->advmss = mss; +		}  	}  	return (__u16)mss; @@ -129,7 +141,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)  /* RFC2861. Reset CWND after idle period longer RTO to "restart window".   * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)  {  	struct tcp_sock *tp = tcp_sk(sk);  	s32 delta = tcp_time_stamp - tp->lsndtime; @@ -150,10 +162,11 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)  /* Congestion state accounting after a packet has been sent. */  static void tcp_event_data_sent(struct tcp_sock *tp, -				struct sk_buff *skb, struct sock *sk) +				struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	const u32 now = tcp_time_stamp; +	const struct dst_entry *dst = __sk_dst_get(sk);  	if (sysctl_tcp_slow_start_after_idle &&  	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -164,8 +177,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,  	/* If it is a reply for ato after last received  	 * packet, enter pingpong mode.  	 */ -	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) -		icsk->icsk_ack.pingpong = 1; +	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && +	    (!dst || !dst_metric(dst, RTAX_QUICKACK))) +			icsk->icsk_ack.pingpong = 1;  }  /* Account for an ACK we sent. */ @@ -175,6 +189,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);  } + +u32 tcp_default_init_rwnd(u32 mss) +{ +	/* Initial receive window should be twice of TCP_INIT_CWND to +	 * enable proper sending of new unsent data during fast recovery +	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a +	 * limit when mss is larger than 1460. +	 */ +	u32 init_rwnd = TCP_INIT_CWND * 2; + +	if (mss > 1460) +		init_rwnd = max((1460 * init_rwnd) / mss, 2U); +	return init_rwnd; +} +  /* Determine a window scaling and initial window to offer.   * Based on the assumption that the given amount of space   * will be offered. Store the results in the tp structure. @@ -224,18 +253,10 @@ void tcp_select_initial_window(int __space, __u32 mss,  		}  	} -	/* Set initial window to value enough for senders, following RFC5681. */  	if (mss > (1 << *rcv_wscale)) { -		int init_cwnd = rfc3390_bytes_to_packets(mss); - -		/* when initializing use the value from init_rcv_wnd -		 * rather than the default from above -		 */ -		if (init_rcv_wnd && -		    (*rcv_wnd > init_rcv_wnd * mss)) -			*rcv_wnd = init_rcv_wnd * mss; -		else if (*rcv_wnd > init_cwnd * mss) -			*rcv_wnd = init_cwnd * mss; +		if (!init_rcv_wnd) /* Use default unless specified otherwise */ +			init_rcv_wnd = tcp_default_init_rwnd(mss); +		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);  	}  	/* Set the clamp no higher than max representable value */ @@ -251,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);  static u16 tcp_select_window(struct sock *sk)  {  	struct tcp_sock *tp = tcp_sk(sk); +	u32 old_win = tp->rcv_wnd;  	u32 cur_win = tcp_receive_window(tp);  	u32 new_win = __tcp_select_window(sk); @@ -263,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)  		 *  		 * Relax Will Robinson.  		 */ +		if (new_win == 0) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPWANTZEROWINDOWADV);  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);  	}  	tp->rcv_wnd = new_win; @@ -280,18 +305,24 @@ static u16 tcp_select_window(struct sock *sk)  	new_win >>= tp->rx_opt.rcv_wscale;  	/* If we advertise zero window, disable fast path. */ -	if (new_win == 0) +	if (new_win == 0) {  		tp->pred_flags = 0; +		if (old_win) +			NET_INC_STATS(sock_net(sk), +				      LINUX_MIB_TCPTOZEROWINDOWADV); +	} else if (old_win == 0) { +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); +	}  	return new_win;  }  /* Packet ECN state for a SYN-ACK */ -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) +static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)  { -	TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; +	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;  	if (!(tp->ecn_flags & TCP_ECN_OK)) -		TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; +		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;  }  /* Packet ECN state for a SYN.  */ @@ -300,14 +331,14 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)  	struct tcp_sock *tp = tcp_sk(sk);  	tp->ecn_flags = 0; -	if (sysctl_tcp_ecn == 1) { -		TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; +	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;  		tp->ecn_flags = TCP_ECN_OK;  	}  }  static __inline__ void -TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)  {  	if (inet_rsk(req)->ecn_ok)  		th->ece = 1; @@ -345,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,   */  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  { +	struct skb_shared_info *shinfo = skb_shinfo(skb); +  	skb->ip_summed = CHECKSUM_PARTIAL;  	skb->csum = 0; -	TCP_SKB_CB(skb)->flags = flags; +	TCP_SKB_CB(skb)->tcp_flags = flags;  	TCP_SKB_CB(skb)->sacked = 0; -	skb_shinfo(skb)->gso_segs = 1; -	skb_shinfo(skb)->gso_size = 0; -	skb_shinfo(skb)->gso_type = 0; +	shinfo->gso_segs = 1; +	shinfo->gso_size = 0; +	shinfo->gso_type = 0;  	TCP_SKB_CB(skb)->seq = seq;  	if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -361,7 +394,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)  	TCP_SKB_CB(skb)->end_seq = seq;  } -static inline int tcp_urg_mode(const struct tcp_sock *tp) +static inline bool tcp_urg_mode(const struct tcp_sock *tp)  {  	return tp->snd_una != tp->snd_up;  } @@ -370,51 +403,25 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)  #define OPTION_TS		(1 << 1)  #define OPTION_MD5		(1 << 2)  #define OPTION_WSCALE		(1 << 3) -#define OPTION_COOKIE_EXTENSION	(1 << 4) +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)  struct tcp_out_options { -	u8 options;		/* bit field of OPTION_* */ +	u16 options;		/* bit field of OPTION_* */ +	u16 mss;		/* 0 to disable */  	u8 ws;			/* window scale, 0 to disable */  	u8 num_sack_blocks;	/* number of SACK blocks to include */  	u8 hash_size;		/* bytes in hash_location */ -	u16 mss;		/* 0 to disable */ -	__u32 tsval, tsecr;	/* need to include OPTION_TS */  	__u8 *hash_location;	/* temporary pointer, overloaded */ +	__u32 tsval, tsecr;	/* need to include OPTION_TS */ +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */  }; -/* The sysctl int routines are generic, so check consistency here. - */ -static u8 tcp_cookie_size_check(u8 desired) -{ -	if (desired > 0) { -		/* previously specified */ -		return desired; -	} -	if (sysctl_tcp_cookie_size <= 0) { -		/* no default specified */ -		return 0; -	} -	if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { -		/* value too small, specify minimum */ -		return TCP_COOKIE_MIN; -	} -	if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { -		/* value too large, specify maximum */ -		return TCP_COOKIE_MAX; -	} -	if (0x1 & sysctl_tcp_cookie_size) { -		/* 8-bit multiple, illegal, fix it */ -		return (u8)(sysctl_tcp_cookie_size + 0x1); -	} -	return (u8)sysctl_tcp_cookie_size; -} -  /* Write previously computed TCP options to the packet.   *   * Beware: Something in the Internet is very sensitive to the ordering of   * TCP options, we learned this through the hard way, so be careful here.   * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with   * the ordering which we have been using if we want to keep working with   * those broken things (not that it currently hurts anybody as there isn't   * particular reason why the ordering would need to be changed). @@ -425,29 +432,11 @@ static u8 tcp_cookie_size_check(u8 desired)  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  			      struct tcp_out_options *opts)  { -	u8 options = opts->options;	/* mungable copy */ +	u16 options = opts->options;	/* mungable copy */ -	/* Having both authentication and cookies for security is redundant, -	 * and there's certainly not enough room.  Instead, the cookie-less -	 * extension variant is proposed. -	 * -	 * Consider the pessimal case with authentication.  The options -	 * could look like: -	 *   COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 -	 */  	if (unlikely(OPTION_MD5 & options)) { -		if (unlikely(OPTION_COOKIE_EXTENSION & options)) { -			*ptr++ = htonl((TCPOPT_COOKIE << 24) | -				       (TCPOLEN_COOKIE_BASE << 16) | -				       (TCPOPT_MD5SIG << 8) | -				       TCPOLEN_MD5SIG); -		} else { -			*ptr++ = htonl((TCPOPT_NOP << 24) | -				       (TCPOPT_NOP << 16) | -				       (TCPOPT_MD5SIG << 8) | -				       TCPOLEN_MD5SIG); -		} -		options &= ~OPTION_COOKIE_EXTENSION; +		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | +			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);  		/* overload cookie hash location */  		opts->hash_location = (__u8 *)ptr;  		ptr += 4; @@ -476,44 +465,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  		*ptr++ = htonl(opts->tsecr);  	} -	/* Specification requires after timestamp, so do it now. -	 * -	 * Consider the pessimal case without authentication.  The options -	 * could look like: -	 *   MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 -	 */ -	if (unlikely(OPTION_COOKIE_EXTENSION & options)) { -		__u8 *cookie_copy = opts->hash_location; -		u8 cookie_size = opts->hash_size; - -		/* 8-bit multiple handled in tcp_cookie_size_check() above, -		 * and elsewhere. -		 */ -		if (0x2 & cookie_size) { -			__u8 *p = (__u8 *)ptr; - -			/* 16-bit multiple */ -			*p++ = TCPOPT_COOKIE; -			*p++ = TCPOLEN_COOKIE_BASE + cookie_size; -			*p++ = *cookie_copy++; -			*p++ = *cookie_copy++; -			ptr++; -			cookie_size -= 2; -		} else { -			/* 32-bit multiple */ -			*ptr++ = htonl(((TCPOPT_NOP << 24) | -					(TCPOPT_NOP << 16) | -					(TCPOPT_COOKIE << 8) | -					TCPOLEN_COOKIE_BASE) + -				       cookie_size); -		} - -		if (cookie_size > 0) { -			memcpy(ptr, cookie_copy, cookie_size); -			ptr += (cookie_size / 4); -		} -	} -  	if (unlikely(OPTION_SACK_ADVERTISE & options)) {  		*ptr++ = htonl((TCPOPT_NOP << 24) |  			       (TCPOPT_NOP << 16) | @@ -547,20 +498,33 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,  		tp->rx_opt.dsack = 0;  	} + +	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { +		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; + +		*ptr++ = htonl((TCPOPT_EXP << 24) | +			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | +			       TCPOPT_FASTOPEN_MAGIC); + +		memcpy(ptr, foc->val, foc->len); +		if ((foc->len & 3) == 2) { +			u8 *align = ((u8 *)ptr) + foc->len; +			align[0] = align[1] = TCPOPT_NOP; +		} +		ptr += (foc->len + 3) >> 2; +	}  }  /* Compute TCP options for SYN packets. This is not the final   * network wire format yet.   */ -static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,  				struct tcp_out_options *opts, -				struct tcp_md5sig_key **md5) { +				struct tcp_md5sig_key **md5) +{  	struct tcp_sock *tp = tcp_sk(sk); -	struct tcp_cookie_values *cvp = tp->cookie_values; -	unsigned remaining = MAX_TCP_OPTION_SPACE; -	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? -			 tcp_cookie_size_check(cvp->cookie_desired) : -			 0; +	unsigned int remaining = MAX_TCP_OPTION_SPACE; +	struct tcp_fastopen_request *fastopen = tp->fastopen_req;  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tp->af_specific->md5_lookup(sk, sk); @@ -586,7 +550,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,  	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {  		opts->options |= OPTION_TS; -		opts->tsval = TCP_SKB_CB(skb)->when; +		opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;  		opts->tsecr = tp->rx_opt.ts_recent;  		remaining -= TCPOLEN_TSTAMP_ALIGNED;  	} @@ -601,68 +565,30 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,  			remaining -= TCPOLEN_SACKPERM_ALIGNED;  	} -	/* Note that timestamps are required by the specification. -	 * -	 * Odd numbers of bytes are prohibited by the specification, ensuring -	 * that the cookie is 16-bit aligned, and the resulting cookie pair is -	 * 32-bit aligned. -	 */ -	if (*md5 == NULL && -	    (OPTION_TS & opts->options) && -	    cookie_size > 0) { -		int need = TCPOLEN_COOKIE_BASE + cookie_size; - -		if (0x2 & need) { -			/* 32-bit multiple */ -			need += 2; /* NOPs */ - -			if (need > remaining) { -				/* try shrinking cookie to fit */ -				cookie_size -= 2; -				need -= 4; -			} -		} -		while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { -			cookie_size -= 4; -			need -= 4; -		} -		if (TCP_COOKIE_MIN <= cookie_size) { -			opts->options |= OPTION_COOKIE_EXTENSION; -			opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; -			opts->hash_size = cookie_size; - -			/* Remember for future incarnations. */ -			cvp->cookie_desired = cookie_size; - -			if (cvp->cookie_desired != cvp->cookie_pair_size) { -				/* Currently use random bytes as a nonce, -				 * assuming these are completely unpredictable -				 * by hostile users of the same system. -				 */ -				get_random_bytes(&cvp->cookie_pair[0], -						 cookie_size); -				cvp->cookie_pair_size = cookie_size; -			} - +	if (fastopen && fastopen->cookie.len >= 0) { +		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; +		need = (need + 3) & ~3U;  /* Align to 32 bits */ +		if (remaining >= need) { +			opts->options |= OPTION_FAST_OPEN_COOKIE; +			opts->fastopen_cookie = &fastopen->cookie;  			remaining -= need; +			tp->syn_fastopen = 1;  		}  	} +  	return MAX_TCP_OPTION_SPACE - remaining;  }  /* Set up TCP options for SYN-ACKs. */ -static unsigned tcp_synack_options(struct sock *sk, +static unsigned int tcp_synack_options(struct sock *sk,  				   struct request_sock *req, -				   unsigned mss, struct sk_buff *skb, +				   unsigned int mss, struct sk_buff *skb,  				   struct tcp_out_options *opts,  				   struct tcp_md5sig_key **md5, -				   struct tcp_extend_values *xvp) +				   struct tcp_fastopen_cookie *foc)  {  	struct inet_request_sock *ireq = inet_rsk(req); -	unsigned remaining = MAX_TCP_OPTION_SPACE; -	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? -			 xvp->cookie_plus : -			 0; +	unsigned int remaining = MAX_TCP_OPTION_SPACE;  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); @@ -701,43 +627,33 @@ static unsigned tcp_synack_options(struct sock *sk,  		if (unlikely(!ireq->tstamp_ok))  			remaining -= TCPOLEN_SACKPERM_ALIGNED;  	} - -	/* Similar rationale to tcp_syn_options() applies here, too. -	 * If the <SYN> options fit, the same options should fit now! -	 */ -	if (*md5 == NULL && -	    ireq->tstamp_ok && -	    cookie_plus > TCPOLEN_COOKIE_BASE) { -		int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ - -		if (0x2 & need) { -			/* 32-bit multiple */ -			need += 2; /* NOPs */ -		} -		if (need <= remaining) { -			opts->options |= OPTION_COOKIE_EXTENSION; -			opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; +	if (foc != NULL && foc->len >= 0) { +		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; +		need = (need + 3) & ~3U;  /* Align to 32 bits */ +		if (remaining >= need) { +			opts->options |= OPTION_FAST_OPEN_COOKIE; +			opts->fastopen_cookie = foc;  			remaining -= need; -		} else { -			/* There's no error return, so flag it. */ -			xvp->cookie_out_never = 1; /* true */ -			opts->hash_size = 0;  		}  	} +  	return MAX_TCP_OPTION_SPACE - remaining;  }  /* Compute TCP options for ESTABLISHED sockets. This is not the   * final wire format yet.   */ -static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,  					struct tcp_out_options *opts, -					struct tcp_md5sig_key **md5) { +					struct tcp_md5sig_key **md5) +{  	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;  	struct tcp_sock *tp = tcp_sk(sk); -	unsigned size = 0; +	unsigned int size = 0;  	unsigned int eff_sacks; +	opts->options = 0; +  #ifdef CONFIG_TCP_MD5SIG  	*md5 = tp->af_specific->md5_lookup(sk, sk);  	if (unlikely(*md5)) { @@ -750,16 +666,16 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,  	if (likely(tp->rx_opt.tstamp_ok)) {  		opts->options |= OPTION_TS; -		opts->tsval = tcb ? tcb->when : 0; +		opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;  		opts->tsecr = tp->rx_opt.ts_recent;  		size += TCPOLEN_TSTAMP_ALIGNED;  	}  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;  	if (unlikely(eff_sacks)) { -		const unsigned remaining = MAX_TCP_OPTION_SPACE - size; +		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;  		opts->num_sack_blocks = -			min_t(unsigned, eff_sacks, +			min_t(unsigned int, eff_sacks,  			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /  			      TCPOLEN_SACK_PERBLOCK);  		size += TCPOLEN_SACK_BASE_ALIGNED + @@ -769,6 +685,172 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,  	return size;  } + +/* TCP SMALL QUEUES (TSQ) + * + * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) + * to reduce RTT and bufferbloat. + * We do this using a special skb destructor (tcp_wfree). + * + * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb + * needs to be reallocated in a driver. + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc + * + * Since transmit from skb destructor is forbidden, we use a tasklet + * to process all sockets that eventually need to send more skbs. + * We use one tasklet per cpu, with its own queue of sockets. + */ +struct tsq_tasklet { +	struct tasklet_struct	tasklet; +	struct list_head	head; /* queue of tcp sockets */ +}; +static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); + +static void tcp_tsq_handler(struct sock *sk) +{ +	if ((1 << sk->sk_state) & +	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | +	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) +		tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, +			       0, GFP_ATOMIC); +} +/* + * One tasklet per cpu tries to send more skbs. + * We run in tasklet context but need to disable irqs when + * transferring tsq->head because tcp_wfree() might + * interrupt us (non NAPI drivers) + */ +static void tcp_tasklet_func(unsigned long data) +{ +	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; +	LIST_HEAD(list); +	unsigned long flags; +	struct list_head *q, *n; +	struct tcp_sock *tp; +	struct sock *sk; + +	local_irq_save(flags); +	list_splice_init(&tsq->head, &list); +	local_irq_restore(flags); + +	list_for_each_safe(q, n, &list) { +		tp = list_entry(q, struct tcp_sock, tsq_node); +		list_del(&tp->tsq_node); + +		sk = (struct sock *)tp; +		bh_lock_sock(sk); + +		if (!sock_owned_by_user(sk)) { +			tcp_tsq_handler(sk); +		} else { +			/* defer the work to tcp_release_cb() */ +			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags); +		} +		bh_unlock_sock(sk); + +		clear_bit(TSQ_QUEUED, &tp->tsq_flags); +		sk_free(sk); +	} +} + +#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\ +			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\ +			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\ +			  (1UL << TCP_MTU_REDUCED_DEFERRED)) +/** + * tcp_release_cb - tcp release_sock() callback + * @sk: socket + * + * called from release_sock() to perform protocol dependent + * actions before socket release. + */ +void tcp_release_cb(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	unsigned long flags, nflags; + +	/* perform an atomic operation only if at least one flag is set */ +	do { +		flags = tp->tsq_flags; +		if (!(flags & TCP_DEFERRED_ALL)) +			return; +		nflags = flags & ~TCP_DEFERRED_ALL; +	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags); + +	if (flags & (1UL << TCP_TSQ_DEFERRED)) +		tcp_tsq_handler(sk); + +	/* Here begins the tricky part : +	 * We are called from release_sock() with : +	 * 1) BH disabled +	 * 2) sk_lock.slock spinlock held +	 * 3) socket owned by us (sk->sk_lock.owned == 1) +	 * +	 * But following code is meant to be called from BH handlers, +	 * so we should keep BH disabled, but early release socket ownership +	 */ +	sock_release_ownership(sk); + +	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { +		tcp_write_timer_handler(sk); +		__sock_put(sk); +	} +	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { +		tcp_delack_timer_handler(sk); +		__sock_put(sk); +	} +	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { +		sk->sk_prot->mtu_reduced(sk); +		__sock_put(sk); +	} +} +EXPORT_SYMBOL(tcp_release_cb); + +void __init tcp_tasklet_init(void) +{ +	int i; + +	for_each_possible_cpu(i) { +		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + +		INIT_LIST_HEAD(&tsq->head); +		tasklet_init(&tsq->tasklet, +			     tcp_tasklet_func, +			     (unsigned long)tsq); +	} +} + +/* + * Write buffer destructor automatically called from kfree_skb. + * We can't xmit new skbs from this context, as we might already + * hold qdisc lock. + */ +void tcp_wfree(struct sk_buff *skb) +{ +	struct sock *sk = skb->sk; +	struct tcp_sock *tp = tcp_sk(sk); + +	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && +	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { +		unsigned long flags; +		struct tsq_tasklet *tsq; + +		/* Keep a ref on socket. +		 * This last ref will be released in tcp_tasklet_func() +		 */ +		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); + +		/* queue this socket to tasklet queue */ +		local_irq_save(flags); +		tsq = &__get_cpu_var(tsq_tasklet); +		list_add(&tp->tsq_node, &tsq->head); +		tasklet_schedule(&tsq->tasklet); +		local_irq_restore(flags); +	} else { +		sock_wfree(skb); +	} +} +  /* This routine actually transmits TCP packets queued in by   * tcp_do_sendmsg().  This is used by both the initial   * transmission and possible later retransmissions. @@ -788,26 +870,24 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	struct tcp_sock *tp;  	struct tcp_skb_cb *tcb;  	struct tcp_out_options opts; -	unsigned tcp_options_size, tcp_header_size; +	unsigned int tcp_options_size, tcp_header_size;  	struct tcp_md5sig_key *md5;  	struct tcphdr *th;  	int err;  	BUG_ON(!skb || !tcp_skb_pcount(skb)); -	/* If congestion control is doing timestamping, we must -	 * take such a timestamp before we potentially clone/copy. -	 */ -	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) -		__net_timestamp(skb); +	if (clone_it) { +		skb_mstamp_get(&skb->skb_mstamp); -	if (likely(clone_it)) {  		if (unlikely(skb_cloned(skb)))  			skb = pskb_copy(skb, gfp_mask);  		else  			skb = skb_clone(skb, gfp_mask);  		if (unlikely(!skb))  			return -ENOBUFS; +		/* Our usage of tstamp should remain private */ +		skb->tstamp.tv64 = 0;  	}  	inet = inet_sk(sk); @@ -815,22 +895,28 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	tcb = TCP_SKB_CB(skb);  	memset(&opts, 0, sizeof(opts)); -	if (unlikely(tcb->flags & TCPHDR_SYN)) +	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);  	else  		tcp_options_size = tcp_established_options(sk, skb, &opts,  							   &md5);  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr); -	if (tcp_packets_in_flight(tp) == 0) { +	if (tcp_packets_in_flight(tp) == 0)  		tcp_ca_event(sk, CA_EVENT_TX_START); -		skb->ooo_okay = 1; -	} else -		skb->ooo_okay = 0; + +	/* if no packet is in qdisc/device queue, then allow XPS to select +	 * another queue. +	 */ +	skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;  	skb_push(skb, tcp_header_size);  	skb_reset_transport_header(skb); -	skb_set_owner_w(skb, sk); + +	skb_orphan(skb); +	skb->sk = sk; +	skb->destructor = tcp_wfree; +	atomic_add(skb->truesize, &sk->sk_wmem_alloc);  	/* Build TCP header and checksum it. */  	th = tcp_hdr(skb); @@ -839,9 +925,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	th->seq			= htonl(tcb->seq);  	th->ack_seq		= htonl(tp->rcv_nxt);  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | -					tcb->flags); +					tcb->tcp_flags); -	if (unlikely(tcb->flags & TCPHDR_SYN)) { +	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {  		/* RFC1323: The window in SYN & SYN/ACK segments  		 * is never scaled.  		 */ @@ -864,7 +950,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	}  	tcp_options_write((__be32 *)(th + 1), tp, &opts); -	if (likely((tcb->flags & TCPHDR_SYN) == 0)) +	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))  		TCP_ECN_send(sk, skb, tcp_header_size);  #ifdef CONFIG_TCP_MD5SIG @@ -878,17 +964,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,  	icsk->icsk_af_ops->send_check(sk, skb); -	if (likely(tcb->flags & TCPHDR_ACK)) +	if (likely(tcb->tcp_flags & TCPHDR_ACK))  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));  	if (skb->len != tcp_header_size) -		tcp_event_data_sent(tp, skb, sk); +		tcp_event_data_sent(tp, sk);  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,  			      tcp_skb_pcount(skb)); -	err = icsk->icsk_af_ops->queue_xmit(skb); +	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);  	if (likely(err <= 0))  		return err; @@ -915,28 +1001,32 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)  }  /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, +static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,  				 unsigned int mss_now)  { -	if (skb->len <= mss_now || !sk_can_gso(sk) || -	    skb->ip_summed == CHECKSUM_NONE) { +	struct skb_shared_info *shinfo = skb_shinfo(skb); + +	/* Make sure we own this skb before messing gso_size/gso_segs */ +	WARN_ON_ONCE(skb_cloned(skb)); + +	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {  		/* Avoid the costly divide in the normal  		 * non-TSO case.  		 */ -		skb_shinfo(skb)->gso_segs = 1; -		skb_shinfo(skb)->gso_size = 0; -		skb_shinfo(skb)->gso_type = 0; +		shinfo->gso_segs = 1; +		shinfo->gso_size = 0; +		shinfo->gso_type = 0;  	} else { -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); -		skb_shinfo(skb)->gso_size = mss_now; -		skb_shinfo(skb)->gso_type = sk->sk_gso_type; +		shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); +		shinfo->gso_size = mss_now; +		shinfo->gso_type = sk->sk_gso_type;  	}  }  /* When a modification to fackets out becomes necessary, we need to check   * skb is counted to fackets_out or not.   */ -static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,  				   int decr)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -951,7 +1041,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,  /* Pcount in the middle of the write queue got changed, we need to do various   * tweaks to fix counters   */ -static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -984,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)   * Remember, these are still headerless SKBs at this point.   */  int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, -		 unsigned int mss_now) +		 unsigned int mss_now, gfp_t gfp)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *buff; @@ -992,19 +1082,18 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	int nlen;  	u8 flags; -	BUG_ON(len > skb->len); +	if (WARN_ON(len > skb->len)) +		return -EINVAL;  	nsize = skb_headlen(skb) - len;  	if (nsize < 0)  		nsize = 0; -	if (skb_cloned(skb) && -	    skb_is_nonlinear(skb) && -	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, gfp))  		return -ENOMEM;  	/* Get a new skb... force flag on. */ -	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); +	buff = sk_stream_alloc_skb(sk, nsize, gfp);  	if (buff == NULL)  		return -ENOMEM; /* We'll just try again later. */ @@ -1020,9 +1109,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;  	/* PSH and FIN should only be set in the second packet. */ -	flags = TCP_SKB_CB(skb)->flags; -	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); -	TCP_SKB_CB(buff)->flags = flags; +	flags = TCP_SKB_CB(skb)->tcp_flags; +	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +	TCP_SKB_CB(buff)->tcp_flags = flags;  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;  	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { @@ -1077,25 +1166,36 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,   */  static void __pskb_trim_head(struct sk_buff *skb, int len)  { +	struct skb_shared_info *shinfo;  	int i, k, eat; +	eat = min_t(int, len, skb_headlen(skb)); +	if (eat) { +		__skb_pull(skb, eat); +		len -= eat; +		if (!len) +			return; +	}  	eat = len;  	k = 0; -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -		if (skb_shinfo(skb)->frags[i].size <= eat) { -			put_page(skb_shinfo(skb)->frags[i].page); -			eat -= skb_shinfo(skb)->frags[i].size; +	shinfo = skb_shinfo(skb); +	for (i = 0; i < shinfo->nr_frags; i++) { +		int size = skb_frag_size(&shinfo->frags[i]); + +		if (size <= eat) { +			skb_frag_unref(skb, i); +			eat -= size;  		} else { -			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; +			shinfo->frags[k] = shinfo->frags[i];  			if (eat) { -				skb_shinfo(skb)->frags[k].page_offset += eat; -				skb_shinfo(skb)->frags[k].size -= eat; +				shinfo->frags[k].page_offset += eat; +				skb_frag_size_sub(&shinfo->frags[k], eat);  				eat = 0;  			}  			k++;  		}  	} -	skb_shinfo(skb)->nr_frags = k; +	shinfo->nr_frags = k;  	skb_reset_tail_pointer(skb);  	skb->data_len -= len; @@ -1105,14 +1205,10 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)  /* Remove acked data from a packet in the transmit queue. */  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)  { -	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, GFP_ATOMIC))  		return -ENOMEM; -	/* If len == headlen, we avoid __skb_pull to preserve alignment. */ -	if (unlikely(len < skb_headlen(skb))) -		__skb_pull(skb, len); -	else -		__pskb_trim_head(skb, len - skb_headlen(skb)); +	__pskb_trim_head(skb, len);  	TCP_SKB_CB(skb)->seq += len;  	skb->ip_summed = CHECKSUM_PARTIAL; @@ -1122,20 +1218,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)  	sk_mem_uncharge(sk, len);  	sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -	/* Any change of skb->len requires recalculation of tso -	 * factor and mss. -	 */ +	/* Any change of skb->len requires recalculation of tso factor. */  	if (tcp_skb_pcount(skb) > 1) -		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); +		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));  	return 0;  } -/* Calculate MSS. Not accounting for SACKs here.  */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options.  */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	int mss_now;  	/* Calculate base mss without TCP options: @@ -1143,6 +1237,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)  	 */  	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); +	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ +	if (icsk->icsk_af_ops->net_frag_header_len) { +		const struct dst_entry *dst = __sk_dst_get(sk); + +		if (dst && dst_allfrag(dst)) +			mss_now -= icsk->icsk_af_ops->net_frag_header_len; +	} +  	/* Clamp it (mss_clamp does not include tcp options) */  	if (mss_now > tp->rx_opt.mss_clamp)  		mss_now = tp->rx_opt.mss_clamp; @@ -1153,18 +1255,22 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)  	/* Then reserve room for full set of TCP options and 8 bytes of data */  	if (mss_now < 48)  		mss_now = 48; - -	/* Now subtract TCP options size, not including SACKs */ -	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); -  	return mss_now;  } +/* Calculate MSS. Not accounting for SACKs here.  */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ +	/* Subtract TCP options size, not including SACKs */ +	return __tcp_mtu_to_mss(sk, pmtu) - +	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} +  /* Inverse of above */  int tcp_mss_to_mtu(struct sock *sk, int mss)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct inet_connection_sock *icsk = inet_csk(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct inet_connection_sock *icsk = inet_csk(sk);  	int mtu;  	mtu = mss + @@ -1172,6 +1278,13 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)  	      icsk->icsk_ext_hdr_len +  	      icsk->icsk_af_ops->net_header_len; +	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ +	if (icsk->icsk_af_ops->net_frag_header_len) { +		const struct dst_entry *dst = __sk_dst_get(sk); + +		if (dst && dst_allfrag(dst)) +			mtu += icsk->icsk_af_ops->net_frag_header_len; +	}  	return mtu;  } @@ -1238,10 +1351,10 @@ EXPORT_SYMBOL(tcp_sync_mss);   */  unsigned int tcp_current_mss(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct tcp_sock *tp = tcp_sk(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	u32 mss_now; -	unsigned header_len; +	unsigned int header_len;  	struct tcp_out_options opts;  	struct tcp_md5sig_key *md5; @@ -1267,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)  	return mss_now;  } -/* Congestion window validation. (RFC2861) */ -static void tcp_cwnd_validate(struct sock *sk) +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +static void tcp_cwnd_application_limited(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); + +	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && +	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { +		/* Limited by application or receiver window. */ +		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); +		u32 win_used = max(tp->snd_cwnd_used, init_win); +		if (win_used < tp->snd_cwnd) { +			tp->snd_ssthresh = tcp_current_ssthresh(sk); +			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; +		} +		tp->snd_cwnd_used = 0; +	} +	tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (tp->packets_out >= tp->snd_cwnd) { +	/* Track the maximum number of outstanding packets in each +	 * window, and remember whether we were cwnd-limited then. +	 */ +	if (!before(tp->snd_una, tp->max_packets_seq) || +	    tp->packets_out > tp->max_packets_out) { +		tp->max_packets_out = tp->packets_out; +		tp->max_packets_seq = tp->snd_nxt; +		tp->is_cwnd_limited = is_cwnd_limited; +	} + +	if (tcp_is_cwnd_limited(sk)) {  		/* Network is feed fully. */  		tp->snd_cwnd_used = 0;  		tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1287,48 +1431,85 @@ static void tcp_cwnd_validate(struct sock *sk)  	}  } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ +	return after(tp->snd_sml, tp->snd_una) && +		!after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + *  skb_pcount = skb->len / mss_now   */ -static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, -					unsigned int mss_now, unsigned int cwnd) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, +				const struct sk_buff *skb)  { -	struct tcp_sock *tp = tcp_sk(sk); -	u32 needed, window, cwnd_len; +	if (skb->len < tcp_skb_pcount(skb) * mss_now) +		tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + *    With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, +			    int nonagle) +{ +	return partial && +		((nonagle & TCP_NAGLE_CORK) || +		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, +					const struct sk_buff *skb, +					unsigned int mss_now, +					unsigned int max_segs, +					int nonagle) +{ +	const struct tcp_sock *tp = tcp_sk(sk); +	u32 partial, needed, window, max_len;  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; -	cwnd_len = mss_now * cwnd; +	max_len = mss_now * max_segs; -	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) -		return cwnd_len; +	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) +		return max_len;  	needed = min(skb->len, window); -	if (cwnd_len <= needed) -		return cwnd_len; +	if (max_len <= needed) +		return max_len; + +	partial = needed % mss_now; +	/* If last segment is not a full MSS, check if Nagle rules allow us +	 * to include this last segment in this skb. +	 * Otherwise, we'll split the skb at last MSS boundary +	 */ +	if (tcp_nagle_check(partial != 0, tp, nonagle)) +		return needed - partial; -	return needed - needed % mss_now; +	return needed;  }  /* Can at least one segment of SKB be sent right now, according to the   * congestion window rules?  If so, return how many segments are allowed.   */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, -					 struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, +					 const struct sk_buff *skb)  {  	u32 in_flight, cwnd;  	/* Don't be strict about the congestion window for the final FIN.  */ -	if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) +	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && +	    tcp_skb_pcount(skb) == 1)  		return 1;  	in_flight = tcp_packets_in_flight(tp); @@ -1339,11 +1520,11 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,  	return 0;  } -/* Intialize TSO state of a skb. +/* Initialize TSO state of a skb.   * This must be invoked the first time we consider transmitting   * SKB onto the wire.   */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, +static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,  			     unsigned int mss_now)  {  	int tso_segs = tcp_skb_pcount(skb); @@ -1355,34 +1536,12 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,  	return tso_segs;  } -/* Minshall's variant of the Nagle send check. */ -static inline int tcp_minshall_check(const struct tcp_sock *tp) -{ -	return after(tp->snd_sml, tp->snd_una) && -		!after(tp->snd_sml, tp->snd_nxt); -} -/* Return 0, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - *    With Minshall's modification: all sent small packets are ACKed. - */ -static inline int tcp_nagle_check(const struct tcp_sock *tp, -				  const struct sk_buff *skb, -				  unsigned mss_now, int nonagle) -{ -	return skb->len < mss_now && -		((nonagle & TCP_NAGLE_CORK) || -		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} - -/* Return non-zero if the Nagle test allows this packet to be +/* Return true if the Nagle test allows this packet to be   * sent now.   */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, -				 unsigned int cur_mss, int nonagle) +static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, +				  unsigned int cur_mss, int nonagle)  {  	/* Nagle rule does not apply to frames, which sit in the middle of the  	 * write_queue (they have no chances to get new data). @@ -1391,24 +1550,22 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,  	 * argument based upon the location of SKB in the send queue.  	 */  	if (nonagle & TCP_NAGLE_PUSH) -		return 1; +		return true; -	/* Don't use the nagle rule for urgent data (or for the final FIN). -	 * Nagle can be ignored during F-RTO too (see RFC4138). -	 */ -	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || -	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) -		return 1; +	/* Don't use the nagle rule for urgent data (or for the final FIN). */ +	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) +		return true; -	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) -		return 1; +	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) +		return true; -	return 0; +	return false;  }  /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, -				   unsigned int cur_mss) +static bool tcp_snd_wnd_test(const struct tcp_sock *tp, +			     const struct sk_buff *skb, +			     unsigned int cur_mss)  {  	u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1422,10 +1579,10 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,   * should be put on the wire right now.  If so, it returns the number of   * packets allowed by the congestion window.   */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, +static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,  				 unsigned int cur_mss, int nonagle)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	unsigned int cwnd_quota;  	tcp_init_tso_segs(sk, skb, cur_mss); @@ -1441,9 +1598,9 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,  }  /* Test if sending is allowed right now. */ -int tcp_may_send_now(struct sock *sk) +bool tcp_may_send_now(struct sock *sk)  { -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb = tcp_send_head(sk);  	return skb && @@ -1468,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	/* All of a TSO frame must be composed of paged data.  */  	if (skb->len != skb->data_len) -		return tcp_fragment(sk, skb, len, mss_now); +		return tcp_fragment(sk, skb, len, mss_now, gfp);  	buff = sk_stream_alloc_skb(sk, 0, gfp);  	if (unlikely(buff == NULL)) @@ -1485,9 +1642,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;  	/* PSH and FIN should only be set in the second packet. */ -	flags = TCP_SKB_CB(skb)->flags; -	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); -	TCP_SKB_CB(buff)->flags = flags; +	flags = TCP_SKB_CB(skb)->tcp_flags; +	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); +	TCP_SKB_CB(buff)->tcp_flags = flags;  	/* This packet was never sent out yet, so no SACK bits. */  	TCP_SKB_CB(buff)->sacked = 0; @@ -1511,13 +1668,15 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,   *   * This algorithm is from John Heffner.   */ -static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) +static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, +				 bool *is_cwnd_limited)  {  	struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_connection_sock *icsk = inet_csk(sk);  	u32 send_win, cong_win, limit, in_flight; +	int win_divisor; -	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)  		goto send_now;  	if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1540,20 +1699,22 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)  	limit = min(send_win, cong_win);  	/* If a full-sized TSO skb can be sent, do it. */ -	if (limit >= sk->sk_gso_max_size) +	if (limit >= min_t(unsigned int, sk->sk_gso_max_size, +			   tp->xmit_size_goal_segs * tp->mss_cache))  		goto send_now;  	/* Middle in queue won't get any more data, full sendable already? */  	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))  		goto send_now; -	if (sysctl_tcp_tso_win_divisor) { +	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); +	if (win_divisor) {  		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);  		/* If at least some fraction of a window is available,  		 * just use it.  		 */ -		chunk /= sysctl_tcp_tso_win_divisor; +		chunk /= win_divisor;  		if (limit >= chunk)  			goto send_now;  	} else { @@ -1562,18 +1723,24 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)  		 * frame, so if we have space for more than 3 frames  		 * then send now.  		 */ -		if (limit > tcp_max_burst(tp) * tp->mss_cache) +		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)  			goto send_now;  	} -	/* Ok, it looks like it is advisable to defer.  */ -	tp->tso_deferred = 1 | (jiffies << 1); +	/* Ok, it looks like it is advisable to defer. +	 * Do not rearm the timer if already set to not break TCP ACK clocking. +	 */ +	if (!tp->tso_deferred) +		tp->tso_deferred = 1 | (jiffies << 1); + +	if (cong_win < send_win && cong_win < skb->len) +		*is_cwnd_limited = true; -	return 1; +	return true;  send_now:  	tp->tso_deferred = 0; -	return 0; +	return false;  }  /* Create a new MTU probe if we are ready. @@ -1643,7 +1810,7 @@ static int tcp_mtu_probe(struct sock *sk)  	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;  	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; -	TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; +	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;  	TCP_SKB_CB(nskb)->sacked = 0;  	nskb->csum = 0;  	nskb->ip_summed = skb->ip_summed; @@ -1663,11 +1830,11 @@ static int tcp_mtu_probe(struct sock *sk)  		if (skb->len <= copy) {  			/* We've eaten all the data from this skb.  			 * Throw it away. */ -			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; +			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;  			tcp_unlink_write_queue(skb, sk);  			sk_wmem_free_skb(sk, skb);  		} else { -			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & +			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &  						   ~(TCPHDR_FIN|TCPHDR_PSH);  			if (!skb_shinfo(skb)->nr_frags) {  				skb_pull(skb, copy); @@ -1715,17 +1882,21 @@ static int tcp_mtu_probe(struct sock *sk)   * snd_up-64k-mss .. snd_up cannot be large. However, taking into   * account rare use of URG, this is not a big flaw.   * - * Returns 1, if no segments are in flight and we have queued segments, but - * cannot send anything now because of SWS or another problem. + * Send at most one packet when push_one > 0. Temporarily ignore + * cwnd limit to force at most one packet out when push_one == 2. + + * Returns true, if no segments are in flight and we have queued segments, + * but cannot send anything now because of SWS or another problem.   */ -static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, -			  int push_one, gfp_t gfp) +static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +			   int push_one, gfp_t gfp)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb;  	unsigned int tso_segs, sent_pkts;  	int cwnd_quota;  	int result; +	bool is_cwnd_limited = false;  	sent_pkts = 0; @@ -1733,7 +1904,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		/* Do MTU probing. */  		result = tcp_mtu_probe(sk);  		if (!result) { -			return 0; +			return false;  		} else if (result > 0) {  			sent_pkts = 1;  		} @@ -1745,9 +1916,18 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);  		BUG_ON(!tso_segs); +		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) +			goto repair; /* Skip network transmission */ +  		cwnd_quota = tcp_cwnd_test(tp, skb); -		if (!cwnd_quota) -			break; +		if (!cwnd_quota) { +			is_cwnd_limited = true; +			if (push_one == 2) +				/* Force out a loss probe pkt. */ +				cwnd_quota = 1; +			else +				break; +		}  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))  			break; @@ -1758,14 +1938,42 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  						      nonagle : TCP_NAGLE_PUSH))))  				break;  		} else { -			if (!push_one && tcp_tso_should_defer(sk, skb)) +			if (!push_one && +			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) +				break; +		} + +		/* TCP Small Queues : +		 * Control number of packets in qdisc/devices to two packets / or ~1 ms. +		 * This allows for : +		 *  - better RTT estimation and ACK scheduling +		 *  - faster recovery +		 *  - high rates +		 * Alas, some drivers / subsystems require a fair amount +		 * of queued bytes to ensure line rate. +		 * One example is wifi aggregation (802.11 AMPDU) +		 */ +		limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, +			      sk->sk_pacing_rate >> 10); + +		if (atomic_read(&sk->sk_wmem_alloc) > limit) { +			set_bit(TSQ_THROTTLED, &tp->tsq_flags); +			/* It is possible TX completion already happened +			 * before we set TSQ_THROTTLED, so we must +			 * test again the condition. +			 */ +			smp_mb__after_atomic(); +			if (atomic_read(&sk->sk_wmem_alloc) > limit)  				break;  		}  		limit = mss_now;  		if (tso_segs > 1 && !tcp_urg_mode(tp))  			limit = tcp_mss_split_point(sk, skb, mss_now, -						    cwnd_quota); +						    min_t(unsigned int, +							  cwnd_quota, +							  sk->sk_gso_max_segs), +						    nonagle);  		if (skb->len > limit &&  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) @@ -1776,23 +1984,165 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))  			break; +repair:  		/* Advance the send_head.  This one is sent out.  		 * This call will increment packets_out.  		 */  		tcp_event_new_data_sent(sk, skb);  		tcp_minshall_update(tp, mss_now, skb); -		sent_pkts++; +		sent_pkts += tcp_skb_pcount(skb);  		if (push_one)  			break;  	}  	if (likely(sent_pkts)) { -		tcp_cwnd_validate(sk); -		return 0; +		if (tcp_in_cwnd_reduction(sk)) +			tp->prr_out += sent_pkts; + +		/* Send one loss probe per tail loss episode. */ +		if (push_one != 2) +			tcp_schedule_loss_probe(sk); +		tcp_cwnd_validate(sk, is_cwnd_limited); +		return false;  	} -	return !tp->packets_out && tcp_send_head(sk); +	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); +} + +bool tcp_schedule_loss_probe(struct sock *sk) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk); +	u32 timeout, tlp_time_stamp, rto_time_stamp; +	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + +	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) +		return false; +	/* No consecutive loss probes. */ +	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { +		tcp_rearm_rto(sk); +		return false; +	} +	/* Don't do any loss probe on a Fast Open connection before 3WHS +	 * finishes. +	 */ +	if (sk->sk_state == TCP_SYN_RECV) +		return false; + +	/* TLP is only scheduled when next timer event is RTO. */ +	if (icsk->icsk_pending != ICSK_TIME_RETRANS) +		return false; + +	/* Schedule a loss probe in 2*RTT for SACK capable connections +	 * in Open state, that are either limited by cwnd or application. +	 */ +	if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out || +	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) +		return false; + +	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && +	     tcp_send_head(sk)) +		return false; + +	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account +	 * for delayed ack when there's one outstanding packet. +	 */ +	timeout = rtt << 1; +	if (tp->packets_out == 1) +		timeout = max_t(u32, timeout, +				(rtt + (rtt >> 1) + TCP_DELACK_MAX)); +	timeout = max_t(u32, timeout, msecs_to_jiffies(10)); + +	/* If RTO is shorter, just schedule TLP in its place. */ +	tlp_time_stamp = tcp_time_stamp + timeout; +	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; +	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { +		s32 delta = rto_time_stamp - tcp_time_stamp; +		if (delta > 0) +			timeout = delta; +	} + +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, +				  TCP_RTO_MAX); +	return true; +} + +/* Thanks to skb fast clones, we can detect if a prior transmit of + * a packet is still in a qdisc or driver queue. + * In this case, there is very little point doing a retransmit ! + * Note: This is called from BH context only. + */ +static bool skb_still_in_host_queue(const struct sock *sk, +				    const struct sk_buff *skb) +{ +	const struct sk_buff *fclone = skb + 1; + +	if (unlikely(skb->fclone == SKB_FCLONE_ORIG && +		     fclone->fclone == SKB_FCLONE_CLONE)) { +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); +		return true; +	} +	return false; +} + +/* When probe timeout (PTO) fires, send a new segment if one exists, else + * retransmit the last segment. + */ +void tcp_send_loss_probe(struct sock *sk) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct sk_buff *skb; +	int pcount; +	int mss = tcp_current_mss(sk); +	int err = -1; + +	if (tcp_send_head(sk) != NULL) { +		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); +		goto rearm_timer; +	} + +	/* At most one outstanding TLP retransmission. */ +	if (tp->tlp_high_seq) +		goto rearm_timer; + +	/* Retransmit last segment. */ +	skb = tcp_write_queue_tail(sk); +	if (WARN_ON(!skb)) +		goto rearm_timer; + +	if (skb_still_in_host_queue(sk, skb)) +		goto rearm_timer; + +	pcount = tcp_skb_pcount(skb); +	if (WARN_ON(!pcount)) +		goto rearm_timer; + +	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { +		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, +					  GFP_ATOMIC))) +			goto rearm_timer; +		skb = tcp_write_queue_tail(sk); +	} + +	if (WARN_ON(!skb || !tcp_skb_pcount(skb))) +		goto rearm_timer; + +	err = __tcp_retransmit_skb(sk, skb); + +	/* Record snd_nxt for loss detection. */ +	if (likely(!err)) +		tp->tlp_high_seq = tp->snd_nxt; + +rearm_timer: +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, +				  inet_csk(sk)->icsk_rto, +				  TCP_RTO_MAX); + +	if (likely(!err)) +		NET_INC_STATS_BH(sock_net(sk), +				 LINUX_MIB_TCPLOSSPROBES);  }  /* Push out any pending frames which were held back due to @@ -1809,7 +2159,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,  	if (unlikely(sk->sk_state == TCP_CLOSE))  		return; -	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) +	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, +			   sk_gfp_atomic(sk, GFP_ATOMIC)))  		tcp_check_probe_timer(sk);  } @@ -1889,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk)  	 */  	int mss = icsk->icsk_ack.rcv_mss;  	int free_space = tcp_space(sk); -	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); +	int allowed_space = tcp_full_space(sk); +	int full_space = min_t(int, tp->window_clamp, allowed_space);  	int window;  	if (mss > full_space) @@ -1898,11 +2250,23 @@ u32 __tcp_select_window(struct sock *sk)  	if (free_space < (full_space >> 1)) {  		icsk->icsk_ack.quick = 0; -		if (tcp_memory_pressure) +		if (sk_under_memory_pressure(sk))  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,  					       4U * tp->advmss); -		if (free_space < mss) +		/* free_space might become our new window, make sure we don't +		 * increase it due to wscale. +		 */ +		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + +		/* if free space is less than mss estimate, or is below 1/16th +		 * of the maximum allowed, try to move to zero-window, else +		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and +		 * new incoming data is dropped due to memory limits. +		 * With large window, mss test triggers way too late in order +		 * to announce zero window in time before rmem limit kicks in. +		 */ +		if (free_space < (allowed_space >> 4) || free_space < mss)  			return 0;  	} @@ -1971,7 +2335,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;  	/* Merge over control information. This moves PSH/FIN etc. over */ -	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags; +	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;  	/* All done, get rid of second SKB and account for it so  	 * packet counting does not break. @@ -1989,22 +2353,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)  }  /* Check if coalescing SKBs is legal. */ -static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)  {  	if (tcp_skb_pcount(skb) > 1) -		return 0; +		return false;  	/* TODO: SACK collapsing could be used to remove this condition */  	if (skb_shinfo(skb)->nr_frags != 0) -		return 0; +		return false;  	if (skb_cloned(skb)) -		return 0; +		return false;  	if (skb == tcp_send_head(sk)) -		return 0; +		return false;  	/* Some heurestics for collapsing over SACK'd could be invented */  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) -		return 0; +		return false; -	return 1; +	return true;  }  /* Collapse packets in the retransmit queue to make to create @@ -2015,11 +2379,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *skb = to, *tmp; -	int first = 1; +	bool first = true;  	if (!sysctl_tcp_retrans_collapse)  		return; -	if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) +	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)  		return;  	tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -2029,7 +2393,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  		space -= skb->len;  		if (first) { -			first = 0; +			first = false;  			continue;  		} @@ -2038,7 +2402,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,  		/* Punt if not enough space exists in the first SKB for  		 * the data in the second  		 */ -		if (skb->len > skb_tailroom(to)) +		if (skb->len > skb_availroom(to))  			break;  		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) @@ -2052,7 +2416,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,   * state updates are done by the caller.  Returns non-zero if an   * error occurred which prevented the send.   */ -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); @@ -2071,6 +2435,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))  		return -EAGAIN; +	if (skb_still_in_host_queue(sk, skb)) +		return -EBUSY; +  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {  		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))  			BUG(); @@ -2093,12 +2460,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		return -EAGAIN;  	if (skb->len > cur_mss) { -		if (tcp_fragment(sk, skb, cur_mss, cur_mss)) +		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))  			return -ENOMEM; /* We'll try again later. */  	} else {  		int oldpcount = tcp_skb_pcount(skb);  		if (unlikely(oldpcount > 1)) { +			if (skb_unclone(skb, GFP_ATOMIC)) +				return -ENOMEM;  			tcp_init_tso_segs(sk, skb, cur_mss);  			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));  		} @@ -2106,38 +2475,45 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  	tcp_retrans_try_collapse(sk, skb, cur_mss); -	/* Some Solaris stacks overoptimize and ignore the FIN on a -	 * retransmit when old data is attached.  So strip it off -	 * since it is cheap to do so and saves bytes on the network. -	 */ -	if (skb->len > 0 && -	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && -	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { -		if (!pskb_trim(skb, 0)) { -			/* Reuse, even though it does some unnecessary work */ -			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, -					     TCP_SKB_CB(skb)->flags); -			skb->ip_summed = CHECKSUM_NONE; -		} -	} -  	/* Make a copy, if the first transmission SKB clone we made  	 * is still in somebody's hands, else make a clone.  	 */  	TCP_SKB_CB(skb)->when = tcp_time_stamp; -	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +	/* make sure skb->data is aligned on arches that require it +	 * and check if ack-trimming & collapsing extended the headroom +	 * beyond what csum_start can cover. +	 */ +	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || +		     skb_headroom(skb) >= 0xFFFF)) { +		struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, +						   GFP_ATOMIC); +		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : +			     -ENOBUFS; +	} else { +		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); +	} -	if (err == 0) { +	if (likely(!err)) { +		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;  		/* Update global TCP statistics. */  		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); - +		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) +			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);  		tp->total_retrans++; +	} +	return err; +} + +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	int err = __tcp_retransmit_skb(sk, skb); +	if (err == 0) {  #if FASTRETRANS_DEBUG > 0  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { -			if (net_ratelimit()) -				printk(KERN_DEBUG "retrans_out leaked.\n"); +			net_dbg_ratelimited("retrans_out leaked\n");  		}  #endif  		if (!tp->retrans_out) @@ -2149,31 +2525,35 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)  		if (!tp->retrans_stamp)  			tp->retrans_stamp = TCP_SKB_CB(skb)->when; -		tp->undo_retrans++; -  		/* snd_nxt is stored to detect loss of retransmitted segment,  		 * see tcp_input.c tcp_sacktag_write_queue().  		 */  		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; +	} else if (err != -EBUSY) { +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);  	} + +	if (tp->undo_retrans < 0) +		tp->undo_retrans = 0; +	tp->undo_retrans += tcp_skb_pcount(skb);  	return err;  }  /* Check if we forward retransmits are possible in the current   * window/congestion state.   */ -static int tcp_can_forward_retransmit(struct sock *sk) +static bool tcp_can_forward_retransmit(struct sock *sk)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	struct tcp_sock *tp = tcp_sk(sk); +	const struct tcp_sock *tp = tcp_sk(sk);  	/* Forward retransmissions are possible only during Recovery. */  	if (icsk->icsk_ca_state != TCP_CA_Recovery) -		return 0; +		return false;  	/* No forward retransmissions in Reno are possible. */  	if (tcp_is_reno(tp)) -		return 0; +		return false;  	/* Yeah, we have to make difficult choice between forward transmission  	 * and retransmission... Both ways have their merits... @@ -2184,9 +2564,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)  	 */  	if (tcp_may_send_now(sk)) -		return 0; +		return false; -	return 1; +	return true;  }  /* This gets called after a retransmit timeout, and the initially @@ -2278,8 +2658,12 @@ begin_fwd:  		if (tcp_retransmit_skb(sk, skb))  			return; +  		NET_INC_STATS_BH(sock_net(sk), mib_idx); +		if (tcp_in_cwnd_reduction(sk)) +			tp->prr_out += tcp_skb_pcount(skb); +  		if (skb == tcp_write_queue_head(sk))  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,  						  inet_csk(sk)->icsk_rto, @@ -2303,7 +2687,7 @@ void tcp_send_fin(struct sock *sk)  	mss_now = tcp_current_mss(sk);  	if (tcp_send_head(sk) != NULL) { -		TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;  		TCP_SKB_CB(skb)->end_seq++;  		tp->write_seq++;  	} else { @@ -2365,11 +2749,11 @@ int tcp_send_synack(struct sock *sk)  	struct sk_buff *skb;  	skb = tcp_write_queue_head(sk); -	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { -		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); +	if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { +		pr_debug("%s: wrong queue state\n", __func__);  		return -EFAULT;  	} -	if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { +	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {  		if (skb_cloned(skb)) {  			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);  			if (nskb == NULL) @@ -2383,66 +2767,50 @@ int tcp_send_synack(struct sock *sk)  			skb = nskb;  		} -		TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;  		TCP_ECN_send_synack(tcp_sk(sk), skb);  	}  	TCP_SKB_CB(skb)->when = tcp_time_stamp;  	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);  } -/* Prepare a SYN-ACK. */ +/** + * tcp_make_synack - Prepare a SYN-ACK. + * sk: listener socket + * dst: dst entry attached to the SYNACK + * req: request_sock pointer + * + * Allocate one skb and build a SYNACK packet. + * @dst is consumed : Caller should not use it again. + */  struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  				struct request_sock *req, -				struct request_values *rvp) +				struct tcp_fastopen_cookie *foc)  {  	struct tcp_out_options opts; -	struct tcp_extend_values *xvp = tcp_xv(rvp);  	struct inet_request_sock *ireq = inet_rsk(req);  	struct tcp_sock *tp = tcp_sk(sk); -	const struct tcp_cookie_values *cvp = tp->cookie_values;  	struct tcphdr *th;  	struct sk_buff *skb;  	struct tcp_md5sig_key *md5;  	int tcp_header_size;  	int mss; -	int s_data_desired = 0; -	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) -		s_data_desired = cvp->s_data_desired; -	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); -	if (skb == NULL) +	skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC); +	if (unlikely(!skb)) { +		dst_release(dst);  		return NULL; - +	}  	/* Reserve space for headers. */  	skb_reserve(skb, MAX_TCP_HEADER); -	skb_dst_set(skb, dst_clone(dst)); +	skb_dst_set(skb, dst); +	security_skb_owned_by(skb, sk); -	mss = dst_metric(dst, RTAX_ADVMSS); +	mss = dst_metric_advmss(dst);  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)  		mss = tp->rx_opt.user_mss; -	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ -		__u8 rcv_wscale; -		/* Set this up on the first call only */ -		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); - -		/* limit the window selection if the user enforce a smaller rx buffer */ -		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && -		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) -			req->window_clamp = tcp_full_space(sk); - -		/* tcp_full_space because it is guaranteed to be the first packet */ -		tcp_select_initial_window(tcp_full_space(sk), -			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), -			&req->rcv_wnd, -			&req->window_clamp, -			ireq->wscale_ok, -			&rcv_wscale, -			dst_metric(dst, RTAX_INITRWND)); -		ireq->rcv_wscale = rcv_wscale; -	} -  	memset(&opts, 0, sizeof(opts));  #ifdef CONFIG_SYN_COOKIES  	if (unlikely(req->cookie_ts)) @@ -2450,9 +2818,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	else  #endif  	TCP_SKB_CB(skb)->when = tcp_time_stamp; -	tcp_header_size = tcp_synack_options(sk, req, mss, -					     skb, &opts, &md5, xvp) -			+ sizeof(*th); +	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, +					     foc) + sizeof(*th);  	skb_push(skb, tcp_header_size);  	skb_reset_transport_header(skb); @@ -2462,56 +2829,23 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,  	th->syn = 1;  	th->ack = 1;  	TCP_ECN_make_synack(req, th); -	th->source = ireq->loc_port; -	th->dest = ireq->rmt_port; +	th->source = htons(ireq->ir_num); +	th->dest = ireq->ir_rmt_port;  	/* Setting of flags are superfluous here for callers (and ECE is  	 * not even correctly set)  	 */  	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,  			     TCPHDR_SYN | TCPHDR_ACK); -	if (OPTION_COOKIE_EXTENSION & opts.options) { -		if (s_data_desired) { -			u8 *buf = skb_put(skb, s_data_desired); - -			/* copy data directly from the listening socket. */ -			memcpy(buf, cvp->s_data_payload, s_data_desired); -			TCP_SKB_CB(skb)->end_seq += s_data_desired; -		} - -		if (opts.hash_size > 0) { -			__u32 workspace[SHA_WORKSPACE_WORDS]; -			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; -			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; - -			/* Secret recipe depends on the Timestamp, (future) -			 * Sequence and Acknowledgment Numbers, Initiator -			 * Cookie, and others handled by IP variant caller. -			 */ -			*tail-- ^= opts.tsval; -			*tail-- ^= tcp_rsk(req)->rcv_isn + 1; -			*tail-- ^= TCP_SKB_CB(skb)->seq + 1; - -			/* recommended */ -			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); -			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ - -			sha_transform((__u32 *)&xvp->cookie_bakery[0], -				      (char *)mess, -				      &workspace[0]); -			opts.hash_location = -				(__u8 *)&xvp->cookie_bakery[0]; -		} -	} -  	th->seq = htonl(TCP_SKB_CB(skb)->seq); -	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); +	/* XXX data is queued and acked as is. No buffer/window check */ +	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);  	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */  	th->window = htons(min(req->rcv_wnd, 65535U));  	tcp_options_write((__be32 *)(th + 1), tp, &opts);  	th->doff = (tcp_header_size >> 2); -	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); +	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);  #ifdef CONFIG_TCP_MD5SIG  	/* Okay, we have all we need - do the md5 hash if needed */ @@ -2528,7 +2862,7 @@ EXPORT_SYMBOL(tcp_make_synack);  /* Do all connect socket setups that can be done AF independent. */  static void tcp_connect_init(struct sock *sk)  { -	struct dst_entry *dst = __sk_dst_get(sk); +	const struct dst_entry *dst = __sk_dst_get(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	__u8 rcv_wscale; @@ -2552,7 +2886,7 @@ static void tcp_connect_init(struct sock *sk)  	if (!tp->window_clamp)  		tp->window_clamp = dst_metric(dst, RTAX_WINDOW); -	tp->advmss = dst_metric(dst, RTAX_ADVMSS); +	tp->advmss = dst_metric_advmss(dst);  	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)  		tp->advmss = tp->rx_opt.user_mss; @@ -2581,15 +2915,134 @@ static void tcp_connect_init(struct sock *sk)  	tp->snd_una = tp->write_seq;  	tp->snd_sml = tp->write_seq;  	tp->snd_up = tp->write_seq; -	tp->rcv_nxt = 0; -	tp->rcv_wup = 0; -	tp->copied_seq = 0; +	tp->snd_nxt = tp->write_seq; + +	if (likely(!tp->repair)) +		tp->rcv_nxt = 0; +	else +		tp->rcv_tstamp = tcp_time_stamp; +	tp->rcv_wup = tp->rcv_nxt; +	tp->copied_seq = tp->rcv_nxt;  	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;  	inet_csk(sk)->icsk_retransmits = 0;  	tcp_clear_retrans(tp);  } +static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + +	tcb->end_seq += skb->len; +	skb_header_release(skb); +	__tcp_add_write_queue_tail(sk, skb); +	sk->sk_wmem_queued += skb->truesize; +	sk_mem_charge(sk, skb->truesize); +	tp->write_seq = tcb->end_seq; +	tp->packets_out += tcp_skb_pcount(skb); +} + +/* Build and send a SYN with data and (cached) Fast Open cookie. However, + * queue a data-only packet after the regular SYN, such that regular SYNs + * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges + * only the SYN sequence, the data are retransmitted in the first ACK. + * If cookie is not cached or other error occurs, falls back to send a + * regular SYN with Fast Open cookie request option. + */ +static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) +{ +	struct tcp_sock *tp = tcp_sk(sk); +	struct tcp_fastopen_request *fo = tp->fastopen_req; +	int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; +	struct sk_buff *syn_data = NULL, *data; +	unsigned long last_syn_loss = 0; + +	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */ +	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, +			       &syn_loss, &last_syn_loss); +	/* Recurring FO SYN losses: revert to regular handshake temporarily */ +	if (syn_loss > 1 && +	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { +		fo->cookie.len = -1; +		goto fallback; +	} + +	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) +		fo->cookie.len = -1; +	else if (fo->cookie.len <= 0) +		goto fallback; + +	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and +	 * user-MSS. Reserve maximum option space for middleboxes that add +	 * private TCP options. The cost is reduced data space in SYN :( +	 */ +	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) +		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; +	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - +		MAX_TCP_OPTION_SPACE; + +	space = min_t(size_t, space, fo->size); + +	/* limit to order-0 allocations */ +	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); + +	syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, +				   sk->sk_allocation); +	if (syn_data == NULL) +		goto fallback; + +	for (i = 0; i < iovlen && syn_data->len < space; ++i) { +		struct iovec *iov = &fo->data->msg_iov[i]; +		unsigned char __user *from = iov->iov_base; +		int len = iov->iov_len; + +		if (syn_data->len + len > space) +			len = space - syn_data->len; +		else if (i + 1 == iovlen) +			/* No more data pending in inet_wait_for_connect() */ +			fo->data = NULL; + +		if (skb_add_data(syn_data, from, len)) +			goto fallback; +	} + +	/* Queue a data-only packet after the regular SYN for retransmission */ +	data = pskb_copy(syn_data, sk->sk_allocation); +	if (data == NULL) +		goto fallback; +	TCP_SKB_CB(data)->seq++; +	TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; +	TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); +	tcp_connect_queue_skb(sk, data); +	fo->copied = data->len; + +	/* syn_data is about to be sent, we need to take current time stamps +	 * for the packets that are in write queue : SYN packet and DATA +	 */ +	skb_mstamp_get(&syn->skb_mstamp); +	data->skb_mstamp = syn->skb_mstamp; + +	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { +		tp->syn_data = (fo->copied > 0); +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); +		goto done; +	} +	syn_data = NULL; + +fallback: +	/* Send a regular SYN with Fast Open cookie request option */ +	if (fo->cookie.len > 0) +		fo->cookie.len = 0; +	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); +	if (err) +		tp->syn_fastopen = 0; +	kfree_skb(syn_data); +done: +	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */ +	return err; +} +  /* Build a SYN and send it off. */  int tcp_connect(struct sock *sk)  { @@ -2599,6 +3052,11 @@ int tcp_connect(struct sock *sk)  	tcp_connect_init(sk); +	if (unlikely(tp->repair)) { +		tcp_finish_connect(sk, NULL); +		return 0; +	} +  	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);  	if (unlikely(buff == NULL))  		return -ENOBUFS; @@ -2606,19 +3064,14 @@ int tcp_connect(struct sock *sk)  	/* Reserve space for headers. */  	skb_reserve(buff, MAX_TCP_HEADER); -	tp->snd_nxt = tp->write_seq;  	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +	tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; +	tcp_connect_queue_skb(sk, buff);  	TCP_ECN_send_syn(sk, buff); -	/* Send it off. */ -	TCP_SKB_CB(buff)->when = tcp_time_stamp; -	tp->retrans_stamp = TCP_SKB_CB(buff)->when; -	skb_header_release(buff); -	__tcp_add_write_queue_tail(sk, buff); -	sk->sk_wmem_queued += buff->truesize; -	sk_mem_charge(sk, buff->truesize); -	tp->packets_out += tcp_skb_pcount(buff); -	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); +	/* Send off SYN; include data in Fast Open. */ +	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : +	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);  	if (err == -ECONNREFUSED)  		return err; @@ -2660,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk)  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements  		 * directly.  		 */ -		if (tp->srtt) { -			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); +		if (tp->srtt_us) { +			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), +					TCP_DELACK_MIN);  			if (rtt < max_ato)  				max_ato = rtt; @@ -2705,7 +3159,7 @@ void tcp_send_ack(struct sock *sk)  	 * tcp_transmit_skb() will set the ownership to this  	 * sock.  	 */ -	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); +	buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));  	if (buff == NULL) {  		inet_csk_schedule_ack(sk);  		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -2720,7 +3174,7 @@ void tcp_send_ack(struct sock *sk)  	/* Send it off, this clears delayed acks for us. */  	TCP_SKB_CB(buff)->when = tcp_time_stamp; -	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); +	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));  }  /* This routine sends a packet with an out of date sequence @@ -2740,7 +3194,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)  	struct sk_buff *skb;  	/* We don't queue it, tcp_transmit_skb() sets ownership. */ -	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); +	skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));  	if (skb == NULL)  		return -1; @@ -2755,6 +3209,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)  	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);  } +void tcp_send_window_probe(struct sock *sk) +{ +	if (sk->sk_state == TCP_ESTABLISHED) { +		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; +		tcp_xmit_probe_skb(sk, 0); +	} +} +  /* Initiate keepalive or window probe from timer. */  int tcp_write_wakeup(struct sock *sk)  { @@ -2780,13 +3242,13 @@ int tcp_write_wakeup(struct sock *sk)  		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||  		    skb->len > mss) {  			seg_size = min(seg_size, mss); -			TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; -			if (tcp_fragment(sk, skb, seg_size, mss)) +			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; +			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))  				return -1;  		} else if (!tcp_skb_pcount(skb))  			tcp_set_skb_tso_segs(sk, skb, mss); -		TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; +		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;  		TCP_SKB_CB(skb)->when = tcp_time_stamp;  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);  		if (!err) diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 85ee7eb7e38..3b66610d415 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -18,6 +18,8 @@   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/kprobes.h>  #include <linux/socket.h> @@ -36,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper");  MODULE_LICENSE("GPL");  MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly;  MODULE_PARM_DESC(port, "Port to match (0=all)");  module_param(port, int, 0); @@ -44,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096;  MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");  module_param(bufsize, uint, 0); +static unsigned int fwmark __read_mostly; +MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); +module_param(fwmark, uint, 0); +  static int full __read_mostly;  MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");  module_param(full, int, 0); @@ -52,12 +58,16 @@ static const char procname[] = "tcpprobe";  struct tcp_log {  	ktime_t tstamp; -	__be32	saddr, daddr; -	__be16	sport, dport; +	union { +		struct sockaddr		raw; +		struct sockaddr_in	v4; +		struct sockaddr_in6	v6; +	}	src, dst;  	u16	length;  	u32	snd_nxt;  	u32	snd_una;  	u32	snd_wnd; +	u32	rcv_wnd;  	u32	snd_cwnd;  	u32	ssthresh;  	u32	srtt; @@ -84,19 +94,29 @@ static inline int tcp_probe_avail(void)  	return bufsize - tcp_probe_used() - 1;  } +#define tcp_probe_copy_fl_to_si4(inet, si4, mem)		\ +	do {							\ +		si4.sin_family = AF_INET;			\ +		si4.sin_port = inet->inet_##mem##port;		\ +		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\ +	} while (0)						\ + +  /*   * Hook inserted to be called before each receive packet.   * Note: arguments must match tcp_rcv_established()!   */ -static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, -			       struct tcphdr *th, unsigned len) +static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, +				 const struct tcphdr *th, unsigned int len)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	const struct inet_sock *inet = inet_sk(sk); -	/* Only update if port matches */ -	if ((port == 0 || ntohs(inet->inet_dport) == port || -	     ntohs(inet->inet_sport) == port) && +	/* Only update if port or skb mark matches */ +	if (((port == 0 && fwmark == 0) || +	     ntohs(inet->inet_dport) == port || +	     ntohs(inet->inet_sport) == port || +	     (fwmark > 0 && skb->mark == fwmark)) &&  	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {  		spin_lock(&tcp_probe.lock); @@ -105,17 +125,36 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,  			struct tcp_log *p = tcp_probe.log + tcp_probe.head;  			p->tstamp = ktime_get(); -			p->saddr = inet->inet_saddr; -			p->sport = inet->inet_sport; -			p->daddr = inet->inet_daddr; -			p->dport = inet->inet_dport; +			switch (sk->sk_family) { +			case AF_INET: +				tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); +				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); +				break; +			case AF_INET6: +				memset(&p->src.v6, 0, sizeof(p->src.v6)); +				memset(&p->dst.v6, 0, sizeof(p->dst.v6)); +#if IS_ENABLED(CONFIG_IPV6) +				p->src.v6.sin6_family = AF_INET6; +				p->src.v6.sin6_port = inet->inet_sport; +				p->src.v6.sin6_addr = inet6_sk(sk)->saddr; + +				p->dst.v6.sin6_family = AF_INET6; +				p->dst.v6.sin6_port = inet->inet_dport; +				p->dst.v6.sin6_addr = sk->sk_v6_daddr; +#endif +				break; +			default: +				BUG(); +			} +  			p->length = skb->len;  			p->snd_nxt = tp->snd_nxt;  			p->snd_una = tp->snd_una;  			p->snd_cwnd = tp->snd_cwnd;  			p->snd_wnd = tp->snd_wnd; +			p->rcv_wnd = tp->rcv_wnd;  			p->ssthresh = tcp_current_ssthresh(sk); -			p->srtt = tp->srtt >> 3; +			p->srtt = tp->srtt_us >> 3;  			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);  		} @@ -126,7 +165,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,  	}  	jprobe_return(); -	return 0;  }  static struct jprobe tcp_jprobe = { @@ -136,7 +174,7 @@ static struct jprobe tcp_jprobe = {  	.entry	= jtcp_rcv_established,  }; -static int tcpprobe_open(struct inode * inode, struct file * file) +static int tcpprobe_open(struct inode *inode, struct file *file)  {  	/* Reset (empty) log */  	spin_lock_bh(&tcp_probe.lock); @@ -155,13 +193,11 @@ static int tcpprobe_sprint(char *tbuf, int n)  		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));  	return scnprintf(tbuf, n, -			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", +			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",  			(unsigned long) tv.tv_sec,  			(unsigned long) tv.tv_nsec, -			&p->saddr, ntohs(p->sport), -			&p->daddr, ntohs(p->dport), -			p->length, p->snd_nxt, p->snd_una, -			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt); +			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, +			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);  }  static ssize_t tcpprobe_read(struct file *file, char __user *buf, @@ -174,7 +210,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,  		return -EINVAL;  	while (cnt < len) { -		char tbuf[164]; +		char tbuf[256];  		int width;  		/* Wait for data in buffer */ @@ -221,6 +257,13 @@ static __init int tcpprobe_init(void)  {  	int ret = -ENOMEM; +	/* Warning: if the function signature of tcp_rcv_established, +	 * has been changed, you also have to change the signature of +	 * jtcp_rcv_established, otherwise you end up right here! +	 */ +	BUILD_BUG_ON(__same_type(tcp_rcv_established, +				 jtcp_rcv_established) == 0); +  	init_waitqueue_head(&tcp_probe.wait);  	spin_lock_init(&tcp_probe.lock); @@ -232,17 +275,18 @@ static __init int tcpprobe_init(void)  	if (!tcp_probe.log)  		goto err0; -	if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) +	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))  		goto err0;  	ret = register_jprobe(&tcp_jprobe);  	if (ret)  		goto err1; -	pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); +	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", +		port, fwmark, bufsize);  	return 0;   err1: -	proc_net_remove(&init_net, procname); +	remove_proc_entry(procname, init_net.proc_net);   err0:  	kfree(tcp_probe.log);  	return ret; @@ -251,7 +295,7 @@ module_init(tcpprobe_init);  static __exit void tcpprobe_exit(void)  { -	proc_net_remove(&init_net, procname); +	remove_proc_entry(procname, init_net.proc_net);  	unregister_jprobe(&tcp_jprobe);  	kfree(tcp_probe.log);  } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a76513779e2..8250949b885 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,15 +15,15 @@  #define TCP_SCALABLE_AI_CNT	50U  #define TCP_SCALABLE_MD_SCALE	3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else  		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));  } @@ -35,10 +35,9 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)  } -static struct tcp_congestion_ops tcp_scalable = { +static struct tcp_congestion_ops tcp_scalable __read_mostly = {  	.ssthresh	= tcp_scalable_ssthresh,  	.cong_avoid	= tcp_scalable_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.owner		= THIS_MODULE,  	.name		= "scalable", diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74a6aa00365..286227abed1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;  int sysctl_tcp_orphan_retries __read_mostly;  int sysctl_tcp_thin_linear_timeouts __read_mostly; -static void tcp_write_timer(unsigned long); -static void tcp_delack_timer(unsigned long); -static void tcp_keepalive_timer (unsigned long data); - -void tcp_init_xmit_timers(struct sock *sk) -{ -	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, -				  &tcp_keepalive_timer); -} -EXPORT_SYMBOL(tcp_init_xmit_timers); -  static void tcp_write_err(struct sock *sk)  {  	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -77,10 +66,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)  	if (sk->sk_err_soft)  		shift++; -	if (tcp_too_many_orphans(sk, shift)) { -		if (net_ratelimit()) -			printk(KERN_INFO "Out of socket memory\n"); - +	if (tcp_check_oom(sk, shift)) {  		/* Catch exceptional cases, when connection requires reset.  		 *      1. Last segment was sent recently. */  		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || @@ -170,14 +156,21 @@ static bool retransmits_timed_out(struct sock *sk,  static int tcp_write_timeout(struct sock *sk)  {  	struct inet_connection_sock *icsk = inet_csk(sk); +	struct tcp_sock *tp = tcp_sk(sk);  	int retry_until; -	bool do_reset, syn_set = 0; +	bool do_reset, syn_set = false;  	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { -		if (icsk->icsk_retransmits) +		if (icsk->icsk_retransmits) {  			dst_negative_advice(sk); +			if (tp->syn_fastopen || tp->syn_data) +				tcp_fastopen_cache_set(sk, 0, NULL, true); +			if (tp->syn_data) +				NET_INC_STATS_BH(sock_net(sk), +						 LINUX_MIB_TCPFASTOPENACTIVEFAIL); +		}  		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; -		syn_set = 1; +		syn_set = true;  	} else {  		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {  			/* Black hole detection */ @@ -208,21 +201,11 @@ static int tcp_write_timeout(struct sock *sk)  	return 0;  } -static void tcp_delack_timer(unsigned long data) +void tcp_delack_timer_handler(struct sock *sk)  { -	struct sock *sk = (struct sock *)data;  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); -	bh_lock_sock(sk); -	if (sock_owned_by_user(sk)) { -		/* Try again later. */ -		icsk->icsk_ack.blocked = 1; -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); -		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); -		goto out_unlock; -	} -  	sk_mem_reclaim_partial(sk);  	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) @@ -259,12 +242,26 @@ static void tcp_delack_timer(unsigned long data)  		tcp_send_ack(sk);  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);  	} -	TCP_CHECK_TIMER(sk);  out: -	if (tcp_memory_pressure) +	if (sk_under_memory_pressure(sk))  		sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_delack_timer(unsigned long data) +{ +	struct sock *sk = (struct sock *)data; + +	bh_lock_sock(sk); +	if (!sock_owned_by_user(sk)) { +		tcp_delack_timer_handler(sk); +	} else { +		inet_csk(sk)->icsk_ack.blocked = 1; +		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); +		/* deleguate our work to tcp_release_cb() */ +		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) +			sock_hold(sk); +	}  	bh_unlock_sock(sk);  	sock_put(sk);  } @@ -315,6 +312,35 @@ static void tcp_probe_timer(struct sock *sk)  }  /* + *	Timer for Fast Open socket to retransmit SYNACK. Note that the + *	sk here is the child socket, not the parent (listener) socket. + */ +static void tcp_fastopen_synack_timer(struct sock *sk) +{ +	struct inet_connection_sock *icsk = inet_csk(sk); +	int max_retries = icsk->icsk_syn_retries ? : +	    sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ +	struct request_sock *req; + +	req = tcp_sk(sk)->fastopen_rsk; +	req->rsk_ops->syn_ack_timeout(sk, req); + +	if (req->num_timeout >= max_retries) { +		tcp_write_err(sk); +		return; +	} +	/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error +	 * returned from rtx_syn_ack() to make it more persistent like +	 * regular retransmit because if the child socket has been accepted +	 * it's not good to give up too easily. +	 */ +	inet_rtx_syn_ack(sk, req); +	req->num_timeout++; +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, +			  TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); +} + +/*   *	The TCP retransmit timer.   */ @@ -323,11 +349,22 @@ void tcp_retransmit_timer(struct sock *sk)  	struct tcp_sock *tp = tcp_sk(sk);  	struct inet_connection_sock *icsk = inet_csk(sk); +	if (tp->fastopen_rsk) { +		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && +			     sk->sk_state != TCP_FIN_WAIT1); +		tcp_fastopen_synack_timer(sk); +		/* Before we receive ACK to our SYN-ACK don't retransmit +		 * anything else (e.g., data or FIN segments). +		 */ +		return; +	}  	if (!tp->packets_out)  		goto out;  	WARN_ON(tcp_write_queue_empty(sk)); +	tp->tlp_high_seq = 0; +  	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&  	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {  		/* Receiver dastardly shrinks window. Our retransmits @@ -335,22 +372,21 @@ void tcp_retransmit_timer(struct sock *sk)  		 * connection. If the socket is an orphan, time it out,  		 * we cannot allow such beasts to hang infinitely.  		 */ -#ifdef TCP_DEBUG  		struct inet_sock *inet = inet_sk(sk);  		if (sk->sk_family == AF_INET) { -			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", -			       &inet->inet_daddr, ntohs(inet->inet_dport), -			       inet->inet_num, tp->snd_una, tp->snd_nxt); +			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), +				       &inet->inet_daddr, +				       ntohs(inet->inet_dport), inet->inet_num, +				       tp->snd_una, tp->snd_nxt);  		} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  		else if (sk->sk_family == AF_INET6) { -			struct ipv6_pinfo *np = inet6_sk(sk); -			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", -			       &np->daddr, ntohs(inet->inet_dport), -			       inet->inet_num, tp->snd_una, tp->snd_nxt); +			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), +				       &sk->sk_v6_daddr, +				       ntohs(inet->inet_dport), inet->inet_num, +				       tp->snd_una, tp->snd_nxt);  		}  #endif -#endif  		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {  			tcp_write_err(sk);  			goto out; @@ -386,11 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)  		NET_INC_STATS_BH(sock_net(sk), mib_idx);  	} -	if (tcp_use_frto(sk)) { -		tcp_enter_frto(sk); -	} else { -		tcp_enter_loss(sk, 0); -	} +	tcp_enter_loss(sk, 0);  	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {  		/* Retransmission failed because of local congestion, @@ -449,19 +481,11 @@ out_reset_timer:  out:;  } -static void tcp_write_timer(unsigned long data) +void tcp_write_timer_handler(struct sock *sk)  { -	struct sock *sk = (struct sock *)data;  	struct inet_connection_sock *icsk = inet_csk(sk);  	int event; -	bh_lock_sock(sk); -	if (sock_owned_by_user(sk)) { -		/* Try again later */ -		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); -		goto out_unlock; -	} -  	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)  		goto out; @@ -471,21 +495,40 @@ static void tcp_write_timer(unsigned long data)  	}  	event = icsk->icsk_pending; -	icsk->icsk_pending = 0;  	switch (event) { +	case ICSK_TIME_EARLY_RETRANS: +		tcp_resume_early_retransmit(sk); +		break; +	case ICSK_TIME_LOSS_PROBE: +		tcp_send_loss_probe(sk); +		break;  	case ICSK_TIME_RETRANS: +		icsk->icsk_pending = 0;  		tcp_retransmit_timer(sk);  		break;  	case ICSK_TIME_PROBE0: +		icsk->icsk_pending = 0;  		tcp_probe_timer(sk);  		break;  	} -	TCP_CHECK_TIMER(sk);  out:  	sk_mem_reclaim(sk); -out_unlock: +} + +static void tcp_write_timer(unsigned long data) +{ +	struct sock *sk = (struct sock *)data; + +	bh_lock_sock(sk); +	if (!sock_owned_by_user(sk)) { +		tcp_write_timer_handler(sk); +	} else { +		/* deleguate our work to tcp_release_cb() */ +		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) +			sock_hold(sk); +	}  	bh_unlock_sock(sk);  	sock_put(sk);  } @@ -589,7 +632,6 @@ static void tcp_keepalive_timer (unsigned long data)  		elapsed = keepalive_time_when(tp) - elapsed;  	} -	TCP_CHECK_TIMER(sk);  	sk_mem_reclaim(sk);  resched: @@ -603,3 +645,10 @@ out:  	bh_unlock_sock(sk);  	sock_put(sk);  } + +void tcp_init_xmit_timers(struct sock *sk) +{ +	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, +				  &tcp_keepalive_timer); +} +EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c6743eec9b7..9a5e05f27f4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -163,13 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)  	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);  } -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct vegas *vegas = inet_csk_ca(sk);  	if (!vegas->doing_vegas_now) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	} @@ -194,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  			/* We don't have enough RTT samples to do the Vegas  			 * calculation, so we'll behave like Reno.  			 */ -			tcp_reno_cong_avoid(sk, ack, in_flight); +			tcp_reno_cong_avoid(sk, ack, acked);  		} else {  			u32 rtt, diff;  			u64 target_cwnd; @@ -243,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {  				/* Slow start.  */ -				tcp_slow_start(tp); +				tcp_slow_start(tp, acked);  			} else {  				/* Congestion avoidance. */ @@ -283,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  	}  	/* Use normal slow start */  	else if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  } @@ -304,12 +304,10 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)  }  EXPORT_SYMBOL_GPL(tcp_vegas_get_info); -static struct tcp_congestion_ops tcp_vegas = { -	.flags		= TCP_CONG_RTT_STAMP, +static struct tcp_congestion_ops tcp_vegas __read_mostly = {  	.init		= tcp_vegas_init,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_vegas_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.pkts_acked	= tcp_vegas_pkts_acked,  	.set_state	= tcp_vegas_state,  	.cwnd_event	= tcp_vegas_cwnd_event, diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index 6c0eea2f824..0531b99d863 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -15,10 +15,10 @@ struct vegas {  	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */  }; -extern void tcp_vegas_init(struct sock *sk); -extern void tcp_vegas_state(struct sock *sk, u8 ca_state); -extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); -extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); -extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); +void tcp_vegas_init(struct sock *sk); +void tcp_vegas_state(struct sock *sk, u8 ca_state); +void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); +void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);  #endif	/* __TCP_VEGAS_H */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 38bc0b52d74..27b9825753d 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,18 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)  		tcp_veno_init(sk);  } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct veno *veno = inet_csk_ca(sk);  	if (!veno->doing_veno_now) { -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  		return;  	}  	/* limited by applications */ -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	/* We do the Veno calculations only if we got enough rtt samples */ @@ -133,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		/* We don't have enough rtt samples to do the Veno  		 * calculation, so we'll behave like Reno.  		 */ -		tcp_reno_cong_avoid(sk, ack, in_flight); +		tcp_reno_cong_avoid(sk, ack, acked);  	} else {  		u64 target_cwnd;  		u32 rtt; @@ -152,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)  		if (tp->snd_cwnd <= tp->snd_ssthresh) {  			/* Slow start.  */ -			tcp_slow_start(tp); +			tcp_slow_start(tp, acked);  		} else {  			/* Congestion avoidance. */  			if (veno->diff < beta) { @@ -201,8 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)  		return max(tp->snd_cwnd >> 1U, 2U);  } -static struct tcp_congestion_ops tcp_veno = { -	.flags		= TCP_CONG_RTT_STAMP, +static struct tcp_congestion_ops tcp_veno __read_mostly = {  	.init		= tcp_veno_init,  	.ssthresh	= tcp_veno_ssthresh,  	.cong_avoid	= tcp_veno_cong_avoid, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index a534dda5456..b94a04ae2ed 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)  		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);  		break; -	case CA_EVENT_FRTO: +	case CA_EVENT_LOSS:  		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);  		/* Update RTT_min when next ack arrives */  		w->reset_rtt_min = 1; @@ -272,11 +272,10 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,  } -static struct tcp_congestion_ops tcp_westwood = { +static struct tcp_congestion_ops tcp_westwood __read_mostly = {  	.init		= tcp_westwood_init,  	.ssthresh	= tcp_reno_ssthresh,  	.cong_avoid	= tcp_reno_cong_avoid, -	.min_cwnd	= tcp_westwood_bw_rttmin,  	.cwnd_event	= tcp_westwood_event,  	.get_info	= tcp_westwood_info,  	.pkts_acked	= tcp_westwood_pkts_acked, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a0f24035889..599b79b8eac 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@   *   YeAH TCP   *   * For further details look at: - *    http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + *   https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf   *   */  #include <linux/mm.h> @@ -15,13 +15,13 @@  #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA       80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA        1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA        3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON      1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY          8 //lin maximum delta from base -#define TCP_YEAH_RHO         16 //lin minumum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA        50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA       80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA        1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA        3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON      1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY          8 /* maximum delta from base */ +#define TCP_YEAH_RHO         16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA        50 /* minimum number of state switches to reset reno_count */  #define TCP_SCALABLE_AI_CNT	 100U @@ -69,16 +69,16 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)  	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);  } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)  {  	struct tcp_sock *tp = tcp_sk(sk);  	struct yeah *yeah = inet_csk_ca(sk); -	if (!tcp_is_cwnd_limited(sk, in_flight)) +	if (!tcp_is_cwnd_limited(sk))  		return;  	if (tp->snd_cwnd <= tp->snd_ssthresh) -		tcp_slow_start(tp); +		tcp_slow_start(tp, acked);  	else if (!yeah->doing_reno_now) {  		/* Scalable */ @@ -213,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {  	if (yeah->doing_reno_now < TCP_YEAH_RHO) {  		reduction = yeah->lastQ; -		reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); +		reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); -		reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); +		reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);  	} else  		reduction = max(tp->snd_cwnd>>1, 2U); @@ -225,12 +225,10 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {  	return tp->snd_cwnd - reduction;  } -static struct tcp_congestion_ops tcp_yeah = { -	.flags		= TCP_CONG_RTT_STAMP, +static struct tcp_congestion_ops tcp_yeah __read_mostly = {  	.init		= tcp_yeah_init,  	.ssthresh	= tcp_yeah_ssthresh,  	.cong_avoid	= tcp_yeah_cong_avoid, -	.min_cwnd	= tcp_reno_min_cwnd,  	.set_state	= tcp_vegas_state,  	.cwnd_event	= tcp_vegas_cwnd_event,  	.get_info	= tcp_vegas_get_info, diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index ac3b3ee4b07..0d017183062 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -105,7 +105,7 @@ drop:  	return 0;  } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  static int tunnel64_rcv(struct sk_buff *skb)  {  	struct xfrm_tunnel *handler; @@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)  			break;  } -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  static void tunnel64_err(struct sk_buff *skb, u32 info)  {  	struct xfrm_tunnel *handler; @@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = {  	.netns_ok	=	1,  }; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  static const struct net_protocol tunnel64_protocol = {  	.handler	=	tunnel64_rcv,  	.err_handler	=	tunnel64_err, @@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = {  static int __init tunnel4_init(void)  {  	if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { -		printk(KERN_ERR "tunnel4 init: can't add protocol\n"); +		pr_err("%s: can't add protocol\n", __func__);  		return -EAGAIN;  	} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { -		printk(KERN_ERR "tunnel64 init: can't add protocol\n"); +		pr_err("tunnel64 init: can't add protocol\n");  		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);  		return -EAGAIN;  	} @@ -179,12 +179,12 @@ static int __init tunnel4_init(void)  static void __exit tunnel4_fini(void)  { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) -		printk(KERN_ERR "tunnel64 close: can't remove protocol\n"); +		pr_err("tunnel64 close: can't remove protocol\n");  #endif  	if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) -		printk(KERN_ERR "tunnel4 close: can't remove protocol\n"); +		pr_err("tunnel4 close: can't remove protocol\n");  }  module_init(tunnel4_init); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b37181da487..7d5a8661df7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -77,7 +77,8 @@   *		2 of the License, or (at your option) any later version.   */ -#include <asm/system.h> +#define pr_fmt(fmt) "UDP: " fmt +  #include <asm/uaccess.h>  #include <asm/ioctls.h>  #include <linux/bootmem.h> @@ -102,9 +103,14 @@  #include <linux/seq_file.h>  #include <net/net_namespace.h>  #include <net/icmp.h> +#include <net/inet_hashtables.h>  #include <net/route.h>  #include <net/checksum.h>  #include <net/xfrm.h> +#include <trace/events/udp.h> +#include <linux/static_key.h> +#include <trace/events/skb.h> +#include <net/busy_poll.h>  #include "udp_impl.h"  struct udp_table udp_table __read_mostly; @@ -135,6 +141,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,  {  	struct sock *sk2;  	struct hlist_nulls_node *node; +	kuid_t uid = sock_i_uid(sk);  	sk_nulls_for_each(sk2, node, &hslot->head)  		if (net_eq(sock_net(sk2), net) && @@ -143,6 +150,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,  		    (!sk2->sk_reuse || !sk->sk_reuse) &&  		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||  		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && +		    (!sk2->sk_reuseport || !sk->sk_reuseport || +		      !uid_eq(uid, sock_i_uid(sk2))) &&  		    (*saddr_comp)(sk, sk2)) {  			if (bitmap)  				__set_bit(udp_sk(sk2)->udp_port_hash >> log, @@ -165,6 +174,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,  {  	struct sock *sk2;  	struct hlist_nulls_node *node; +	kuid_t uid = sock_i_uid(sk);  	int res = 0;  	spin_lock(&hslot2->lock); @@ -175,6 +185,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,  		    (!sk2->sk_reuse || !sk->sk_reuse) &&  		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||  		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && +		    (!sk2->sk_reuseport || !sk->sk_reuseport || +		      !uid_eq(uid, sock_i_uid(sk2))) &&  		    (*saddr_comp)(sk, sk2)) {  			res = 1;  			break; @@ -189,7 +201,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,   *  @sk:          socket struct in question   *  @snum:        port number to look up   *  @saddr_comp:  AF-dependent comparison of bound local IP addresses - *  @hash2_nulladdr: AF-dependant hash value in secondary hash chains, + *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,   *                   with NULL address   */  int udp_lib_get_port(struct sock *sk, unsigned short snum, @@ -204,14 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,  	if (!snum) {  		int low, high, remaining; -		unsigned rand; +		unsigned int rand;  		unsigned short first, last;  		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); -		inet_get_local_port_range(&low, &high); +		inet_get_local_port_range(net, &low, &high);  		remaining = (high - low) + 1; -		rand = net_random(); +		rand = prandom_u32();  		first = (((u64)rand * remaining) >> 32) + low;  		/*  		 * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -234,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,  			do {  				if (low <= snum && snum <= high &&  				    !test_bit(snum >> udptable->log, bitmap) && -				    !inet_is_reserved_local_port(snum)) +				    !inet_is_local_reserved_port(net, snum))  					goto found;  				snum += rand;  			} while (snum != first); @@ -333,26 +345,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,  			!ipv6_only_sock(sk)) {  		struct inet_sock *inet = inet_sk(sk); -		score = (sk->sk_family == PF_INET ? 1 : 0); +		score = (sk->sk_family == PF_INET ? 2 : 1);  		if (inet->inet_rcv_saddr) {  			if (inet->inet_rcv_saddr != daddr)  				return -1; -			score += 2; +			score += 4;  		}  		if (inet->inet_daddr) {  			if (inet->inet_daddr != saddr)  				return -1; -			score += 2; +			score += 4;  		}  		if (inet->inet_dport) {  			if (inet->inet_dport != sport)  				return -1; -			score += 2; +			score += 4;  		}  		if (sk->sk_bound_dev_if) {  			if (sk->sk_bound_dev_if != dif)  				return -1; -			score += 2; +			score += 4;  		}  	}  	return score; @@ -361,7 +373,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,  /*   * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)   */ -#define SCORE2_MAX (1 + 2 + 2 + 2)  static inline int compute_score2(struct sock *sk, struct net *net,  				 __be32 saddr, __be16 sport,  				 __be32 daddr, unsigned int hnum, int dif) @@ -376,26 +387,38 @@ static inline int compute_score2(struct sock *sk, struct net *net,  		if (inet->inet_num != hnum)  			return -1; -		score = (sk->sk_family == PF_INET ? 1 : 0); +		score = (sk->sk_family == PF_INET ? 2 : 1);  		if (inet->inet_daddr) {  			if (inet->inet_daddr != saddr)  				return -1; -			score += 2; +			score += 4;  		}  		if (inet->inet_dport) {  			if (inet->inet_dport != sport)  				return -1; -			score += 2; +			score += 4;  		}  		if (sk->sk_bound_dev_if) {  			if (sk->sk_bound_dev_if != dif)  				return -1; -			score += 2; +			score += 4;  		}  	}  	return score;  } +static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, +				 const __u16 lport, const __be32 faddr, +				 const __be16 fport) +{ +	static u32 udp_ehash_secret __read_mostly; + +	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); + +	return __inet_ehashfn(laddr, lport, faddr, fport, +			      udp_ehash_secret + net_hash_mix(net)); +} +  /* called with read_rcu_lock() */  static struct sock *udp4_lib_lookup2(struct net *net, @@ -405,19 +428,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,  {  	struct sock *sk, *result;  	struct hlist_nulls_node *node; -	int score, badness; +	int score, badness, matches = 0, reuseport = 0; +	u32 hash = 0;  begin:  	result = NULL; -	badness = -1; +	badness = 0;  	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {  		score = compute_score2(sk, net, saddr, sport,  				      daddr, hnum, dif);  		if (score > badness) {  			result = sk;  			badness = score; -			if (score == SCORE2_MAX) -				goto exact_match; +			reuseport = sk->sk_reuseport; +			if (reuseport) { +				hash = udp_ehashfn(net, daddr, hnum, +						   saddr, sport); +				matches = 1; +			} +		} else if (score == badness && reuseport) { +			matches++; +			if (((u64)hash * matches) >> 32 == 0) +				result = sk; +			hash = next_pseudo_random32(hash);  		}  	}  	/* @@ -427,9 +460,7 @@ begin:  	 */  	if (get_nulls_value(node) != slot2)  		goto begin; -  	if (result) { -exact_match:  		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))  			result = NULL;  		else if (unlikely(compute_score2(result, net, saddr, sport, @@ -444,7 +475,7 @@ exact_match:  /* UDP is nearly always wildcards out the wazoo, it makes no sense to try   * harder than this. -DaveM   */ -static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, +struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,  		__be16 sport, __be32 daddr, __be16 dport,  		int dif, struct udp_table *udptable)  { @@ -453,7 +484,8 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,  	unsigned short hnum = ntohs(dport);  	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);  	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; -	int score, badness; +	int score, badness, matches = 0, reuseport = 0; +	u32 hash = 0;  	rcu_read_lock();  	if (hslot->count > 10) { @@ -482,13 +514,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,  	}  begin:  	result = NULL; -	badness = -1; +	badness = 0;  	sk_nulls_for_each_rcu(sk, node, &hslot->head) {  		score = compute_score(sk, net, saddr, hnum, sport,  				      daddr, dport, dif);  		if (score > badness) {  			result = sk;  			badness = score; +			reuseport = sk->sk_reuseport; +			if (reuseport) { +				hash = udp_ehashfn(net, daddr, hnum, +						   saddr, sport); +				matches = 1; +			} +		} else if (score == badness && reuseport) { +			matches++; +			if (((u64)hash * matches) >> 32 == 0) +				result = sk; +			hash = next_pseudo_random32(hash);  		}  	}  	/* @@ -511,20 +554,17 @@ begin:  	rcu_read_unlock();  	return result;  } +EXPORT_SYMBOL_GPL(__udp4_lib_lookup);  static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,  						 __be16 sport, __be16 dport,  						 struct udp_table *udptable)  { -	struct sock *sk;  	const struct iphdr *iph = ip_hdr(skb); -	if (unlikely(sk = skb_steal_sock(skb))) -		return sk; -	else -		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, -					 iph->daddr, dport, inet_iif(skb), -					 udptable); +	return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, +				 iph->daddr, dport, inet_iif(skb), +				 udptable);  }  struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, @@ -534,6 +574,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,  }  EXPORT_SYMBOL_GPL(udp4_lib_lookup); +static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, +				       __be16 loc_port, __be32 loc_addr, +				       __be16 rmt_port, __be32 rmt_addr, +				       int dif, unsigned short hnum) +{ +	struct inet_sock *inet = inet_sk(sk); + +	if (!net_eq(sock_net(sk), net) || +	    udp_sk(sk)->udp_port_hash != hnum || +	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) || +	    (inet->inet_dport != rmt_port && inet->inet_dport) || +	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || +	    ipv6_only_sock(sk) || +	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) +		return false; +	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif)) +		return false; +	return true; +} +  static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,  					     __be16 loc_port, __be32 loc_addr,  					     __be16 rmt_port, __be32 rmt_addr, @@ -544,20 +604,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,  	unsigned short hnum = ntohs(loc_port);  	sk_nulls_for_each_from(s, node) { -		struct inet_sock *inet = inet_sk(s); - -		if (!net_eq(sock_net(s), net) || -		    udp_sk(s)->udp_port_hash != hnum || -		    (inet->inet_daddr && inet->inet_daddr != rmt_addr) || -		    (inet->inet_dport != rmt_port && inet->inet_dport) || -		    (inet->inet_rcv_saddr && -		     inet->inet_rcv_saddr != loc_addr) || -		    ipv6_only_sock(s) || -		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) -			continue; -		if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) -			continue; -		goto found; +		if (__udp_is_mcast_sock(net, s, +					loc_port, loc_addr, +					rmt_port, rmt_addr, +					dif, hnum)) +			goto found;  	}  	s = NULL;  found: @@ -578,7 +629,7 @@ found:  void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  {  	struct inet_sock *inet; -	struct iphdr *iph = (struct iphdr *)skb->data; +	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));  	const int type = icmp_hdr(skb)->type;  	const int code = icmp_hdr(skb)->code; @@ -611,6 +662,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  		break;  	case ICMP_DEST_UNREACH:  		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ +			ipv4_sk_update_pmtu(skb, sk, info);  			if (inet->pmtudisc != IP_PMTUDISC_DONT) {  				err = EMSGSIZE;  				harderr = 1; @@ -624,6 +676,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  			err = icmp_err_convert[code].errno;  		}  		break; +	case ICMP_REDIRECT: +		ipv4_sk_redirect(skb, sk); +		goto out;  	}  	/* @@ -663,97 +718,132 @@ void udp_flush_pending_frames(struct sock *sk)  EXPORT_SYMBOL(udp_flush_pending_frames);  /** - * 	udp4_hwcsum_outgoing  -  handle outgoing HW checksumming - * 	@sk: 	socket we are sending on + * 	udp4_hwcsum  -  handle outgoing HW checksumming   * 	@skb: 	sk_buff containing the filled-in UDP header   * 	        (checksum field must be zeroed out) + *	@src:	source IP address + *	@dst:	destination IP address   */ -static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, -				 __be32 src, __be32 dst, int len) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)  { -	unsigned int offset;  	struct udphdr *uh = udp_hdr(skb); +	int offset = skb_transport_offset(skb); +	int len = skb->len - offset; +	int hlen = len;  	__wsum csum = 0; -	if (skb_queue_len(&sk->sk_write_queue) == 1) { +	if (!skb_has_frag_list(skb)) {  		/*  		 * Only one fragment on the socket.  		 */  		skb->csum_start = skb_transport_header(skb) - skb->head;  		skb->csum_offset = offsetof(struct udphdr, check); -		uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); +		uh->check = ~csum_tcpudp_magic(src, dst, len, +					       IPPROTO_UDP, 0);  	} else { +		struct sk_buff *frags; +  		/*  		 * HW-checksum won't work as there are two or more  		 * fragments on the socket so that all csums of sk_buffs  		 * should be together  		 */ -		offset = skb_transport_offset(skb); -		skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); +		skb_walk_frags(skb, frags) { +			csum = csum_add(csum, frags->csum); +			hlen -= frags->len; +		} +		csum = skb_checksum(skb, offset, hlen, csum);  		skb->ip_summed = CHECKSUM_NONE; -		skb_queue_walk(&sk->sk_write_queue, skb) { -			csum = csum_add(csum, skb->csum); -		} -  		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);  		if (uh->check == 0)  			uh->check = CSUM_MANGLED_0;  	}  } +EXPORT_SYMBOL_GPL(udp4_hwcsum); -/* - * Push out all pending data as one UDP datagram. Socket is locked. +/* Function to set UDP checksum for an IPv4 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel.   */ -static int udp_push_pending_frames(struct sock *sk) +void udp_set_csum(bool nocheck, struct sk_buff *skb, +		  __be32 saddr, __be32 daddr, int len)  { -	struct udp_sock  *up = udp_sk(sk); +	struct udphdr *uh = udp_hdr(skb); + +	if (nocheck) +		uh->check = 0; +	else if (skb_is_gso(skb)) +		uh->check = ~udp_v4_check(len, saddr, daddr, 0); +	else if (skb_dst(skb) && skb_dst(skb)->dev && +		 (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + +		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + +		skb->ip_summed = CHECKSUM_PARTIAL; +		skb->csum_start = skb_transport_header(skb) - skb->head; +		skb->csum_offset = offsetof(struct udphdr, check); +		uh->check = ~udp_v4_check(len, saddr, daddr, 0); +	} else { +		__wsum csum; + +		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + +		uh->check = 0; +		csum = skb_checksum(skb, 0, len, 0); +		uh->check = udp_v4_check(len, saddr, daddr, csum); +		if (uh->check == 0) +			uh->check = CSUM_MANGLED_0; + +		skb->ip_summed = CHECKSUM_UNNECESSARY; +	} +} +EXPORT_SYMBOL(udp_set_csum); + +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) +{ +	struct sock *sk = skb->sk;  	struct inet_sock *inet = inet_sk(sk); -	struct flowi *fl = &inet->cork.fl; -	struct sk_buff *skb;  	struct udphdr *uh;  	int err = 0;  	int is_udplite = IS_UDPLITE(sk); +	int offset = skb_transport_offset(skb); +	int len = skb->len - offset;  	__wsum csum = 0; -	/* Grab the skbuff where UDP header space exists. */ -	if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) -		goto out; -  	/*  	 * Create a UDP header  	 */  	uh = udp_hdr(skb); -	uh->source = fl->fl_ip_sport; -	uh->dest = fl->fl_ip_dport; -	uh->len = htons(up->len); +	uh->source = inet->inet_sport; +	uh->dest = fl4->fl4_dport; +	uh->len = htons(len);  	uh->check = 0;  	if (is_udplite)  				 /*     UDP-Lite      */ -		csum  = udplite_csum_outgoing(sk, skb); +		csum = udplite_csum(skb); -	else if (sk->sk_no_check == UDP_CSUM_NOXMIT) {   /* UDP csum disabled */ +	else if (sk->sk_no_check_tx) {   /* UDP csum disabled */  		skb->ip_summed = CHECKSUM_NONE;  		goto send;  	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ -		udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); +		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);  		goto send; -	} else						 /*   `normal' UDP    */ -		csum = udp_csum_outgoing(sk, skb); +	} else +		csum = udp_csum(skb);  	/* add protocol-dependent pseudo-header */ -	uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, +	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,  				      sk->sk_protocol, csum);  	if (uh->check == 0)  		uh->check = CSUM_MANGLED_0;  send: -	err = ip_push_pending_frames(sk); +	err = ip_send_skb(sock_net(sk), skb);  	if (err) {  		if (err == -ENOBUFS && !inet->recverr) {  			UDP_INC_STATS_USER(sock_net(sk), @@ -763,17 +853,40 @@ send:  	} else  		UDP_INC_STATS_USER(sock_net(sk),  				   UDP_MIB_OUTDATAGRAMS, is_udplite); +	return err; +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +int udp_push_pending_frames(struct sock *sk) +{ +	struct udp_sock  *up = udp_sk(sk); +	struct inet_sock *inet = inet_sk(sk); +	struct flowi4 *fl4 = &inet->cork.fl.u.ip4; +	struct sk_buff *skb; +	int err = 0; + +	skb = ip_finish_skb(sk, fl4); +	if (!skb) +		goto out; + +	err = udp_send_skb(skb, fl4); +  out:  	up->len = 0;  	up->pending = 0;  	return err;  } +EXPORT_SYMBOL(udp_push_pending_frames);  int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		size_t len)  {  	struct inet_sock *inet = inet_sk(sk);  	struct udp_sock *up = udp_sk(sk); +	struct flowi4 fl4_stack; +	struct flowi4 *fl4;  	int ulen = len;  	struct ipcm_cookie ipc;  	struct rtable *rt = NULL; @@ -785,6 +898,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	int err, is_udplite = IS_UDPLITE(sk);  	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;  	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); +	struct sk_buff *skb; +	struct ip_options_data opt_copy;  	if (len > 0xFFFF)  		return -EMSGSIZE; @@ -798,7 +913,12 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.opt = NULL;  	ipc.tx_flags = 0; +	ipc.ttl = 0; +	ipc.tos = -1; +	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + +	fl4 = &inet->cork.fl.u.ip4;  	if (up->pending) {  		/*  		 * There are pending frames. @@ -820,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	 *	Get and verify the address.  	 */  	if (msg->msg_name) { -		struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; +		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);  		if (msg->msg_namelen < sizeof(*usin))  			return -EINVAL;  		if (usin->sin_family != AF_INET) { @@ -845,33 +965,44 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  	ipc.addr = inet->inet_saddr;  	ipc.oif = sk->sk_bound_dev_if; -	err = sock_tx_timestamp(sk, &ipc.tx_flags); -	if (err) -		return err; + +	sock_tx_timestamp(sk, &ipc.tx_flags); +  	if (msg->msg_controllen) { -		err = ip_cmsg_send(sock_net(sk), msg, &ipc); +		err = ip_cmsg_send(sock_net(sk), msg, &ipc, +				   sk->sk_family == AF_INET6);  		if (err)  			return err;  		if (ipc.opt)  			free = 1;  		connected = 0;  	} -	if (!ipc.opt) -		ipc.opt = inet->opt; +	if (!ipc.opt) { +		struct ip_options_rcu *inet_opt; + +		rcu_read_lock(); +		inet_opt = rcu_dereference(inet->inet_opt); +		if (inet_opt) { +			memcpy(&opt_copy, inet_opt, +			       sizeof(*inet_opt) + inet_opt->opt.optlen); +			ipc.opt = &opt_copy.opt; +		} +		rcu_read_unlock(); +	}  	saddr = ipc.addr;  	ipc.addr = faddr = daddr; -	if (ipc.opt && ipc.opt->srr) { +	if (ipc.opt && ipc.opt->opt.srr) {  		if (!daddr)  			return -EINVAL; -		faddr = ipc.opt->faddr; +		faddr = ipc.opt->opt.faddr;  		connected = 0;  	} -	tos = RT_TOS(inet->tos); +	tos = get_rttos(&ipc, inet);  	if (sock_flag(sk, SOCK_LOCALROUTE) ||  	    (msg->msg_flags & MSG_DONTROUTE) || -	    (ipc.opt && ipc.opt->is_strictroute)) { +	    (ipc.opt && ipc.opt->opt.is_strictroute)) {  		tos |= RTO_ONLINK;  		connected = 0;  	} @@ -882,28 +1013,28 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		if (!saddr)  			saddr = inet->mc_addr;  		connected = 0; -	} +	} else if (!ipc.oif) +		ipc.oif = inet->uc_index;  	if (connected)  		rt = (struct rtable *)sk_dst_check(sk, 0);  	if (rt == NULL) { -		struct flowi fl = { .oif = ipc.oif, -				    .mark = sk->sk_mark, -				    .fl4_dst = faddr, -				    .fl4_src = saddr, -				    .fl4_tos = tos, -				    .proto = sk->sk_protocol, -				    .flags = inet_sk_flowi_flags(sk), -				    .fl_ip_sport = inet->inet_sport, -				    .fl_ip_dport = dport };  		struct net *net = sock_net(sk); -		security_sk_classify_flow(sk, &fl); -		err = ip_route_output_flow(net, &rt, &fl, sk, 1); -		if (err) { +		fl4 = &fl4_stack; +		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, +				   RT_SCOPE_UNIVERSE, sk->sk_protocol, +				   inet_sk_flowi_flags(sk), +				   faddr, saddr, dport, inet->inet_sport); + +		security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); +		rt = ip_route_output_flow(net, fl4, sk); +		if (IS_ERR(rt)) { +			err = PTR_ERR(rt); +			rt = NULL;  			if (err == -ENETUNREACH) -				IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); +				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);  			goto out;  		} @@ -919,9 +1050,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		goto do_confirm;  back_from_confirm: -	saddr = rt->rt_src; +	saddr = fl4->saddr;  	if (!ipc.addr) -		daddr = ipc.addr = rt->rt_dst; +		daddr = ipc.addr = fl4->daddr; + +	/* Lockless fast path for the non-corking case. */ +	if (!corkreq) { +		skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, +				  sizeof(struct udphdr), &ipc, &rt, +				  msg->msg_flags); +		err = PTR_ERR(skb); +		if (!IS_ERR_OR_NULL(skb)) +			err = udp_send_skb(skb, fl4); +		goto out; +	}  	lock_sock(sk);  	if (unlikely(up->pending)) { @@ -929,25 +1071,25 @@ back_from_confirm:  		/* ... which is an evident application bug. --ANK */  		release_sock(sk); -		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); +		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));  		err = -EINVAL;  		goto out;  	}  	/*  	 *	Now cork the socket to pend data.  	 */ -	inet->cork.fl.fl4_dst = daddr; -	inet->cork.fl.fl_ip_dport = dport; -	inet->cork.fl.fl4_src = saddr; -	inet->cork.fl.fl_ip_sport = inet->inet_sport; +	fl4 = &inet->cork.fl.u.ip4; +	fl4->daddr = daddr; +	fl4->saddr = saddr; +	fl4->fl4_dport = dport; +	fl4->fl4_sport = inet->inet_sport;  	up->pending = AF_INET;  do_append_data:  	up->len += ulen; -	getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag; -	err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, -			sizeof(struct udphdr), &ipc, &rt, -			corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); +	err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, +			     sizeof(struct udphdr), &ipc, &rt, +			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);  	if (err)  		udp_flush_pending_frames(sk);  	else if (!corkreq) @@ -987,9 +1129,13 @@ EXPORT_SYMBOL(udp_sendmsg);  int udp_sendpage(struct sock *sk, struct page *page, int offset,  		 size_t size, int flags)  { +	struct inet_sock *inet = inet_sk(sk);  	struct udp_sock *up = udp_sk(sk);  	int ret; +	if (flags & MSG_SENDPAGE_NOTLAST) +		flags |= MSG_MORE; +  	if (!up->pending) {  		struct msghdr msg = {	.msg_flags = flags|MSG_MORE }; @@ -1007,11 +1153,12 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,  	if (unlikely(!up->pending)) {  		release_sock(sk); -		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); +		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n"));  		return -EINVAL;  	} -	ret = ip_append_page(sk, page, offset, size, flags); +	ret = ip_append_page(sk, &inet->cork.fl.u.ip4, +			     page, offset, size, flags);  	if (ret == -EOPNOTSUPP) {  		release_sock(sk);  		return sock_no_sendpage(sk->sk_socket, page, offset, @@ -1051,6 +1198,8 @@ static unsigned int first_packet_length(struct sock *sk)  	spin_lock_bh(&rcvq->lock);  	while ((skb = skb_peek(rcvq)) != NULL &&  		udp_lib_checksum_complete(skb)) { +		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, +				 IS_UDPLITE(sk));  		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,  				 IS_UDPLITE(sk));  		atomic_inc(&sk->sk_drops); @@ -1116,33 +1265,28 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,  		size_t len, int noblock, int flags, int *addr_len)  {  	struct inet_sock *inet = inet_sk(sk); -	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; +	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);  	struct sk_buff *skb; -	unsigned int ulen; -	int peeked; +	unsigned int ulen, copied; +	int peeked, off = 0;  	int err;  	int is_udplite = IS_UDPLITE(sk);  	bool slow; -	/* -	 *	Check any passed addresses -	 */ -	if (addr_len) -		*addr_len = sizeof(*sin); -  	if (flags & MSG_ERRQUEUE) -		return ip_recv_error(sk, msg, len); +		return ip_recv_error(sk, msg, len, addr_len);  try_again:  	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), -				  &peeked, &err); +				  &peeked, &off, &err);  	if (!skb)  		goto out;  	ulen = skb->len - sizeof(struct udphdr); -	if (len > ulen) -		len = ulen; -	else if (len < ulen) +	copied = len; +	if (copied > ulen) +		copied = ulen; +	else if (copied < ulen)  		msg->msg_flags |= MSG_TRUNC;  	/* @@ -1151,14 +1295,14 @@ try_again:  	 * coverage checksum (UDP-Lite), do it before the copy.  	 */ -	if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { +	if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {  		if (udp_lib_checksum_complete(skb))  			goto csum_copy_err;  	}  	if (skb_csum_unnecessary(skb))  		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), -					      msg->msg_iov, len); +					      msg->msg_iov, copied);  	else {  		err = skb_copy_and_csum_datagram_iovec(skb,  						       sizeof(struct udphdr), @@ -1168,8 +1312,15 @@ try_again:  			goto csum_copy_err;  	} -	if (err) +	if (unlikely(err)) { +		trace_kfree_skb(skb, udp_recvmsg); +		if (!peeked) { +			atomic_inc(&sk->sk_drops); +			UDP_INC_STATS_USER(sock_net(sk), +					   UDP_MIB_INERRORS, is_udplite); +		}  		goto out_free; +	}  	if (!peeked)  		UDP_INC_STATS_USER(sock_net(sk), @@ -1183,11 +1334,12 @@ try_again:  		sin->sin_port = udp_hdr(skb)->source;  		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;  		memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +		*addr_len = sizeof(*sin);  	}  	if (inet->cmsg_flags)  		ip_cmsg_recv(msg, skb); -	err = len; +	err = copied;  	if (flags & MSG_TRUNC)  		err = ulen; @@ -1198,12 +1350,17 @@ out:  csum_copy_err:  	slow = lock_sock_fast(sk); -	if (!skb_kill_datagram(sk, skb, flags)) +	if (!skb_kill_datagram(sk, skb, flags)) { +		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);  		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); +	}  	unlock_sock_fast(sk, slow);  	if (noblock)  		return -EAGAIN; + +	/* starting over for a new packet */ +	msg->msg_flags &= ~MSG_TRUNC;  	goto try_again;  } @@ -1218,7 +1375,7 @@ int udp_disconnect(struct sock *sk, int flags)  	sk->sk_state = TCP_CLOSE;  	inet->inet_daddr = 0;  	inet->inet_dport = 0; -	sock_rps_save_rxhash(sk, 0); +	sock_rps_reset_rxhash(sk);  	sk->sk_bound_dev_if = 0;  	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))  		inet_reset_saddr(sk); @@ -1305,10 +1462,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  {  	int rc; -	if (inet_sk(sk)->inet_daddr) -		sock_rps_save_rxhash(sk, skb->rxhash); +	if (inet_sk(sk)->inet_daddr) { +		sock_rps_save_rxhash(sk, skb); +		sk_mark_napi_id(sk, skb); +	} -	rc = ip_queue_rcv_skb(sk, skb); +	rc = sock_queue_rcv_skb(sk, skb);  	if (rc < 0) {  		int is_udplite = IS_UDPLITE(sk); @@ -1318,6 +1477,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  					 is_udplite);  		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);  		kfree_skb(skb); +		trace_udp_fail_queue_rcv_skb(rc, sk);  		return -1;  	} @@ -1325,6 +1485,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  } +static struct static_key udp_encap_needed __read_mostly; +void udp_encap_enable(void) +{ +	if (!static_key_enabled(&udp_encap_needed)) +		static_key_slow_inc(&udp_encap_needed); +} +EXPORT_SYMBOL(udp_encap_enable); +  /* returns:   *  -1: error   *   0: success @@ -1346,7 +1514,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		goto drop;  	nf_reset(skb); -	if (up->encap_type) { +	if (static_key_false(&udp_encap_needed) && up->encap_type) { +		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); +  		/*  		 * This is an encapsulation socket so pass the skb to  		 * the socket's udp_encap_rcv() hook. Otherwise, just @@ -1359,11 +1529,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		 */  		/* if we're overly short, let UDP handle it */ -		if (skb->len > sizeof(struct udphdr) && -		    up->encap_rcv != NULL) { +		encap_rcv = ACCESS_ONCE(up->encap_rcv); +		if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {  			int ret; -			ret = (*up->encap_rcv)(sk, skb); +			/* Verify checksum before giving to encap */ +			if (udp_lib_checksum_complete(skb)) +				goto csum_error; + +			ret = encap_rcv(sk, skb);  			if (ret <= 0) {  				UDP_INC_STATS_BH(sock_net(sk),  						 UDP_MIB_INDATAGRAMS, @@ -1392,9 +1566,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		 * provided by the application."  		 */  		if (up->pcrlen == 0) {          /* full coverage was set  */ -			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage " -				"%d while full coverage %d requested\n", -				UDP_SKB_CB(skb)->cscov, skb->len); +			LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", +				       UDP_SKB_CB(skb)->cscov, skb->len);  			goto drop;  		}  		/* The next case involves violating the min. coverage requested @@ -1404,28 +1577,30 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  		 * Therefore the above ...()->partial_cov statement is essential.  		 */  		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) { -			LIMIT_NETDEBUG(KERN_WARNING -				"UDPLITE: coverage %d too small, need min %d\n", -				UDP_SKB_CB(skb)->cscov, up->pcrlen); +			LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", +				       UDP_SKB_CB(skb)->cscov, up->pcrlen);  			goto drop;  		}  	} -	if (rcu_dereference_raw(sk->sk_filter)) { -		if (udp_lib_checksum_complete(skb)) -			goto drop; -	} +	if (rcu_access_pointer(sk->sk_filter) && +	    udp_lib_checksum_complete(skb)) +		goto csum_error; -	if (sk_rcvqueues_full(sk, skb)) +	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { +		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, +				 is_udplite);  		goto drop; +	}  	rc = 0; +	ipv4_pktinfo_prepare(sk, skb);  	bh_lock_sock(sk);  	if (!sock_owned_by_user(sk))  		rc = __udp_queue_rcv_skb(sk, skb); -	else if (sk_add_backlog(sk, skb)) { +	else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {  		bh_unlock_sock(sk);  		goto drop;  	} @@ -1433,6 +1608,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	return rc; +csum_error: +	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);  drop:  	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);  	atomic_inc(&sk->sk_drops); @@ -1468,6 +1645,18 @@ static void flush_stack(struct sock **stack, unsigned int count,  		kfree_skb(skb1);  } +/* For TCP sockets, sk_rx_dst is protected by socket lock + * For UDP, we use xchg() to guard against concurrent changes. + */ +static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ +	struct dst_entry *old; + +	dst_hold(dst); +	old = xchg(&sk->sk_rx_dst, dst); +	dst_release(old); +} +  /*   *	Multicasts and broadcasts go to each listener.   * @@ -1528,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,  static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,  				 int proto)  { -	const struct iphdr *iph;  	int err;  	UDP_SKB_CB(skb)->partial_cov = 0; @@ -1540,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,  			return err;  	} -	iph = ip_hdr(skb); -	if (uh->check == 0) { -		skb->ip_summed = CHECKSUM_UNNECESSARY; -	} else if (skb->ip_summed == CHECKSUM_COMPLETE) { -		if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, -				      proto, skb->csum)) -			skb->ip_summed = CHECKSUM_UNNECESSARY; -	} -	if (!skb_csum_unnecessary(skb)) -		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -					       skb->len, proto, 0); -	/* Probably, we should checksum udp header (it should be in cache -	 * in any case) and data in tiny packets (< rx copybreak). -	 */ - -	return 0; +	return skb_checksum_init_zero_check(skb, proto, uh->check, +					    inet_compute_pseudo);  }  /* @@ -1596,14 +1770,34 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,  	if (udp4_csum_init(skb, uh, proto))  		goto csum_error; -	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) -		return __udp4_lib_mcast_deliver(net, skb, uh, -				saddr, daddr, udptable); +	sk = skb_steal_sock(skb); +	if (sk) { +		struct dst_entry *dst = skb_dst(skb); +		int ret; -	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); +		if (unlikely(sk->sk_rx_dst != dst)) +			udp_sk_rx_dst_set(sk, dst); + +		ret = udp_queue_rcv_skb(sk, skb); +		sock_put(sk); +		/* a return value > 0 means to resubmit the input, but +		 * it wants the return to be -protocol, or 0 +		 */ +		if (ret > 0) +			return -ret; +		return 0; +	} else { +		if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) +			return __udp4_lib_mcast_deliver(net, skb, uh, +					saddr, daddr, udptable); + +		sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); +	}  	if (sk != NULL) { -		int ret = udp_queue_rcv_skb(sk, skb); +		int ret; + +		ret = udp_queue_rcv_skb(sk, skb);  		sock_put(sk);  		/* a return value > 0 means to resubmit the input, but @@ -1634,13 +1828,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,  short_packet:  	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", -		       proto == IPPROTO_UDPLITE ? "-Lite" : "", -		       &saddr, -		       ntohs(uh->source), -		       ulen, -		       skb->len, -		       &daddr, -		       ntohs(uh->dest)); +		       proto == IPPROTO_UDPLITE ? "Lite" : "", +		       &saddr, ntohs(uh->source), +		       ulen, skb->len, +		       &daddr, ntohs(uh->dest));  	goto drop;  csum_error: @@ -1649,18 +1840,152 @@ csum_error:  	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).  	 */  	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", -		       proto == IPPROTO_UDPLITE ? "-Lite" : "", -		       &saddr, -		       ntohs(uh->source), -		       &daddr, -		       ntohs(uh->dest), +		       proto == IPPROTO_UDPLITE ? "Lite" : "", +		       &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),  		       ulen); +	UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);  drop:  	UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);  	kfree_skb(skb);  	return 0;  } +/* We can only early demux multicast if there is a single matching socket. + * If more than one socket found returns NULL + */ +static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, +						  __be16 loc_port, __be32 loc_addr, +						  __be16 rmt_port, __be32 rmt_addr, +						  int dif) +{ +	struct sock *sk, *result; +	struct hlist_nulls_node *node; +	unsigned short hnum = ntohs(loc_port); +	unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); +	struct udp_hslot *hslot = &udp_table.hash[slot]; + +	/* Do not bother scanning a too big list */ +	if (hslot->count > 10) +		return NULL; + +	rcu_read_lock(); +begin: +	count = 0; +	result = NULL; +	sk_nulls_for_each_rcu(sk, node, &hslot->head) { +		if (__udp_is_mcast_sock(net, sk, +					loc_port, loc_addr, +					rmt_port, rmt_addr, +					dif, hnum)) { +			result = sk; +			++count; +		} +	} +	/* +	 * if the nulls value we got at the end of this lookup is +	 * not the expected one, we must restart lookup. +	 * We probably met an item that was moved to another chain. +	 */ +	if (get_nulls_value(node) != slot) +		goto begin; + +	if (result) { +		if (count != 1 || +		    unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) +			result = NULL; +		else if (unlikely(!__udp_is_mcast_sock(net, result, +						       loc_port, loc_addr, +						       rmt_port, rmt_addr, +						       dif, hnum))) { +			sock_put(result); +			result = NULL; +		} +	} +	rcu_read_unlock(); +	return result; +} + +/* For unicast we should only early demux connected sockets or we can + * break forwarding setups.  The chains here can be long so only check + * if the first socket is an exact match and if not move on. + */ +static struct sock *__udp4_lib_demux_lookup(struct net *net, +					    __be16 loc_port, __be32 loc_addr, +					    __be16 rmt_port, __be32 rmt_addr, +					    int dif) +{ +	struct sock *sk, *result; +	struct hlist_nulls_node *node; +	unsigned short hnum = ntohs(loc_port); +	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); +	unsigned int slot2 = hash2 & udp_table.mask; +	struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; +	INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); +	const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + +	rcu_read_lock(); +	result = NULL; +	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { +		if (INET_MATCH(sk, net, acookie, +			       rmt_addr, loc_addr, ports, dif)) +			result = sk; +		/* Only check first socket in chain */ +		break; +	} + +	if (result) { +		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) +			result = NULL; +		else if (unlikely(!INET_MATCH(sk, net, acookie, +					      rmt_addr, loc_addr, +					      ports, dif))) { +			sock_put(result); +			result = NULL; +		} +	} +	rcu_read_unlock(); +	return result; +} + +void udp_v4_early_demux(struct sk_buff *skb) +{ +	struct net *net = dev_net(skb->dev); +	const struct iphdr *iph; +	const struct udphdr *uh; +	struct sock *sk; +	struct dst_entry *dst; +	int dif = skb->dev->ifindex; + +	/* validate the packet */ +	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) +		return; + +	iph = ip_hdr(skb); +	uh = udp_hdr(skb); + +	if (skb->pkt_type == PACKET_BROADCAST || +	    skb->pkt_type == PACKET_MULTICAST) +		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, +						   uh->source, iph->saddr, dif); +	else if (skb->pkt_type == PACKET_HOST) +		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, +					     uh->source, iph->saddr, dif); +	else +		return; + +	if (!sk) +		return; + +	skb->sk = sk; +	skb->destructor = sock_edemux; +	dst = sk->sk_rx_dst; + +	if (dst) +		dst = dst_check(dst, 0); +	if (dst) +		skb_dst_set_noref(skb, dst); +} +  int udp_rcv(struct sk_buff *skb)  {  	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -1668,9 +1993,16 @@ int udp_rcv(struct sk_buff *skb)  void udp_destroy_sock(struct sock *sk)  { +	struct udp_sock *up = udp_sk(sk);  	bool slow = lock_sock_fast(sk);  	udp_flush_pending_frames(sk);  	unlock_sock_fast(sk, slow); +	if (static_key_false(&udp_encap_needed) && up->encap_type) { +		void (*encap_destroy)(struct sock *sk); +		encap_destroy = ACCESS_ONCE(up->encap_destroy); +		if (encap_destroy) +			encap_destroy(sk); +	}  }  /* @@ -1681,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  		       int (*push_pending_frames)(struct sock *))  {  	struct udp_sock *up = udp_sk(sk); -	int val; +	int val, valbool;  	int err = 0;  	int is_udplite = IS_UDPLITE(sk); @@ -1691,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  	if (get_user(val, (int __user *)optval))  		return -EFAULT; +	valbool = val ? 1 : 0; +  	switch (optname) {  	case UDP_CORK:  		if (val != 0) { @@ -1712,6 +2046,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  			/* FALLTHROUGH */  		case UDP_ENCAP_L2TPINUDP:  			up->encap_type = val; +			udp_encap_enable();  			break;  		default:  			err = -ENOPROTOOPT; @@ -1719,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  		}  		break; +	case UDP_NO_CHECK6_TX: +		up->no_check6_tx = valbool; +		break; + +	case UDP_NO_CHECK6_RX: +		up->no_check6_rx = valbool; +		break; +  	/*  	 * 	UDP-Lite's partial checksum coverage (RFC 3828).  	 */ @@ -1801,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,  		val = up->encap_type;  		break; +	case UDP_NO_CHECK6_TX: +		val = up->no_check6_tx; +		break; + +	case UDP_NO_CHECK6_RX: +		val = up->no_check6_rx; +		break; +  	/* The following two cannot be changed on UDP sockets, the return is  	 * always 0 (which corresponds to the full checksum coverage of UDP). */  	case UDPLITE_SEND_CSCOV: @@ -1858,6 +2209,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)  	unsigned int mask = datagram_poll(file, sock, wait);  	struct sock *sk = sock->sk; +	sock_rps_record_flow(sk); +  	/* Check for false positives due to checksum errors */  	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&  	    !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk)) @@ -1882,6 +2235,7 @@ struct proto udp_prot = {  	.recvmsg	   = udp_recvmsg,  	.sendpage	   = udp_sendpage,  	.backlog_rcv	   = __udp_queue_rcv_skb, +	.release_cb	   = ip4_datagram_release_cb,  	.hash		   = udp_lib_hash,  	.unhash		   = udp_lib_unhash,  	.rehash		   = udp_v4_rehash, @@ -1897,6 +2251,7 @@ struct proto udp_prot = {  	.compat_setsockopt = compat_udp_setsockopt,  	.compat_getsockopt = compat_udp_getsockopt,  #endif +	.clear_sk	   = sk_prot_clear_portaddr_nulls,  };  EXPORT_SYMBOL(udp_prot); @@ -1987,9 +2342,9 @@ static void udp_seq_stop(struct seq_file *seq, void *v)  		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);  } -static int udp_seq_open(struct inode *inode, struct file *file) +int udp_seq_open(struct inode *inode, struct file *file)  { -	struct udp_seq_afinfo *afinfo = PDE(inode)->data; +	struct udp_seq_afinfo *afinfo = PDE_DATA(inode);  	struct udp_iter_state *s;  	int err; @@ -2003,6 +2358,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)  	s->udp_table		= afinfo->udp_table;  	return err;  } +EXPORT_SYMBOL(udp_seq_open);  /* ------------------------------------------------------------------------ */  int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) @@ -2010,17 +2366,12 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)  	struct proc_dir_entry *p;  	int rc = 0; -	afinfo->seq_fops.open		= udp_seq_open; -	afinfo->seq_fops.read		= seq_read; -	afinfo->seq_fops.llseek		= seq_lseek; -	afinfo->seq_fops.release	= seq_release_net; -  	afinfo->seq_ops.start		= udp_seq_start;  	afinfo->seq_ops.next		= udp_seq_next;  	afinfo->seq_ops.stop		= udp_seq_stop;  	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, -			     &afinfo->seq_fops, afinfo); +			     afinfo->seq_fops, afinfo);  	if (!p)  		rc = -ENOMEM;  	return rc; @@ -2029,13 +2380,13 @@ EXPORT_SYMBOL(udp_proc_register);  void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)  { -	proc_net_remove(net, afinfo->name); +	remove_proc_entry(afinfo->name, net->proc_net);  }  EXPORT_SYMBOL(udp_proc_unregister);  /* ------------------------------------------------------------------------ */  static void udp4_format_sock(struct sock *sp, struct seq_file *f, -		int bucket, int *len) +		int bucket)  {  	struct inet_sock *inet = inet_sk(sp);  	__be32 dest = inet->inet_daddr; @@ -2044,40 +2395,47 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,  	__u16 srcp	  = ntohs(inet->inet_sport);  	seq_printf(f, "%5d: %08X:%04X %08X:%04X" -		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", +		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",  		bucket, src, srcp, dest, destp, sp->sk_state,  		sk_wmem_alloc_get(sp),  		sk_rmem_alloc_get(sp), -		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), +		0, 0L, 0, +		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), +		0, sock_i_ino(sp),  		atomic_read(&sp->sk_refcnt), sp, -		atomic_read(&sp->sk_drops), len); +		atomic_read(&sp->sk_drops));  }  int udp4_seq_show(struct seq_file *seq, void *v)  { +	seq_setwidth(seq, 127);  	if (v == SEQ_START_TOKEN) -		seq_printf(seq, "%-127s\n", -			   "  sl  local_address rem_address   st tx_queue " +		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "  			   "rx_queue tr tm->when retrnsmt   uid  timeout "  			   "inode ref pointer drops");  	else {  		struct udp_iter_state *state = seq->private; -		int len; -		udp4_format_sock(v, seq, state->bucket, &len); -		seq_printf(seq, "%*s\n", 127 - len, ""); +		udp4_format_sock(v, seq, state->bucket);  	} +	seq_pad(seq, '\n');  	return 0;  } +static const struct file_operations udp_afinfo_seq_fops = { +	.owner    = THIS_MODULE, +	.open     = udp_seq_open, +	.read     = seq_read, +	.llseek   = seq_lseek, +	.release  = seq_release_net +}; +  /* ------------------------------------------------------------------------ */  static struct udp_seq_afinfo udp4_seq_afinfo = {  	.name		= "udp",  	.family		= AF_INET,  	.udp_table	= &udp_table, -	.seq_fops	= { -		.owner	=	THIS_MODULE, -	}, +	.seq_fops	= &udp_afinfo_seq_fops,  	.seq_ops	= {  		.show		= udp4_seq_show,  	}, @@ -2112,9 +2470,15 @@ void udp4_proc_exit(void)  static __initdata unsigned long uhash_entries;  static int __init set_uhash_entries(char *str)  { +	ssize_t ret; +  	if (!str)  		return 0; -	uhash_entries = simple_strtoul(str, &str, 0); + +	ret = kstrtoul(str, 0, &uhash_entries); +	if (ret) +		return 0; +  	if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)  		uhash_entries = UDP_HTABLE_SIZE_MIN;  	return 1; @@ -2125,26 +2489,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)  {  	unsigned int i; -	if (!CONFIG_BASE_SMALL) -		table->hash = alloc_large_system_hash(name, -			2 * sizeof(struct udp_hslot), -			uhash_entries, -			21, /* one slot per 2 MB */ -			0, -			&table->log, -			&table->mask, -			64 * 1024); -	/* -	 * Make sure hash table has the minimum size -	 */ -	if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { -		table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * -				      2 * sizeof(struct udp_hslot), GFP_KERNEL); -		if (!table->hash) -			panic(name); -		table->log = ilog2(UDP_HTABLE_SIZE_MIN); -		table->mask = UDP_HTABLE_SIZE_MIN - 1; -	} +	table->hash = alloc_large_system_hash(name, +					      2 * sizeof(struct udp_hslot), +					      uhash_entries, +					      21, /* one slot per 2 MB */ +					      0, +					      &table->log, +					      &table->mask, +					      UDP_HTABLE_SIZE_MIN, +					      64 * 1024); +  	table->hash2 = table->hash + (table->mask + 1);  	for (i = 0; i <= table->mask; i++) {  		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); @@ -2160,16 +2514,10 @@ void __init udp_table_init(struct udp_table *table, const char *name)  void __init udp_init(void)  { -	unsigned long nr_pages, limit; +	unsigned long limit;  	udp_table_init(&udp_table, "UDP"); -	/* Set the pressure threshold up by the same strategy of TCP. It is a -	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing -	 * toward zero with the amount of memory, with a floor of 128 pages. -	 */ -	nr_pages = totalram_pages - totalhigh_pages; -	limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); -	limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); +	limit = nr_free_buffer_pages() / 8;  	limit = max(limit, 128UL);  	sysctl_udp_mem[0] = limit / 4 * 3;  	sysctl_udp_mem[1] = limit; @@ -2179,64 +2527,78 @@ void __init udp_init(void)  	sysctl_udp_wmem_min = SK_MEM_QUANTUM;  } -int udp4_ufo_send_check(struct sk_buff *skb) +struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, +				       netdev_features_t features)  { -	const struct iphdr *iph; -	struct udphdr *uh; - -	if (!pskb_may_pull(skb, sizeof(*uh))) -		return -EINVAL; +	struct sk_buff *segs = ERR_PTR(-EINVAL); +	u16 mac_offset = skb->mac_header; +	int mac_len = skb->mac_len; +	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); +	__be16 protocol = skb->protocol; +	netdev_features_t enc_features; +	int udp_offset, outer_hlen; +	unsigned int oldlen; +	bool need_csum; + +	oldlen = (u16)~skb->len; + +	if (unlikely(!pskb_may_pull(skb, tnl_hlen))) +		goto out; -	iph = ip_hdr(skb); -	uh = udp_hdr(skb); +	skb->encapsulation = 0; +	__skb_pull(skb, tnl_hlen); +	skb_reset_mac_header(skb); +	skb_set_network_header(skb, skb_inner_network_offset(skb)); +	skb->mac_len = skb_inner_network_offset(skb); +	skb->protocol = htons(ETH_P_TEB); + +	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); +	if (need_csum) +		skb->encap_hdr_csum = 1; + +	/* segment inner packet. */ +	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); +	segs = skb_mac_gso_segment(skb, enc_features); +	if (!segs || IS_ERR(segs)) { +		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, +				     mac_len); +		goto out; +	} -	uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, -				       IPPROTO_UDP, 0); -	skb->csum_start = skb_transport_header(skb) - skb->head; -	skb->csum_offset = offsetof(struct udphdr, check); -	skb->ip_summed = CHECKSUM_PARTIAL; -	return 0; -} +	outer_hlen = skb_tnl_header_len(skb); +	udp_offset = outer_hlen - tnl_hlen; +	skb = segs; +	do { +		struct udphdr *uh; +		int len; -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) -{ -	struct sk_buff *segs = ERR_PTR(-EINVAL); -	unsigned int mss; -	int offset; -	__wsum csum; +		skb_reset_inner_headers(skb); +		skb->encapsulation = 1; -	mss = skb_shinfo(skb)->gso_size; -	if (unlikely(skb->len <= mss)) -		goto out; +		skb->mac_len = mac_len; -	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { -		/* Packet is from an untrusted source, reset gso_segs. */ -		int type = skb_shinfo(skb)->gso_type; +		skb_push(skb, outer_hlen); +		skb_reset_mac_header(skb); +		skb_set_network_header(skb, mac_len); +		skb_set_transport_header(skb, udp_offset); +		len = skb->len - udp_offset; +		uh = udp_hdr(skb); +		uh->len = htons(len); -		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || -			     !(type & (SKB_GSO_UDP)))) -			goto out; +		if (need_csum) { +			__be32 delta = htonl(oldlen + len); -		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); +			uh->check = ~csum_fold((__force __wsum) +					       ((__force u32)uh->check + +						(__force u32)delta)); +			uh->check = gso_make_checksum(skb, ~uh->check); -		segs = NULL; -		goto out; -	} +			if (uh->check == 0) +				uh->check = CSUM_MANGLED_0; +		} -	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot -	 * do checksum of UDP packets sent as multiple IP fragments. -	 */ -	offset = skb->csum_start - skb_headroom(skb); -	csum = skb_checksum(skb, offset, skb->len - offset, 0); -	offset += skb->csum_offset; -	*(__sum16 *)(skb->data + offset) = csum_fold(csum); -	skb->ip_summed = CHECKSUM_NONE; - -	/* Fragment the skb. IP headers of the fragments are updated in -	 * inet_gso_segment() -	 */ -	segs = skb_segment(skb, features); +		skb->protocol = protocol; +	} while ((skb = skb->next));  out:  	return segs;  } - diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c new file mode 100644 index 00000000000..7927db0a927 --- /dev/null +++ b/net/ipv4/udp_diag.c @@ -0,0 +1,216 @@ +/* + * udp_diag.c	Module for monitoring UDP transport protocols sockets. + * + * Authors:	Pavel Emelyanov, <xemul@parallels.com> + * + *	This program is free software; you can redistribute it and/or + *      modify it under the terms of the GNU General Public License + *      as published by the Free Software Foundation; either version + *      2 of the License, or (at your option) any later version. + */ + + +#include <linux/module.h> +#include <linux/inet_diag.h> +#include <linux/udp.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <linux/sock_diag.h> + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, +		struct netlink_callback *cb, struct inet_diag_req_v2 *req, +		struct nlattr *bc) +{ +	if (!inet_diag_bc_sk(bc, sk)) +		return 0; + +	return inet_sk_diag_fill(sk, NULL, skb, req, +			sk_user_ns(NETLINK_CB(cb->skb).sk), +			NETLINK_CB(cb->skb).portid, +			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, +		const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) +{ +	int err = -EINVAL; +	struct sock *sk; +	struct sk_buff *rep; +	struct net *net = sock_net(in_skb->sk); + +	if (req->sdiag_family == AF_INET) +		sk = __udp4_lib_lookup(net, +				req->id.idiag_src[0], req->id.idiag_sport, +				req->id.idiag_dst[0], req->id.idiag_dport, +				req->id.idiag_if, tbl); +#if IS_ENABLED(CONFIG_IPV6) +	else if (req->sdiag_family == AF_INET6) +		sk = __udp6_lib_lookup(net, +				(struct in6_addr *)req->id.idiag_src, +				req->id.idiag_sport, +				(struct in6_addr *)req->id.idiag_dst, +				req->id.idiag_dport, +				req->id.idiag_if, tbl); +#endif +	else +		goto out_nosk; + +	err = -ENOENT; +	if (sk == NULL) +		goto out_nosk; + +	err = sock_diag_check_cookie(sk, req->id.idiag_cookie); +	if (err) +		goto out; + +	err = -ENOMEM; +	rep = nlmsg_new(sizeof(struct inet_diag_msg) + +			sizeof(struct inet_diag_meminfo) + 64, +			GFP_KERNEL); +	if (!rep) +		goto out; + +	err = inet_sk_diag_fill(sk, NULL, rep, req, +			   sk_user_ns(NETLINK_CB(in_skb).sk), +			   NETLINK_CB(in_skb).portid, +			   nlh->nlmsg_seq, 0, nlh); +	if (err < 0) { +		WARN_ON(err == -EMSGSIZE); +		kfree_skb(rep); +		goto out; +	} +	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, +			      MSG_DONTWAIT); +	if (err > 0) +		err = 0; +out: +	if (sk) +		sock_put(sk); +out_nosk: +	return err; +} + +static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, +		struct inet_diag_req_v2 *r, struct nlattr *bc) +{ +	int num, s_num, slot, s_slot; +	struct net *net = sock_net(skb->sk); + +	s_slot = cb->args[0]; +	num = s_num = cb->args[1]; + +	for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { +		struct sock *sk; +		struct hlist_nulls_node *node; +		struct udp_hslot *hslot = &table->hash[slot]; + +		if (hlist_nulls_empty(&hslot->head)) +			continue; + +		spin_lock_bh(&hslot->lock); +		sk_nulls_for_each(sk, node, &hslot->head) { +			struct inet_sock *inet = inet_sk(sk); + +			if (!net_eq(sock_net(sk), net)) +				continue; +			if (num < s_num) +				goto next; +			if (!(r->idiag_states & (1 << sk->sk_state))) +				goto next; +			if (r->sdiag_family != AF_UNSPEC && +					sk->sk_family != r->sdiag_family) +				goto next; +			if (r->id.idiag_sport != inet->inet_sport && +			    r->id.idiag_sport) +				goto next; +			if (r->id.idiag_dport != inet->inet_dport && +			    r->id.idiag_dport) +				goto next; + +			if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { +				spin_unlock_bh(&hslot->lock); +				goto done; +			} +next: +			num++; +		} +		spin_unlock_bh(&hslot->lock); +	} +done: +	cb->args[0] = slot; +	cb->args[1] = num; +} + +static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, +		struct inet_diag_req_v2 *r, struct nlattr *bc) +{ +	udp_dump(&udp_table, skb, cb, r, bc); +} + +static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +		struct inet_diag_req_v2 *req) +{ +	return udp_dump_one(&udp_table, in_skb, nlh, req); +} + +static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, +		void *info) +{ +	r->idiag_rqueue = sk_rmem_alloc_get(sk); +	r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + +static const struct inet_diag_handler udp_diag_handler = { +	.dump		 = udp_diag_dump, +	.dump_one	 = udp_diag_dump_one, +	.idiag_get_info  = udp_diag_get_info, +	.idiag_type	 = IPPROTO_UDP, +}; + +static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, +		struct inet_diag_req_v2 *r, struct nlattr *bc) +{ +	udp_dump(&udplite_table, skb, cb, r, bc); +} + +static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, +		struct inet_diag_req_v2 *req) +{ +	return udp_dump_one(&udplite_table, in_skb, nlh, req); +} + +static const struct inet_diag_handler udplite_diag_handler = { +	.dump		 = udplite_diag_dump, +	.dump_one	 = udplite_diag_dump_one, +	.idiag_get_info  = udp_diag_get_info, +	.idiag_type	 = IPPROTO_UDPLITE, +}; + +static int __init udp_diag_init(void) +{ +	int err; + +	err = inet_diag_register(&udp_diag_handler); +	if (err) +		goto out; +	err = inet_diag_register(&udplite_diag_handler); +	if (err) +		goto out_lite; +out: +	return err; +out_lite: +	inet_diag_unregister(&udp_diag_handler); +	goto out; +} + +static void __exit udp_diag_exit(void) +{ +	inet_diag_unregister(&udplite_diag_handler); +	inet_diag_unregister(&udp_diag_handler); +} + +module_init(udp_diag_init); +module_exit(udp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index aaad650d47d..f3c27899f62 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -5,30 +5,30 @@  #include <net/protocol.h>  #include <net/inet_common.h> -extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int ); -extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); -extern int	udp_v4_get_port(struct sock *sk, unsigned short snum); +int udp_v4_get_port(struct sock *sk, unsigned short snum); -extern int	udp_setsockopt(struct sock *sk, int level, int optname, -			       char __user *optval, unsigned int optlen); -extern int	udp_getsockopt(struct sock *sk, int level, int optname, -			       char __user *optval, int __user *optlen); +int udp_setsockopt(struct sock *sk, int level, int optname, +		   char __user *optval, unsigned int optlen); +int udp_getsockopt(struct sock *sk, int level, int optname, +		   char __user *optval, int __user *optlen);  #ifdef CONFIG_COMPAT -extern int	compat_udp_setsockopt(struct sock *sk, int level, int optname, -				      char __user *optval, unsigned int optlen); -extern int	compat_udp_getsockopt(struct sock *sk, int level, int optname, -				      char __user *optval, int __user *optlen); +int compat_udp_setsockopt(struct sock *sk, int level, int optname, +			  char __user *optval, unsigned int optlen); +int compat_udp_getsockopt(struct sock *sk, int level, int optname, +			  char __user *optval, int __user *optlen);  #endif -extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, -			    size_t len, int noblock, int flags, int *addr_len); -extern int	udp_sendpage(struct sock *sk, struct page *page, int offset, -			     size_t size, int flags); -extern int	udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern void	udp_destroy_sock(struct sock *sk); +int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +		size_t len, int noblock, int flags, int *addr_len); +int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, +		 int flags); +int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udp_destroy_sock(struct sock *sk);  #ifdef CONFIG_PROC_FS -extern int	udp4_seq_show(struct seq_file *seq, void *v); +int udp4_seq_show(struct seq_file *seq, void *v);  #endif  #endif	/* _UDP4_IMPL_H */ diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c new file mode 100644 index 00000000000..546d2d439dd --- /dev/null +++ b/net/ipv4/udp_offload.c @@ -0,0 +1,250 @@ +/* + *	IPV4 GSO/GRO offload support + *	Linux INET implementation + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *	UDPv4 GSO support + */ + +#include <linux/skbuff.h> +#include <net/udp.h> +#include <net/protocol.h> + +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; + +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + +struct udp_offload_priv { +	struct udp_offload	*offload; +	struct rcu_head		rcu; +	struct udp_offload_priv __rcu *next; +}; + +static int udp4_ufo_send_check(struct sk_buff *skb) +{ +	if (!pskb_may_pull(skb, sizeof(struct udphdr))) +		return -EINVAL; + +	if (likely(!skb->encapsulation)) { +		const struct iphdr *iph; +		struct udphdr *uh; + +		iph = ip_hdr(skb); +		uh = udp_hdr(skb); + +		uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, +				IPPROTO_UDP, 0); +		skb->csum_start = skb_transport_header(skb) - skb->head; +		skb->csum_offset = offsetof(struct udphdr, check); +		skb->ip_summed = CHECKSUM_PARTIAL; +	} + +	return 0; +} + +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, +					 netdev_features_t features) +{ +	struct sk_buff *segs = ERR_PTR(-EINVAL); +	unsigned int mss; +	int offset; +	__wsum csum; + +	if (skb->encapsulation && +	    (skb_shinfo(skb)->gso_type & +	     (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { +		segs = skb_udp_tunnel_segment(skb, features); +		goto out; +	} + +	mss = skb_shinfo(skb)->gso_size; +	if (unlikely(skb->len <= mss)) +		goto out; + +	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { +		/* Packet is from an untrusted source, reset gso_segs. */ +		int type = skb_shinfo(skb)->gso_type; + +		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | +				      SKB_GSO_UDP_TUNNEL | +				      SKB_GSO_UDP_TUNNEL_CSUM | +				      SKB_GSO_IPIP | +				      SKB_GSO_GRE | SKB_GSO_GRE_CSUM | +				      SKB_GSO_MPLS) || +			     !(type & (SKB_GSO_UDP)))) +			goto out; + +		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + +		segs = NULL; +		goto out; +	} + +	/* Do software UFO. Complete and fill in the UDP checksum as +	 * HW cannot do checksum of UDP packets sent as multiple +	 * IP fragments. +	 */ +	offset = skb_checksum_start_offset(skb); +	csum = skb_checksum(skb, offset, skb->len - offset, 0); +	offset += skb->csum_offset; +	*(__sum16 *)(skb->data + offset) = csum_fold(csum); +	skb->ip_summed = CHECKSUM_NONE; + +	/* Fragment the skb. IP headers of the fragments are updated in +	 * inet_gso_segment() +	 */ +	segs = skb_segment(skb, features); +out: +	return segs; +} + +int udp_add_offload(struct udp_offload *uo) +{ +	struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); + +	if (!new_offload) +		return -ENOMEM; + +	new_offload->offload = uo; + +	spin_lock(&udp_offload_lock); +	new_offload->next = udp_offload_base; +	rcu_assign_pointer(udp_offload_base, new_offload); +	spin_unlock(&udp_offload_lock); + +	return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ +	struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); +	kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ +	struct udp_offload_priv __rcu **head = &udp_offload_base; +	struct udp_offload_priv *uo_priv; + +	spin_lock(&udp_offload_lock); + +	uo_priv = udp_deref_protected(*head); +	for (; uo_priv != NULL; +	     uo_priv = udp_deref_protected(*head)) { +		if (uo_priv->offload == uo) { +			rcu_assign_pointer(*head, +					   udp_deref_protected(uo_priv->next)); +			goto unlock; +		} +		head = &uo_priv->next; +	} +	pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); +unlock: +	spin_unlock(&udp_offload_lock); +	if (uo_priv != NULL) +		call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ +	struct udp_offload_priv *uo_priv; +	struct sk_buff *p, **pp = NULL; +	struct udphdr *uh, *uh2; +	unsigned int hlen, off; +	int flush = 1; + +	if (NAPI_GRO_CB(skb)->udp_mark || +	    (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) +		goto out; + +	/* mark that this skb passed once through the udp gro layer */ +	NAPI_GRO_CB(skb)->udp_mark = 1; + +	off  = skb_gro_offset(skb); +	hlen = off + sizeof(*uh); +	uh   = skb_gro_header_fast(skb, off); +	if (skb_gro_header_hard(skb, hlen)) { +		uh = skb_gro_header_slow(skb, hlen, off); +		if (unlikely(!uh)) +			goto out; +	} + +	rcu_read_lock(); +	uo_priv = rcu_dereference(udp_offload_base); +	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { +		if (uo_priv->offload->port == uh->dest && +		    uo_priv->offload->callbacks.gro_receive) +			goto unflush; +	} +	goto out_unlock; + +unflush: +	flush = 0; + +	for (p = *head; p; p = p->next) { +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		uh2 = (struct udphdr   *)(p->data + off); +		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} +	} + +	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ +	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); +	pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: +	rcu_read_unlock(); +out: +	NAPI_GRO_CB(skb)->flush |= flush; +	return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ +	struct udp_offload_priv *uo_priv; +	__be16 newlen = htons(skb->len - nhoff); +	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); +	int err = -ENOSYS; + +	uh->len = newlen; + +	rcu_read_lock(); + +	uo_priv = rcu_dereference(udp_offload_base); +	for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { +		if (uo_priv->offload->port == uh->dest && +		    uo_priv->offload->callbacks.gro_complete) +			break; +	} + +	if (uo_priv != NULL) +		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + +	rcu_read_unlock(); +	return err; +} + +static const struct net_offload udpv4_offload = { +	.callbacks = { +		.gso_send_check = udp4_ufo_send_check, +		.gso_segment = udp4_ufo_fragment, +		.gro_receive  =	udp_gro_receive, +		.gro_complete =	udp_gro_complete, +	}, +}; + +int __init udpv4_offload_init(void) +{ +	return inet_add_offload(&udpv4_offload, IPPROTO_UDP); +} diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index ab76aa928fa..3b3efbda48e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -10,6 +10,10 @@   *		as published by the Free Software Foundation; either version   *		2 of the License, or (at your option) any later version.   */ + +#define pr_fmt(fmt) "UDPLite: " fmt + +#include <linux/export.h>  #include "udp_impl.h"  struct udp_table 	udplite_table __read_mostly; @@ -57,6 +61,7 @@ struct proto 	udplite_prot = {  	.compat_setsockopt = compat_udp_setsockopt,  	.compat_getsockopt = compat_udp_getsockopt,  #endif +	.clear_sk	   = sk_prot_clear_portaddr_nulls,  };  EXPORT_SYMBOL(udplite_prot); @@ -65,18 +70,24 @@ static struct inet_protosw udplite4_protosw = {  	.protocol	=  IPPROTO_UDPLITE,  	.prot		=  &udplite_prot,  	.ops		=  &inet_dgram_ops, -	.no_check	=  0,		/* must checksum (RFC 3828) */  	.flags		=  INET_PROTOSW_PERMANENT,  };  #ifdef CONFIG_PROC_FS + +static const struct file_operations udplite_afinfo_seq_fops = { +	.owner    = THIS_MODULE, +	.open     = udp_seq_open, +	.read     = seq_read, +	.llseek   = seq_lseek, +	.release  = seq_release_net +}; +  static struct udp_seq_afinfo udplite4_seq_afinfo = {  	.name		= "udplite",  	.family		= AF_INET,  	.udp_table 	= &udplite_table, -	.seq_fops	= { -		.owner	=	THIS_MODULE, -	}, +	.seq_fops	= &udplite_afinfo_seq_fops,  	.seq_ops	= {  		.show		= udp4_seq_show,  	}, @@ -120,11 +131,11 @@ void __init udplite4_register(void)  	inet_register_protosw(&udplite4_protosw);  	if (udplite4_proc_init()) -		printk(KERN_ERR "%s: Cannot register /proc!\n", __func__); +		pr_err("%s: Cannot register /proc!\n", __func__);  	return;  out_unregister_proto:  	proto_unregister(&udplite_prot);  out_register_err: -	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); +	pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);  } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 06814b6216d..aac6197b7a7 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -37,15 +37,6 @@ drop:  	return NET_RX_DROP;  } -int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, -		    int encap_type) -{ -	XFRM_SPI_SKB_CB(skb)->family = AF_INET; -	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); -	return xfrm_input(skb, nexthdr, spi, encap_type); -} -EXPORT_SYMBOL(xfrm4_rcv_encap); -  int xfrm4_transport_finish(struct sk_buff *skb, int async)  {  	struct iphdr *iph = ip_hdr(skb); @@ -132,7 +123,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)  	 * header and optional ESP marker bytes) and then modify the  	 * protocol to ESP, and then call into the transform receiver.  	 */ -	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +	if (skb_unclone(skb, GFP_ATOMIC))  		goto drop;  	/* Now we can update and verify the packet length... */ diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 63418185f52..71acd0014f2 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)  		hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);  	skb_set_network_header(skb, -x->props.header_len - -			            hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); +				    hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));  	if (x->sel.family != AF_INET6)  		skb->network_header += IPV4_BEET_PHMAXLEN;  	skb->mac_header = skb->network_header + @@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)  	skb_push(skb, sizeof(*iph));  	skb_reset_network_header(skb); - -	memmove(skb->data - skb->mac_len, skb_mac_header(skb), -		skb->mac_len); -	skb_set_mac_header(skb, -skb->mac_len); +	skb_mac_header_rebuild(skb);  	xfrm4_beet_make_header(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 6f368413eb0..91771a7c802 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -44,8 +44,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)  	top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); -	/* DS disclosed */ -	top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, +	/* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ +	if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) +		top_iph->tos = 0; +	else +		top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; +	top_iph->tos = INET_ECN_encapsulate(top_iph->tos,  					    XFRM_MODE_SKB_CB(skb)->tos);  	flags = x->props.flags; @@ -54,19 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)  	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?  		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); -	ip_select_ident(top_iph, dst->child, NULL); -	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); +	top_iph->ttl = ip4_dst_hoplimit(dst->child);  	top_iph->saddr = x->props.saddr.a4;  	top_iph->daddr = x->id.daddr.a4; +	ip_select_ident(skb, NULL);  	return 0;  }  static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)  { -	const unsigned char *old_mac;  	int err = -EINVAL;  	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -75,8 +78,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)  	if (!pskb_may_pull(skb, sizeof(struct iphdr)))  		goto out; -	if (skb_cloned(skb) && -	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) +	err = skb_unclone(skb, GFP_ATOMIC); +	if (err)  		goto out;  	if (x->props.flags & XFRM_STATE_DECAP_DSCP) @@ -84,10 +87,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)  	if (!(x->props.flags & XFRM_STATE_NOECN))  		ipip_ecn_decapsulate(skb); -	old_mac = skb_mac_header(skb); -	skb_set_mac_header(skb, -skb->mac_len); -	memmove(skb_mac_header(skb), old_mac, skb->mac_len);  	skb_reset_network_header(skb); +	skb_mac_header_rebuild(skb); +  	err = 0;  out: diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96a175..d5f6bd9a210 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -21,18 +21,20 @@  static int xfrm4_tunnel_check_size(struct sk_buff *skb)  {  	int mtu, ret = 0; -	struct dst_entry *dst;  	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)  		goto out; -	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) +	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)  		goto out; -	dst = skb_dst(skb); -	mtu = dst_mtu(dst); +	mtu = dst_mtu(skb_dst(skb));  	if (skb->len > mtu) { -		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +		if (skb->sk) +			xfrm_local_error(skb, mtu); +		else +			icmp_send(skb, ICMP_DEST_UNREACH, +				  ICMP_FRAG_NEEDED, htonl(mtu));  		ret = -EMSGSIZE;  	}  out: @@ -60,33 +62,50 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)  	if (err)  		return err; -	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - -	skb->protocol = htons(ETH_P_IP); +	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;  	return x->outer_mode->output2(x, skb);  }  EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb)  { +	memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +	skb->protocol = htons(ETH_P_IP); +  #ifdef CONFIG_NETFILTER -	if (!skb_dst(skb)->xfrm) { +	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif + +	return xfrm_output(skb); +} + +static int __xfrm4_output(struct sk_buff *skb) +{ +	struct xfrm_state *x = skb_dst(skb)->xfrm; + +#ifdef CONFIG_NETFILTER +	if (!x) {  		IPCB(skb)->flags |= IPSKB_REROUTED;  		return dst_output(skb);  	} - -	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;  #endif -	skb->protocol = htons(ETH_P_IP); -	return xfrm_output(skb); +	return x->outer_mode->afinfo->output_finish(skb);  } -int xfrm4_output(struct sk_buff *skb) +int xfrm4_output(struct sock *sk, struct sk_buff *skb)  {  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, -			    NULL, skb_dst(skb)->dev, xfrm4_output_finish, +			    NULL, skb_dst(skb)->dev, __xfrm4_output,  			    !(IPCB(skb)->flags & IPSKB_REROUTED));  } + +void xfrm4_local_error(struct sk_buff *skb, u32 mtu) +{ +	struct iphdr *hdr; + +	hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); +	ip_local_error(skb->sk, EMSGSIZE, hdr->daddr, +		       inet_sk(skb->sk)->inet_dport, mtu); +} diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b057d40adde..6156f68a1e9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -18,47 +18,53 @@  static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, -					  xfrm_address_t *saddr, -					  xfrm_address_t *daddr) +static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, +					    int tos, +					    const xfrm_address_t *saddr, +					    const xfrm_address_t *daddr)  { -	struct flowi fl = { -		.fl4_dst = daddr->a4, -		.fl4_tos = tos, -	}; -	struct dst_entry *dst;  	struct rtable *rt; -	int err; +	memset(fl4, 0, sizeof(*fl4)); +	fl4->daddr = daddr->a4; +	fl4->flowi4_tos = tos;  	if (saddr) -		fl.fl4_src = saddr->a4; +		fl4->saddr = saddr->a4; + +	rt = __ip_route_output_key(net, fl4); +	if (!IS_ERR(rt)) +		return &rt->dst; + +	return ERR_CAST(rt); +} -	err = __ip_route_output_key(net, &rt, &fl); -	dst = &rt->dst; -	if (err) -		dst = ERR_PTR(err); -	return dst; +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, +					  const xfrm_address_t *saddr, +					  const xfrm_address_t *daddr) +{ +	struct flowi4 fl4; + +	return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);  }  static int xfrm4_get_saddr(struct net *net,  			   xfrm_address_t *saddr, xfrm_address_t *daddr)  {  	struct dst_entry *dst; -	struct rtable *rt; +	struct flowi4 fl4; -	dst = xfrm4_dst_lookup(net, 0, NULL, daddr); +	dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);  	if (IS_ERR(dst))  		return -EHOSTUNREACH; -	rt = (struct rtable *)dst; -	saddr->a4 = rt->rt_src; +	saddr->a4 = fl4.saddr;  	dst_release(dst);  	return 0;  } -static int xfrm4_get_tos(struct flowi *fl) +static int xfrm4_get_tos(const struct flowi *fl)  { -	return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ +	return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */  }  static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -68,28 +74,26 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,  }  static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, -			  struct flowi *fl) +			  const struct flowi *fl)  {  	struct rtable *rt = (struct rtable *)xdst->route; +	const struct flowi4 *fl4 = &fl->u.ip4; -	xdst->u.rt.fl = *fl; +	xdst->u.rt.rt_iif = fl4->flowi4_iif;  	xdst->u.dst.dev = dev;  	dev_hold(dev); -	xdst->u.rt.peer = rt->peer; -	if (rt->peer) -		atomic_inc(&rt->peer->refcnt); -  	/* Sheit... I remember I did this right. Apparently,  	 * it was magically lost, so this code needs audit */ +	xdst->u.rt.rt_is_input = rt->rt_is_input;  	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |  					      RTCF_LOCAL);  	xdst->u.rt.rt_type = rt->rt_type; -	xdst->u.rt.rt_src = rt->rt_src; -	xdst->u.rt.rt_dst = rt->rt_dst;  	xdst->u.rt.rt_gateway = rt->rt_gateway; -	xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; +	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; +	xdst->u.rt.rt_pmtu = rt->rt_pmtu; +	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);  	return 0;  } @@ -97,13 +101,19 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,  static void  _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  { -	struct iphdr *iph = ip_hdr(skb); +	const struct iphdr *iph = ip_hdr(skb);  	u8 *xprth = skb_network_header(skb) + iph->ihl * 4; +	struct flowi4 *fl4 = &fl->u.ip4; +	int oif = 0; -	memset(fl, 0, sizeof(struct flowi)); -	fl->mark = skb->mark; +	if (skb_dst(skb)) +		oif = skb_dst(skb)->dev->ifindex; -	if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { +	memset(fl4, 0, sizeof(struct flowi4)); +	fl4->flowi4_mark = skb->mark; +	fl4->flowi4_oif = reverse ? skb->skb_iif : oif; + +	if (!ip_is_fragment(iph)) {  		switch (iph->protocol) {  		case IPPROTO_UDP:  		case IPPROTO_UDPLITE: @@ -114,8 +124,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  			    pskb_may_pull(skb, xprth + 4 - skb->data)) {  				__be16 *ports = (__be16 *)xprth; -				fl->fl_ip_sport = ports[!!reverse]; -				fl->fl_ip_dport = ports[!reverse]; +				fl4->fl4_sport = ports[!!reverse]; +				fl4->fl4_dport = ports[!reverse];  			}  			break; @@ -123,8 +133,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  			if (pskb_may_pull(skb, xprth + 2 - skb->data)) {  				u8 *icmp = xprth; -				fl->fl_icmp_type = icmp[0]; -				fl->fl_icmp_code = icmp[1]; +				fl4->fl4_icmp_type = icmp[0]; +				fl4->fl4_icmp_code = icmp[1];  			}  			break; @@ -132,15 +142,15 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {  				__be32 *ehdr = (__be32 *)xprth; -				fl->fl_ipsec_spi = ehdr[0]; +				fl4->fl4_ipsec_spi = ehdr[0];  			}  			break;  		case IPPROTO_AH:  			if (pskb_may_pull(skb, xprth + 8 - skb->data)) { -				__be32 *ah_hdr = (__be32*)xprth; +				__be32 *ah_hdr = (__be32 *)xprth; -				fl->fl_ipsec_spi = ah_hdr[1]; +				fl4->fl4_ipsec_spi = ah_hdr[1];  			}  			break; @@ -148,7 +158,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {  				__be16 *ipcomp_hdr = (__be16 *)xprth; -				fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); +				fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));  			}  			break; @@ -160,20 +170,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)  				if (greflags[0] & GRE_KEY) {  					if (greflags[0] & GRE_CSUM)  						gre_hdr++; -					fl->fl_gre_key = gre_hdr[1]; +					fl4->fl4_gre_key = gre_hdr[1];  				}  			}  			break;  		default: -			fl->fl_ipsec_spi = 0; +			fl4->fl4_ipsec_spi = 0;  			break;  		}  	} -	fl->proto = iph->protocol; -	fl->fl4_dst = reverse ? iph->saddr : iph->daddr; -	fl->fl4_src = reverse ? iph->daddr : iph->saddr; -	fl->fl4_tos = iph->tos; +	fl4->flowi4_proto = iph->protocol; +	fl4->daddr = reverse ? iph->saddr : iph->daddr; +	fl4->saddr = reverse ? iph->daddr : iph->saddr; +	fl4->flowi4_tos = iph->tos;  }  static inline int xfrm4_garbage_collect(struct dst_ops *ops) @@ -184,20 +194,30 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)  	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);  } -static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, +			      struct sk_buff *skb, u32 mtu) +{ +	struct xfrm_dst *xdst = (struct xfrm_dst *)dst; +	struct dst_entry *path = xdst->route; + +	path->ops->update_pmtu(path, sk, skb, mtu); +} + +static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk, +			   struct sk_buff *skb)  {  	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;  	struct dst_entry *path = xdst->route; -	path->ops->update_pmtu(path, mtu); +	path->ops->redirect(path, sk, skb);  }  static void xfrm4_dst_destroy(struct dst_entry *dst)  {  	struct xfrm_dst *xdst = (struct xfrm_dst *)dst; -	if (likely(xdst->u.rt.peer)) -		inet_putpeer(xdst->u.rt.peer); +	dst_destroy_metrics_generic(dst); +  	xfrm_dst_destroy(xdst);  } @@ -215,10 +235,12 @@ static struct dst_ops xfrm4_dst_ops = {  	.protocol =		cpu_to_be16(ETH_P_IP),  	.gc =			xfrm4_garbage_collect,  	.update_pmtu =		xfrm4_update_pmtu, +	.redirect =		xfrm4_redirect, +	.cow_metrics =		dst_cow_metrics_generic,  	.destroy =		xfrm4_dst_destroy,  	.ifdown =		xfrm4_dst_ifdown,  	.local_out =		__ip_local_out, -	.gc_thresh =		1024, +	.gc_thresh =		32768,  };  static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { @@ -230,6 +252,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {  	.get_tos =		xfrm4_get_tos,  	.init_path =		xfrm4_init_path,  	.fill_dst =		xfrm4_fill_dst, +	.blackhole_route =	ipv4_blackhole_route,  };  #ifdef CONFIG_SYSCTL @@ -244,43 +267,67 @@ static struct ctl_table xfrm4_policy_table[] = {  	{ }  }; -static struct ctl_table_header *sysctl_hdr; -#endif - -static void __init xfrm4_policy_init(void) +static int __net_init xfrm4_net_init(struct net *net)  { -	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); +	struct ctl_table *table; +	struct ctl_table_header *hdr; + +	table = xfrm4_policy_table; +	if (!net_eq(net, &init_net)) { +		table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL); +		if (!table) +			goto err_alloc; + +		table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh; +	} + +	hdr = register_net_sysctl(net, "net/ipv4", table); +	if (!hdr) +		goto err_reg; + +	net->ipv4.xfrm4_hdr = hdr; +	return 0; + +err_reg: +	if (!net_eq(net, &init_net)) +		kfree(table); +err_alloc: +	return -ENOMEM;  } -static void __exit xfrm4_policy_fini(void) +static void __net_exit xfrm4_net_exit(struct net *net)  { -#ifdef CONFIG_SYSCTL -	if (sysctl_hdr) -		unregister_net_sysctl_table(sysctl_hdr); +	struct ctl_table *table; + +	if (net->ipv4.xfrm4_hdr == NULL) +		return; + +	table = net->ipv4.xfrm4_hdr->ctl_table_arg; +	unregister_net_sysctl_table(net->ipv4.xfrm4_hdr); +	if (!net_eq(net, &init_net)) +		kfree(table); +} + +static struct pernet_operations __net_initdata xfrm4_net_ops = { +	.init	= xfrm4_net_init, +	.exit	= xfrm4_net_exit, +};  #endif -	xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); + +static void __init xfrm4_policy_init(void) +{ +	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);  } -void __init xfrm4_init(int rt_max_size) +void __init xfrm4_init(void)  { -	/* -	 * Select a default value for the gc_thresh based on the main route -	 * table hash size.  It seems to me the worst case scenario is when -	 * we have ipsec operating in transport mode, in which we create a -	 * dst_entry per socket.  The xfrm gc algorithm starts trying to remove -	 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh -	 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2. -	 * That will let us store an ipsec connection per route table entry, -	 * and start cleaning when were 1/2 full -	 */ -	xfrm4_dst_ops.gc_thresh = rt_max_size/2;  	dst_entries_init(&xfrm4_dst_ops);  	xfrm4_state_init();  	xfrm4_policy_init(); +	xfrm4_protocol_init();  #ifdef CONFIG_SYSCTL -	sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, -						xfrm4_policy_table); +	register_pernet_subsys(&xfrm4_net_ops);  #endif  } diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c new file mode 100644 index 00000000000..a2ce0101eaa --- /dev/null +++ b/net/ipv4/xfrm4_protocol.c @@ -0,0 +1,301 @@ +/* xfrm4_protocol.c - Generic xfrm protocol multiplexer. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/tunnel4.c + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; +static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; +static DEFINE_MUTEX(xfrm4_protocol_mutex); + +static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) +{ +	switch (protocol) { +	case IPPROTO_ESP: +		return &esp4_handlers; +	case IPPROTO_AH: +		return &ah4_handlers; +	case IPPROTO_COMP: +		return &ipcomp4_handlers; +	} + +	return NULL; +} + +#define for_each_protocol_rcu(head, handler)		\ +	for (handler = rcu_dereference(head);		\ +	     handler != NULL;				\ +	     handler = rcu_dereference(handler->next))	\ + +int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ +	int ret; +	struct xfrm4_protocol *handler; +	struct xfrm4_protocol __rcu **head = proto_handlers(protocol); + +	if (!head) +		return 0; + +	for_each_protocol_rcu(*head, handler) +		if ((ret = handler->cb_handler(skb, err)) <= 0) +			return ret; + +	return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_cb); + +int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, +		    int encap_type) +{ +	int ret; +	struct xfrm4_protocol *handler; +	struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; +	XFRM_SPI_SKB_CB(skb)->family = AF_INET; +	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + +	if (!head) +		goto out; + +	for_each_protocol_rcu(*head, handler) +		if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) +			return ret; + +out: +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} +EXPORT_SYMBOL(xfrm4_rcv_encap); + +static int xfrm4_esp_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(esp4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(esp4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static int xfrm4_ah_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(ah4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret;; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(ah4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static int xfrm4_ipcomp_rcv(struct sk_buff *skb) +{ +	int ret; +	struct xfrm4_protocol *handler; + +	XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + +	for_each_protocol_rcu(ipcomp4_handlers, handler) +		if ((ret = handler->handler(skb)) != -EINVAL) +			return ret; + +	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +	kfree_skb(skb); +	return 0; +} + +static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +{ +	struct xfrm4_protocol *handler; + +	for_each_protocol_rcu(ipcomp4_handlers, handler) +		if (!handler->err_handler(skb, info)) +			break; +} + +static const struct net_protocol esp4_protocol = { +	.handler	=	xfrm4_esp_rcv, +	.err_handler	=	xfrm4_esp_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static const struct net_protocol ah4_protocol = { +	.handler	=	xfrm4_ah_rcv, +	.err_handler	=	xfrm4_ah_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static const struct net_protocol ipcomp4_protocol = { +	.handler	=	xfrm4_ipcomp_rcv, +	.err_handler	=	xfrm4_ipcomp_err, +	.no_policy	=	1, +	.netns_ok	=	1, +}; + +static struct xfrm_input_afinfo xfrm4_input_afinfo = { +	.family		=	AF_INET, +	.owner		=	THIS_MODULE, +	.callback	=	xfrm4_rcv_cb, +}; + +static inline const struct net_protocol *netproto(unsigned char protocol) +{ +	switch (protocol) { +	case IPPROTO_ESP: +		return &esp4_protocol; +	case IPPROTO_AH: +		return &ah4_protocol; +	case IPPROTO_COMP: +		return &ipcomp4_protocol; +	} + +	return NULL; +} + +int xfrm4_protocol_register(struct xfrm4_protocol *handler, +			    unsigned char protocol) +{ +	struct xfrm4_protocol __rcu **pprev; +	struct xfrm4_protocol *t; +	bool add_netproto = false; +	int ret = -EEXIST; +	int priority = handler->priority; + +	if (!proto_handlers(protocol) || !netproto(protocol)) +		return -EINVAL; + +	mutex_lock(&xfrm4_protocol_mutex); + +	if (!rcu_dereference_protected(*proto_handlers(protocol), +				       lockdep_is_held(&xfrm4_protocol_mutex))) +		add_netproto = true; + +	for (pprev = proto_handlers(protocol); +	     (t = rcu_dereference_protected(*pprev, +			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; +	     pprev = &t->next) { +		if (t->priority < priority) +			break; +		if (t->priority == priority) +			goto err; +	} + +	handler->next = *pprev; +	rcu_assign_pointer(*pprev, handler); + +	ret = 0; + +err: +	mutex_unlock(&xfrm4_protocol_mutex); + +	if (add_netproto) { +		if (inet_add_protocol(netproto(protocol), protocol)) { +			pr_err("%s: can't add protocol\n", __func__); +			ret = -EAGAIN; +		} +	} + +	return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_register); + +int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, +			      unsigned char protocol) +{ +	struct xfrm4_protocol __rcu **pprev; +	struct xfrm4_protocol *t; +	int ret = -ENOENT; + +	if (!proto_handlers(protocol) || !netproto(protocol)) +		return -EINVAL; + +	mutex_lock(&xfrm4_protocol_mutex); + +	for (pprev = proto_handlers(protocol); +	     (t = rcu_dereference_protected(*pprev, +			lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; +	     pprev = &t->next) { +		if (t == handler) { +			*pprev = handler->next; +			ret = 0; +			break; +		} +	} + +	if (!rcu_dereference_protected(*proto_handlers(protocol), +				       lockdep_is_held(&xfrm4_protocol_mutex))) { +		if (inet_del_protocol(netproto(protocol), protocol) < 0) { +			pr_err("%s: can't remove protocol\n", __func__); +			ret = -EAGAIN; +		} +	} + +	mutex_unlock(&xfrm4_protocol_mutex); + +	synchronize_net(); + +	return ret; +} +EXPORT_SYMBOL(xfrm4_protocol_deregister); + +void __init xfrm4_protocol_init(void) +{ +	xfrm_input_register_afinfo(&xfrm4_input_afinfo); +} +EXPORT_SYMBOL(xfrm4_protocol_init); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 47947624ecc..542074c00c7 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -12,33 +12,36 @@  #include <linux/pfkeyv2.h>  #include <linux/ipsec.h>  #include <linux/netfilter_ipv4.h> +#include <linux/export.h>  static int xfrm4_init_flags(struct xfrm_state *x)  { -	if (ipv4_config.no_pmtu_disc) +	if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)  		x->props.flags |= XFRM_STATE_NOPMTUDISC;  	return 0;  }  static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)  { -	sel->daddr.a4 = fl->fl4_dst; -	sel->saddr.a4 = fl->fl4_src; -	sel->dport = xfrm_flowi_dport(fl); +	const struct flowi4 *fl4 = &fl->u.ip4; + +	sel->daddr.a4 = fl4->daddr; +	sel->saddr.a4 = fl4->saddr; +	sel->dport = xfrm_flowi_dport(fl, &fl4->uli);  	sel->dport_mask = htons(0xffff); -	sel->sport = xfrm_flowi_sport(fl); +	sel->sport = xfrm_flowi_sport(fl, &fl4->uli);  	sel->sport_mask = htons(0xffff);  	sel->family = AF_INET;  	sel->prefixlen_d = 32;  	sel->prefixlen_s = 32; -	sel->proto = fl->proto; -	sel->ifindex = fl->oif; +	sel->proto = fl4->flowi4_proto; +	sel->ifindex = fl4->flowi4_oif;  }  static void -xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, -		   xfrm_address_t *daddr, xfrm_address_t *saddr) +xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, +		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)  {  	x->id = tmpl->id;  	if (x->id.daddr.a4 == 0) @@ -53,7 +56,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,  int xfrm4_extract_header(struct sk_buff *skb)  { -	struct iphdr *iph = ip_hdr(skb); +	const struct iphdr *iph = ip_hdr(skb);  	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);  	XFRM_MODE_SKB_CB(skb)->id = iph->id; @@ -76,9 +79,11 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {  	.init_tempsel		= __xfrm4_init_tempsel,  	.init_temprop		= xfrm4_init_temprop,  	.output			= xfrm4_output, +	.output_finish		= xfrm4_output_finish,  	.extract_input		= xfrm4_extract_input,  	.extract_output		= xfrm4_extract_output,  	.transport_finish	= xfrm4_transport_finish, +	.local_error		= xfrm4_local_error,  };  void __init xfrm4_state_init(void) diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 82806455e85..06347dbd32c 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -3,6 +3,8 @@   * Copyright (C) 2003 David S. Miller (davem@redhat.com)   */ +#define pr_fmt(fmt) "IPsec: " fmt +  #include <linux/skbuff.h>  #include <linux/module.h>  #include <linux/mutex.h> @@ -61,10 +63,10 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)  static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {  	.handler	=	xfrm_tunnel_rcv,  	.err_handler	=	xfrm_tunnel_err, -	.priority	=	2, +	.priority	=	3,  }; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {  	.handler	=	xfrm_tunnel_rcv,  	.err_handler	=	xfrm_tunnel_err, @@ -75,18 +77,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {  static int __init ipip_init(void)  {  	if (xfrm_register_type(&ipip_type, AF_INET) < 0) { -		printk(KERN_INFO "ipip init: can't add xfrm type\n"); +		pr_info("%s: can't add xfrm type\n", __func__);  		return -EAGAIN;  	}  	if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { -		printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n"); +		pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);  		xfrm_unregister_type(&ipip_type, AF_INET);  		return -EAGAIN;  	} -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { -		printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n"); +		pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);  		xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);  		xfrm_unregister_type(&ipip_type, AF_INET);  		return -EAGAIN; @@ -97,14 +99,16 @@ static int __init ipip_init(void)  static void __exit ipip_fini(void)  { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6)  	if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) -		printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n"); +		pr_info("%s: can't remove xfrm handler for AF_INET6\n", +			__func__);  #endif  	if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) -		printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n"); +		pr_info("%s: can't remove xfrm handler for AF_INET\n", +			__func__);  	if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) -		printk(KERN_INFO "ipip close: can't remove xfrm type\n"); +		pr_info("%s: can't remove xfrm type\n", __func__);  }  module_init(ipip_init);  | 
