aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig116
-rw-r--r--net/ipv4/Makefile19
-rw-r--r--net/ipv4/af_inet.c653
-rw-r--r--net/ipv4/ah4.c173
-rw-r--r--net/ipv4/arp.c306
-rw-r--r--net/ipv4/cipso_ipv4.c138
-rw-r--r--net/ipv4/datagram.c68
-rw-r--r--net/ipv4/devinet.c863
-rw-r--r--net/ipv4/esp4.c274
-rw-r--r--net/ipv4/fib_frontend.c418
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h35
-rw-r--r--net/ipv4/fib_rules.c106
-rw-r--r--net/ipv4/fib_semantics.c444
-rw-r--r--net/ipv4/fib_trie.c561
-rw-r--r--net/ipv4/gre.c152
-rw-r--r--net/ipv4/gre_demux.c364
-rw-r--r--net/ipv4/gre_offload.c298
-rw-r--r--net/ipv4/icmp.c600
-rw-r--r--net/ipv4/igmp.c466
-rw-r--r--net/ipv4/inet_connection_sock.c370
-rw-r--r--net/ipv4/inet_diag.c867
-rw-r--r--net/ipv4/inet_fragment.c165
-rw-r--r--net/ipv4/inet_hashtables.c168
-rw-r--r--net/ipv4/inet_lro.c250
-rw-r--r--net/ipv4/inet_timewait_sock.c72
-rw-r--r--net/ipv4/inetpeer.c534
-rw-r--r--net/ipv4/ip_forward.c40
-rw-r--r--net/ipv4/ip_fragment.c240
-rw-r--r--net/ipv4/ip_gre.c1483
-rw-r--r--net/ipv4/ip_input.c98
-rw-r--r--net/ipv4/ip_options.c208
-rw-r--r--net/ipv4/ip_output.c771
-rw-r--r--net/ipv4/ip_sockglue.c229
-rw-r--r--net/ipv4/ip_tunnel.c1062
-rw-r--r--net/ipv4/ip_tunnel_core.c204
-rw-r--r--net/ipv4/ip_vti.c603
-rw-r--r--net/ipv4/ipcomp.c53
-rw-r--r--net/ipv4/ipconfig.c327
-rw-r--r--net/ipv4/ipip.c864
-rw-r--r--net/ipv4/ipmr.c584
-rw-r--r--net/ipv4/netfilter.c141
-rw-r--r--net/ipv4/netfilter/Kconfig183
-rw-r--r--net/ipv4/netfilter/Makefile32
-rw-r--r--net/ipv4/netfilter/arp_tables.c103
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c6
-rw-r--r--net/ipv4/netfilter/arptable_filter.c9
-rw-r--r--net/ipv4/netfilter/ip_queue.c637
-rw-r--r--net/ipv4/netfilter/ip_tables.c122
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c173
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c517
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c42
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c98
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c110
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c126
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c482
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c208
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c134
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c128
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c144
-rw-r--r--net/ipv4/netfilter/iptable_filter.c26
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c31
-rw-r--r--net/ipv4/netfilter/iptable_nat.c328
-rw-r--r--net/ipv4/netfilter/iptable_raw.c16
-rw-r--r--net/ipv4/netfilter/iptable_security.c12
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c247
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c35
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c154
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c85
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c774
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c137
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c173
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c451
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c99
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c281
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c55
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c124
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c108
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c38
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c97
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c92
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c83
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c99
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c214
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c561
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c248
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c325
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c51
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c104
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c129
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c199
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c90
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c75
-rw-r--r--net/ipv4/ping.c1218
-rw-r--r--net/ipv4/proc.c106
-rw-r--r--net/ipv4/protocol.c39
-rw-r--r--net/ipv4/raw.c238
-rw-r--r--net/ipv4/route.c3055
-rw-r--r--net/ipv4/syncookies.c162
-rw-r--r--net/ipv4/sysctl_net_ipv4.c433
-rw-r--r--net/ipv4/tcp.c1502
-rw-r--r--net/ipv4/tcp_bic.c19
-rw-r--r--net/ipv4/tcp_cong.c116
-rw-r--r--net/ipv4/tcp_cubic.c74
-rw-r--r--net/ipv4/tcp_diag.c20
-rw-r--r--net/ipv4/tcp_fastopen.c295
-rw-r--r--net/ipv4/tcp_highspeed.c9
-rw-r--r--net/ipv4/tcp_htcp.c8
-rw-r--r--net/ipv4/tcp_hybla.c31
-rw-r--r--net/ipv4/tcp_illinois.c19
-rw-r--r--net/ipv4/tcp_input.c3120
-rw-r--r--net/ipv4/tcp_ipv4.c1313
-rw-r--r--net/ipv4/tcp_lp.c10
-rw-r--r--net/ipv4/tcp_memcontrol.c228
-rw-r--r--net/ipv4/tcp_metrics.c1188
-rw-r--r--net/ipv4/tcp_minisocks.c266
-rw-r--r--net/ipv4/tcp_offload.c329
-rw-r--r--net/ipv4/tcp_output.c1534
-rw-r--r--net/ipv4/tcp_probe.c94
-rw-r--r--net/ipv4/tcp_scalable.c9
-rw-r--r--net/ipv4/tcp_timer.c169
-rw-r--r--net/ipv4/tcp_vegas.c14
-rw-r--r--net/ipv4/tcp_vegas.h10
-rw-r--r--net/ipv4/tcp_veno.c13
-rw-r--r--net/ipv4/tcp_westwood.c5
-rw-r--r--net/ipv4/tcp_yeah.c30
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c942
-rw-r--r--net/ipv4/udp_diag.c216
-rw-r--r--net/ipv4/udp_impl.h36
-rw-r--r--net/ipv4/udp_offload.c250
-rw-r--r--net/ipv4/udplite.c23
-rw-r--r--net/ipv4/xfrm4_input.c11
-rw-r--r--net/ipv4/xfrm4_mode_beet.c7
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c22
-rw-r--r--net/ipv4/xfrm4_output.c53
-rw-r--r--net/ipv4/xfrm4_policy.c201
-rw-r--r--net/ipv4/xfrm4_protocol.c301
-rw-r--r--net/ipv4/xfrm4_state.c27
-rw-r--r--net/ipv4/xfrm4_tunnel.c24
143 files changed, 23359 insertions, 19860 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9e95d7fb6d5..05c57f0fcab 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -9,10 +9,7 @@ config IP_MULTICAST
intend to participate in the MBONE, a high bandwidth network on top
of the Internet which carries audio and video broadcasts. More
information about the MBONE is on the WWW at
- <http://www.savetz.com/mbone/>. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. For most people, it's
- safe to say N.
+ <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
@@ -55,45 +52,9 @@ config IP_ADVANCED_ROUTER
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
- depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
- ---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
- June 1999
-
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
-
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
- depends on IP_FIB_TRIE
+ depends on IP_ADVANCED_ROUTER
---help---
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
@@ -140,6 +101,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -196,11 +160,10 @@ config IP_PNP_RARP
operating on your network. Read
<file:Documentation/filesystems/nfs/nfsroot.txt> for details.
-# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -221,9 +184,14 @@ config NET_IPGRE_DEMUX
This is helper module to demultiplex GRE packets on GRE version field criteria.
Required by ip_gre and pptp modules.
+config NET_IP_TUNNEL
+ tristate
+ default n
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -252,10 +220,8 @@ config IP_MROUTE
packets that have several destination addresses. It is needed on the
MBONE, a high bandwidth network on top of the Internet which carries
audio and video broadcasts. In order to do that, you would most
- likely run the program mrouted. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. If you haven't heard
- about it, you don't need it.
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
config IP_MROUTE_MULTIPLE_TABLES
bool "IP: multicast policy routing"
@@ -293,22 +259,6 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.
-config ARPD
- bool "IP: ARP daemon support"
- ---help---
- The kernel maintains an internal cache which maps IP addresses to
- hardware addresses on the local network, so that Ethernet/Token Ring/
- etc. frames are sent to the proper address on the physical networking
- layer. Normally, kernel uses the ARP protocol to resolve these
- mappings.
-
- Saying Y here adds support to have an user space daemon to do this
- resolution instead. This is useful for implementing an alternate
- address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
- testing purposes.
-
- If unsure, say N.
-
config SYN_COOKIES
bool "IP: TCP syncookie support"
---help---
@@ -345,9 +295,21 @@ config SYN_COOKIES
If unsure, say N.
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ depends on INET_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
config INET_AH
tristate "IP: AH transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_MD5
@@ -359,7 +321,7 @@ config INET_AH
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_AUTHENC
select CRYPTO_HMAC
@@ -432,7 +394,9 @@ config INET_DIAG
---help---
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
- downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
+ downloadable at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
If unsure, say Y.
@@ -440,6 +404,14 @@ config INET_TCP_DIAG
depends on INET_DIAG
def_tristate INET_DIAG
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for UDP socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
---help---
@@ -502,7 +474,6 @@ config TCP_CONG_HTCP
config TCP_CONG_HSTCP
tristate "High Speed TCP"
- depends on EXPERIMENTAL
default n
---help---
Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -513,7 +484,6 @@ config TCP_CONG_HSTCP
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
- depends on EXPERIMENTAL
default n
---help---
TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -523,7 +493,6 @@ config TCP_CONG_HYBLA
config TCP_CONG_VEGAS
tristate "TCP Vegas"
- depends on EXPERIMENTAL
default n
---help---
TCP Vegas is a sender-side only change to TCP that anticipates
@@ -534,7 +503,6 @@ config TCP_CONG_VEGAS
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
- depends on EXPERIMENTAL
default n
---help---
Scalable TCP is a sender-side only change to TCP which uses a
@@ -544,7 +512,6 @@ config TCP_CONG_SCALABLE
config TCP_CONG_LP
tristate "TCP Low Priority"
- depends on EXPERIMENTAL
default n
---help---
TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
@@ -554,7 +521,6 @@ config TCP_CONG_LP
config TCP_CONG_VENO
tristate "TCP Veno"
- depends on EXPERIMENTAL
default n
---help---
TCP Veno is a sender-side only enhancement of TCP to obtain better
@@ -566,7 +532,6 @@ config TCP_CONG_VENO
config TCP_CONG_YEAH
tristate "YeAH TCP"
- depends on EXPERIMENTAL
select TCP_CONG_VEGAS
default n
---help---
@@ -581,7 +546,6 @@ config TCP_CONG_YEAH
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
- depends on EXPERIMENTAL
default n
---help---
TCP-Illinois is a sender-side modification of TCP Reno for
@@ -645,8 +609,7 @@ config DEFAULT_TCP_CONG
default "cubic"
config TCP_MD5SIG
- bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "TCP: MD5 Signature Option support (RFC2385)"
select CRYPTO
select CRYPTO_MD5
---help---
@@ -655,4 +618,3 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a7..f032688d20d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,21 +7,22 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
- datagram.o raw.o udp.o udplite.o \
- arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o \
- inet_fragment.o
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
@@ -36,6 +37,7 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -49,7 +51,8 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
- xfrm4_output.o
+ xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6..d156b3c5f36 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -65,6 +65,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -89,7 +91,6 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/inet.h>
#include <linux/igmp.h>
@@ -105,14 +106,15 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
+#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
@@ -124,9 +126,6 @@
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);
-struct ipv4_config ipv4_config;
-EXPORT_SYMBOL(ipv4_config);
-
/* New destruction routine */
void inet_sock_destruct(struct sock *sk)
@@ -153,8 +152,9 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc);
- kfree(inet->opt);
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -209,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
+ /* Check special setups for testing purpose to enable TFO w/o
+ * requiring TCP_FASTOPEN sockopt.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopenq may already been allocated because this
+ * socket was in TCP_LISTEN state previously but was
+ * shutdown() (rather than close()).
+ */
+ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+ err = fastopen_init_queue(sk, backlog);
+ else if ((sysctl_tcp_fastopen &
+ TFO_SERVER_WO_SOCKOPT2) != 0)
+ err = fastopen_init_queue(sk,
+ ((uint)sysctl_tcp_fastopen) >> 16);
+ else
+ err = 0;
+ if (err)
+ goto out;
+ }
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
@@ -222,41 +242,6 @@ out:
}
EXPORT_SYMBOL(inet_listen);
-u32 inet_ehash_secret __read_mostly;
-EXPORT_SYMBOL(inet_ehash_secret);
-
-/*
- * inet_ehash_secret must be set exactly once
- */
-void build_ehash_secret(void)
-{
- u32 rnd;
-
- do {
- get_random_bytes(&rnd, sizeof(rnd));
- } while (rnd == 0);
-
- cmpxchg(&inet_ehash_secret, 0, rnd);
-}
-EXPORT_SYMBOL(build_ehash_secret);
-
-static inline int inet_netns_ok(struct net *net, int protocol)
-{
- int hash;
- const struct net_protocol *ipprot;
-
- if (net_eq(net, &init_net))
- return 1;
-
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
-
- if (ipprot == NULL)
- /* raw IP is OK */
- return 1;
- return ipprot->netns_ok;
-}
-
/*
* Create an inet socket.
*/
@@ -269,14 +254,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
- if (unlikely(!inet_ehash_secret))
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- build_ehash_secret();
-
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -325,16 +305,12 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
- goto out_rcu_unlock;
-
- err = -EAFNOSUPPORT;
- if (!inet_netns_ok(net, protocol))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
@@ -346,9 +322,8 @@ lookup_protocol:
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -361,7 +336,7 @@ lookup_protocol:
inet->hdrincl = 1;
}
- if (ipv4_config.no_pmtu_disc)
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
@@ -380,6 +355,7 @@ lookup_protocol:
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
+ inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
@@ -451,6 +427,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
int err;
@@ -464,7 +441,17 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -484,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
@@ -528,7 +516,7 @@ out:
}
EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -540,15 +528,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -564,6 +553,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
return timeo;
}
@@ -571,8 +561,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
@@ -581,8 +571,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
- lock_sock(sk);
-
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -622,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
/* Error code is set above */
- if (!timeo || !inet_wait_for_connect(sk, timeo))
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
@@ -645,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
@@ -655,6 +646,18 @@ sock_error:
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
EXPORT_SYMBOL(inet_stream_connect);
/*
@@ -672,8 +675,10 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
lock_sock(sk2);
+ sock_rps_record_flow(sk2);
WARN_ON(!((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
sock_graft(sk2, newsock);
@@ -880,6 +885,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
EXPORT_SYMBOL(inet_ioctl);
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -903,6 +921,7 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
@@ -929,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);
@@ -959,6 +979,7 @@ static const struct proto_ops inet_sockraw_ops = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -978,7 +999,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
@@ -988,17 +1008,22 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
@@ -1048,13 +1073,11 @@ out:
return;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
@@ -1063,8 +1086,7 @@ EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw_lock);
@@ -1085,34 +1107,36 @@ int sysctl_ip_dynaddr __read_mostly;
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
__be32 old_saddr = inet->inet_saddr;
- __be32 new_saddr;
__be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 new_saddr;
+ struct ip_options_rcu *inet_opt;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
/* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk, 0);
- if (err)
- return err;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
sk_setup_caps(sk, &rt->dst);
- new_saddr = rt->rt_src;
+ new_saddr = fl4->saddr;
if (new_saddr == old_saddr)
return 0;
if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
- __func__, &old_saddr, &new_saddr);
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
}
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
@@ -1134,6 +1158,8 @@ int inet_sk_rebuild_header(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
__be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
int err;
/* Route is OK, nothing to do. */
@@ -1141,28 +1167,23 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
daddr = inet->inet_daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-{
- struct flowi fl = {
- .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport,
- };
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
-}
- if (!err)
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
sk_setup_caps(sk, &rt->dst);
- else {
+ } else {
+ err = PTR_ERR(rt);
+
/* Routing failed... */
sk->sk_route_caps = 0;
/*
@@ -1182,8 +1203,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- struct iphdr *iph;
- const struct net_protocol *ops;
+ const struct net_offload *ops;
+ const struct iphdr *iph;
int proto;
int ihl;
int err = -EINVAL;
@@ -1196,46 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb)
if (ihl < sizeof(*iph))
goto out;
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
err = -EPROTONOSUPPORT;
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_send_check))
- err = ops->gso_send_check(skb);
- rcu_read_unlock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_send_check))
+ err = ops->callbacks.gso_send_check(skb);
out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
struct iphdr *iph;
- const struct net_protocol *ops;
int proto;
+ int nhoff;
int ihl;
int id;
- unsigned int offset = 0;
-
- if (!(features & NETIF_F_V4_CSUM))
- features &= ~NETIF_F_SG;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_MPLS |
0)))
goto out;
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
@@ -1244,39 +1274,53 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
if (ihl < sizeof(*iph))
goto out;
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ SKB_GSO_CB(skb)->encap_level += ihl;
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
segs = ERR_PTR(-EPROTONOSUPPORT);
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_segment))
- segs = ops->gso_segment(skb, features);
- rcu_read_unlock();
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
- if (!segs || IS_ERR(segs))
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
- iph = ip_hdr(skb);
- if (proto == IPPROTO_UDP) {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next != NULL)
iph->frag_off |= htons(IP_MF);
- offset += (skb->len - skb->mac_len - iph->ihl * 4);
- } else
+ offset += skb->len - nhoff - ihl;
+ } else {
iph->id = htons(id++);
- iph->tot_len = htons(skb->len - skb->mac_len);
- iph->check = 0;
- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ }
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
@@ -1286,10 +1330,10 @@ out:
static struct sk_buff **inet_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
- const struct net_protocol *ops;
+ const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
- struct iphdr *iph;
+ const struct iphdr *iph;
unsigned int hlen;
unsigned int off;
unsigned int id;
@@ -1305,21 +1349,21 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
goto out;
}
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ proto = iph->protocol;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (!ops || !ops->gro_receive)
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
goto out_unlock;
if (*(u8 *)iph != 0x45)
goto out_unlock;
- if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out_unlock;
id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
for (p = *head; p; p = p->next) {
@@ -1328,10 +1372,13 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (!NAPI_GRO_CB(p)->same_flow)
continue;
- iph2 = ip_hdr(p);
-
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
if ((iph->protocol ^ iph2->protocol) |
- (iph->tos ^ iph2->tos) |
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
@@ -1341,16 +1388,29 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ (iph->tos ^ iph2->tos) |
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
}
NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = ops->gro_receive(head, skb);
+ pp = ops->callbacks.gro_receive(head, skb);
out_unlock:
rcu_read_unlock();
@@ -1361,23 +1421,30 @@ out:
return pp;
}
-static int inet_gro_complete(struct sk_buff *skb)
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
- const struct net_protocol *ops;
- struct iphdr *iph = ip_hdr(skb);
- int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ int proto = iph->protocol;
int err = -ENOSYS;
- __be16 newlen = htons(skb->len - skb_network_offset(skb));
+
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (WARN_ON(!ops || !ops->gro_complete))
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
- err = ops->gro_complete(skb);
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
out_unlock:
rcu_read_unlock();
@@ -1407,82 +1474,44 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
int i;
- for_each_possible_cpu(i) {
- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
- }
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
#if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
u64 res = 0;
int cpu;
for_each_possible_cpu(cpu) {
- void *bhptr, *userptr;
+ void *bhptr;
struct u64_stats_sync *syncp;
- u64 v_bh, v_user;
+ u64 v;
unsigned int start;
- /* first mib used by softirq context, we must use _bh() accessors */
- bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+ bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_bh(syncp);
- v_bh = *(((u64 *) bhptr) + offt);
- } while (u64_stats_fetch_retry_bh(syncp, start));
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
- /* second mib used in USER context */
- userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
- syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
- do {
- start = u64_stats_fetch_begin(syncp);
- v_user = *(((u64 *) userptr) + offt);
- } while (u64_stats_fetch_retry(syncp, start));
-
- res += v_bh + v_user;
+ res += v;
}
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
-{
- BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, align);
- if (!ptr[0])
- goto err0;
- ptr[1] = __alloc_percpu(mibsize, align);
- if (!ptr[1])
- goto err1;
- return 0;
-err1:
- free_percpu(ptr[0]);
- ptr[0] = NULL;
-err0:
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_init);
-
-void snmp_mib_free(void __percpu *ptr[2])
-{
- BUG_ON(ptr == NULL);
- free_percpu(ptr[0]);
- free_percpu(ptr[1]);
- ptr[0] = ptr[1] = NULL;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
-
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1491,90 +1520,91 @@ static const struct net_protocol igmp_protocol = {
#endif
static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- .no_policy = 1,
- .netns_ok = 1,
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
- .gso_send_check = udp4_ufo_send_check,
- .gso_segment = udp4_ufo_fragment,
.no_policy = 1,
.netns_ok = 1,
};
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
+ .err_handler = icmp_err,
.no_policy = 1,
.netns_ok = 1,
};
static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
goto err_tcp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
goto err_ip_mib;
- if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
goto err_net_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
goto err_udp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
goto err_udplite_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
goto err_icmp_mib;
- if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
- sizeof(struct icmpmsg_mib),
- __alignof__(struct icmpmsg_mib)) < 0)
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
goto err_icmpmsg_mib;
tcp_mib_init(net);
return 0;
err_icmpmsg_mib:
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ free_percpu(net->mib.udp_statistics);
err_udp_mib:
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ free_percpu(net->mib.net_statistics);
err_net_mib:
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ free_percpu(net->mib.ip_statistics);
err_ip_mib:
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
return -ENOMEM;
}
static __net_exit void ipv4_mib_exit_net(struct net *net)
{
- snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
- snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
- snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
- snmp_mib_free((void __percpu **)net->mib.udp_statistics);
- snmp_mib_free((void __percpu **)net->mib.net_statistics);
- snmp_mib_free((void __percpu **)net->mib.ip_statistics);
- snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1587,37 +1617,95 @@ static int __init init_ipv4_mibs(void)
return register_pernet_subsys(&ipv4_mib_ops);
}
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
* IP protocol layer initialiser
*/
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
};
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
-
- sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
- if (!sysctl_local_reserved_ports)
- goto out;
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out_free_reserved_ports;
+ goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1627,6 +1715,10 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
/*
* Tell SOCKET that we are alive...
*/
@@ -1642,14 +1734,14 @@ static int __init inet_init(void)
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
@@ -1682,6 +1774,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ ping_init();
+
/*
* Set the ICMP layer up
*/
@@ -1694,14 +1788,17 @@ static int __init inet_init(void)
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
@@ -1712,12 +1809,12 @@ static int __init inet_init(void)
rc = 0;
out:
return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
-out_free_reserved_ports:
- kfree(sysctl_local_reserved_ports);
goto out;
}
@@ -1736,11 +1833,15 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
+ if (ping_proc_init())
+ goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
+ ping_proc_exit();
+out_ping:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 880a5ec6dce..a2afa89513a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -73,9 +75,9 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
* into IP header for icv calculation. Options are already checked
* for validity, so paranoia is not required. */
-static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
- unsigned char * optptr = (unsigned char*)(iph+1);
+ unsigned char *optptr = (unsigned char *)(iph+1);
int l = iph->ihl*4 - sizeof(struct iphdr);
int optlen;
@@ -136,8 +138,6 @@ static void ah_output_done(struct crypto_async_request *base, int err)
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
}
- err = ah->nexthdr;
-
kfree(AH_SKB_CB(skb)->tmp);
xfrm_output_resume(skb, err);
}
@@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *iph, *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
ahp = x->data;
ahash = ahp->ahash;
@@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
ah = ip_auth_hdr(skb);
ihl = ip_hdrlen(skb);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
err = -ENOMEM;
- iph = ah_alloc_tmp(ahash, nfrags, ihl);
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
if (!iph)
goto out;
-
- icv = ah_tmp_icv(ahash, iph, ihl);
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
memset(ah->auth_data, 0, ahp->icv_trunc_len);
@@ -201,16 +210,24 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
- ahash_request_set_crypt(req, sg, icv, skb->len);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
ahash_request_set_callback(req, 0, ah_output_done, skb);
AH_SKB_CB(skb)->tmp = iph;
@@ -261,12 +278,16 @@ static void ah_input_done(struct crypto_async_request *base, int err)
if (err)
goto out;
+ err = ah->nexthdr;
+
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
- skb_set_transport_header(skb, -ihl);
- err = ah->nexthdr;
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
out:
kfree(AH_SKB_CB(skb)->tmp);
xfrm_input_resume(skb, err);
@@ -288,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
struct ip_auth_hdr *ah;
struct ah_data *ahp;
int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
@@ -299,37 +324,51 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
- goto out;
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
skb->ip_summed = CHECKSUM_NONE;
- ah = (struct ip_auth_hdr *)skb->data;
- iph = ip_hdr(skb);
- ihl = ip_hdrlen(skb);
if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
goto out;
nfrags = err;
- work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+ ah = (struct ip_auth_hdr *)skb->data;
+ iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
if (!work_iph)
goto out;
- auth_data = ah_tmp_auth(work_iph, ihl);
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
memcpy(work_iph, iph, ihl);
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -348,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, ihl);
- sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
- ahash_request_set_crypt(req, sg, icv, skb->len);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
ahash_request_set_callback(req, 0, ah_input_done, skb);
AH_SKB_CB(skb)->tmp = work_iph;
@@ -361,8 +405,6 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
if (err == -EINPROGRESS)
goto out;
- if (err == -EBUSY)
- err = NET_XMIT_DROP;
goto out_free;
}
@@ -373,7 +415,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
- skb_set_transport_header(skb, -ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
err = nexthdr;
@@ -383,23 +428,35 @@ out:
return err;
}
-static void ah4_err(struct sk_buff *skb, u32 info)
+static int ah4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
if (!x)
- return;
- printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
- ntohl(ah->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
+
+ return 0;
}
static int ah_init_state(struct xfrm_state *x)
@@ -438,9 +495,10 @@ static int ah_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_ahash_digestsize(ahash)) {
- printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_ahash_digestsize(ahash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
goto error;
}
@@ -449,8 +507,12 @@ static int ah_init_state(struct xfrm_state *x)
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
@@ -476,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)
kfree(ahp);
}
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
static const struct xfrm_type ah_type =
{
@@ -489,21 +555,22 @@ static const struct xfrm_type ah_type =
.output = ah_output
};
-static const struct net_protocol ah4_protocol = {
+static struct xfrm4_protocol ah4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
.err_handler = ah4_err,
- .no_policy = 1,
- .netns_ok = 1,
+ .priority = 0,
};
static int __init ah4_init(void)
{
if (xfrm_register_type(&ah_type, AF_INET) < 0) {
- printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
- printk(KERN_INFO "ip ah init: can't add protocol\n");
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah_type, AF_INET);
return -EAGAIN;
}
@@ -512,10 +579,10 @@ static int __init ah4_init(void)
static void __exit ah4_fini(void)
{
- if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
- printk(KERN_INFO "ip ah close: can't remove protocol\n");
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ah4_init);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a2fc7b961db..1a9b99e0446 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,6 +73,8 @@
* Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -89,7 +91,6 @@
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
-#include <linux/trdevice.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -97,7 +98,6 @@
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -113,13 +113,7 @@
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-#include <net/atmclip.h>
-struct neigh_table *clip_tbl_hook;
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
-#include <asm/system.h>
#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
@@ -127,7 +121,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -139,8 +133,6 @@ static const struct neigh_ops arp_generic_ops = {
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
static const struct neigh_ops arp_hh_ops = {
@@ -149,16 +141,12 @@ static const struct neigh_ops arp_hh_ops = {
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
static const struct neigh_ops arp_direct_ops = {
.family = AF_INET,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
static const struct neigh_ops arp_broken_ops = {
@@ -167,13 +155,10 @@ static const struct neigh_ops arp_broken_ops = {
.error_report = arp_error_report,
.output = neigh_compat_output,
.connected_output = neigh_compat_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
struct neigh_table arp_tbl = {
.family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
.key_len = 4,
.hash = arp_hash,
.constructor = arp_constructor,
@@ -181,18 +166,20 @@ struct neigh_table arp_tbl = {
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
.reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
@@ -209,12 +196,12 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
case ARPHRD_IEEE802:
ip_eth_mc_map(addr, haddr);
return 0;
- case ARPHRD_IEEE802_TR:
- ip_tr_mc_map(addr, haddr);
- return 0;
case ARPHRD_INFINIBAND:
ip_ib_mc_map(addr, dev->broadcast, haddr);
return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+ return 0;
default:
if (dir) {
memcpy(haddr, dev->broadcast, dev->addr_len);
@@ -227,9 +214,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
static u32 arp_hash(const void *pkey,
const struct net_device *dev,
- __u32 hash_rnd)
+ __u32 *hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
+ return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
@@ -256,7 +243,7 @@ static int arp_constructor(struct neighbour *neigh)
if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
- neigh->output = neigh->ops->queue_xmit;
+ neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
@@ -289,9 +276,9 @@ static int arp_constructor(struct neighbour *neigh)
default:
break;
case ARPHRD_ROSE:
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
#endif
neigh->ops = &arp_broken_ops;
@@ -336,7 +323,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
- u8 *dst_ha = NULL;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
struct net_device *dev = neigh->dev;
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
@@ -374,31 +361,27 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- probes -= neigh->parms->ucast_probes;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
if (!(neigh->nud_state & NUD_VALID))
- printk(KERN_DEBUG
- "trying to ucast probe in NUD_INVALID\n");
- dst_ha = neigh->ha;
- read_lock_bh(&neigh->lock);
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
} else {
- probes -= neigh->parms->app_probes;
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
if (probes < 0) {
-#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
-#endif
return;
}
}
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_ha, dev->dev_addr, NULL);
- if (dst_ha)
- read_unlock_bh(&neigh->lock);
+ dst_hw, dev->dev_addr, NULL);
}
static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
+ struct net *net = dev_net(in_dev->dev);
int scope;
switch (IN_DEV_ARP_IGNORE(in_dev)) {
@@ -417,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
case 3: /* Do not reply for scope host addresses */
sip = 0;
scope = RT_SCOPE_LINK;
+ in_dev = NULL;
break;
case 4: /* Reserved */
case 5:
@@ -428,19 +412,18 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
default:
return 0;
}
- return !inet_confirm_addr(in_dev, sip, tip, scope);
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
- struct flowi fl = { .fl4_dst = sip,
- .fl4_src = tip };
struct rtable *rt;
int flag = 0;
/*unsigned long now; */
struct net *net = dev_net(dev);
- if (ip_route_output_key(net, &rt, &fl) < 0)
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -466,7 +449,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
{
switch (addr_hint) {
case RTN_LOCAL:
- printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ pr_debug("arp called for own IP address\n");
memcpy(haddr, dev->dev_addr, dev->addr_len);
return 1;
case RTN_MULTICAST:
@@ -487,13 +470,12 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
struct neighbour *n;
if (!skb_dst(skb)) {
- printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ pr_debug("arp_find is called with dst==NULL\n");
kfree_skb(skb);
return 1;
}
- paddr = skb_rtable(skb)->rt_gateway;
-
+ paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
paddr, dev))
return 0;
@@ -516,30 +498,6 @@ EXPORT_SYMBOL(arp_find);
/* END OF OBSOLETE FUNCTIONS */
-int arp_bind_neighbour(struct dst_entry *dst)
-{
- struct net_device *dev = dst->dev;
- struct neighbour *n = dst->neighbour;
-
- if (dev == NULL)
- return -EINVAL;
- if (n == NULL) {
- __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
- if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
- nexthop = 0;
- n = __neigh_lookup_errno(
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ?
- clip_tbl_hook :
-#endif
- &arp_tbl, &nexthop, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- dst->neighbour = n;
- }
- return 0;
-}
-
/*
* Check if we can use proxy ARP for this path
*/
@@ -623,16 +581,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
struct sk_buff *skb;
struct arphdr *arp;
unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
/*
* Allocate a buffer
*/
- skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
if (skb == NULL)
return NULL;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
@@ -664,13 +624,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pro = htons(ETH_P_IP);
break;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
arp->ar_hrd = htons(ARPHRD_AX25);
arp->ar_pro = htons(AX25_P_IP);
break;
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
arp->ar_hrd = htons(ARPHRD_NETROM);
arp->ar_pro = htons(AX25_P_IP);
@@ -678,18 +638,12 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
#endif
#endif
-#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
arp->ar_hrd = htons(ARPHRD_ETHER);
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
- case ARPHRD_IEEE802_TR:
- arp->ar_hrd = htons(ARPHRD_IEEE802);
- arp->ar_pro = htons(ETH_P_IP);
- break;
-#endif
}
arp->ar_hln = dev->addr_len;
@@ -702,11 +656,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp_ptr += dev->addr_len;
memcpy(arp_ptr, &src_ip, 4);
arp_ptr += 4;
- if (target_hw != NULL)
- memcpy(arp_ptr, target_hw, dev->addr_len);
- else
- memset(arp_ptr, 0, dev->addr_len);
- arp_ptr += dev->addr_len;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
memcpy(arp_ptr, &dest_ip, 4);
return skb;
@@ -770,6 +732,7 @@ static int arp_process(struct sk_buff *skb)
int addr_type;
struct neighbour *n;
struct net *net = dev_net(dev);
+ bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
@@ -787,11 +750,10 @@ static int arp_process(struct sk_buff *skb)
goto out;
break;
case ARPHRD_ETHER:
- case ARPHRD_IEEE802_TR:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
- * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
@@ -830,13 +792,21 @@ static int arp_process(struct sk_buff *skb)
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
- arp_ptr += dev->addr_len;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
- if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out;
/*
@@ -898,14 +868,15 @@ static int arp_process(struct sk_buff *skb)
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
- in_dev->arp_parms->proxy_delay == 0) {
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
dev, tip, sha, dev->dev_addr,
sha);
@@ -923,15 +894,17 @@ static int arp_process(struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) {
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
+ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
+ inet_addr_type(net, sip) == RTN_UNICAST;
+
if (n == NULL &&
- (arp->ar_op == htons(ARPOP_REPLY) ||
- (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
- inet_addr_type(net, sip) == RTN_UNICAST)
+ ((arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -944,7 +917,10 @@ static int arp_process(struct sk_buff *skb)
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
- override = time_after(jiffies, n->updated + n->parms->locktime);
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
@@ -975,24 +951,25 @@ static void parp_redo(struct sk_buff *skb)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct arphdr *arp;
+ const struct arphdr *arp;
+
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
+ goto freeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto freeskb;
arp = arp_hdr(skb);
- if (arp->ar_hln != dev->addr_len ||
- dev->flags & IFF_NOARP ||
- skb->pkt_type == PACKET_OTHERHOST ||
- skb->pkt_type == PACKET_LOOPBACK ||
- arp->ar_pln != 4)
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
goto freeskb;
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (skb == NULL)
- goto out_of_mem;
-
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
@@ -1017,14 +994,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
- if (__in_dev_get_rcu(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
return 0;
}
return -ENXIO;
}
-/* must be called with rcu_read_lock() */
static int arp_req_set_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
@@ -1062,19 +1038,17 @@ static int arp_req_set(struct net *net, struct arpreq *r,
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
- struct flowi fl = { .fl4_dst = ip,
- .fl4_tos = RTO_ONLINK };
- struct rtable *rt;
- err = ip_route_output_key(net, &rt, &fl);
- if (err != 0)
- return err;
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
}
switch (dev->type) {
-#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
/*
* According to RFC 1390, FDDI devices should accept ARP
@@ -1097,7 +1071,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
- unsigned state = NUD_STALE;
+ unsigned int state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1109,7 +1083,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
if (neigh->nud_state&NUD_PERMANENT)
return ATF_PERM | ATF_COM;
@@ -1143,6 +1117,22 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
return err;
}
+static int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
+
+ if (neigh) {
+ if (neigh->nud_state & ~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+
+ return err;
+}
+
static int arp_req_delete_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
@@ -1161,36 +1151,22 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
static int arp_req_delete(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- int err;
__be32 ip;
- struct neighbour *neigh;
if (r->arp_flags & ATF_PUBL)
return arp_req_delete_public(net, r, dev);
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (dev == NULL) {
- struct flowi fl = { .fl4_dst = ip,
- .fl4_tos = RTO_ONLINK };
- struct rtable *rt;
- err = ip_route_output_key(net, &rt, &fl);
- if (err != 0)
- return err;
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
}
- err = -ENXIO;
- neigh = neigh_lookup(&arp_tbl, &ip, dev);
- if (neigh) {
- if (neigh->nud_state & ~NUD_NOARP)
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE|
- NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
- }
- return err;
+ return arp_invalidate(dev, ip);
}
/*
@@ -1206,7 +1182,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -1226,10 +1202,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
htonl(0xFFFFFFFFUL);
- rcu_read_lock();
+ rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- dev = dev_get_by_name_rcu(net, r.arp_dev);
+ dev = __dev_get_by_name(net, r.arp_dev);
if (dev == NULL)
goto out;
@@ -1256,7 +1232,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
break;
}
out:
- rcu_read_unlock();
+ rtnl_unlock();
if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
err = -EFAULT;
return err;
@@ -1265,12 +1241,18 @@ out:
static int arp_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
- rt_cache_flush(dev_net(dev), 0);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
break;
default:
break;
@@ -1311,13 +1293,13 @@ void __init arp_init(void)
dev_add_pack(&arp_packet_type);
arp_proc_init();
#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier);
}
#ifdef CONFIG_PROC_FS
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
/* ------------------------------------------------------------------------ */
/*
@@ -1365,7 +1347,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
read_lock(&n->lock);
/* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
ax2asc2((ax25_address *)n->ha, hbuffer);
else {
@@ -1378,7 +1360,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
if (k != 0)
--k;
hbuffer[k] = 0;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
}
#endif
sprintf(tbuf, "%pI4", n->primary_key);
@@ -1451,14 +1433,14 @@ static const struct file_operations arp_seq_fops = {
static int __net_init arp_net_init(struct net *net)
{
- if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
return -ENOMEM;
return 0;
}
static void __net_exit arp_net_exit(struct net *net)
{
- proc_net_remove(net, "arp");
+ remove_proc_entry("arp", net->proc_net);
}
static struct pernet_operations arp_net_ops = {
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 094e150c626..69e77c8ff28 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -31,8 +31,7 @@
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
@@ -50,7 +49,7 @@
#include <net/tcp.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/bug.h>
#include <asm/unaligned.h>
@@ -112,7 +111,7 @@ int cipso_v4_rbm_strictvalid = 1;
/* The maximum number of category ranges permitted in the ranged category tag
* (tag #5). You may note that the IETF draft states that the maximum number
* of category ranges is 7, but if the low end of the last category range is
- * zero then it is possibile to fit 8 category ranges because the zero should
+ * zero then it is possible to fit 8 category ranges because the zero should
* be omitted. */
#define CIPSO_V4_TAG_RNG_CAT_MAX 8
@@ -438,7 +437,7 @@ cache_add_failure:
*
* Description:
* Search the DOI definition list for a DOI definition with a DOI value that
- * matches @doi. The caller is responsibile for calling rcu_read_[un]lock().
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
* Returns a pointer to the DOI definition on success and NULL on failure.
*/
static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -476,7 +475,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
doi = doi_def->doi;
doi_type = doi_def->type;
- if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
goto doi_add_return;
for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
switch (doi_def->tags[iter]) {
@@ -1293,7 +1292,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
return ret_val;
/* This will send packets using the "optimized" format when
- * possibile as specified in section 3.4.2.6 of the
+ * possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
tag_len = 14;
@@ -1336,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
@@ -1432,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
@@ -1527,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->attr.mls.cat =
- netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
@@ -1725,8 +1721,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
case CIPSO_V4_TAG_LOCAL:
/* This is a non-standard tag that we only allow for
* local connections, so if the incoming interface is
- * not the loopback device drop the packet. */
- if (!(skb->dev->flags & IFF_LOOPBACK)) {
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
err_offset = opt_iter;
goto validate_return_locked;
}
@@ -1752,7 +1750,7 @@ validate_return:
}
/**
- * cipso_v4_error - Send the correct reponse for a bad packet
+ * cipso_v4_error - Send the correct response for a bad packet
* @skb: the packet
* @error: the error code
* @gateway: CIPSO gateway flag
@@ -1879,7 +1877,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
unsigned char *buf = NULL;
u32 buf_len;
u32 opt_len;
- struct ip_options *opt = NULL;
+ struct ip_options_rcu *old, *opt = NULL;
struct inet_sock *sk_inet;
struct inet_connection_sock *sk_conn;
@@ -1915,22 +1913,25 @@ int cipso_v4_sock_setattr(struct sock *sk,
ret_val = -ENOMEM;
goto socket_setattr_failure;
}
- memcpy(opt->__data, buf, buf_len);
- opt->optlen = opt_len;
- opt->cipso = sizeof(struct iphdr);
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
kfree(buf);
buf = NULL;
sk_inet = inet_sk(sk);
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
if (sk_inet->is_icsk) {
sk_conn = inet_csk(sk);
- if (sk_inet->opt)
- sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
- sk_conn->icsk_ext_hdr_len += opt->optlen;
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
}
- opt = xchg(&sk_inet->opt, opt);
- kfree(opt);
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
@@ -1960,7 +1961,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
unsigned char *buf = NULL;
u32 buf_len;
u32 opt_len;
- struct ip_options *opt = NULL;
+ struct ip_options_rcu *opt = NULL;
struct inet_request_sock *req_inet;
/* We allocate the maximum CIPSO option size here so we are probably
@@ -1988,15 +1989,16 @@ int cipso_v4_req_setattr(struct request_sock *req,
ret_val = -ENOMEM;
goto req_setattr_failure;
}
- memcpy(opt->__data, buf, buf_len);
- opt->optlen = opt_len;
- opt->cipso = sizeof(struct iphdr);
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
kfree(buf);
buf = NULL;
req_inet = inet_rsk(req);
opt = xchg(&req_inet->opt, opt);
- kfree(opt);
+ if (opt)
+ kfree_rcu(opt, rcu);
return 0;
@@ -2016,34 +2018,34 @@ req_setattr_failure:
* values on failure.
*
*/
-static int cipso_v4_delopt(struct ip_options **opt_ptr)
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
{
int hdr_delta = 0;
- struct ip_options *opt = *opt_ptr;
+ struct ip_options_rcu *opt = *opt_ptr;
- if (opt->srr || opt->rr || opt->ts || opt->router_alert) {
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
u8 cipso_len;
u8 cipso_off;
unsigned char *cipso_ptr;
int iter;
int optlen_new;
- cipso_off = opt->cipso - sizeof(struct iphdr);
- cipso_ptr = &opt->__data[cipso_off];
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
cipso_len = cipso_ptr[1];
- if (opt->srr > opt->cipso)
- opt->srr -= cipso_len;
- if (opt->rr > opt->cipso)
- opt->rr -= cipso_len;
- if (opt->ts > opt->cipso)
- opt->ts -= cipso_len;
- if (opt->router_alert > opt->cipso)
- opt->router_alert -= cipso_len;
- opt->cipso = 0;
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
memmove(cipso_ptr, cipso_ptr + cipso_len,
- opt->optlen - cipso_off - cipso_len);
+ opt->opt.optlen - cipso_off - cipso_len);
/* determining the new total option length is tricky because of
* the padding necessary, the only thing i can think to do at
@@ -2052,21 +2054,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
* from there we can determine the new total option length */
iter = 0;
optlen_new = 0;
- while (iter < opt->optlen)
- if (opt->__data[iter] != IPOPT_NOP) {
- iter += opt->__data[iter + 1];
+ while (iter < opt->opt.optlen)
+ if (opt->opt.__data[iter] != IPOPT_NOP) {
+ iter += opt->opt.__data[iter + 1];
optlen_new = iter;
} else
iter++;
- hdr_delta = opt->optlen;
- opt->optlen = (optlen_new + 3) & ~3;
- hdr_delta -= opt->optlen;
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
} else {
/* only the cipso option was present on the socket so we can
* remove the entire option struct */
*opt_ptr = NULL;
- hdr_delta = opt->optlen;
- kfree(opt);
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
}
return hdr_delta;
@@ -2083,15 +2085,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
void cipso_v4_sock_delattr(struct sock *sk)
{
int hdr_delta;
- struct ip_options *opt;
+ struct ip_options_rcu *opt;
struct inet_sock *sk_inet;
sk_inet = inet_sk(sk);
- opt = sk_inet->opt;
- if (opt == NULL || opt->cipso == 0)
+ opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+ if (opt == NULL || opt->opt.cipso == 0)
return;
- hdr_delta = cipso_v4_delopt(&sk_inet->opt);
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
if (sk_inet->is_icsk && hdr_delta > 0) {
struct inet_connection_sock *sk_conn = inet_csk(sk);
sk_conn->icsk_ext_hdr_len -= hdr_delta;
@@ -2109,12 +2111,12 @@ void cipso_v4_sock_delattr(struct sock *sk)
*/
void cipso_v4_req_delattr(struct request_sock *req)
{
- struct ip_options *opt;
+ struct ip_options_rcu *opt;
struct inet_request_sock *req_inet;
req_inet = inet_rsk(req);
opt = req_inet->opt;
- if (opt == NULL || opt->cipso == 0)
+ if (opt == NULL || opt->opt.cipso == 0)
return;
cipso_v4_delopt(&req_inet->opt);
@@ -2184,14 +2186,18 @@ getattr_return:
*/
int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
{
- struct ip_options *opt;
-
- opt = inet_sk(sk)->opt;
- if (opt == NULL || opt->cipso == 0)
- return -ENOMSG;
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
- return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr),
- secattr);
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
}
/**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 174be6caa5c..a3095fdefbe 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
struct rtable *rt;
__be32 saddr;
int oif;
@@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_reset(sk);
+ lock_sock(sk);
+
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -46,33 +49,74 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!saddr)
saddr = inet->mc_addr;
}
- err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (err) {
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
- return err;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
}
if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
ip_rt_put(rt);
- return -EACCES;
+ err = -EACCES;
+ goto out;
}
if (!inet->inet_saddr)
- inet->inet_saddr = rt->rt_src; /* Update source address */
+ inet->inet_saddr = fl4->saddr; /* Update source address */
if (!inet->inet_rcv_saddr) {
- inet->inet_rcv_saddr = rt->rt_src;
+ inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
- inet->inet_daddr = rt->rt_dst;
+ inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
- return 0;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
EXPORT_SYMBOL(ip4_datagram_connect);
+
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3b067704ab3..e9449376b58 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -27,7 +27,6 @@
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
@@ -51,10 +50,12 @@
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
+#include <linux/netconf.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -62,6 +63,9 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#include "fib_lookup.h"
static struct ipv4_devconf ipv4_devconf = {
.data = {
@@ -69,6 +73,8 @@ static struct ipv4_devconf ipv4_devconf = {
[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
@@ -79,6 +85,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
@@ -90,8 +98,82 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
};
+#define IN4_ADDR_HSIZE_SHIFT 8
+#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
+
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+
+static u32 inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ net_hash_mix(net);
+
+ return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+ ASSERT_RTNL();
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ ASSERT_RTNL();
+ hlist_del_init_rcu(&ifa->hash);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
+ if (ifa->ifa_local == addr) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ result = dev;
+ break;
+ }
+ }
+ if (!result) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -101,10 +183,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
static void devinet_sysctl_register(struct in_device *idev);
static void devinet_sysctl_unregister(struct in_device *idev);
#else
-static inline void devinet_sysctl_register(struct in_device *idev)
+static void devinet_sysctl_register(struct in_device *idev)
{
}
-static inline void devinet_sysctl_unregister(struct in_device *idev)
+static void devinet_sysctl_unregister(struct in_device *idev)
{
}
#endif
@@ -124,7 +206,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
kfree(ifa);
}
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct in_ifaddr *ifa)
{
call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
}
@@ -135,9 +217,9 @@ void in_dev_finish_destroy(struct in_device *idev)
WARN_ON(idev->ifa_list);
WARN_ON(idev->mc_list);
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
- idev, dev ? dev->name : "NIL");
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
dev_put(dev);
if (!idev->dead)
@@ -209,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
- rcu_assign_pointer(dev->ip_ptr, NULL);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -234,7 +316,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
}
static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 pid)
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -265,9 +347,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
- rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_DOWN, ifa);
inet_free_ifa(ifa);
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
}
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -292,10 +387,11 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
is valid, it will try to restore deleted routes... Grr.
So that, this order is correct.
*/
- rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
+ struct in_ifaddr *next_sec = promote->ifa_next;
if (prev_prom) {
prev_prom->ifa_next = promote->ifa_next;
@@ -304,10 +400,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
- rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -325,8 +421,12 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 pid)
+ u32 portid)
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -361,17 +461,22 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- net_srandom(ifa->ifa_local);
+ prandom_seed((__force u32) ifa->ifa_local);
ifap = last_primary;
}
ifa->ifa_next = *ifap;
*ifap = ifa;
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
- rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
return 0;
@@ -393,6 +498,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return -ENOBUFS;
}
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
if (ifa->ifa_dev != in_dev) {
WARN_ON(ifa->ifa_dev);
in_dev_hold(in_dev);
@@ -434,7 +540,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
@@ -470,7 +576,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
!inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
continue;
- __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
return 0;
}
@@ -479,7 +585,132 @@ errout:
return err;
}
-static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ struct hlist_node *n;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ change_needed = true;
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+ rtnl_lock();
+ hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap;
+
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &(*ifap)->ifa_next) {
+ if (*ifap == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_unlock();
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+ next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
@@ -516,14 +747,17 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
goto errout;
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
in_dev_hold(in_dev);
if (tb[IFA_ADDRESS] == NULL)
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
- ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
ifa->ifa_scope = ifm->ifa_scope;
ifa->ifa_dev = in_dev;
@@ -538,31 +772,87 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout_free;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
return ifa;
+errout_free:
+ inet_free_ifa(ifa);
errout:
return ERR_PTR(err);
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(net, nlh);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq,
+ &check_lifetime_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+ }
+ return 0;
}
/*
* Determine a default network mask, based on the IP address.
*/
-static inline int inet_abc_len(__be32 addr)
+static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
@@ -628,16 +918,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
break;
case SIOCSIFFLAGS:
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
case SIOCSIFBRDADDR: /* Set the broadcast address */
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
@@ -670,7 +960,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifap = &ifa->ifa_next) {
if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
- ifa->ifa_address) {
+ ifa->ifa_local) {
break; /* found */
}
}
@@ -730,6 +1020,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifa = inet_alloc_ifa();
if (!ifa)
break;
+ INIT_HLIST_NODE(&ifa->hash);
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
else
@@ -756,6 +1047,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
ret = inet_set_ifa(dev, ifa);
break;
@@ -841,10 +1133,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
if (len < (int) sizeof(ifr))
break;
memset(&ifr, 0, sizeof(struct ifreq));
- if (ifa->ifa_label)
- strcpy(ifr.ifr_name, ifa->ifa_label);
- else
- strcpy(ifr.ifr_name, dev->name);
+ strcpy(ifr.ifr_name, ifa->ifa_label);
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
@@ -950,22 +1239,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
/*
* Confirm that local IP address exists using wildcards:
- * - in_dev: only on this interface, 0=any interface
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
* - dst: only in the same subnet as dst, 0=any dst
* - local: address, 0=autoselect the local address
* - scope: maximum allowed scope value for the local address
*/
-__be32 inet_confirm_addr(struct in_device *in_dev,
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
__be32 dst, __be32 local, int scope)
{
__be32 addr = 0;
struct net_device *dev;
- struct net *net;
- if (scope != RT_SCOPE_LINK)
+ if (in_dev != NULL)
return confirm_addr_indev(in_dev, dst, local, scope);
- net = dev_net(in_dev->dev);
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
in_dev = __in_dev_get_rcu(dev);
@@ -979,6 +1267,7 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
return addr;
}
+EXPORT_SYMBOL(inet_confirm_addr);
/*
* Device notifier
@@ -1025,17 +1314,32 @@ skip:
}
}
-static inline bool inetdev_valid_mtu(unsigned mtu)
+static bool inetdev_valid_mtu(unsigned int mtu)
{
return mtu >= 68;
}
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa;
+
+ for (ifa = in_dev->ifa_list; ifa;
+ ifa = ifa->ifa_next) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+ }
+}
+
/* Called only under RTNL semaphore */
static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev = __in_dev_get_rtnl(dev);
ASSERT_RTNL();
@@ -1059,8 +1363,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
- printk(KERN_DEBUG "inetdev_event: bug\n");
- rcu_assign_pointer(dev->ip_ptr, NULL);
+ pr_debug("%s: bug\n", __func__);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
if (!inetdev_valid_mtu(dev->mtu))
@@ -1069,6 +1373,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
struct in_ifaddr *ifa = inet_alloc_ifa();
if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1077,23 +1382,22 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
inet_insert_ifa(ifa);
}
}
ip_mc_up(in_dev);
/* fall through */
- case NETDEV_NOTIFY_PEERS:
case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
- if (IN_DEV_ARP_NOTIFY(in_dev)) {
- struct in_ifaddr *ifa = in_dev->ifa_list;
-
- if (ifa)
- arp_send(ARPOP_REQUEST, ETH_P_ARP,
- ifa->ifa_address, dev,
- ifa->ifa_address, NULL,
- dev->dev_addr, NULL);
- }
+ inetdev_send_gratuitous_arp(dev, in_dev);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
@@ -1129,43 +1433,86 @@ static struct notifier_block ip_netdev_notifier = {
.notifier_call = inetdev_event,
};
-static inline size_t inet_nlmsg_size(void)
+static size_t inet_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ nla_total_size(4) /* IFA_ADDRESS */
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
- + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 pid, u32 seq, int event, unsigned int flags)
+ u32 portid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ u32 preferred, valid;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_flags = ifa->ifa_flags;
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (ifa->ifa_address)
- NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
- if (ifa->ifa_local)
- NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
-
- if (ifa->ifa_broadcast)
- NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
-
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+ if ((ifa->ifa_address &&
+ nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1184,7 +1531,6 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
struct in_device *in_dev;
struct in_ifaddr *ifa;
struct hlist_head *head;
- struct hlist_node *node;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -1194,7 +1540,9 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
head = &net->dev_index_head[h];
rcu_read_lock();
- hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
if (h > s_h || idx > s_idx)
@@ -1208,12 +1556,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (ip_idx < s_ip_idx)
continue;
if (inet_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWADDR, NLM_F_MULTI) <= 0) {
rcu_read_unlock();
goto done;
}
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
cont:
idx++;
@@ -1230,7 +1579,7 @@ done:
}
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 pid)
+ u32 portid)
{
struct sk_buff *skb;
u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1242,14 +1591,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
if (skb == NULL)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
return;
errout:
if (err < 0)
@@ -1258,7 +1607,7 @@ errout:
static size_t inet_get_link_af_size(const struct net_device *dev)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
if (!in_dev)
return 0;
@@ -1268,7 +1617,7 @@ static size_t inet_get_link_af_size(const struct net_device *dev)
static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
int i;
@@ -1337,6 +1686,232 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
return 0;
}
+static int inet_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv4_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+};
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv4_devconf *devconf;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (dev == NULL)
+ goto errout;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev == NULL)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
#ifdef CONFIG_SYSCTL
static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1362,6 +1937,12 @@ static void inet_forward_change(struct net *net)
IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
for_each_netdev(net, dev) {
struct in_device *in_dev;
@@ -1369,33 +1950,69 @@ static void inet_forward_change(struct net *net)
dev_disable_lro(dev);
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
+ if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
rcu_read_unlock();
}
}
-static int devinet_conf_proc(ctl_table *ctl, int write,
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
+}
+
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ int old_value = *(int *)ctl->data;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
if (write) {
struct ipv4_devconf *cnf = ctl->extra1;
struct net *net = ctl->extra2;
int i = (int *)ctl->data - cnf->data;
+ int ifindex;
set_bit(i, cnf->state);
if (cnf == net->ipv4.devconf_dflt)
devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
}
return ret;
}
-static int devinet_sysctl_forward(ctl_table *ctl, int write,
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -1416,23 +2033,31 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
}
if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
inet_forward_change(net);
- } else if (*valp) {
+ } else {
struct ipv4_devconf *cnf = ctl->extra1;
struct in_device *idev =
container_of(cnf, struct in_device, cnf);
- dev_disable_lro(idev->dev);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
}
rtnl_unlock();
- rt_cache_flush(net, 0);
- }
+ rt_cache_flush(net);
+ } else
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
}
return ret;
}
-int ipv4_doint_and_flush(ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -1440,7 +2065,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
struct net *net = ctl->extra2;
if (write && *valp != val)
- rt_cache_flush(net, 0);
+ rt_cache_flush(net);
return ret;
}
@@ -1471,7 +2096,6 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
- char *dev_name;
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1498,13 +2122,19 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
- "force_igmp_version"),
DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
"promote_secondaries"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
},
};
@@ -1513,16 +2143,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
{
int i;
struct devinet_sysctl_table *t;
-
-#define DEVINET_CTL_PATH_DEV 3
-
- struct ctl_path devinet_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "conf", },
- { /* to be set */ },
- { },
- };
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
if (!t)
@@ -1534,27 +2155,15 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
t->devinet_vars[i].extra2 = net;
}
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- t->dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!t->dev_name)
- goto free;
-
- devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
- t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
- t->devinet_vars);
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl = t;
return 0;
-free_procname:
- kfree(t->dev_name);
free:
kfree(t);
out:
@@ -1569,14 +2178,13 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
return;
cnf->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->dev_name);
+ unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
static void devinet_sysctl_register(struct in_device *idev)
{
- neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
+ neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
&idev->cnf);
}
@@ -1600,12 +2208,6 @@ static struct ctl_table ctl_forward_entry[] = {
},
{ },
};
-
-static __net_initdata struct ctl_path net_ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
#endif
static __net_init int devinet_init_net(struct net *net)
@@ -1651,7 +2253,7 @@ static __net_init int devinet_init_net(struct net *net)
goto err_reg_dflt;
err = -ENOMEM;
- forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
if (forw_hdr == NULL)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
@@ -1710,15 +2312,24 @@ static struct rtnl_af_ops inet_af_ops = {
void __init devinet_init(void)
{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
register_pernet_subsys(&devinet_ops);
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
rtnl_af_register(&inet_af_ops);
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+ inet_netconf_dump_devconf, NULL);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1c3fb..360b565918c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <crypto/aead.h>
#include <crypto/authenc.h>
#include <linux/err.h>
@@ -23,6 +25,8 @@ struct esp_skb_cb {
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
@@ -31,11 +35,14 @@ struct esp_skb_cb {
*
* TODO: Use spare space in skb for this where possible.
*/
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
{
unsigned int len;
- len = crypto_aead_ivsize(aead);
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
if (len) {
len += crypto_aead_alignmask(aead) &
~(crypto_tfm_ctx_alignment() - 1);
@@ -50,10 +57,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
return kmalloc(len, GFP_ATOMIC);
}
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp)
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
{
return crypto_aead_ivsize(aead) ?
- PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp;
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
}
static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -109,7 +121,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct aead_givcrypt_request *req;
struct scatterlist *sg;
struct scatterlist *asg;
- struct esp_data *esp;
struct sk_buff *trailer;
void *tmp;
u8 *iv;
@@ -117,46 +128,72 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int blksize;
int clen;
int alen;
+ int plen;
+ int tfclen;
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
/* skb is pure payload to encrypt */
- err = -ENOMEM;
-
- /* Round to block size */
- clen = skb->len;
-
- esp = x->data;
- aead = esp->aead;
+ aead = x->data;
alen = crypto_aead_authsize(aead);
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- clen = ALIGN(clen + 2, blksize);
- if (esp->padlen)
- clen = ALIGN(clen, esp->padlen);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ plen = clen - skb->len - tfclen;
- if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0)
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
goto error;
nfrags = err;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
- if (!tmp)
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
goto error;
+ }
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_givreq(aead, iv);
asg = esp_givreq_sg(aead, req);
- sg = asg + 1;
+ sg = asg + sglists;
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
do {
int i;
- for (i=0; i<clen-skb->len - 2; i++)
+ for (i = 0; i < plen - 2; i++)
tail[i] = i + 1;
} while (0);
- tail[clen - skb->len - 2] = (clen - skb->len) - 2;
- tail[clen - skb->len - 1] = *skb_mac_header(skb);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
pskb_put(skb, trailer, clen - skb->len + alen);
skb_push(skb, -skb_network_offset(skb));
@@ -199,19 +236,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
esph->spi = x->id.spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg,
esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
clen + alen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
- aead_givcrypt_set_assoc(req, asg, sizeof(*esph));
+ aead_givcrypt_set_assoc(req, asg, assoclen);
aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output);
+ XFRM_SKB_CB(skb)->seq.output.low);
ESP_SKB_CB(skb)->tmp = tmp;
err = crypto_aead_givencrypt(req);
@@ -229,10 +274,9 @@ error:
static int esp_input_done2(struct sk_buff *skb, int err)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
struct xfrm_state *x = xfrm_input_state(skb);
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
int alen = crypto_aead_authsize(aead);
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int elen = skb->len - hlen;
@@ -297,7 +341,10 @@ static int esp_input_done2(struct sk_buff *skb, int err)
pskb_trim(skb, skb->len - alen - padlen - 2);
__skb_pull(skb, hlen);
- skb_set_transport_header(skb, -ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
err = nexthdr[1];
@@ -324,12 +371,15 @@ static void esp_input_done(struct crypto_async_request *base, int err)
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_aead *aead = esp->aead;
+ struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
void *tmp;
u8 *iv;
struct scatterlist *sg;
@@ -346,16 +396,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
nfrags = err;
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
err = -ENOMEM;
- tmp = esp_alloc_tmp(aead, nfrags + 1);
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
if (!tmp)
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- iv = esp_tmp_iv(aead, tmp);
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
asg = esp_req_sg(aead, req);
- sg = asg + 1;
+ sg = asg + sglists;
skb->ip_summed = CHECKSUM_NONE;
@@ -366,11 +427,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
sg_init_table(sg, nfrags);
skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
- sg_init_one(asg, esph, sizeof(*esph));
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
aead_request_set_callback(req, 0, esp_input_done, skb);
aead_request_set_crypt(req, sg, sg, elen, iv);
- aead_request_set_assoc(req, asg, sizeof(*esph));
+ aead_request_set_assoc(req, asg, assoclen);
err = crypto_aead_decrypt(req);
if (err == -EINPROGRESS)
@@ -384,66 +453,69 @@ out:
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
- u32 align = max_t(u32, blksize, esp->padlen);
- u32 rem;
-
- mtu -= x->props.header_len + crypto_aead_authsize(esp->aead);
- rem = mtu & (align - 1);
- mtu &= ~(align - 1);
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ unsigned int net_adj;
switch (x->props.mode) {
- case XFRM_MODE_TUNNEL:
- break;
- default:
case XFRM_MODE_TRANSPORT:
- /* The worst case */
- mtu -= blksize - 4;
- mtu += min_t(u32, blksize - 4, rem);
- break;
case XFRM_MODE_BEET:
- /* The worst case. */
- mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
break;
+ default:
+ BUG();
}
- return mtu - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
-static void esp4_err(struct sk_buff *skb, u32 info)
+static int esp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
- return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
+
+ return 0;
}
static void esp_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_aead(esp->aead);
- kfree(esp);
+ crypto_free_aead(aead);
}
static int esp_init_aead(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
int err;
@@ -452,7 +524,7 @@ static int esp_init_aead(struct xfrm_state *x)
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
err = crypto_aead_setkey(aead, x->aead->alg_key,
(x->aead->alg_key_len + 7) / 8);
@@ -469,7 +541,6 @@ error:
static int esp_init_authenc(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
struct rtattr *rta;
@@ -484,17 +555,27 @@ static int esp_init_authenc(struct xfrm_state *x)
goto error;
err = -ENAMETOOLONG;
- if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)",
- x->aalg ? x->aalg->alg_name : "digest_null",
- x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
if (IS_ERR(aead))
goto error;
- esp->aead = aead;
+ x->data = aead;
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
@@ -549,16 +630,11 @@ error:
static int esp_init_state(struct xfrm_state *x)
{
- struct esp_data *esp;
struct crypto_aead *aead;
u32 align;
int err;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
-
- x->data = esp;
+ x->data = NULL;
if (x->aead)
err = esp_init_aead(x);
@@ -568,9 +644,7 @@ static int esp_init_state(struct xfrm_state *x)
if (err)
goto error;
- aead = esp->aead;
-
- esp->padlen = 0;
+ aead = x->data;
x->props.header_len = sizeof(struct ip_esp_hdr) +
crypto_aead_ivsize(aead);
@@ -594,14 +668,17 @@ static int esp_init_state(struct xfrm_state *x)
}
align = ALIGN(crypto_aead_blocksize(aead), 4);
- if (esp->padlen)
- align = max_t(u32, align, esp->padlen);
- x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
return err;
}
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
static const struct xfrm_type esp_type =
{
.description = "ESP4",
@@ -615,21 +692,22 @@ static const struct xfrm_type esp_type =
.output = esp_output
};
-static const struct net_protocol esp4_protocol = {
+static struct xfrm4_protocol esp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
.err_handler = esp4_err,
- .no_policy = 1,
- .netns_ok = 1,
+ .priority = 0,
};
static int __init esp4_init(void)
{
if (xfrm_register_type(&esp_type, AF_INET) < 0) {
- printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
- printk(KERN_INFO "ip esp init: can't add protocol\n");
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp_type, AF_INET);
return -EAGAIN;
}
@@ -638,10 +716,10 @@ static int __init esp4_init(void)
static void __exit esp4_fini(void)
{
- if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
- printk(KERN_INFO "ip esp close: can't remove protocol\n");
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(esp4_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d3a1112b9d9..255aa9946fe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -15,7 +15,6 @@
#include <linux/module.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -32,6 +31,7 @@
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
+#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
@@ -44,6 +44,7 @@
#include <net/arp.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
+#include <net/xfrm.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_hash_table(RT_TABLE_LOCAL);
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
if (local_table == NULL)
return -ENOMEM;
- main_table = fib_hash_table(RT_TABLE_MAIN);
+ main_table = fib_trie_table(RT_TABLE_MAIN);
if (main_table == NULL)
goto fail;
@@ -82,9 +83,27 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_hash_table(id);
+ tb = fib_trie_table(id);
if (!tb)
return NULL;
+
+ switch (id) {
+ case RT_TABLE_LOCAL:
+ net->ipv4.fib_local = tb;
+ break;
+
+ case RT_TABLE_MAIN:
+ net->ipv4.fib_main = tb;
+ break;
+
+ case RT_TABLE_DEFAULT:
+ net->ipv4.fib_default = tb;
+ break;
+
+ default:
+ break;
+ }
+
h = id & (FIB_TABLE_HASHSZ - 1);
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
@@ -93,7 +112,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
@@ -103,7 +121,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
rcu_read_lock();
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
@@ -114,84 +132,34 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-void fib_select_default(struct net *net,
- const struct flowi *flp, struct fib_result *res)
-{
- struct fib_table *tb;
- int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
- return;
- table = res->r->table;
-#endif
- tb = fib_get_table(net, table);
- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- fib_table_select_default(tb, flp, res);
-}
-
static void fib_flush(struct net *net)
{
int flushed = 0;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist)
+ hlist_for_each_entry(tb, head, tb_hlist)
flushed += fib_table_flush(tb);
}
if (flushed)
- rt_cache_flush(net, -1);
-}
-
-/**
- * __ip_dev_find - find the first device with a given source address.
- * @net: the net namespace
- * @addr: the source address
- * @devref: if true, take a reference on the found device
- *
- * If a caller uses devref=false, it should be protected by RCU, or RTNL
- */
-struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
-{
- struct flowi fl = {
- .fl4_dst = addr,
- .flags = FLOWI_FLAG_MATCH_ANY_IIF
- };
- struct fib_result res = { 0 };
- struct net_device *dev = NULL;
-
- rcu_read_lock();
- if (fib_lookup(net, &fl, &res)) {
- rcu_read_unlock();
- return NULL;
- }
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
-
- if (dev && devref)
- dev_hold(dev);
-out:
- rcu_read_unlock();
- return dev;
+ rt_cache_flush(net);
}
-EXPORT_SYMBOL(__ip_dev_find);
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
*/
-static inline unsigned __inet_dev_addr_type(struct net *net,
- const struct net_device *dev,
- __be32 addr)
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
{
- struct flowi fl = { .fl4_dst = addr };
+ struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
- unsigned ret = RTN_BROADCAST;
+ unsigned int ret = RTN_BROADCAST;
struct fib_table *local_table;
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -199,15 +167,11 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
rcu_read_lock();
- if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
@@ -229,6 +193,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
}
EXPORT_SYMBOL(inet_dev_addr_type);
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
+
+ in_dev = __in_dev_get_rcu(dev);
+ BUG_ON(!in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ fl4.daddr = ip_hdr(skb)->saddr;
+ fl4.saddr = 0;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_scope = scope;
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ if (!fib_lookup(net, &fl4, &res))
+ return FIB_RES_PREFSRC(net, res);
+ } else {
+ scope = RT_SCOPE_LINK;
+ }
+
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
* - (main) check, that source is valid i.e. not broadcast or our local
* address.
@@ -237,45 +239,35 @@ EXPORT_SYMBOL(inet_dev_addr_type);
* - check, that packet arrived from expected physical interface.
* called with rcu_read_lock()
*/
-int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
- struct net_device *dev, __be32 *spec_dst,
- u32 *itag, u32 mark)
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
{
- struct in_device *in_dev;
- struct flowi fl = {
- .fl4_dst = src,
- .fl4_src = dst,
- .fl4_tos = tos,
- .mark = mark,
- .iif = oif
- };
+ int ret, no_addr, accept_local;
struct fib_result res;
- int no_addr, rpf, accept_local;
- bool dev_match;
- int ret;
+ struct flowi4 fl4;
struct net *net;
+ bool dev_match;
- no_addr = rpf = accept_local = 0;
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev) {
- no_addr = in_dev->ifa_list == NULL;
- rpf = IN_DEV_RPFILTER(in_dev);
- accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
- if (mark && !IN_DEV_SRC_VMARK(in_dev))
- fl.mark = 0;
- }
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
- if (in_dev == NULL)
- goto e_inval;
+ no_addr = idev->ifa_list == NULL;
+
+ accept_local = IN_DEV_ACCEPT_LOCAL(idev);
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
net = dev_net(dev);
- if (fib_lookup(net, &fl, &res))
+ if (fib_lookup(net, &fl4, &res))
goto last_resort;
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
goto e_inval;
}
- *spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
dev_match = false;
@@ -300,21 +292,18 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
goto last_resort;
if (rpf == 1)
goto e_rpf;
- fl.oif = dev->ifindex;
+ fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(net, &fl, &res) == 0) {
- if (res.type == RTN_UNICAST) {
- *spec_dst = FIB_RES_PREFSRC(res);
+ if (fib_lookup(net, &fl4, &res) == 0) {
+ if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- }
}
return ret;
last_resort:
if (rpf)
goto e_rpf;
- *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
*itag = 0;
return 0;
@@ -324,6 +313,21 @@ e_rpf:
return -EXDEV;
}
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+ if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ *itag = 0;
+ return 0;
+ }
+ return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
static inline __be32 sk_extract_addr(struct sockaddr *addr)
{
return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -482,7 +486,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(&rt, arg, sizeof(rt)))
@@ -552,7 +556,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_flags = rtm->rtm_flags;
cfg->fc_nlflags = nlh->nlmsg_flags;
- cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
cfg->fc_nlinfo.nl_net = net;
@@ -600,7 +604,7 @@ errout:
return err;
}
-static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -622,7 +626,7 @@ errout:
return err;
}
-static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_config cfg;
@@ -650,13 +654,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
- struct hlist_node *node;
struct hlist_head *head;
int dumped = 0;
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
- return ip_rt_dump(skb, cb);
+ return skb->len;
s_h = cb->args[0];
s_e = cb->args[1];
@@ -664,7 +667,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, node, head, tb_hlist) {
+ hlist_for_each_entry(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -740,7 +743,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
- printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
}
@@ -769,30 +772,44 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
}
}
-static void fib_del_ifaddr(struct in_ifaddr *ifa)
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
{
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
- struct in_ifaddr *prim = ifa;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
__be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
- unsigned ok = 0;
+ unsigned int ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
- if (!(ifa->ifa_flags & IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
- else {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
- printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
+ subnet = 1;
}
/* Deletion is more complicated than add.
@@ -802,6 +819,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
*/
for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
if (ifa->ifa_local == ifa1->ifa_local)
ok |= LOCAL_OK;
if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -810,19 +870,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD1_OK;
if (any == ifa1->ifa_broadcast)
ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
}
if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok & BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok & BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ }
if (!(ok & LOCAL_OK)) {
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
- if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ if (gone &&
+ inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
* We must flush stray FIB entries.
*
@@ -843,24 +921,19 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = {
- .mark = frn->fl_mark,
- .fl4_dst = frn->fl_addr,
- .fl4_tos = frn->fl_tos,
- .fl4_scope = frn->fl_scope,
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
};
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
frn->err = -ENOENT;
if (tb) {
local_bh_disable();
frn->tb_id = tb->tb_id;
- rcu_read_lock();
- frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
@@ -868,7 +941,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
frn->type = res.type;
frn->scope = res.scope;
}
- rcu_read_unlock();
local_bh_enable();
}
}
@@ -879,35 +951,38 @@ static void nl_fib_input(struct sk_buff *skb)
struct fib_result_nl *frn;
struct nlmsghdr *nlh;
struct fib_table *tb;
- u32 pid;
+ u32 portid;
net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
- nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
return;
- skb = skb_clone(skb, GFP_KERNEL);
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
if (skb == NULL)
return;
nlh = nlmsg_hdr(skb);
- frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
tb = fib_get_table(net, frn->tb_id_in);
nl_fib_lookup(frn, tb);
- pid = NETLINK_CB(skb).pid; /* pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
+ NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+ netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
}
static int __net_init nl_fib_lookup_init(struct net *net)
{
struct sock *sk;
- sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
- nl_fib_input, NULL, THIS_MODULE);
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
if (sk == NULL)
return -EAFNOSUPPORT;
net->ipv4.fibnl = sk;
@@ -920,11 +995,11 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force, int delay)
+static void fib_disable_ip(struct net_device *dev, int force)
{
if (fib_sync_down_dev(dev, force))
fib_flush(dev_net(dev));
- rt_cache_flush(dev_net(dev), delay);
+ rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
}
@@ -932,6 +1007,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
{
struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
switch (event) {
case NETDEV_UP:
@@ -939,17 +1015,19 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
- rt_cache_flush(dev_net(dev), -1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev));
break;
case NETDEV_DOWN:
- fib_del_ifaddr(ifa);
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
* Disable IP.
*/
- fib_disable_ip(dev, 1, 0);
+ fib_disable_ip(dev, 1);
} else {
- rt_cache_flush(dev_net(dev), -1);
+ rt_cache_flush(dev_net(dev));
}
break;
}
@@ -958,14 +1036,17 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2, -1);
+ fib_disable_ip(dev, 2);
+ rt_flush_dev(dev);
return NOTIFY_DONE;
}
+ in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return NOTIFY_DONE;
@@ -977,17 +1058,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
- rt_cache_flush(dev_net(dev), -1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0, 0);
+ fib_disable_ip(dev, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
- rt_cache_flush(dev_net(dev), 0);
- break;
- case NETDEV_UNREGISTER_BATCH:
- rt_cache_flush_batch();
+ rt_cache_flush(net);
break;
}
return NOTIFY_DONE;
@@ -1031,18 +1110,20 @@ static void ip_fib_net_exit(struct net *net)
fib4_rules_exit(net);
#endif
+ rtnl_lock();
for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
struct fib_table *tb;
struct hlist_head *head;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
head = &net->ipv4.fib_table_hash[i];
- hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
- hlist_del(node);
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
fib_table_flush(tb);
fib_free_table(tb);
}
}
+ rtnl_unlock();
kfree(net->ipv4.fib_table_hash);
}
@@ -1050,6 +1131,9 @@ static int __net_init fib_net_init(struct net *net)
{
int error;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ net->ipv4.fib_num_tclassid_users = 0;
+#endif
error = ip_fib_net_init(net);
if (error < 0)
goto out;
@@ -1083,13 +1167,13 @@ static struct pernet_operations fib_net_ops = {
void __init ip_fib_init(void)
{
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
register_pernet_subsys(&fib_net_ops);
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- fib_hash_init();
+ fib_trie_init();
}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b2..00000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
- struct fib_alias fn_embedded_alias;
-};
-
-#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
-
-struct fn_zone {
- struct fn_zone __rcu *fz_next; /* Next not empty zone */
- struct hlist_head __rcu *fz_hash; /* Hash table pointer */
- seqlock_t fz_lock;
- u32 fz_hashmask; /* (fz_divisor - 1) */
-
- u8 fz_order; /* Zone order (0..32) */
- u8 fz_revorder; /* 32 - fz_order */
- __be32 fz_mask; /* inet_make_mask(order) */
-#define FZ_MASK(fz) ((fz)->fz_mask)
-
- struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
-
- int fz_nent; /* Number of entries */
- int fz_divisor; /* Hash size (mask+1) */
-};
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone __rcu *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key) >> fz->fz_revorder;
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= fz->fz_hashmask;
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
-
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del_rcu(&f->fn_hash);
-
- new_head = rcu_dereference_protected(fz->fz_hash, 1) +
- fn_hash(f->fn_key, fz);
- hlist_add_head_rcu(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- new_divisor = old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case EMBEDDED_HASH_SIZE:
- new_divisor *= EMBEDDED_HASH_SIZE;
- break;
- case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
- new_divisor *= (EMBEDDED_HASH_SIZE/2);
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
- fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- struct fn_zone nfz;
-
- memcpy(&nfz, fz, sizeof(nfz));
-
- write_seqlock_bh(&fz->fz_lock);
- old_ht = rcu_dereference_protected(fz->fz_hash, 1);
- RCU_INIT_POINTER(nfz.fz_hash, ht);
- nfz.fz_hashmask = new_hashmask;
- nfz.fz_divisor = new_divisor;
- fn_rebuild_zone(&nfz, old_ht, old_divisor);
- fib_hash_genid++;
- rcu_assign_pointer(fz->fz_hash, ht);
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- write_sequnlock_bh(&fz->fz_lock);
-
- if (old_ht != fz->fz_embedded_hash) {
- synchronize_rcu();
- fz_hash_free(old_ht, old_divisor);
- }
- }
-}
-
-static void fn_free_node_rcu(struct rcu_head *head)
-{
- struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
-
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_node(struct fib_node *f)
-{
- call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
-}
-
-static void fn_free_alias_rcu(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
- fib_release_info(fa->fa_info);
- if (fa == &f->fn_embedded_alias)
- fa->fa_info = NULL;
- else
- call_rcu(&fa->rcu, fn_free_alias_rcu);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- seqlock_init(&fz->fz_lock);
- fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
- fz->fz_hashmask = fz->fz_divisor - 1;
- RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
- fz->fz_order = z;
- fz->fz_revorder = 32 - z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i = z + 1; i <= 32; i++)
- if (table->fn_zones[i])
- break;
- if (i > 32) {
- /* No more specific masks, we are the first. */
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zone_list));
- rcu_assign_pointer(table->fn_zone_list, fz);
- } else {
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zones[i]->fz_next));
- rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- return fz;
-}
-
-int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res,
- int fib_flags)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-
- rcu_read_lock();
- for (fz = rcu_dereference(t->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next)) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k;
- unsigned int seq;
-
- do {
- seq = read_seqbegin(&fz->fz_lock);
- k = fz_key(flp->fl4_dst, fz);
-
- head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- fz->fz_order, fib_flags);
- if (err <= 0)
- goto out;
- }
- } while (read_seqretry(&fz->fz_lock, seq));
- }
- err = 1;
-out:
- rcu_read_unlock();
- return err;
-}
-
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
- struct hlist_head *head;
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
- head = rcu_dereference(fz->fz_hash);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
-
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
-
- hlist_add_head_rcu(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-
-static struct fib_alias *fib_fast_alloc(struct fib_node *f)
-{
- struct fib_alias *fa = &f->fn_embedded_alias;
-
- if (fa->fa_info != NULL)
- fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- return fa;
-}
-
-/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f = NULL;
- struct fib_node *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_first, *fa_match;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- /* We have 2 goals:
- * 1. Find exact match for type, scope, fib_info to avoid
- * duplicate routes
- * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
- */
- fa_match = NULL;
- fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- fa_match = fa;
- break;
- }
- }
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- u8 state;
-
- fa = fa_first;
- if (fa_match) {
- if (fa == fa_match)
- err = 0;
- goto out;
- }
- err = -ENOBUFS;
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_tos = fa->fa_tos;
- new_fa->fa_info = fi;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- new_fa->fa_state = state & ~FA_S_ACCESSED;
- fib_hash_genid++;
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
- fn_free_alias(fa, f);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- if (fa_match)
- goto out;
-
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_first;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
-
- if (!f) {
- new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, 0);
- return 0;
-
-out:
- if (new_f)
- kmem_cache_free(fn_hash_kmem, new_f);
- fib_release_info(fi);
- return err;
-}
-
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, 0);
-
- kill_fn = 0;
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- fn_free_alias(fa, f);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
-
- fn_free_alias(fa, f);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-/* caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = rtnl_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rtnl_dereference(fz->fz_next)) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-void fib_free_table(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz, *next;
-
- next = table->fn_zone_list;
- while (next != NULL) {
- fz = next;
- next = fz->fz_next;
-
- if (fz->fz_hash != fz->fz_embedded_hash)
- fz_hash_free(fz->fz_hash, fz->fz_divisor);
-
- kfree(fz);
- }
-
- kfree(tb);
-}
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
-next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
- struct hlist_head *head = rcu_dereference(fz->fz_hash);
-
- if (head == NULL)
- return skb->len;
- s_h = cb->args[3];
- for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(head + h))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
- cb->args[3] = h;
- return -1;
- }
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m = 0, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-
- s_m = cb->args[2];
- rcu_read_lock();
- for (fz = rcu_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next), m++) {
- if (m < s_m)
- continue;
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- rcu_read_unlock();
- return -1;
- }
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- }
- rcu_read_unlock();
- cb->args[2] = m;
- return skb->len;
-}
-
-void __init fib_hash_init(void)
-{
- fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
- 0, SLAB_PANIC, NULL);
-
- fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
-
-}
-
-struct fib_table *fib_hash_table(u32 id)
-{
- struct fib_table *tb;
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_default = -1;
-
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct seq_net_private p;
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_table *main_table;
- struct fn_hash *table;
-
- main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
- table = (struct fn_hash *)main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = rcu_dereference(table->fn_zone_list);
- iter->zone != NULL;
- iter->zone = rcu_dereference(iter->zone->fz_next)) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = rcu_dereference(iter->zone->fz_next);
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- void *v = NULL;
-
- rcu_read_lock();
- if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT,
- [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- int len;
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3, &len);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
-out:
- return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-int __net_init fib_proc_init(struct net *net)
-{
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __net_exit fib_proc_exit(struct net *net)
-{
- proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec65..1e4f6600b31 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -10,7 +10,6 @@ struct fib_alias {
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
- u8 fa_scope;
u8 fa_state;
struct rcu_head rcu;
};
@@ -25,24 +24,15 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
- const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags);
-extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(struct fib_config *cfg);
-extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst,
- int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
-extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
- unsigned int nlm_flags);
-extern struct fib_alias *fib_find_alias(struct list_head *fah,
- u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort,
- int *last_idx, int dflt);
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
@@ -51,4 +41,11 @@ static inline void fib_result_assign(struct fib_result *res,
res->fi = fi;
}
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7..f2e15738534 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
+#include <linux/export.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -41,19 +42,12 @@ struct fib4_rule {
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
-{
- return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
-}
-#endif
-
-int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
@@ -61,11 +55,16 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
};
int err;
- err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg);
- res->r = arg.rule;
-
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
return err;
}
+EXPORT_SYMBOL_GPL(__fib_lookup);
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
@@ -95,25 +94,53 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
if (!tbl)
goto errout;
- err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
+ err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
return err;
}
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+ struct fib_result *result = (struct fib_result *) arg->result;
+ struct net_device *dev = NULL;
+
+ if (result->fi)
+ dev = result->fi->fib_dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
+}
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
- __be32 daddr = fl->fl4_dst;
- __be32 saddr = fl->fl4_src;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
if (((saddr ^ r->src) & r->srcmask) ||
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl->fl4_tos))
+ if (r->tos && (r->tos != fl4->flowi4_tos))
return 0;
return 1;
@@ -165,9 +192,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (tb[FRA_FLOW])
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW]) {
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users++;
+ }
#endif
rule4->src_len = frh->src_len;
@@ -176,11 +206,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->dstmask = inet_make_mask(rule4->dst_len);
rule4->tos = frh->tos;
+ net->ipv4.fib_has_custom_rules = true;
err = 0;
errout:
return err;
}
+static void fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users--;
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+}
+
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
@@ -195,7 +238,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->tos && (rule4->tos != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
@@ -218,15 +261,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->src_len = rule4->src_len;
frh->tos = rule4->tos;
- if (rule4->dst_len)
- NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-
- if (rule4->src_len)
- NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-
-#ifdef CONFIG_NET_CLS_ROUTE
- if (rule4->tclassid)
- NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+ if ((rule4->dst_len &&
+ nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_be32(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
#endif
return 0;
@@ -243,16 +286,18 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
{
- rt_cache_flush(ops->fro_net, -1);
+ rt_cache_flush(ops->fro_net);
}
-static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
.action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
.default_pref = fib_default_rule_pref,
@@ -292,6 +337,7 @@ int __net_init fib4_rules_init(struct net *net)
if (err < 0)
goto fail;
net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
return 0;
fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b..b10cd43a472 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -14,7 +14,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -49,7 +48,7 @@
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -90,11 +89,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
#define endfor_nexthops(fi) }
-static const struct
-{
- int error;
- u8 scope;
-} fib_props[RTN_MAX + 1] = {
+const struct fib_prop fib_props[RTN_MAX + 1] = {
[RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
@@ -145,29 +140,96 @@ static const struct
},
};
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
-/* Release a nexthop info record */
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_free(&rt->dst);
+}
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+ struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt)
+ dst_free(&rt->dst);
+ }
+ free_percpu(rtp);
+}
+
+/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ if (nexthop_nh->nh_exceptions)
+ free_nh_exceptions(nexthop_nh);
+ rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+ rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+ } endfor_nexthops(fi);
+
+ release_net(fi->fib_net);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
kfree(fi);
}
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- pr_warning("Freeing alive fib_info %p\n", fi);
+ pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
+ fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
change_nexthops(fi) {
- if (nexthop_nh->nh_dev)
- dev_put(nexthop_nh->nh_dev);
- nexthop_nh->nh_dev = NULL;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users--;
} endfor_nexthops(fi);
- fib_info_cnt--;
- release_net(fi->fib_net);
+#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
@@ -200,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight != onh->nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,10 +283,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
unsigned int val = fi->fib_nhs;
- val ^= fi->fib_protocol;
+ val ^= (fi->fib_protocol << 8) | fi->fib_scope;
val ^= (__force u32)fi->fib_prefsrc;
val ^= fi->fib_priority;
for_nexthops(fi) {
@@ -237,23 +299,24 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_info *fi;
unsigned int hash;
hash = fib_info_hashfn(nfi);
head = &fib_info_hash[hash];
- hlist_for_each_entry(fi, node, head, fib_hash) {
+ hlist_for_each_entry(fi, head, fib_hash) {
if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
- sizeof(fi->fib_metrics)) == 0 &&
+ sizeof(u32) * RTAX_MAX) == 0 &&
((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
@@ -268,7 +331,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
unsigned int hash;
@@ -276,7 +338,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
!(nh->nh_flags & RTNH_F_DEAD)) {
@@ -318,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
}
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
+ int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
struct sk_buff *skb;
@@ -329,8 +391,8 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
if (skb == NULL)
goto errout;
- err = fib_dump_info(skb, info->pid, seq, event, tb_id,
- fa->fa_type, fa->fa_scope, key, dst_len,
+ err = fib_dump_info(skb, info->portid, seq, event, tb_id,
+ fa->fa_type, key, dst_len,
fa->fa_tos, fa->fa_info, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -338,7 +400,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
info->nlh, GFP_KERNEL);
return;
errout:
@@ -364,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int dflt)
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
{
struct neighbour *n;
int state = NUD_NONE;
@@ -422,9 +485,11 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
}
@@ -476,7 +541,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla && nla_get_be32(nla) != nh->nh_gw)
return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
return 1;
@@ -562,16 +627,17 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
}
rcu_read_lock();
{
- struct flowi fl = {
- .fl4_dst = nh->nh_gw,
- .fl4_scope = cfg->fc_scope + 1,
- .oif = nh->nh_oif,
+ struct flowi4 fl4 = {
+ .daddr = nh->nh_gw,
+ .flowi4_scope = cfg->fc_scope + 1,
+ .flowi4_oif = nh->nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
};
/* It is not necessary, but requires a bit of thinking */
- if (fl.fl4_scope < RT_SCOPE_LINK)
- fl.fl4_scope = RT_SCOPE_LINK;
- err = fib_lookup(net, &fl, &res);
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+ err = fib_lookup(net, &fl4, &res);
if (err) {
rcu_read_unlock();
return err;
@@ -613,14 +679,14 @@ out:
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
return ((__force u32)val ^
((__force u32)val >> 7) ^
((__force u32)val >> 14)) & mask;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +696,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
get_order(bytes));
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
{
if (!hash)
return;
@@ -641,25 +707,25 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
free_pages((unsigned long) hash, get_order(bytes));
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
+ unsigned int old_size = fib_info_hash_size;
unsigned int i, bytes;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ fib_info_hash_size = new_size;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
struct hlist_head *dest;
unsigned int new_hash;
@@ -674,10 +740,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
unsigned int new_hash;
@@ -693,8 +759,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
+}
+
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+ nh->nh_saddr = inet_select_addr(nh->nh_dev,
+ nh->nh_gw,
+ nh->nh_parent->fib_scope);
+ nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+
+ return nh->nh_saddr;
}
struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -705,6 +781,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int nhs = 1;
struct net *net = cfg->fc_nlinfo.nl_net;
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
+
/* Fast check to catch the most weird cases */
if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
goto err_inval;
@@ -718,24 +797,24 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
unsigned int bytes;
if (!new_size)
- new_size = 1;
+ new_size = 16;
bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
} else
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- if (!fib_hash_size)
+ if (!fib_info_hash_size)
goto failure;
}
@@ -743,16 +822,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fi == NULL)
goto failure;
fib_info_cnt++;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
fi->fib_net = hold_net(net);
fi->fib_protocol = cfg->fc_protocol;
+ fi->fib_scope = cfg->fc_scope;
fi->fib_flags = cfg->fc_flags;
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
+ fi->fib_type = cfg->fc_type;
fi->fib_nhs = nhs;
change_nexthops(fi) {
nexthop_nh->nh_parent = fi;
+ nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+ if (!nexthop_nh->nh_pcpu_rth_output)
+ goto failure;
} endfor_nexthops(fi)
if (cfg->fc_mx) {
@@ -763,9 +853,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int type = nla_type(nla);
if (type) {
+ u32 val;
+
if (type > RTAX_MAX)
goto err_inval;
- fi->fib_metrics[type - 1] = nla_get_u32(nla);
+ val = nla_get_u32(nla);
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ fi->fib_metrics[type - 1] = val;
}
}
}
@@ -779,7 +876,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
goto err_inval;
#endif
@@ -792,8 +889,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight = 1;
@@ -804,6 +903,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
goto err_inval;
goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ goto err_inval;
+ }
}
if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -835,6 +945,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
}
+ change_nexthops(fi) {
+ fib_info_update_nh_saddr(net, nexthop_nh);
+ } endfor_nexthops(fi)
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -880,92 +994,14 @@ failure:
return ERR_PTR(err);
}
-/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags)
-{
- struct fib_alias *fa;
- int nh_sel = 0;
-
- list_for_each_entry_rcu(fa, head, fa_list) {
- int err;
-
- if (fa->fa_tos &&
- fa->fa_tos != flp->fl4_tos)
- continue;
-
- if (fa->fa_scope < flp->fl4_scope)
- continue;
-
- fib_alias_accessed(fa);
-
- err = fib_props[fa->fa_type].error;
- if (err == 0) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi->fib_flags & RTNH_F_DEAD)
- continue;
-
- switch (fa->fa_type) {
- case RTN_UNICAST:
- case RTN_LOCAL:
- case RTN_BROADCAST:
- case RTN_ANYCAST:
- case RTN_MULTICAST:
- for_nexthops(fi) {
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (!flp->oif || flp->oif == nh->nh_oif)
- break;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (nhsel < fi->fib_nhs) {
- nh_sel = nhsel;
- goto out_fill_res;
- }
-#else
- if (nhsel < 1)
- goto out_fill_res;
-#endif
- endfor_nexthops(fi);
- continue;
-
- default:
- pr_warning("fib_semantic_match bad type %#x\n",
- fa->fa_type);
- return -EINVAL;
- }
- }
- return err;
- }
- return 1;
-
-out_fill_res:
- res->prefixlen = prefixlen;
- res->nh_sel = nh_sel;
- res->type = fa->fa_type;
- res->scope = fa->fa_scope;
- res->fi = fa->fa_info;
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&res->fi->fib_clntref);
- return 0;
-}
-
-/* Find appropriate source address to this destination */
-
-__be32 __fib_res_prefsrc(struct fib_result *res)
-{
- return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
-}
-
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
{
struct nlmsghdr *nlh;
struct rtmsg *rtm;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -978,33 +1014,36 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
- NLA_PUT_U32(skb, RTA_TABLE, tb_id);
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
rtm->rtm_type = type;
rtm->rtm_flags = fi->fib_flags;
- rtm->rtm_scope = scope;
+ rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
- if (rtm->rtm_dst_len)
- NLA_PUT_BE32(skb, RTA_DST, dst);
-
- if (fi->fib_priority)
- NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-
+ if (rtm->rtm_dst_len &&
+ nla_put_be32(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
goto nla_put_failure;
- if (fi->fib_prefsrc)
- NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-
+ if (fi->fib_prefsrc &&
+ nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
-
- if (fi->fib_nh->nh_oif)
- NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (fi->fib_nh[0].nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
+ if (fi->fib_nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
#endif
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1025,11 +1064,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
- if (nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (nh->nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+ if (nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
#endif
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
@@ -1056,13 +1097,12 @@ int fib_sync_down_addr(struct net *net, __be32 local)
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
- struct hlist_node *node;
struct fib_info *fi;
if (fib_info_laddrhash == NULL || local == 0)
return 0;
- hlist_for_each_entry(fi, node, head, fib_lhash) {
+ hlist_for_each_entry(fi, head, fib_lhash) {
if (!net_eq(fi->fib_net, net))
continue;
if (fi->fib_prefsrc == local) {
@@ -1080,13 +1120,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)
struct fib_info *prev_fi = NULL;
unsigned int hash = fib_devindex_hashfn(dev->ifindex);
struct hlist_head *head = &fib_info_devhash[hash];
- struct hlist_node *node;
struct fib_nh *nh;
if (force)
scope = -1;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int dead;
@@ -1125,6 +1164,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
return ret;
}
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct list_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (next_fi->fib_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || fi == NULL) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
@@ -1136,7 +1231,6 @@ int fib_sync_up(struct net_device *dev)
struct fib_info *prev_fi;
unsigned int hash;
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
int ret;
@@ -1148,7 +1242,7 @@ int fib_sync_up(struct net_device *dev)
head = &fib_info_devhash[hash];
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int alive;
@@ -1189,7 +1283,7 @@ int fib_sync_up(struct net_device *dev)
* The algorithm is suboptimal, but it provides really
* fair weighted route distribution.
*/
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+void fib_select_multipath(struct fib_result *res)
{
struct fib_info *fi = res->fi;
int w;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 200eb538fbb..5afeb5aa4c7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -12,7 +12,7 @@
*
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
*
- * This work is based on the LPC-trie which is originally descibed in:
+ * This work is based on the LPC-trie which is originally described in:
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
@@ -51,7 +51,6 @@
#define VERSION "0.409"
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -72,6 +71,7 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
+struct rt_trie_node {
unsigned long parent;
t_key key;
};
@@ -109,9 +109,10 @@ struct leaf {
struct leaf_info {
struct hlist_node hlist;
- struct rcu_head rcu;
int plen;
+ u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
struct list_head falh;
+ struct rcu_head rcu;
};
struct tnode {
@@ -123,10 +124,9 @@ struct tnode {
unsigned int empty_children; /* KEYLENGTH bits needed */
union {
struct rcu_head rcu;
- struct work_struct work;
struct tnode *tnode_free;
};
- struct node *child[0];
+ struct rt_trie_node __rcu *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,15 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct rt_trie_node __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -177,39 +176,58 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+/*
+ * caller must hold RTNL
+ */
+static inline struct tnode *node_parent(const struct rt_trie_node *node)
{
- return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
+ unsigned long parent;
+
+ parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+
+ return (struct tnode *)(parent & ~NODE_TYPE_MASK);
}
-static inline struct tnode *node_parent_rcu(struct node *node)
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
{
- struct tnode *ret = node_parent(node);
+ unsigned long parent;
- return rcu_dereference_rtnl(ret);
+ parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
+ lockdep_rtnl_is_held());
+
+ return (struct tnode *)(parent & ~NODE_TYPE_MASK);
}
/* Same as rcu_assign_pointer
* but that macro() assumes that value is a pointer.
*/
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
smp_wmb();
node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
-static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+/*
+ * caller must hold RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
- return tn->child[i];
+ return rtnl_dereference(tn->child[i]);
}
-static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
- struct node *ret = tnode_get_child(tn, i);
+ BUG_ON(i >= 1U << tn->bits);
- return rcu_dereference_rtnl(ret);
+ return rcu_dereference_rtnl(tn->child[i]);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -217,12 +235,12 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -347,17 +365,12 @@ static void __leaf_free_rcu(struct rcu_head *head)
static inline void free_leaf(struct leaf *l)
{
- call_rcu_bh(&l->rcu, __leaf_free_rcu);
-}
-
-static void __leaf_info_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct leaf_info, rcu));
+ call_rcu(&l->rcu, __leaf_free_rcu);
}
static inline void free_leaf_info(struct leaf_info *leaf)
{
- call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+ kfree_rcu(leaf, rcu);
}
static struct tnode *tnode_alloc(size_t size)
@@ -365,27 +378,19 @@ static struct tnode *tnode_alloc(size_t size)
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
- return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
-}
-
-static void __tnode_vfree(struct work_struct *arg)
-{
- struct tnode *tn = container_of(arg, struct tnode, work);
- vfree(tn);
+ return vzalloc(size);
}
static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
size_t size = sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
- else {
- INIT_WORK(&tn->work, __tnode_vfree);
- schedule_work(&tn->work);
- }
+ else
+ vfree(tn);
}
static inline void tnode_free(struct tnode *tn)
@@ -402,7 +407,7 @@ static void tnode_free_safe(struct tnode *tn)
tn->tnode_free = tnode_free_head;
tnode_free_head = tn;
tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
}
static void tnode_free_flush(void)
@@ -436,6 +441,7 @@ static struct leaf_info *leaf_info_new(int plen)
struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
if (li) {
li->plen = plen;
+ li->mask_plen = ntohl(inet_make_mask(plen));
INIT_LIST_HEAD(&li->falh);
}
return li;
@@ -443,7 +449,7 @@ static struct leaf_info *leaf_info_new(int plen)
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
@@ -456,7 +462,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct node) << bits);
+ sizeof(struct rt_trie_node *) << bits);
return tn;
}
@@ -465,7 +471,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -473,8 +479,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
-static inline void put_child(struct trie *t, struct tnode *tn, int i,
- struct node *n)
+static inline void put_child(struct tnode *tn, int i,
+ struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -484,10 +490,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull)
{
- struct node *chi = tn->child[i];
+ struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +521,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
}
#define MAX_WORK 10
-static struct node *resize(struct trie *t, struct tnode *tn)
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
struct tnode *old_tn;
@@ -605,7 +611,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node *)tn)) {
+ if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -635,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
/*
* Halve as long as the number of empty children in this
@@ -663,9 +669,9 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
+ struct rt_trie_node *n;
- n = tn->child[i];
+ n = rtnl_dereference(tn->child[i]);
if (!n)
continue;
@@ -676,7 +682,21 @@ one_child:
return n;
}
}
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
+}
+
+
+static void tnode_clean_free(struct tnode *tn)
+{
+ int i;
+ struct tnode *tofree;
+
+ for (i = 0; i < tnode_child_length(tn); i++) {
+ tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+ if (tofree)
+ tnode_free(tofree);
+ }
+ tnode_free(tn);
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +743,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
+ put_child(tn, 2*i, (struct rt_trie_node *) left);
+ put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
- struct node *node = tnode_get_child(oldtnode, i);
+ struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -742,12 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key,
- oldtnode->pos + oldtnode->bits,
- 1) == 0)
- put_child(t, tn, 2*i, node);
- else
- put_child(t, tn, 2*i+1, node);
+ put_child(tn,
+ tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
+ node);
continue;
}
@@ -755,8 +772,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode = (struct tnode *) node;
if (inode->bits == 1) {
- put_child(t, tn, 2*i, inode->child[0]);
- put_child(t, tn, 2*i+1, inode->child[1]);
+ put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
+ put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
tnode_free_safe(inode);
continue;
@@ -786,46 +803,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ put_child(tn, 2*i, NULL);
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
size = tnode_child_length(left);
for (j = 0; j < size; j++) {
- put_child(t, left, j, inode->child[j]);
- put_child(t, right, j, inode->child[j + size]);
+ put_child(left, j, rtnl_dereference(inode->child[j]));
+ put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
+ put_child(tn, 2*i, resize(t, left));
+ put_child(tn, 2*i+1, resize(t, right));
tnode_free_safe(inode);
}
tnode_free_safe(oldtnode);
return tn;
nomem:
- {
- int size = tnode_child_length(tn);
- int j;
-
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- return ERR_PTR(-ENOMEM);
- }
+ tnode_clean_free(tn);
+ return ERR_PTR(-ENOMEM);
}
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
- struct node *left, *right;
+ struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
@@ -856,7 +863,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct node *)newn);
+ put_child(tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -871,37 +878,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (left == NULL) {
if (right == NULL) /* Both are empty */
continue;
- put_child(t, tn, i/2, right);
+ put_child(tn, i/2, right);
continue;
}
if (right == NULL) {
- put_child(t, tn, i/2, left);
+ put_child(tn, i/2, left);
continue;
}
/* Two nonempty children */
newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
+ put_child(tn, i/2, NULL);
+ put_child(newBinNode, 0, left);
+ put_child(newBinNode, 1, right);
+ put_child(tn, i/2, resize(t, newBinNode));
}
tnode_free_safe(oldtnode);
return tn;
nomem:
- {
- int size = tnode_child_length(tn);
- int j;
-
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- return ERR_PTR(-ENOMEM);
- }
+ tnode_clean_free(tn);
+ return ERR_PTR(-ENOMEM);
}
/* readside must use rcu_read_lock currently dump routines
@@ -910,10 +907,9 @@ nomem:
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
struct hlist_head *head = &l->list;
- struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry_rcu(li, node, head, hlist)
+ hlist_for_each_entry_rcu(li, head, hlist)
if (li->plen == plen)
return li;
@@ -933,12 +929,11 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node;
if (hlist_empty(head)) {
hlist_add_head_rcu(&new->hlist, head);
} else {
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry(li, head, hlist) {
if (new->plen > li->plen)
break;
@@ -958,7 +953,7 @@ fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
- struct node *n;
+ struct rt_trie_node *n;
pos = 0;
n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +988,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,
- (struct node *)tn, wasfull);
+ tnode_put_child_reorg(tp, cindex,
+ (struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct node *) tn);
+ tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
@@ -1013,9 +1008,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode *)resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, tn);
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
@@ -1025,7 +1020,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
+ struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
@@ -1033,7 +1028,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
t_key cindex;
pos = 0;
- n = t->trie;
+ n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
@@ -1111,10 +1106,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct node *)l, tp);
+ node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ put_child(tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1122,12 +1117,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* first tnode need some special handling
*/
- if (tp)
- pos = tp->pos+tp->bits;
- else
- pos = 0;
-
if (n) {
+ pos = tp ? tp->pos+tp->bits : 0;
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
@@ -1141,26 +1132,24 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
return NULL;
}
- node_set_parent((struct node *)tn, tp);
+ node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
- put_child(t, tn, 1-missbit, n);
+ put_child(tn, missbit, (struct rt_trie_node *)l);
+ put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex,
- (struct node *)tn);
+ put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
- pr_warning("fib_trie"
- " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
@@ -1245,7 +1234,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
fa->fa_info == fi) {
fa_match = fa;
break;
@@ -1271,7 +1259,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
@@ -1280,7 +1267,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
@@ -1308,7 +1295,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
new_fa->fa_state = 0;
/*
* Insert new entry to the list.
@@ -1322,10 +1308,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
}
+ if (!plen)
+ tb->tb_num_default++;
+
list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
@@ -1340,49 +1329,83 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
- t_key key, const struct flowi *flp,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+ t_key key, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
- struct hlist_node *node;
- hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- int err;
- int plen = li->plen;
- __be32 mask = inet_make_mask(plen);
+ hlist_for_each_entry_rcu(li, hhead, hlist) {
+ struct fib_alias *fa;
- if (l->key != (key & ntohl(mask)))
+ if (l->key != (key & li->mask_plen))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (err) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- if (err <= 0)
- t->stats.semantic_match_passed++;
- else
- t->stats.semantic_match_miss++;
+ t->stats.semantic_match_passed++;
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_passed++;
+#endif
+ res->prefixlen = li->plen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_info->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &li->falh;
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+ return 0;
+ }
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_miss++;
#endif
- if (err <= 0)
- return err;
}
return 1;
}
-int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
- struct node *n;
+ struct rt_trie_node *n;
struct tnode *pn;
- int pos, bits;
- t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
+ unsigned int pos, bits;
+ t_key key = ntohl(flp->daddr);
+ unsigned int chopped_off;
t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
+ unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
t_key pref_mismatch;
@@ -1398,7 +1421,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1423,7 +1446,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1507,7 +1530,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* state.directly.
*/
if (pref_mismatch) {
- int mp = KEYLENGTH - fls(pref_mismatch);
+ /* fls(x) = __fls(x) + 1 */
+ int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;
@@ -1541,7 +1565,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct node *) pn);
+ struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;
@@ -1562,22 +1586,23 @@ found:
rcu_read_unlock();
return ret;
}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
/*
* Remove the leaf and return parent.
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct node *) l);
+ struct tnode *tp = node_parent((struct rt_trie_node *) l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, NULL);
+ put_child(tp, cindex, NULL);
trie_rebalance(t, tp);
} else
- rcu_assign_pointer(t->trie, NULL);
+ RCU_INIT_POINTER(t->trie, NULL);
free_leaf(l);
}
@@ -1611,7 +1636,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!l)
return -ESRCH;
- fa_head = get_fa_head(l, plen);
+ li = find_leaf_info(l, plen);
+
+ if (!li)
+ return -ESRCH;
+
+ fa_head = &li->falh;
fa = fib_find_alias(fa_head, tos, 0);
if (!fa)
@@ -1629,7 +1659,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
(cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
fib_nh_match(cfg, fi) == 0) {
@@ -1645,11 +1677,11 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
- l = fib_find_node(t, key);
- li = find_leaf_info(l, plen);
-
list_del_rcu(&fa->fa_list);
+ if (!plen)
+ tb->tb_num_default--;
+
if (list_empty(fa_head)) {
hlist_del_rcu(&li->hlist);
free_leaf_info(li);
@@ -1659,7 +1691,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1688,10 +1720,10 @@ static int trie_flush_leaf(struct leaf *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct leaf_info *li = NULL;
- hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+ hlist_for_each_entry_safe(li, tmp, lih, hlist) {
found += trie_flush_list(&li->falh);
if (list_empty(&li->falh)) {
@@ -1706,7 +1738,7 @@ static int trie_flush_leaf(struct leaf *l)
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
{
do {
t_key idx;
@@ -1721,10 +1753,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
if (!c)
continue;
- if (IS_LEAF(c)) {
- prefetch(p->child[idx]);
+ if (IS_LEAF(c))
return (struct leaf *) c;
- }
/* Rescan start scanning in new node */
p = (struct tnode *) c;
@@ -1732,7 +1762,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
}
/* Node empty, walk back up to parent */
- c = (struct node *) p;
+ c = (struct rt_trie_node *) p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
@@ -1753,7 +1783,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
static struct leaf *trie_nextleaf(struct leaf *l)
{
- struct node *c = (struct node *) l;
+ struct rt_trie_node *c = (struct rt_trie_node *) l;
struct tnode *p = node_parent_rcu(c);
if (!p)
@@ -1802,80 +1832,6 @@ void fib_free_table(struct fib_table *tb)
kfree(tb);
}
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp,
- struct fib_result *res)
-{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
-
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
-
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
-
- if (list_empty(fa_head))
- goto out;
-
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
@@ -1895,12 +1851,11 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
continue;
}
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE,
tb->tb_id,
fa->fa_type,
- fa->fa_scope,
xkey,
plen,
fa->fa_tos,
@@ -1918,14 +1873,13 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
struct leaf_info *li;
- struct hlist_node *node;
int i, s_i;
s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
if (i < s_i) {
i++;
continue;
@@ -1990,7 +1944,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
return skb->len;
}
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
@@ -2003,8 +1957,7 @@ void __init fib_hash_init(void)
}
-/* Fix more generic FIB names for init later */
-struct fib_table *fib_hash_table(u32 id)
+struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
@@ -2016,13 +1969,11 @@ struct fib_table *fib_hash_table(u32 id)
tb->tb_id = id;
tb->tb_default = -1;
+ tb->tb_num_default = 0;
t = (struct trie *) tb->tb_data;
memset(t, 0, sizeof(*t));
- if (id == RT_TABLE_LOCAL)
- pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
-
return tb;
}
@@ -2036,7 +1987,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -2050,7 +2001,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child_rcu(tn, cindex);
+ struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2069,7 +2020,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct node *)tn);
+ p = node_parent_rcu((struct rt_trie_node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2081,10 +2032,10 @@ rescan:
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct node *n;
+ struct rt_trie_node *n;
if (!t)
return NULL;
@@ -2108,7 +2059,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct rt_trie_node *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2118,14 +2069,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
if (IS_LEAF(n)) {
struct leaf *l = (struct leaf *)n;
struct leaf_info *li;
- struct hlist_node *tmp;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+ hlist_for_each_entry_rcu(li, &l->list, hlist)
++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
@@ -2173,7 +2123,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
max--;
pointers = 0;
- for (i = 1; i <= max; i++)
+ for (i = 1; i < max; i++)
if (stat->nodesizes[i] != 0) {
seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
@@ -2181,7 +2131,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
+ bytes += sizeof(struct rt_trie_node *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2226,10 +2176,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct trie *t = (struct trie *) tb->tb_data;
struct trie_stat stat;
@@ -2262,7 +2211,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2271,11 +2220,10 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- struct hlist_node *node;
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
- struct node *n;
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2304,7 +2252,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct node *n;
+ struct rt_trie_node *n;
++*pos;
/* next node in same table */
@@ -2314,7 +2262,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* walk rest of this hash chain */
h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
- while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
+ while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
@@ -2324,7 +2272,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* new hash chain */
while (++h < FIB_TABLE_HASHSZ) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
goto found;
@@ -2390,7 +2338,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
+ struct rt_trie_node *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
@@ -2407,13 +2355,12 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
} else {
struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
- struct hlist_node *node;
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2422,7 +2369,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
seq_indent(seq, iter->depth+1);
seq_printf(seq, " /%d %s %s", li->plen,
rtn_scope(buf1, sizeof(buf1),
- fa->fa_scope),
+ fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
if (fa->fa_tos)
@@ -2558,7 +2505,6 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
{
struct leaf *l = v;
struct leaf_info *li;
- struct hlist_node *node;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2567,7 +2513,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
@@ -2577,16 +2523,17 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- int len;
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
continue;
+ seq_setwidth(seq, 127);
+
if (fi)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u%n",
+ "%d\t%08X\t%d\t%u\t%u",
fi->fib_dev ? fi->fib_dev->name : "*",
prefix,
fi->fib_nh->nh_gw, flags, 0, 0,
@@ -2595,15 +2542,15 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
- fi->fib_rtt >> 3, &len);
+ fi->fib_rtt >> 3);
else
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u%n",
+ "%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0, &len);
+ mask, 0, 0, 0);
- seq_printf(seq, "%*s\n", 127 - len, "");
+ seq_pad(seq, '\n');
}
}
@@ -2633,31 +2580,31 @@ static const struct file_operations fib_route_fops = {
int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
- &fib_triestat_fops))
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove(net, "fib_triestat");
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- proc_net_remove(net, "fib_trie");
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(net, "fib_trie");
- proc_net_remove(net, "fib_triestat");
- proc_net_remove(net, "route");
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
deleted file mode 100644
index c6933f2ea31..00000000000
--- a/net/ipv4/gre.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * GRE over IPv4 demultiplexer driver
- *
- * Authors: Dmitry Kozlov (xeb@mail.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/version.h>
-#include <linux/spinlock.h>
-#include <net/protocol.h>
-#include <net/gre.h>
-
-
-static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
-static DEFINE_SPINLOCK(gre_proto_lock);
-
-int gre_add_protocol(const struct gre_protocol *proto, u8 version)
-{
- if (version >= GREPROTO_MAX)
- goto err_out;
-
- spin_lock(&gre_proto_lock);
- if (gre_proto[version])
- goto err_out_unlock;
-
- rcu_assign_pointer(gre_proto[version], proto);
- spin_unlock(&gre_proto_lock);
- return 0;
-
-err_out_unlock:
- spin_unlock(&gre_proto_lock);
-err_out:
- return -1;
-}
-EXPORT_SYMBOL_GPL(gre_add_protocol);
-
-int gre_del_protocol(const struct gre_protocol *proto, u8 version)
-{
- if (version >= GREPROTO_MAX)
- goto err_out;
-
- spin_lock(&gre_proto_lock);
- if (rcu_dereference_protected(gre_proto[version],
- lockdep_is_held(&gre_proto_lock)) != proto)
- goto err_out_unlock;
- rcu_assign_pointer(gre_proto[version], NULL);
- spin_unlock(&gre_proto_lock);
- synchronize_rcu();
- return 0;
-
-err_out_unlock:
- spin_unlock(&gre_proto_lock);
-err_out:
- return -1;
-}
-EXPORT_SYMBOL_GPL(gre_del_protocol);
-
-static int gre_rcv(struct sk_buff *skb)
-{
- const struct gre_protocol *proto;
- u8 ver;
- int ret;
-
- if (!pskb_may_pull(skb, 12))
- goto drop;
-
- ver = skb->data[1]&0x7f;
- if (ver >= GREPROTO_MAX)
- goto drop;
-
- rcu_read_lock();
- proto = rcu_dereference(gre_proto[ver]);
- if (!proto || !proto->handler)
- goto drop_unlock;
- ret = proto->handler(skb);
- rcu_read_unlock();
- return ret;
-
-drop_unlock:
- rcu_read_unlock();
-drop:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static void gre_err(struct sk_buff *skb, u32 info)
-{
- const struct gre_protocol *proto;
- u8 ver;
-
- if (!pskb_may_pull(skb, 12))
- goto drop;
-
- ver = skb->data[1]&0x7f;
- if (ver >= GREPROTO_MAX)
- goto drop;
-
- rcu_read_lock();
- proto = rcu_dereference(gre_proto[ver]);
- if (!proto || !proto->err_handler)
- goto drop_unlock;
- proto->err_handler(skb, info);
- rcu_read_unlock();
- return;
-
-drop_unlock:
- rcu_read_unlock();
-drop:
- kfree_skb(skb);
-}
-
-static const struct net_protocol net_gre_protocol = {
- .handler = gre_rcv,
- .err_handler = gre_err,
- .netns_ok = 1,
-};
-
-static int __init gre_init(void)
-{
- pr_info("GRE over IPv4 demultiplexor driver");
-
- if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
- pr_err("gre: can't add protocol\n");
- return -EAGAIN;
- }
-
- return 0;
-}
-
-static void __exit gre_exit(void)
-{
- inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
-}
-
-module_init(gre_init);
-module_exit(gre_exit);
-
-MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
-MODULE_LICENSE("GPL");
-
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 00000000000..0485bf7f8f0
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,364 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
+
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(gre_build_header);
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ unsigned int ip_hlen = ip_hdrlen(skb);
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
+
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
+
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static int gre_cisco_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ int i;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+ int ret;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+ ret = proto->handler(skb, &tpi);
+ if (ret == PACKET_RCVD) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void gre_cisco_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int i;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+
+ if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
+ goto out;
+
+ }
+out:
+ rcu_read_unlock();
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+ if (ver >= GREPROTO_MAX)
+ return;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_cisco_rcv,
+ .err_handler = gre_cisco_err,
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *newp)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[newp->priority];
+
+ return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_register);
+
+int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[del_proto->priority];
+ int ret;
+
+ ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
+
+ if (ret)
+ return ret;
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_unregister);
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ goto err;
+ }
+
+ if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+ pr_info("%s: can't add ipgre handler\n", __func__);
+ goto err_gre;
+ }
+
+ return 0;
+err_gre:
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+err:
+ return -EAGAIN;
+}
+
+static void __exit gre_exit(void)
+{
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 00000000000..f0bdd47bbbc
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,298 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ if (!skb->encapsulation)
+ return -EINVAL;
+ return 0;
+}
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl;
+ struct gre_base_hdr *greh;
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ ghl = skb_inner_network_header(skb) - skb_transport_header(skb);
+ if (unlikely(ghl < sizeof(*greh)))
+ goto out;
+
+ csum = !!(greh->flags & GRE_CSUM);
+ if (csum)
+ skb->encap_hdr_csum = 1;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs)) {
+ skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ goto out;
+ }
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ skb_reset_transport_header(skb);
+
+ greh = (struct gre_base_hdr *)
+ skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+/* Compute the whole skb csum in s/w and store it, then verify GRO csum
+ * starting from gro_offset.
+ */
+static __sum16 gro_skb_checksum(struct sk_buff *skb)
+{
+ __sum16 sum;
+
+ skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
+ csum_partial(skb->data, skb_gro_offset(skb), 0));
+ sum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
+ if (unlikely(!sum) && !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ } else {
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ }
+
+ return sum;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ type = greh->protocol;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL)
+ goto out_unlock;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out_unlock;
+ }
+ if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
+ __sum16 csum = 0;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ /* Don't trust csum error calculated/reported by h/w */
+ if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
+ csum = gro_skb_checksum(skb);
+
+ /* GRE CSUM is the 1's complement of the 1's complement sum
+ * of the GRE hdr plus payload so it should add up to 0xffff
+ * (and 0 after csum_fold()) just like the IPv4 hdr csum.
+ */
+ if (csum)
+ goto out_unlock;
+ }
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
+ },
+};
+
+static int __init gre_offload_init(void)
+{
+ return inet_add_offload(&gre_offload, IPPROTO_GRE);
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea..42b7bcf8045 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -62,6 +62,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
@@ -83,16 +85,17 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
+#include <net/ip_fib.h>
/*
* Build xmit assembly blocks
@@ -108,8 +111,7 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options replyopts;
- unsigned char optbuf[40];
+ struct ip_options_data replyopts;
};
/* An array of errno for error messages from dest unreach. */
@@ -233,48 +235,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
* Send an ICMP frame.
*/
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
- *
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
-{
- unsigned long now, token = dst->rate_tokens;
- int rc = 0;
-
- now = jiffies;
- token += now - dst->rate_last;
- dst->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
- if (token >= timeout) {
- token -= timeout;
- rc = 1;
- }
- dst->rate_tokens = token;
- return rc;
-}
-EXPORT_SYMBOL(xrlim_allow);
-
-static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
- int type, int code)
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
- int rc = 1;
+ bool rc = true;
if (type > NR_ICMP_TYPES)
goto out;
@@ -288,8 +253,13 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & net->ipv4.sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
+ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+ struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ rc = inet_peer_xrlim_allow(peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
+ }
out:
return rc;
}
@@ -324,13 +294,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
}
static void icmp_push_reply(struct icmp_bxm *icmp_param,
+ struct flowi4 *fl4,
struct ipcm_cookie *ipc, struct rtable **rt)
{
struct sock *sk;
struct sk_buff *skb;
sk = icmp_sk(dev_net((*rt)->dst.dev));
- if (ip_append_data(sk, icmp_glue_bits, icmp_param,
+ if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
ipc, rt, MSG_DONTWAIT) < 0) {
@@ -349,7 +320,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
icmp_param->head_len, csum);
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
- ip_push_pending_frames(sk);
+ ip_push_pending_frames(sk, fl4);
}
}
@@ -362,11 +333,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
+ struct flowi4 fl4;
struct sock *sk;
struct inet_sock *inet;
- __be32 daddr;
+ __be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
- if (ip_options_echo(&icmp_param->replyopts, skb))
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
sk = icmp_xmit_lock(net);
@@ -377,31 +350,129 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_param->data.icmph.checksum = 0;
inet->tos = ip_hdr(skb)->tos;
- daddr = ipc.addr = rt->rt_src;
+ sk->sk_mark = mark;
+ daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
ipc.tx_flags = 0;
- if (icmp_param->replyopts.optlen) {
- ipc.opt = &icmp_param->replyopts;
- if (ipc.opt->srr)
- daddr = icmp_param->replyopts.faddr;
- }
- {
- struct flowi fl = { .fl4_dst= daddr,
- .fl4_src = rt->rt_spec_dst,
- .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
- .proto = IPPROTO_ICMP };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(net, &rt, &fl))
- goto out_unlock;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (icmp_param->replyopts.opt.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts.opt;
+ if (ipc.opt->opt.srr)
+ daddr = icmp_param->replyopts.opt.opt.faddr;
}
- if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_proto = IPPROTO_ICMP;
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+ if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
icmp_param->data.icmph.code))
- icmp_push_reply(icmp_param, &ipc, &rt);
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
}
+static struct rtable *icmp_route_lookup(struct net *net,
+ struct flowi4 *fl4,
+ struct sk_buff *skb_in,
+ const struct iphdr *iph,
+ __be32 saddr, u8 tos, u32 mark,
+ int type, int code,
+ struct icmp_bxm *param)
+{
+ struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
+ int err;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = (param->replyopts.opt.opt.srr ?
+ param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
+ fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_proto = IPPROTO_ICMP;
+ fl4->fl4_icmp_type = type;
+ fl4->fl4_icmp_code = code;
+ security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ rt = __ip_route_output_key(net, fl4);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ if (!IS_ERR(rt)) {
+ if (rt != rt2)
+ return rt;
+ } else if (PTR_ERR(rt) == -EPERM) {
+ rt = NULL;
+ } else
+ return rt;
+
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4_dec.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+ RT_TOS(tos), rt2->dst.dev);
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+ flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(rt2)) {
+ dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
+ rt = rt2;
+ } else if (PTR_ERR(rt2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
/*
* Send an ICMP message in response to a situation
@@ -418,11 +489,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
+ struct icmp_bxm *icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
+ struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
struct net *net;
struct sock *sk;
@@ -438,7 +511,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph = ip_hdr(skb_in);
if ((u8 *)iph < skb_in->head ||
- (skb_in->network_header + sizeof(*iph)) > skb_in->tail)
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
goto out;
/*
@@ -492,9 +566,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
+ icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+ if (!icmp_param)
+ return;
+
sk = icmp_xmit_lock(net);
if (sk == NULL)
- return;
+ goto out_free;
/*
* Construct source address and options.
@@ -507,7 +585,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
rcu_read_lock();
if (rt_is_input_route(rt) &&
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index_rcu(net, rt->fl.iif);
+ dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -519,8 +597,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -528,98 +607,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts;
+ ipc.opt = &icmp_param->replyopts.opt;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
- {
- struct flowi fl = {
- .fl4_dst = icmp_param.replyopts.srr ?
- icmp_param.replyopts.faddr : iph->saddr,
- .fl4_src = saddr,
- .fl4_tos = RT_TOS(tos),
- .proto = IPPROTO_ICMP,
- .fl_icmp_type = type,
- .fl_icmp_code = code,
- };
- int err;
- struct rtable *rt2;
-
- security_skb_classify_flow(skb_in, &fl);
- if (__ip_route_output_key(net, &rt, &fl))
- goto out_unlock;
-
- /* No need to clone since we're just using its address. */
- rt2 = rt;
-
- if (!fl.nl_u.ip4_u.saddr)
- fl.nl_u.ip4_u.saddr = rt->rt_src;
-
- err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
- switch (err) {
- case 0:
- if (rt != rt2)
- goto route_done;
- break;
- case -EPERM:
- rt = NULL;
- break;
- default:
- goto out_unlock;
- }
-
- if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
- goto relookup_failed;
-
- if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
- err = __ip_route_output_key(net, &rt2, &fl);
- else {
- struct flowi fl2 = {};
- unsigned long orefdst;
-
- fl2.fl4_dst = fl.fl4_src;
- if (ip_route_output_key(net, &rt2, &fl2))
- goto relookup_failed;
-
- /* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->dst.dev);
-
- dst_release(&rt2->dst);
- rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
- }
-
- if (err)
- goto relookup_failed;
-
- err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
- XFRM_LOOKUP_ICMP);
- switch (err) {
- case 0:
- dst_release(&rt->dst);
- rt = rt2;
- break;
- case -EPERM:
- goto ende;
- default:
-relookup_failed:
- if (!rt)
- goto out_unlock;
- break;
- }
- }
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
+ type, code, icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
-route_done:
- if (!icmpv4_xrlim_allow(net, rt, type, code))
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
@@ -627,36 +634,68 @@ route_done:
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param.data_len = skb_in->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = skb_in->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
- icmp_push_reply(&icmp_param, &ipc, &rt);
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
+out_free:
+ kfree(icmp_param);
out:;
}
EXPORT_SYMBOL(icmp_send);
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ return;
+
+ raw_icmp_error(skb, protocol, info);
+
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
+
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
*/
static void icmp_unreach(struct sk_buff *skb)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
struct icmphdr *icmph;
- int hash, protocol;
- const struct net_protocol *ipprot;
- u32 info = 0;
struct net *net;
+ u32 info = 0;
net = dev_net(skb_dst(skb)->dev);
@@ -670,7 +709,7 @@ static void icmp_unreach(struct sk_buff *skb)
goto out_err;
icmph = icmp_hdr(skb);
- iph = (struct iphdr *)skb->data;
+ iph = (const struct iphdr *)skb->data;
if (iph->ihl < 5) /* Mangled header, drop. */
goto out_err;
@@ -683,19 +722,27 @@ static void icmp_unreach(struct sk_buff *skb)
case ICMP_PORT_UNREACH:
break;
case ICMP_FRAG_NEEDED:
- if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.txt
+ */
+ switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ default:
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
&iph->daddr);
- } else {
- info = ip_rt_frag_needed(net, iph,
- ntohs(icmph->un.frag.mtu),
- skb->dev);
- if (!info)
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
goto out;
+ /* fall through */
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),
&iph->daddr);
break;
default:
@@ -718,7 +765,7 @@ static void icmp_unreach(struct sk_buff *skb)
*/
/*
- * Check the other end isnt violating RFC 1122. Some routers send
+ * Check the other end isn't violating RFC 1122. Some routers send
* bogus responses to broadcast frames. If you see this message
* first check your netmask matches at both ends, if it does then
* get the other vendor to fix their kit.
@@ -726,37 +773,14 @@ static void icmp_unreach(struct sk_buff *skb)
if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
- if (net_ratelimit())
- printk(KERN_WARNING "%pI4 sent an invalid ICMP "
- "type %u, code %u "
- "error to a broadcast: %pI4 on %s\n",
- &ip_hdr(skb)->saddr,
- icmph->type, icmph->code,
- &iph->daddr,
- skb->dev->name);
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
goto out;
}
- /* Checkin full IP header plus 8 bytes of protocol to
- * avoid additional coding at protocol handlers.
- */
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- protocol = iph->protocol;
-
- /*
- * Deliver ICMP message to raw sockets. Pretty useless feature?
- */
- raw_icmp_error(skb, protocol, info);
-
- hash = protocol & (MAX_INET_PROTOS - 1);
- rcu_read_lock();
- ipprot = rcu_dereference(inet_protos[hash]);
- if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, info);
- rcu_read_unlock();
+ icmp_socket_deliver(skb, info);
out:
return;
@@ -772,37 +796,15 @@ out_err:
static void icmp_redirect(struct sk_buff *skb)
{
- struct iphdr *iph;
-
- if (skb->len < sizeof(struct iphdr))
- goto out_err;
+ if (skb->len < sizeof(struct iphdr)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return;
+ }
- /*
- * Get the copied header of the packet that caused the redirect
- */
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
-
- iph = (struct iphdr *)skb->data;
+ return;
- switch (icmp_hdr(skb)->code & 7) {
- case ICMP_REDIR_NET:
- case ICMP_REDIR_NETTOS:
- /*
- * As per RFC recommendations now handle it as a host redirect.
- */
- case ICMP_REDIR_HOST:
- case ICMP_REDIR_HOSTTOS:
- ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
- icmp_hdr(skb)->un.gateway,
- iph->saddr, skb->dev);
- break;
- }
-out:
- return;
-out_err:
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
- goto out;
+ icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
}
/*
@@ -876,87 +878,6 @@ out_err:
goto out;
}
-
-/*
- * Handle ICMP_ADDRESS_MASK requests. (RFC950)
- *
- * RFC1122 (3.2.2.9). A host MUST only send replies to
- * ADDRESS_MASK requests if it's been configured as an address mask
- * agent. Receiving a request doesn't constitute implicit permission to
- * act as one. Of course, implementing this correctly requires (SHOULD)
- * a way to turn the functionality on and off. Another one for sysctl(),
- * I guess. -- MS
- *
- * RFC1812 (4.3.3.9). A router MUST implement it.
- * A router SHOULD have switch turning it on/off.
- * This switch MUST be ON by default.
- *
- * Gratuitous replies, zero-source replies are not implemented,
- * that complies with RFC. DO NOT implement them!!! All the idea
- * of broadcast addrmask replies as specified in RFC950 is broken.
- * The problem is that it is not uncommon to have several prefixes
- * on one physical interface. Moreover, addrmask agent can even be
- * not aware of existing another prefixes.
- * If source is zero, addrmask agent cannot choose correct prefix.
- * Gratuitous mask announcements suffer from the same problem.
- * RFC1812 explains it, but still allows to use ADDRMASK,
- * that is pretty silly. --ANK
- *
- * All these rules are so bizarre, that I removed kernel addrmask
- * support at all. It is wrong, it is obsolete, nobody uses it in
- * any case. --ANK
- *
- * Furthermore you can do it with a usermode address agent program
- * anyway...
- */
-
-static void icmp_address(struct sk_buff *skb)
-{
-#if 0
- if (net_ratelimit())
- printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
-#endif
-}
-
-/*
- * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
- * loudly if an inconsistency is found.
- * called with rcu_read_lock()
- */
-
-static void icmp_address_reply(struct sk_buff *skb)
-{
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = skb->dev;
- struct in_device *in_dev;
- struct in_ifaddr *ifa;
-
- if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
- return;
-
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev)
- return;
-
- if (in_dev->ifa_list &&
- IN_DEV_LOG_MARTIANS(in_dev) &&
- IN_DEV_FORWARD(in_dev)) {
- __be32 _mask, *mp;
-
- mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
- BUG_ON(mp == NULL);
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (*mp == ifa->ifa_mask &&
- inet_ifa_match(rt->rt_src, ifa))
- break;
- }
- if (!ifa && net_ratelimit()) {
- printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
- mp, dev->name, &rt->rt_src);
- }
- }
-}
-
static void icmp_discard(struct sk_buff *skb)
{
}
@@ -992,16 +913,8 @@ int icmp_rcv(struct sk_buff *skb)
ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto error;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
@@ -1048,17 +961,43 @@ int icmp_rcv(struct sk_buff *skb)
drop:
kfree_skb(skb);
return 0;
+csum_error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
error:
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
goto drop;
}
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, offset, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
/*
* This table is the definition of how we handle ICMP.
*/
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY] = {
- .handler = icmp_discard,
+ .handler = ping_rcv,
},
[1] = {
.handler = icmp_discard,
@@ -1120,10 +1059,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
.handler = icmp_discard,
},
[ICMP_ADDRESS] = {
- .handler = icmp_address,
+ .handler = icmp_discard,
},
[ICMP_ADDRESSREPLY] = {
- .handler = icmp_address_reply,
+ .handler = icmp_discard,
},
};
@@ -1157,10 +1096,9 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.icmp_sk[i] = sk;
/* Enough space for 2 64K ICMP packets, including
- * sk_buff struct overhead.
+ * sk_buff/skb_shared_info struct overhead.
*/
- sk->sk_sndbuf =
- (2 * ((64 * 1024) + sizeof(struct sk_buff)));
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
/*
* Speedup sock_wfree()
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e0e77e297de..db710b059ba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -73,7 +73,6 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -89,6 +88,7 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+#include <linux/pkt_sched.h>
#include <net/net_namespace.h>
#include <net/arp.h>
@@ -114,7 +114,8 @@
#define IGMP_V1_Router_Present_Timeout (400*HZ)
#define IGMP_V2_Router_Present_Timeout (400*HZ)
-#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
#define IGMP_Query_Response_Interval (10*HZ)
#define IGMP_Unsolicited_Report_Count 2
@@ -139,6 +140,29 @@
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -149,17 +173,11 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc);
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta);
-
-static void ip_mc_list_reclaim(struct rcu_head *head)
-{
- kfree(container_of(head, struct ip_mc_list, rcu));
-}
-
static void ip_ma_put(struct ip_mc_list *im)
{
if (atomic_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
- call_rcu(&im->rcu, ip_mc_list_reclaim);
+ kfree_rcu(im, rcu);
}
}
@@ -193,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv = net_random() % max_delay;
+ int tv = prandom_u32() % max_delay;
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
@@ -202,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = net_random() % in_dev->mr_maxdelay;
+ int tv = prandom_u32() % in_dev->mr_maxdelay;
in_dev->mr_gq_running = 1;
if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
@@ -211,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = net_random() % delay;
+ int tv = prandom_u32() % delay;
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
@@ -292,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
struct ip_sf_list *psf;
int scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -309,9 +327,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
struct iphdr *pip;
struct igmpv3_report *pig;
struct net *net = dev_net(dev);
+ struct flowi4 fl4;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
while (1) {
- skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
+ skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
if (skb)
break;
@@ -319,27 +340,21 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
if (size < 256)
return NULL;
}
+ skb->priority = TC_PRIO_CONTROL;
igmp_skb_size(skb) = size;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .fl4_dst = IGMPV3_ALL_MCR,
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(net, &rt, &fl)) {
- kfree_skb(skb);
- return NULL;
- }
- }
- if (rt->rt_src == 0) {
+ rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
kfree_skb(skb);
- ip_rt_put(rt);
return NULL;
}
skb_dst_set(skb, &rt->dst);
skb->dev = dev;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
@@ -350,15 +365,15 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->tos = 0xc0;
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
- pip->daddr = rt->rt_dst;
- pip->saddr = rt->rt_src;
+ pip->daddr = fl4.daddr;
+ pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->dst, NULL);
- ((u8*)&pip[1])[0] = IPOPT_RA;
- ((u8*)&pip[1])[1] = 4;
- ((u8*)&pip[1])[2] = 0;
- ((u8*)&pip[1])[3] = 0;
+ ip_select_ident(skb, NULL);
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
@@ -374,7 +389,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
static int igmpv3_sendpack(struct sk_buff *skb)
{
struct igmphdr *pig = igmp_hdr(skb);
- const int igmplen = skb->tail - skb->transport_header;
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
@@ -448,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
}
first = 1;
psf_prev = NULL;
- for (psf=*psf_list; psf; psf=psf_next) {
+ for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
@@ -505,7 +520,7 @@ empty_source:
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
- if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) {
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
@@ -561,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf=*ppsf; psf; psf = psf_next) {
+ for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
@@ -585,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -657,7 +672,9 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
struct net_device *dev = in_dev->dev;
struct net *net = dev_net(dev);
__be32 group = pmc ? pmc->multiaddr : 0;
+ struct flowi4 fl4;
__be32 dst;
+ int hlen, tlen;
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
@@ -666,27 +683,24 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
else
dst = group;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .fl4_dst = dst,
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(net, &rt, &fl))
- return -1;
- }
- if (rt->rt_src == 0) {
- ip_rt_put(rt);
+ rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
return -1;
- }
- skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
if (skb == NULL) {
ip_rt_put(rt);
return -1;
}
+ skb->priority = TC_PRIO_CONTROL;
skb_dst_set(skb, &rt->dst);
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
iph = ip_hdr(skb);
@@ -698,13 +712,13 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
- iph->saddr = rt->rt_src;
+ iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(iph, &rt->dst, NULL);
- ((u8*)&iph[1])[0] = IPOPT_RA;
- ((u8*)&iph[1])[1] = 4;
- ((u8*)&iph[1])[2] = 0;
- ((u8*)&iph[1])[3] = 0;
+ ip_select_ident(skb, NULL);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
@@ -722,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_timer_expire(unsigned long data)
@@ -732,9 +746,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
igmpv3_send_cr(in_dev);
if (in_dev->mr_ifc_count) {
in_dev->mr_ifc_count--;
- igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
}
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
@@ -749,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev)
static void igmp_timer_expire(unsigned long data)
{
- struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct ip_mc_list *im = (struct ip_mc_list *)data;
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
@@ -757,7 +772,7 @@ static void igmp_timer_expire(unsigned long data)
if (im->unsolicit_count) {
im->unsolicit_count--;
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
}
im->reporter = 1;
spin_unlock(&im->lock);
@@ -779,15 +794,15 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
int i, scount;
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
- if (pmc->sfcount[MCAST_INCLUDE] ||
+ if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
@@ -810,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++)
+ for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
@@ -828,14 +843,15 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
return 1;
}
-static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
- return;
+ return false;
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
@@ -845,9 +861,11 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
}
}
rcu_read_unlock();
+ return false;
}
-static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int len)
{
struct igmphdr *ih = igmp_hdr(skb);
@@ -879,7 +897,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
- return; /* ignore bogus packet; freed by caller */
+ return true; /* ignore bogus packet; freed by caller */
} else if (IGMP_V1_SEEN(in_dev)) {
/* This is a v3 query with v1 queriers present */
max_delay = IGMP_Query_Response_Interval;
@@ -892,15 +910,17 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
* to be intended in a v3 query.
*/
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
- return;
+ return true;
ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs) {
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ ntohs(ih3->nsrcs)*sizeof(__be32)))
- return;
+ return true;
ih3 = igmpv3_query_hdr(skb);
}
@@ -912,9 +932,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
in_dev->mr_qrv = ih3->qrv;
if (!group) { /* general query */
if (ih3->nsrcs)
- return; /* no sources allowed */
+ return false; /* no sources allowed */
igmp_gq_start_timer(in_dev);
- return;
+ return false;
}
/* mark sources to include, if group & source-specific */
mark = ih3->nsrcs != 0;
@@ -950,6 +970,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
igmp_mod_timer(im, max_delay);
}
rcu_read_unlock();
+ return false;
}
/* called in rcu_read_lock() section */
@@ -959,6 +980,7 @@ int igmp_rcv(struct sk_buff *skb)
struct igmphdr *ih;
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
int len = skb->len;
+ bool dropped = true;
if (in_dev == NULL)
goto drop;
@@ -966,21 +988,13 @@ int igmp_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto drop;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
- igmp_heard_query(in_dev, skb, len);
+ dropped = igmp_heard_query(in_dev, skb, len);
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -990,7 +1004,7 @@ int igmp_rcv(struct sk_buff *skb)
/* don't rely on MC router hearing unicast reports */
if (skb->pkt_type == PACKET_MULTICAST ||
skb->pkt_type == PACKET_BROADCAST)
- igmp_heard_report(in_dev, ih->group);
+ dropped = igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
@@ -1008,7 +1022,10 @@ int igmp_rcv(struct sk_buff *skb)
}
drop:
- kfree_skb(skb);
+ if (dropped)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
return 0;
}
@@ -1026,7 +1043,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
- is changed. This check should be done in dev->set_multicast_list
+ is changed. This check should be done in ndo_set_rx_mode
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
@@ -1078,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
@@ -1096,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
@@ -1109,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
if (pmc) {
- for (psf=pmc->tomb; psf; psf=psf_next) {
+ for (psf = pmc->tomb; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
@@ -1142,7 +1159,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
- for (; psf; psf=psf_next) {
+ for (; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
@@ -1172,20 +1189,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
if (!in_dev->dead) {
if (IGMP_V1_SEEN(in_dev))
- goto done;
+ return;
if (IGMP_V2_SEEN(in_dev)) {
if (reporter)
igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
- goto done;
+ return;
}
/* IGMPv3 */
igmpv3_add_delrec(in_dev, im);
igmp_ifc_event(in_dev);
}
-done:
#endif
- ip_mc_clear_src(im);
}
static void igmp_group_added(struct ip_mc_list *im)
@@ -1222,6 +1237,57 @@ static void igmp_group_added(struct ip_mc_list *im)
* Multicast list managers
*/
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
/*
* A socket has joined a multicast group on device dev.
@@ -1263,6 +1329,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
in_dev->mc_count++;
rcu_assign_pointer(in_dev->mc_list, im);
+ ip_mc_hash_add(in_dev, im);
+
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im->multiaddr);
#endif
@@ -1275,16 +1343,17 @@ out:
EXPORT_SYMBOL(ip_mc_inc_group);
/*
- * Resend IGMP JOIN report; used for bonding.
- * Called with rcu_read_lock()
+ * Resend IGMP JOIN report; used by netdev notifier.
*/
-void ip_mc_rejoin_groups(struct in_device *in_dev)
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
- for_each_pmc_rcu(in_dev, im) {
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
@@ -1301,7 +1370,6 @@ void ip_mc_rejoin_groups(struct in_device *in_dev)
}
#endif
}
-EXPORT_SYMBOL(ip_mc_rejoin_groups);
/*
* A socket has left a multicast group on device dev
@@ -1319,9 +1387,11 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
ip = &i->next_rcu) {
if (i->multiaddr == addr) {
if (--i->users == 0) {
+ ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
igmp_group_dropped(i);
+ ip_mc_clear_src(i);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
@@ -1385,13 +1455,9 @@ void ip_mc_init_dev(struct in_device *in_dev)
{
ASSERT_RTNL();
- in_dev->mc_tomb = NULL;
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_gq_running = 0;
setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
(unsigned long)in_dev);
- in_dev->mr_ifc_count = 0;
- in_dev->mc_count = 0;
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
@@ -1431,7 +1497,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
- igmp_group_dropped(i);
+ /* We've dropped the groups in ip_mc_down already */
+ ip_mc_clear_src(i);
ip_ma_put(i);
}
}
@@ -1439,8 +1506,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
- struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
- struct rtable *rt;
struct net_device *dev = NULL;
struct in_device *idev = NULL;
@@ -1454,9 +1519,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
return NULL;
}
- if (!dev && !ip_route_output_key(net, &rt, &fl)) {
- dev = rt->dst.dev;
- ip_rt_put(rt);
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
@@ -1479,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
int rv = 0;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1552,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[sfmode]--;
}
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
@@ -1572,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
@@ -1588,12 +1658,12 @@ out_unlock:
* Add multicast single-source filter to the interface list
*/
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
- __be32 *psfsrc, int delta)
+ __be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1621,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc)
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
@@ -1638,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
int new_in, rv;
rv = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -1648,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) {
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
@@ -1670,7 +1740,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next)
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
@@ -1722,17 +1792,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
- for (i=0; i<sfcount; i++) {
- err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ for (i = 0; i < sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
if (err) {
int j;
- pmc->sfcount[sfmode]--;
- for (j=0; j<i; j++)
- (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ if (!delta)
+ pmc->sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
@@ -1750,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
@@ -1765,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *nextpsf;
- for (psf=pmc->tomb; psf; psf=nextpsf) {
+ for (psf = pmc->tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
pmc->tomb = NULL;
- for (psf=pmc->sources; psf; psf=nextpsf) {
+ for (psf = pmc->sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
@@ -1836,12 +1907,6 @@ done:
}
EXPORT_SYMBOL(ip_mc_join_group);
-static void ip_sf_socklist_reclaim(struct rcu_head *rp)
-{
- kfree(container_of(rp, struct ip_sf_socklist, rcu));
- /* sk_omem_alloc should have been decreased by the caller*/
-}
-
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
@@ -1855,21 +1920,13 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, psf->sl_count, psf->sl_addr, 0);
- rcu_assign_pointer(iml->sflist, NULL);
+ RCU_INIT_POINTER(iml->sflist, NULL);
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
- call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
+ kfree_rcu(psf, rcu);
return err;
}
-
-static void ip_mc_socklist_reclaim(struct rcu_head *rp)
-{
- kfree(container_of(rp, struct ip_mc_socklist, rcu));
- /* sk_omem_alloc should have been decreased by the caller*/
-}
-
-
/*
* Ask a socket to leave a group.
*/
@@ -1887,6 +1944,10 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
rtnl_lock();
in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list;
(iml = rtnl_dereference(*imlp)) != NULL;
@@ -1904,19 +1965,18 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- if (in_dev)
- ip_mc_dec_group(in_dev, group);
+ ip_mc_dec_group(in_dev, group);
rtnl_unlock();
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
- call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
+ kfree_rcu(iml, rcu);
return 0;
}
- if (!in_dev)
- ret = -ENODEV;
+out:
rtnl_unlock();
return ret;
}
+EXPORT_SYMBOL(ip_mc_leave_group);
int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mreq_source *mreqs, int ifindex)
@@ -1977,7 +2037,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -1996,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
- for (j=i+1; j<psl->sl_count; j++)
+ for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
@@ -2022,17 +2082,17 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
- for (i=0; i<psl->sl_count; i++)
+ for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
- call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
+ kfree_rcu(psl, rcu);
}
rcu_assign_pointer(pmc->sflist, newpsl);
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -2040,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
if (rv == 0) /* address already there is an error */
goto done;
- for (j=psl->sl_count-1; j>=i; j--)
+ for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
@@ -2127,7 +2187,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
psl->sl_count, psl->sl_addr, 0);
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
- call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
+ kfree_rcu(psl, rcu);
} else
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
@@ -2239,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
return -EFAULT;
}
- for (i=0; i<copycount; i++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
@@ -2284,7 +2344,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
if (!psl)
goto unlock;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
@@ -2324,27 +2384,40 @@ void ip_mc_drop_socket(struct sock *sk)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
- call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
+ kfree_rcu(iml, rcu);
}
rtnl_unlock();
}
-int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
{
struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
- rcu_read_lock();
- for_each_pmc_rcu(in_dev, im) {
- if (im->multiaddr == mc_addr)
- break;
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
} else if (im) {
if (src_addr) {
- for (psf=im->sources; psf; psf=psf->sf_next) {
+ for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
@@ -2357,7 +2430,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
} else
rv = 1; /* unspecified source; tentatively allow */
}
- rcu_read_unlock();
return rv;
}
@@ -2457,6 +2529,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
struct ip_mc_list *im = (struct ip_mc_list *)v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
+ long delta;
+
#ifdef CONFIG_IP_MULTICAST
querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2470,11 +2544,12 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
+ delta = im->timer.expires - jiffies;
seq_printf(seq,
"\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
im->multiaddr, im->users,
- im->tm_running, im->tm_running ?
- jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+ im->tm_running,
+ im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
im->reporter);
}
return 0;
@@ -2656,33 +2731,72 @@ static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
goto out_igmp;
- pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
return 0;
out_mcfilter:
- proc_net_remove(net, "igmp");
+ remove_proc_entry("igmp", net->proc_net);
out_igmp:
return -ENOMEM;
}
static void __net_exit igmp_net_exit(struct net *net)
{
- proc_net_remove(net, "mcfilter");
- proc_net_remove(net, "igmp");
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
}
static struct pernet_operations igmp_net_ops = {
.init = igmp_net_init,
.exit = igmp_net_exit,
};
+#endif
-int __init igmp_mc_proc_init(void)
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
{
- return register_pernet_subsys(&igmp_net_ops);
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
+#else
+ return register_netdevice_notifier(&igmp_notifier);
#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 06f5f8f482f..14d02ea905b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,36 +29,26 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
-/*
- * This struct holds the first and last local port number.
- */
-struct local_ports sysctl_local_ports __read_mostly = {
- .lock = SEQLOCK_UNLOCKED,
- .range = { 32768, 61000 },
-};
-
-unsigned long *sysctl_local_reserved_ports;
-EXPORT_SYMBOL(sysctl_local_reserved_ports);
-
-void inet_get_local_port_range(int *low, int *high)
+void inet_get_local_port_range(struct net *net, int *low, int *high)
{
- unsigned seq;
+ unsigned int seq;
+
do {
- seq = read_seqbegin(&sysctl_local_ports.lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
- *low = sysctl_local_ports.range[0];
- *high = sysctl_local_ports.range[1];
- } while (read_seqretry(&sysctl_local_ports.lock, seq));
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
+ const struct inet_bind_bucket *tb, bool relax)
{
- const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
- struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
/*
* Unlike other sk lookup places we do not check
@@ -67,22 +57,32 @@ int inet_csk_bind_conflict(const struct sock *sk,
* one this bucket belongs to.
*/
- sk_for_each_bound(sk2, node, &tb->owners) {
+ sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
}
}
- return node != NULL;
+ return sk2 != NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
@@ -93,42 +93,49 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
- struct hlist_node *node;
struct inet_bind_bucket *tb;
int ret, attempts = 5;
struct net *net = sock_net(sk);
int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
int remaining, rover, low, high;
again:
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- smallest_rover = rover = net_random() % remaining + low;
+ smallest_rover = rover = prandom_u32() % remaining + low;
smallest_size = -1;
do {
- if (inet_is_reserved_local_port(rover))
+ if (inet_is_local_reserved_port(net, rover))
goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN &&
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || smallest_size == -1)) {
smallest_size = tb->num_owners;
smallest_rover = rover;
- if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
- spin_unlock(&head->lock);
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
snum = smallest_rover;
- goto have_snum;
+ goto tb_found;
}
}
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = rover;
+ goto tb_found;
+ }
goto next;
}
break;
@@ -162,7 +169,7 @@ have_snum:
head = &hashinfo->bhash[inet_bhashfn(net, snum,
hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
@@ -170,18 +177,26 @@ have_snum:
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size == -1) {
goto success;
} else {
ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
}
+
goto fail_unlock;
}
}
@@ -196,9 +211,19 @@ tb_not_found:
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
@@ -267,7 +292,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk;
+ struct request_sock *req;
int error;
lock_sock(sk);
@@ -280,7 +307,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
goto out_err;
/* Find already established connection */
- if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+ if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
@@ -292,14 +319,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
-
- newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
- WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+ req = reqsk_queue_remove(queue);
+ newsk = req->sk;
+
+ sk_acceptq_removed(sk);
+ if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
+ spin_lock_bh(&queue->fastopenq->lock);
+ if (tcp_rsk(req)->listener) {
+ /* We are still waiting for the final ACK from 3WHS
+ * so can't free req now. Instead, we set req->sk to
+ * NULL to signify that the child socket is taken
+ * so reqsk_fastopen_remove() will free the req
+ * when 3WHS finishes (or is aborted).
+ */
+ req->sk = NULL;
+ req = NULL;
+ }
+ spin_unlock_bh(&queue->fastopenq->lock);
+ }
out:
release_sock(sk);
+ if (req)
+ __reqsk_free(req);
return newsk;
out_err:
newsk = NULL;
+ req = NULL;
*err = error;
goto out;
}
@@ -351,27 +396,26 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
+ struct flowi4 *fl4,
const struct request_sock *req)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = ((opt && opt->srr) ?
- opt->faddr : ireq->rmt_addr),
- .fl4_src = ireq->loc_addr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet_sk(sk)->inet_sport,
- .fl_ip_dport = ireq->rmt_port };
+ struct ip_options_rcu *opt = inet_rsk(req)->opt;
struct net *net = sock_net(sk);
-
- security_req_classify_flow(req, &fl);
- if (ip_route_output_flow(net, &rt, &fl, sk, 0))
+ int flags = inet_sk_flowi_flags(sk);
+
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol,
+ flags,
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
goto no_route;
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
return &rt->dst;
@@ -383,13 +427,51 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+ struct sock *newsk,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_sock *newinet = inet_sk(newsk);
+ struct ip_options_rcu *opt;
+ struct net *net = sock_net(sk);
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ fl4 = &newinet->cork.fl.u.ip4;
+
+ rcu_read_lock();
+ opt = rcu_dereference(newinet->inet_opt);
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ rcu_read_unlock();
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) 1
@@ -410,9 +492,9 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
prev = &req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
+ if (ireq->ir_rmt_port == rport &&
+ ireq->ir_rmt_addr == raddr &&
+ ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
WARN_ON(req->sk);
*prevp = prev;
@@ -429,7 +511,8 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
@@ -448,21 +531,31 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
int *expire, int *resend)
{
if (!rskq_defer_accept) {
- *expire = req->retrans >= thresh;
+ *expire = req->num_timeout >= thresh;
*resend = 1;
return;
}
- *expire = req->retrans >= thresh &&
- (!inet_rsk(req)->acked || req->retrans >= max_retries);
+ *expire = req->num_timeout >= thresh &&
+ (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
/*
* Do not resend while waiting for data after ACK,
* start to resend on end of deferring period to give
* last chance for data or ACK to create established socket.
*/
*resend = !inet_rsk(req)->acked ||
- req->retrans >= rskq_defer_accept - 1;
+ req->num_timeout >= rskq_defer_accept - 1;
}
+int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+{
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
+
+ if (!err)
+ req->num_retrans++;
+ return err;
+}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
+
void inet_csk_reqsk_queue_prune(struct sock *parent,
const unsigned long interval,
const unsigned long timeout,
@@ -482,7 +575,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
+ * If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
@@ -523,17 +616,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
syn_ack_recalc(req, thresh, max_retries,
queue->rskq_defer_accept,
&expire, &resend);
- if (req->rsk_ops->syn_ack_timeout)
- req->rsk_ops->syn_ack_timeout(parent, req);
+ req->rsk_ops->syn_ack_timeout(parent, req);
if (!expire &&
(!resend ||
- !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
+ !inet_rtx_syn_ack(parent, req) ||
inet_rsk(req)->acked)) {
unsigned long timeo;
- if (req->retrans++ == 0)
+ if (req->num_timeout++ == 0)
lopt->qlen_young--;
- timeo = min((timeout << req->retrans), max_rto);
+ timeo = min(timeout << req->num_timeout,
+ max_rto);
req->expires = now + timeo;
reqp = &req->dl_next;
continue;
@@ -559,10 +652,19 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
-struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
- const gfp_t priority)
+/**
+ * inet_csk_clone_lock - clone an inet socket, and lock its clone
+ * @sk: the socket to clone
+ * @req: request_sock
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
{
- struct sock *newsk = sk_clone(sk, priority);
+ struct sock *newsk = sk_clone_lock(sk, priority);
if (newsk != NULL) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -570,11 +672,13 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
- inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
- inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
- inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+ inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+ inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
@@ -586,7 +690,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
}
return newsk;
}
-EXPORT_SYMBOL_GPL(inet_csk_clone);
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
@@ -618,6 +722,23 @@ void inet_csk_destroy_sock(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
@@ -659,13 +780,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *acc_req;
struct request_sock *req;
inet_csk_delete_keepalive_timer(sk);
/* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+ acc_req = reqsk_queue_yank_acceptq(queue);
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -675,7 +797,7 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ reqsk_queue_destroy(queue);
while ((req = acc_req) != NULL) {
struct sock *child = req->sk;
@@ -693,6 +815,19 @@ void inet_csk_listen_stop(struct sock *sk)
percpu_counter_inc(sk->sk_prot->orphan_count);
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != tcp_rsk(req)->listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ sock_put(sk);
+ }
inet_csk_destroy_sock(child);
bh_unlock_sock(child);
@@ -702,6 +837,17 @@ void inet_csk_listen_stop(struct sock *sk)
sk_acceptq_removed(sk);
__reqsk_free(req);
}
+ if (queue->fastopenq != NULL) {
+ /* Free all the reqs queued in rskq_rst_head. */
+ spin_lock_bh(&queue->fastopenq->lock);
+ acc_req = queue->fastopenq->rskq_rst_head;
+ queue->fastopenq->rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq->lock);
+ while ((req = acc_req) != NULL) {
+ acc_req = req->dl_next;
+ __reqsk_free(req);
+ }
+ }
WARN_ON(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
@@ -744,3 +890,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
}
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ fl4 = &fl->u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
+ }
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2ada17129fc..e34dccbc4d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -33,6 +33,7 @@
#include <linux/stddef.h>
#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
static const struct inet_diag_handler **inet_diag_table;
@@ -43,26 +44,25 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
+ struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
+#endif
};
-static struct sock *idiagnl;
-
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-
static DEFINE_MUTEX(inet_diag_table_mutex);
-static const struct inet_diag_handler *inet_diag_lock_handler(int type)
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
- if (!inet_diag_table[type])
- request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
- NETLINK_INET_DIAG, type);
+ if (!inet_diag_table[proto])
+ request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET, proto);
mutex_lock(&inet_diag_table_mutex);
- if (!inet_diag_table[type])
+ if (!inet_diag_table[proto])
return ERR_PTR(-ENOENT);
- return inet_diag_table[type];
+ return inet_diag_table[proto];
}
static inline void inet_diag_unlock_handler(
@@ -71,71 +71,100 @@ static inline void inet_diag_unlock_handler(
mutex_unlock(&inet_diag_table_mutex);
}
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
+ struct nlattr *attr;
void *info = NULL;
- struct inet_diag_meminfo *minfo = NULL;
- unsigned char *b = skb_tail_pointer(skb);
const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
- handler = inet_diag_table[unlh->nlmsg_type];
+ handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(handler == NULL);
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- r = NLMSG_DATA(nlh);
+ r = nlmsg_data(nlh);
BUG_ON(sk->sk_state == TCP_TIME_WAIT);
- if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
- minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
-
- if (ext & (1 << (INET_DIAG_INFO - 1)))
- info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
- handler->idiag_info_size);
-
- if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
- const size_t len = strlen(icsk->icsk_ca_ops->name);
-
- strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
- icsk->icsk_ca_ops->name);
- }
-
r->idiag_family = sk->sk_family;
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
r->id.idiag_sport = inet->inet_sport;
r->id.idiag_dport = inet->inet_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &np->daddr);
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
}
#endif
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
+ }
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+ if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+ goto errout;
+
+ if (icsk == NULL) {
+ handler->idiag_get_info(sk, r, NULL);
+ goto out;
+ }
+
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
@@ -153,123 +182,129 @@ static int inet_csk_diag_fill(struct sock *sk,
}
#undef EXPIRES_IN_MS
- r->idiag_uid = sock_i_uid(sk);
- r->idiag_inode = sock_i_ino(sk);
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ attr = nla_reserve(skb, INET_DIAG_INFO,
+ sizeof(struct tcp_info));
+ if (!attr)
+ goto errout;
- if (minfo) {
- minfo->idiag_rmem = sk_rmem_alloc_get(sk);
- minfo->idiag_wmem = sk->sk_wmem_queued;
- minfo->idiag_fmem = sk->sk_forward_alloc;
- minfo->idiag_tmem = sk_wmem_alloc_get(sk);
+ info = nla_data(attr);
}
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
+ if (nla_put_string(skb, INET_DIAG_CONG,
+ icsk->icsk_ca_ops->name) < 0)
+ goto errout;
+
handler->idiag_get_info(sk, r, info);
if (sk->sk_state < TCP_TIME_WAIT &&
icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
icsk->icsk_ca_ops->get_info(sk, ext, skb);
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
+out:
+ return nlmsg_end(skb, nlh);
-rtattr_failure:
-nlmsg_failure:
- nlmsg_trim(skb, b);
+errout:
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_csk_diag_fill(struct sock *sk,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ return inet_sk_diag_fill(sk, inet_csk(sk),
+ skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
+}
static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
- struct sk_buff *skb, int ext, u32 pid,
- u32 seq, u16 nlmsg_flags,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- long tmo;
+ s32 tmo;
struct inet_diag_msg *r;
- const unsigned char *previous_tail = skb_tail_pointer(skb);
- struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
- unlh->nlmsg_type, sizeof(*r));
+ struct nlmsghdr *nlh;
- r = NLMSG_DATA(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- nlh->nlmsg_flags = nlmsg_flags;
+ r = nlmsg_data(nlh);
+ BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_ttd - jiffies;
+ tmo = tw->tw_ttd - inet_tw_time_stamp();
if (tmo < 0)
tmo = 0;
r->idiag_family = tw->tw_family;
r->idiag_retrans = 0;
+
r->id.idiag_if = tw->tw_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
+ sock_diag_save_cookie(tw, r->id.idiag_cookie);
+
r->id.idiag_sport = tw->tw_sport;
r->id.idiag_dport = tw->tw_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
r->id.idiag_src[0] = tw->tw_rcv_saddr;
r->id.idiag_dst[0] = tw->tw_daddr;
+
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ);
+ r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- const struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
-
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tw6->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tw6->tw_v6_daddr);
+ *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
- return skb->len;
-nlmsg_failure:
- nlmsg_trim(skb, previous_tail);
- return -EMSGSIZE;
+
+ return nlmsg_end(skb, nlh);
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ struct inet_diag_req_v2 *r,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
- skb, ext, pid, seq, nlmsg_flags,
- unlh);
- return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
+ return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+ nlmsg_flags, unlh);
+
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+ nlmsg_flags, unlh);
}
-static int inet_diag_get_exact(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh)
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
{
int err;
struct sock *sk;
- struct inet_diag_req *req = NLMSG_DATA(nlh);
struct sk_buff *rep;
- struct inet_hashinfo *hashinfo;
- const struct inet_diag_handler *handler;
-
- handler = inet_diag_lock_handler(nlh->nlmsg_type);
- if (IS_ERR(handler)) {
- err = PTR_ERR(handler);
- goto unlock;
- }
+ struct net *net = sock_net(in_skb->sk);
- hashinfo = handler->idiag_hashinfo;
err = -EINVAL;
-
- if (req->idiag_family == AF_INET) {
- sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
}
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- else if (req->idiag_family == AF_INET6) {
- sk = inet6_lookup(&init_net, hashinfo,
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6) {
+ sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
@@ -278,50 +313,62 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
}
#endif
else {
- goto unlock;
+ goto out_nosk;
}
err = -ENOENT;
if (sk == NULL)
- goto unlock;
+ goto out_nosk;
- err = -ESTALE;
- if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
- req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
goto out;
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- handler->idiag_info_size + 64)),
- GFP_KERNEL);
- if (!rep)
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ sizeof(struct tcp_info) + 64, GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
goto out;
+ }
- err = sk_diag_fill(sk, rep, req->idiag_ext,
- NETLINK_CB(in_skb).pid,
+ err = sk_diag_fill(sk, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
- kfree_skb(rep);
+ nlmsg_free(rep);
goto out;
}
- err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
MSG_DONTWAIT);
if (err > 0)
err = 0;
out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put((struct inet_timewait_sock *)sk);
- else
- sock_put(sk);
- }
-unlock:
+ if (sk)
+ sock_gen_put(sk);
+
+out_nosk:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
+
+static int inet_diag_get_exact(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ const struct inet_diag_handler *handler;
+ int err;
+
+ handler = inet_diag_lock_handler(req->sdiag_protocol);
+ if (IS_ERR(handler))
+ err = PTR_ERR(handler);
+ else
+ err = handler->dump_one(in_skb, nlh, req);
inet_diag_unlock_handler(handler);
+
return err;
}
@@ -352,9 +399,12 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
}
-static int inet_diag_bc_run(const void *bc, int len,
- const struct inet_diag_entry *entry)
+static int inet_diag_bc_run(const struct nlattr *_bc,
+ const struct inet_diag_entry *entry)
{
+ const void *bc = nla_data(_bc);
+ int len = nla_len(_bc);
+
while (len > 0) {
int yes = 1;
const struct inet_diag_bc_op *op = bc;
@@ -393,25 +443,31 @@ static int inet_diag_bc_run(const void *bc, int len,
break;
}
- if (cond->prefix_len == 0)
- break;
-
if (op->code == INET_DIAG_BC_S_COND)
addr = entry->saddr;
else
addr = entry->daddr;
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
if (bitstring_match(addr, cond->addr,
cond->prefix_len))
break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr + 3, cond->addr,
- cond->prefix_len))
- break;
- }
yes = 0;
break;
}
@@ -428,6 +484,34 @@ static int inet_diag_bc_run(const void *bc, int len,
return len == 0;
}
+int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+{
+ struct inet_diag_entry entry;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (bc == NULL)
+ return 1;
+
+ entry.family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (entry.family == AF_INET6) {
+
+ entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry.daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->inet_rcv_saddr;
+ entry.daddr = &inet->inet_daddr;
+ }
+ entry.sport = inet->inet_num;
+ entry.dport = ntohs(inet->inet_dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ return inet_diag_bc_run(bc, &entry);
+}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
+
static int valid_cc(const void *bc, int len, int cc)
{
while (len >= 0) {
@@ -437,7 +521,7 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
if (cc == len)
return 1;
- if (op->yes < 4)
+ if (op->yes < 4 || op->yes & 3)
return 0;
len -= op->yes;
bc += op->yes;
@@ -445,39 +529,96 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ int addr_len;
+ struct inet_diag_hostcond *cond;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
{
- const unsigned char *bc = bytecode;
+ const void *bc = bytecode;
int len = bytecode_len;
while (len > 0) {
- struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+ const struct inet_diag_bc_op *op = bc;
+ int min_len = sizeof(struct inet_diag_bc_op);
//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
- case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len + 4)
- return -EINVAL;
- case INET_DIAG_BC_JMP:
- if (op->no < 4 || op->no > len + 4)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len - op->no))
+ if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len + 4)
- return -EINVAL;
break;
default:
return -EINVAL;
}
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+ return -EINVAL;
bc += op->yes;
len -= op->yes;
}
@@ -486,62 +627,35 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-
- if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
- struct inet_diag_entry entry;
- const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
- sizeof(*r),
- INET_DIAG_REQ_BYTECODE);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->inet_rcv_saddr;
- entry.daddr = &inet->inet_daddr;
- }
- entry.sport = inet->inet_num;
- entry.dport = ntohs(inet->inet_dport);
- entry.userlocks = sk->sk_userlocks;
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
- if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
- return 0;
- }
-
- return inet_csk_diag_fill(sk, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
+ return inet_csk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_dump(struct sock *sk,
struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ struct inet_timewait_sock *tw = inet_twsk(sk);
- if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
+ if (bc != NULL) {
struct inet_diag_entry entry;
- const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
- sizeof(*r),
- INET_DIAG_REQ_BYTECODE);
entry.family = tw->tw_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
- entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw6->tw_v6_daddr.s6_addr32;
+ entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
+ entry.daddr = tw->tw_v6_daddr.s6_addr32;
} else
#endif
{
@@ -552,77 +666,109 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
entry.dport = ntohs(tw->tw_dport);
entry.userlocks = 0;
- if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
+ if (!inet_diag_bc_run(bc, &entry))
return 0;
}
- return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
+ return inet_twsk_diag_fill(tw, skb, r,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
+/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
+ * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
+ */
+static inline void inet_diag_req_addrs(const struct sock *sk,
+ const struct request_sock *req,
+ struct inet_diag_entry *entry)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (req->rsk_ops->family == AF_INET6) {
+ entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
+ entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
+ } else if (req->rsk_ops->family == AF_INET) {
+ ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
+ &entry->saddr_storage);
+ ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
+ &entry->daddr_storage);
+ entry->saddr = entry->saddr_storage.s6_addr32;
+ entry->daddr = entry->daddr_storage.s6_addr32;
+ }
+ } else
+#endif
+ {
+ entry->saddr = &ireq->ir_loc_addr;
+ entry->daddr = &ireq->ir_rmt_addr;
+ }
+}
+
static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req, u32 pid, u32 seq,
+ struct request_sock *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq,
const struct nlmsghdr *unlh)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb_tail_pointer(skb);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ r = nlmsg_data(nlh);
r->idiag_family = sk->sk_family;
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = req->retrans;
+ r->idiag_retrans = req->num_retrans;
r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)req;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+ sock_diag_save_cookie(req, r->id.idiag_cookie);
tmo = req->expires - jiffies;
if (tmo < 0)
tmo = 0;
r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = ireq->rmt_port;
- r->id.idiag_src[0] = ireq->loc_addr;
- r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->id.idiag_dport = ireq->ir_rmt_port;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = ireq->ir_loc_addr;
+ r->id.idiag_dst[0] = ireq->ir_rmt_addr;
+
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
- r->idiag_uid = sock_i_uid(sk);
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &inet6_rsk(req)->loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &inet6_rsk(req)->rmt_addr);
+ struct inet_diag_entry entry;
+ inet_diag_req_addrs(sk, req, &entry);
+ memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
+ memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-nlmsg_failure:
- nlmsg_trim(skb, b);
- return -1;
+ return nlmsg_end(skb, nlh);
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
struct inet_diag_entry entry;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt;
- const struct nlattr *bc = NULL;
struct inet_sock *inet = inet_sk(sk);
int j, s_j;
int reqnum, s_reqnum;
@@ -642,9 +788,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (!lopt || !lopt->qlen)
goto out;
- if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
- bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
- INET_DIAG_REQ_BYTECODE);
+ if (bc != NULL) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
@@ -658,32 +802,21 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (reqnum < s_reqnum)
continue;
- if (r->id.idiag_dport != ireq->rmt_port &&
+ if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
- entry.saddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
- entry.dport = ntohs(ireq->rmt_port);
+ inet_diag_req_addrs(sk, req, &entry);
+ entry.dport = ntohs(ireq->ir_rmt_port);
- if (!inet_diag_bc_run(nla_data(bc),
- nla_len(bc), &entry))
+ if (!inet_diag_bc_run(bc, &entry))
continue;
}
err = inet_diag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
@@ -701,19 +834,12 @@ out:
return err;
}
-static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+ struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int i, num;
int s_i, s_num;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
- const struct inet_diag_handler *handler;
- struct inet_hashinfo *hashinfo;
-
- handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
- if (IS_ERR(handler))
- goto unlock;
-
- hashinfo = handler->idiag_hashinfo;
+ struct net *net = sock_net(skb->sk);
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -733,11 +859,18 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
if (num < s_num) {
num++;
continue;
}
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_listen;
@@ -747,7 +880,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[3] > 0)
goto syn_recv;
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -756,7 +889,7 @@ syn_recv:
if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+ if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -778,7 +911,7 @@ skip_listen_ht:
}
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
- goto unlock;
+ goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
@@ -788,8 +921,7 @@ skip_listen_ht:
num = 0;
- if (hlist_nulls_empty(&head->chain) &&
- hlist_nulls_empty(&head->twchain))
+ if (hlist_nulls_empty(&head->chain))
continue;
if (i > s_i)
@@ -797,19 +929,31 @@ skip_listen_ht:
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
+ int res;
+ int state;
+ if (!net_eq(sock_net(sk), net))
+ continue;
if (num < s_num)
goto next_normal;
- if (!(r->idiag_states & (1 << sk->sk_state)))
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_substate : sk->sk_state;
+ if (!(r->idiag_states & (1 << state)))
goto next_normal;
- if (r->id.idiag_sport != inet->inet_sport &&
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
r->id.idiag_sport)
goto next_normal;
- if (r->id.idiag_dport != inet->inet_dport &&
+ if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
+ else
+ res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ if (res < 0) {
spin_unlock_bh(lock);
goto done;
}
@@ -817,43 +961,95 @@ next_normal:
++num;
}
- if (r->idiag_states & TCPF_TIME_WAIT) {
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each(tw, node,
- &head->twchain) {
-
- if (num < s_num)
- goto next_dying;
- if (r->id.idiag_sport != tw->tw_sport &&
- r->id.idiag_sport)
- goto next_dying;
- if (r->id.idiag_dport != tw->tw_dport &&
- r->id.idiag_dport)
- goto next_dying;
- if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
- spin_unlock_bh(lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
spin_unlock_bh(lock);
}
done:
cb->args[1] = i;
cb->args[2] = num;
-unlock:
+out:
+ ;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
+
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ const struct inet_diag_handler *handler;
+ int err = 0;
+
+ handler = inet_diag_lock_handler(r->sdiag_protocol);
+ if (!IS_ERR(handler))
+ handler->dump(skb, cb, r, bc);
+ else
+ err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
- return skb->len;
+
+ return err ? : skb->len;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *bc = NULL;
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+}
+
+static inline int inet_diag_type2proto(int type)
+{
+ switch (type) {
+ case TCPDIAG_GETSOCK:
+ return IPPROTO_TCP;
+ case DCCPDIAG_GETSOCK:
+ return IPPROTO_DCCP;
+ default:
+ return 0;
+ }
}
-static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ struct inet_diag_req_v2 req;
+ struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req);
+ req.sdiag_family = AF_UNSPEC; /* compatibility */
+ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, &req, bc);
+}
+
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh)
+{
+ struct inet_diag_req *rc = nlmsg_data(nlh);
+ struct inet_diag_req_v2 req;
+
+ req.sdiag_family = rc->idiag_family;
+ req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ return inet_diag_get_exact(in_skb, nlh, &req);
+}
+
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
+
if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
nlmsg_len(nlh) < hdrlen)
return -EINVAL;
@@ -869,29 +1065,62 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
-
- return netlink_dump_start(idiagnl, skb, nlh,
- inet_diag_dump, NULL);
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+ }
}
- return inet_diag_get_exact(skb, nlh);
+ return inet_diag_get_exact_compat(skb, nlh);
}
-static DEFINE_MUTEX(inet_diag_mutex);
-
-static void inet_diag_rcv(struct sk_buff *skb)
+static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
- mutex_lock(&inet_diag_mutex);
- netlink_rcv_skb(skb, &inet_diag_rcv_msg);
- mutex_unlock(&inet_diag_mutex);
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(h, hdrlen)) {
+ struct nlattr *attr;
+ attr = nlmsg_find_attr(h, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (attr == NULL ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+
+ return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
+static const struct sock_diag_handler inet_diag_handler = {
+ .family = AF_INET,
+ .dump = inet_diag_handler_dump,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+ .family = AF_INET6,
+ .dump = inet_diag_handler_dump,
+};
+
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
int err = -EINVAL;
- if (type >= INET_DIAG_GETSOCK_MAX)
+ if (type >= IPPROTO_MAX)
goto out;
mutex_lock(&inet_diag_table_mutex);
@@ -910,7 +1139,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- if (type >= INET_DIAG_GETSOCK_MAX)
+ if (type >= IPPROTO_MAX)
return;
mutex_lock(&inet_diag_table_mutex);
@@ -921,7 +1150,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister);
static int __init inet_diag_init(void)
{
- const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ const int inet_diag_table_size = (IPPROTO_MAX *
sizeof(struct inet_diag_handler *));
int err = -ENOMEM;
@@ -929,25 +1158,35 @@ static int __init inet_diag_init(void)
if (!inet_diag_table)
goto out;
- idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
- inet_diag_rcv, NULL, THIS_MODULE);
- if (idiagnl == NULL)
- goto out_free_table;
- err = 0;
+ err = sock_diag_register(&inet_diag_handler);
+ if (err)
+ goto out_free_nl;
+
+ err = sock_diag_register(&inet6_diag_handler);
+ if (err)
+ goto out_free_inet;
+
+ sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
out:
return err;
-out_free_table:
+
+out_free_inet:
+ sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
kfree(inet_diag_table);
goto out;
}
static void __exit inet_diag_exit(void)
{
- netlink_kernel_release(idiagnl);
+ sock_diag_unregister(&inet6_diag_handler);
+ sock_diag_unregister(&inet_diag_handler);
+ sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
kfree(inet_diag_table);
}
module_init(inet_diag_init);
module_exit(inet_diag_exit);
MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0..3b01959bf4b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -21,7 +21,30 @@
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/sock.h>
#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
static void inet_frag_secret_rebuild(unsigned long dummy)
{
@@ -29,20 +52,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
unsigned long now = jiffies;
int i;
+ /* Per bucket lock NOT needed here, due to write lock protection */
write_lock(&f->lock);
+
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
- struct hlist_node *p, *n;
+ struct hlist_node *n;
- hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+ hb = &f->hash[i];
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
hlist_del(&q->list);
/* Relink to new hash chain. */
- hlist_add_head(&q->list, &f->hash[hval]);
+ hb_dest = &f->hash[hval];
+ hlist_add_head(&q->list, &hb_dest->chain);
}
}
}
@@ -55,14 +85,14 @@ void inet_frags_init(struct inet_frags *f)
{
int i;
- for (i = 0; i < INETFRAGS_HASHSZ; i++)
- INIT_HLIST_HEAD(&f->hash[i]);
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
rwlock_init(&f->lock);
- f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
- (jiffies ^ (jiffies >> 6)));
-
setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
(unsigned long)f);
f->secret_timer.expires = jiffies + f->secret_interval;
@@ -73,8 +103,9 @@ EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
nf->nqueues = 0;
- atomic_set(&nf->mem, 0);
+ init_frag_mem_limit(nf);
INIT_LIST_HEAD(&nf->lru_list);
+ spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
@@ -89,18 +120,28 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
nf->low_thresh = 0;
local_bh_disable();
- inet_frag_evictor(nf, f);
+ inet_frag_evictor(nf, f, true);
local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
- write_lock(&f->lock);
+ struct inet_frag_bucket *hb;
+ unsigned int hash;
+
+ read_lock(&f->lock);
+ hash = f->hashfn(fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
hlist_del(&fq->list);
- list_del(&fq->lru_list);
- fq->net->nqueues--;
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+
+ read_unlock(&f->lock);
+ inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -117,12 +158,8 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb, int *work)
+ struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
-
- atomic_sub(skb->truesize, &nf->mem);
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
@@ -133,6 +170,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
{
struct sk_buff *fp;
struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
@@ -143,13 +181,14 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(nf, f, fp, work);
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
fp = xp;
}
-
+ sum = sum_truesize + f->qsize;
if (work)
- *work -= f->qsize;
- atomic_sub(f->qsize, &nf->mem);
+ *work -= sum;
+ sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
@@ -158,23 +197,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
}
EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
{
struct inet_frag_queue *q;
int work, evicted = 0;
- work = atomic_read(&nf->mem) - nf->low_thresh;
- while (work > 0) {
- read_lock(&f->lock);
+ if (!force) {
+ if (frag_mem_limit(nf) <= nf->high_thresh)
+ return 0;
+ }
+
+ work = frag_mem_limit(nf) - nf->low_thresh;
+ while (work > 0 || force) {
+ spin_lock(&nf->lru_lock);
+
if (list_empty(&nf->lru_list)) {
- read_unlock(&f->lock);
+ spin_unlock(&nf->lru_lock);
break;
}
q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
- read_unlock(&f->lock);
+ /* Remove q from list to avoid several CPUs grabbing it */
+ list_del_init(&q->lru_list);
+
+ spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
if (!(q->last_in & INET_FRAG_COMPLETE))
@@ -194,28 +242,30 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *qp;
-#ifdef CONFIG_SMP
- struct hlist_node *n;
-#endif
unsigned int hash;
- write_lock(&f->lock);
+ read_lock(&f->lock); /* Protects against hash rebuild */
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
hash = f->hashfn(qp_in);
+ hb = &f->hash[hash];
+ spin_lock(&hb->chain_lock);
+
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
- * promoted read lock to write lock.
+ * released the hash bucket lock.
*/
- hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+ hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
@@ -227,10 +277,11 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
- hlist_add_head(&qp->list, &f->hash[hash]);
- list_add_tail(&qp->lru_list, &nf->lru_list);
- nf->nqueues++;
- write_unlock(&f->lock);
+ hlist_add_head(&qp->list, &hb->chain);
+ inet_frag_lru_add(nf, qp);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+
return qp;
}
@@ -243,12 +294,14 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
if (q == NULL)
return NULL;
+ q->net = nf;
f->constructor(q, arg);
- atomic_add(f->qsize, &nf->mem);
+ add_frag_mem_limit(q, f->qsize);
+
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
- q->net = nf;
+ INIT_LIST_HEAD(&q->lru_list);
return q;
}
@@ -269,18 +322,40 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock)
{
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
- struct hlist_node *n;
+ int depth = 0;
+
+ hb = &f->hash[hash];
- hlist_for_each_entry(q, n, &f->hash[hash], list) {
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
return q;
}
+ depth++;
}
+ spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
- return inet_frag_create(nf, f, key);
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+ else
+ return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 1b344f30b46..43116e8c8e1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -21,8 +21,34 @@
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#include <net/secure_seq.h>
#include <net/ip.h>
+static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 inet_ehash_secret __read_mostly;
+
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+
+
+static unsigned int inet_sk_ehashfn(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const __be32 laddr = inet->inet_rcv_saddr;
+ const __u16 lport = inet->inet_num;
+ const __be32 faddr = inet->inet_daddr;
+ const __be16 fport = inet->inet_dport;
+ struct net *net = sock_net(sk);
+
+ return inet_ehashfn(net, laddr, lport, faddr, fport);
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -38,6 +64,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
@@ -118,13 +145,12 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
* that the listener socket's icsk_bind_hash is the same
* as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
- struct hlist_node *node;
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), sock_net(sk)) &&
tb->port == port)
break;
}
- if (!node) {
+ if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
sock_net(sk), head, port);
if (!tb) {
@@ -133,8 +159,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
}
}
}
- sk_add_bind_node(child, &tb->owners);
- inet_csk(child)->icsk_bind_hash = tb;
+ inet_bind_hash(child, tb, port);
spin_unlock(&head->lock);
return 0;
@@ -151,16 +176,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
__be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 1 : 0;
+ score = sk->sk_family == PF_INET ? 2 : 1;
if (rcv_saddr) {
if (rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -176,6 +201,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
@@ -183,17 +209,29 @@ struct sock *__inet_lookup_listener(struct net *net,
struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
result = sk;
hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
@@ -217,13 +255,26 @@ begin:
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-struct sock * __inet_lookup_established(struct net *net,
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk;
const struct hlist_nulls_node *node;
@@ -237,16 +288,18 @@ struct sock * __inet_lookup_established(struct net *net,
rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
- goto begintw;
- if (unlikely(!INET_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
- sock_put(sk);
+ goto out;
+ if (unlikely(!INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ sock_gen_put(sk);
goto begin;
}
- goto out;
+ goto found;
}
}
/*
@@ -256,33 +309,9 @@ begin:
*/
if (get_nulls_value(node) != slot)
goto begin;
-
-begintw:
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_nulls_for_each_rcu(sk, node, &head->twchain) {
- if (INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif)) {
- if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
- sk = NULL;
- goto out;
- }
- if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
- saddr, daddr, ports, dif))) {
- sock_put(sk);
- goto begintw;
- }
- goto out;
- }
- }
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begintw;
- sk = NULL;
out:
+ sk = NULL;
+found:
rcu_read_unlock();
return sk;
}
@@ -298,7 +327,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__be32 daddr = inet->inet_rcv_saddr;
__be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
unsigned int hash = inet_ehashfn(net, daddr, lport,
@@ -307,35 +336,29 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
- struct inet_timewait_sock *tw;
+ struct inet_timewait_sock *tw = NULL;
int twrefcnt = 0;
spin_lock(lock);
- /* Check TIME-WAIT sockets first. */
- sk_nulls_for_each(sk2, node, &head->twchain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, net, hash, acookie,
- saddr, daddr, ports, dif)) {
- if (twsk_unique(sk, sk2, twp))
- goto unique;
- else
- goto not_unique;
- }
- }
- tw = NULL;
-
- /* And established part... */
sk_nulls_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, net, hash, acookie,
- saddr, daddr, ports, dif))
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
goto not_unique;
+ }
}
-unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
+ * in hash table socket with a funny identity.
+ */
inet->inet_num = lport;
inet->inet_sport = htons(lport);
sk->sk_hash = hash;
@@ -444,7 +467,7 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
- done =__sk_nulls_del_node_init_rcu(sk);
+ done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -469,16 +492,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
- struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
- if (inet_is_reserved_local_port(port))
+ if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
@@ -488,10 +510,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* because the established check is already
* unique enough.
*/
- inet_bind_bucket_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, &head->chain) {
if (net_eq(ib_net(tb), net) &&
tb->port == port) {
- if (tb->fastreuse >= 0)
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
@@ -508,6 +531,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 47038cb6c13..f17ea49b28f 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/if_vlan.h>
#include <linux/inet_lro.h>
+#include <net/checksum.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
@@ -51,8 +52,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
* Basic tcp checks whether packet is suitable for LRO
*/
-static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
- int len, struct net_lro_desc *lro_desc)
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
+ int len, const struct net_lro_desc *lro_desc)
{
/* check ip header: don't aggregate padded frames */
if (ntohs(iph->tot_len) != len)
@@ -114,11 +115,9 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
*(p+2) = lro_desc->tcp_rcv_tsecr;
}
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
iph->tot_len = htons(lro_desc->ip_tot_len);
- iph->check = 0;
- iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
-
tcph->check = 0;
tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
@@ -146,8 +145,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
}
static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph,
- u16 vlan_tag, struct vlan_group *vgrp)
+ struct iphdr *iph, struct tcphdr *tcph)
{
int nr_frags;
__be32 *ptr;
@@ -173,8 +171,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
}
lro_desc->mss = tcp_data_len;
- lro_desc->vgrp = vgrp;
- lro_desc->vlan_tag = vlan_tag;
lro_desc->active = 1;
lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
@@ -234,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
lro_desc->last_skb = skb;
}
-static void lro_add_frags(struct net_lro_desc *lro_desc,
- int len, int hlen, int truesize,
- struct skb_frag_struct *skb_frags,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct sk_buff *skb = lro_desc->parent;
- int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
- skb->truesize += truesize;
-
- skb_frags[0].page_offset += hlen;
- skb_frags[0].size -= hlen;
-
- while (tcp_data_len > 0) {
- *(lro_desc->next_frag) = *skb_frags;
- tcp_data_len -= skb_frags->size;
- lro_desc->next_frag++;
- skb_frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-}
static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
struct iphdr *iph,
@@ -309,29 +282,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,
skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
- if (lro_desc->vgrp) {
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
- else
- vlan_hwaccel_rx(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
-
- } else {
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(lro_desc->parent);
- else
- netif_rx(lro_desc->parent);
- }
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
LRO_INC_STATS(lro_mgr, flushed);
lro_clear_desc(lro_desc);
}
static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
- struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+ void *priv)
{
struct net_lro_desc *lro_desc;
struct iphdr *iph;
@@ -360,7 +321,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
goto out;
skb->ip_summed = lro_mgr->ip_summed_aggr;
- lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+ lro_init_desc(lro_desc, skb, iph, tcph);
LRO_INC_STATS(lro_mgr, aggregated);
return 0;
}
@@ -387,134 +348,11 @@ out:
return 1;
}
-
-static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- void *mac_hdr,
- int hlen, __wsum sum,
- u32 ip_summed)
-{
- struct sk_buff *skb;
- struct skb_frag_struct *skb_frags;
- int data_len = len;
- int hdr_len = min(len, hlen);
-
- skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
- if (!skb)
- return NULL;
-
- skb_reserve(skb, lro_mgr->frag_align_pad);
- skb->len = len;
- skb->data_len = len - hdr_len;
- skb->truesize += true_size;
- skb->tail += hdr_len;
-
- memcpy(skb->data, mac_hdr, hdr_len);
-
- skb_frags = skb_shinfo(skb)->frags;
- while (data_len > 0) {
- *skb_frags = *frags;
- data_len -= frags->size;
- skb_frags++;
- frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-
- skb_shinfo(skb)->frags[0].page_offset += hdr_len;
- skb_shinfo(skb)->frags[0].size -= hdr_len;
-
- skb->ip_summed = ip_summed;
- skb->csum = sum;
- skb->protocol = eth_type_trans(skb, lro_mgr->dev);
- return skb;
-}
-
-static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
-{
- struct net_lro_desc *lro_desc;
- struct iphdr *iph;
- struct tcphdr *tcph;
- struct sk_buff *skb;
- u64 flags;
- void *mac_hdr;
- int mac_hdr_len;
- int hdr_len = LRO_MAX_PG_HLEN;
- int vlan_hdr_len = 0;
-
- if (!lro_mgr->get_frag_header ||
- lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
- (void *)&tcph, &flags, priv)) {
- mac_hdr = page_address(frags->page) + frags->page_offset;
- goto out1;
- }
-
- if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
- goto out1;
-
- hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
- mac_hdr_len = (int)((void *)(iph) - mac_hdr);
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (!lro_desc)
- goto out1;
-
- if (!lro_desc->active) { /* start new lro session */
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
- goto out1;
-
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, 0, lro_mgr->ip_summed_aggr);
- if (!skb)
- goto out;
-
- if ((skb->protocol == htons(ETH_P_8021Q)) &&
- !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
- vlan_hdr_len = VLAN_HLEN;
-
- iph = (void *)(skb->data + vlan_hdr_len);
- tcph = (void *)((u8 *)skb->data + vlan_hdr_len
- + IP_HDR_LEN(iph));
-
- lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
- LRO_INC_STATS(lro_mgr, aggregated);
- return NULL;
- }
-
- if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
- goto out2;
-
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
- goto out2;
-
- lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
-
- if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
- lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
- lro_flush(lro_mgr, lro_desc);
-
- return NULL;
-
-out2: /* send aggregated packets to the stack */
- lro_flush(lro_mgr, lro_desc);
-
-out1: /* Original packet has to be posted to the stack */
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, sum, lro_mgr->ip_summed);
-out:
- return skb;
-}
-
void lro_receive_skb(struct net_lro_mgr *lro_mgr,
struct sk_buff *skb,
void *priv)
{
- if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
+ if (__lro_proc_skb(lro_mgr, skb, priv)) {
if (lro_mgr->features & LRO_F_NAPI)
netif_receive_skb(skb);
else
@@ -523,59 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
}
EXPORT_SYMBOL(lro_receive_skb);
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
- struct sk_buff *skb,
- struct vlan_group *vgrp,
- u16 vlan_tag,
- void *priv)
-{
- if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
- }
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
-
-void lro_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
- priv, sum);
- if (!skb)
- return;
-
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(skb);
- else
- netif_rx(skb);
-}
-EXPORT_SYMBOL(lro_receive_frags);
-
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
- vlan_tag, priv, sum);
- if (!skb)
- return;
-
- if (lro_mgr->features & LRO_F_NAPI)
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
-
void lro_flush_all(struct net_lro_mgr *lro_mgr)
{
int i;
@@ -587,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr)
}
}
EXPORT_SYMBOL(lro_flush_all);
-
-void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct net_lro_desc *lro_desc;
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (lro_desc->active)
- lro_flush(lro_mgr, lro_desc);
-}
-EXPORT_SYMBOL(lro_flush_pkt);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf70..6d592f8555f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
@@ -86,19 +87,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
refcnt += inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
-#ifdef SOCK_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
- }
-#endif
- while (refcnt) {
- inet_twsk_put(tw);
- refcnt--;
- }
+ BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+ atomic_sub(refcnt, &tw->tw_refcnt);
}
-static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
+void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
twsk_destructor((struct sock *)tw);
@@ -117,6 +110,18 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
+{
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind_node, list);
+}
+
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@@ -145,26 +150,21 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
/*
- * Step 2: Hash TW into TIMEWAIT chain.
- * Should be done before removing sk from established chain
- * because readers are lockless and search established first.
+ * Step 2: Hash TW into tcp ehash chain.
+ * Notes :
+ * - tw_refcnt is set to 3 because :
+ * - We have one reference from bhash chain.
+ * - We have one reference from ehash chain.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
*/
- inet_twsk_add_node_rcu(tw, &ehead->twchain);
+ atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
- /* Step 3: Remove SK from established hash. */
+ /* Step 3: Remove SK from hash chain */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- /*
- * Notes :
- * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
- * - We add one reference for the bhash link
- * - We add one reference for the ehash link
- * - We want this refcnt update done before allowing other
- * threads to find this tw in ehash chain.
- */
- atomic_add(1 + 1 + 1, &tw->tw_refcnt);
-
spin_unlock(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -183,6 +183,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_daddr = inet->inet_daddr;
tw->tw_rcv_saddr = inet->inet_rcv_saddr;
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_tos = inet->tos;
tw->tw_num = inet->inet_num;
tw->tw_state = TCP_TIME_WAIT;
tw->tw_substate = state;
@@ -214,7 +215,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
const int slot)
{
struct inet_timewait_sock *tw;
- struct hlist_node *node;
unsigned int killed;
int ret;
@@ -227,7 +227,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
killed = 0;
ret = 0;
rescan:
- inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
@@ -261,7 +261,7 @@ rescan:
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
- int unsigned need_timer;
+ unsigned int need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
@@ -386,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
- tw->tw_ttd = jiffies + timeo;
+ tw->tw_ttd = inet_tw_time_stamp() + timeo;
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
list = &twdr->cells[slot];
} else {
- tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+ tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
if (twdr->twcal_hand < 0) {
twdr->twcal_hand = 0;
@@ -436,10 +436,10 @@ void inet_twdr_twcal_tick(unsigned long data)
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
+ struct hlist_node *safe;
struct inet_timewait_sock *tw;
- inet_twsk_for_each_inmate_safe(tw, node, safe,
+ inet_twsk_for_each_inmate_safe(tw, safe,
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
@@ -489,7 +489,9 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
restart_rcu:
rcu_read_lock();
restart:
- sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_state != TCP_TIME_WAIT)
+ continue;
tw = inet_twsk(sk);
if ((tw->tw_family != family) ||
atomic_read(&twsk_net(tw)->count))
@@ -505,7 +507,9 @@ restart:
}
rcu_read_unlock();
+ local_bh_disable();
inet_twsk_deschedule(tw, twdr);
+ local_bh_enable();
inet_twsk_put(tw);
goto restart_rcu;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d9bc85751c7..bd5f5928167 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,27 +17,16 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
+#include <net/secure_seq.h>
/*
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size uses a constant ID and do not use this code (see
- * ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
@@ -54,21 +43,21 @@
* 1. Nodes may appear in the tree only with the pool lock held.
* 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
- * 3. Nodes appears and disappears from unused node list only under
- * "inet_peer_unused_lock".
- * 4. Global variable peer_total is modified under the pool lock.
- * 5. struct inet_peer fields modification:
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
* avl_left, avl_right, avl_parent, avl_height: pool lock
- * unused: unused node list lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
- * dtime: unused node list lock
* daddr: unchangeable
- * ip_id_count: atomic value (no lock needed)
*/
static struct kmem_cache *peer_cachep __read_mostly;
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
#define node_height(x) x->avl_height
#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -79,23 +68,32 @@ static const struct inet_peer peer_fake_node = {
.avl_height = 0
};
-struct inet_peer_base {
- struct inet_peer __rcu *root;
- spinlock_t lock;
- int total;
-};
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->flush_seq = ~0U;
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static struct inet_peer_base v4_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
- .total = 0,
-};
+static atomic_t v4_seq = ATOMIC_INIT(0);
+static atomic_t v6_seq = ATOMIC_INIT(0);
-static struct inet_peer_base v6_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
- .total = 0,
-};
+static atomic_t *inetpeer_seq_ptr(int family)
+{
+ return (family == AF_INET ? &v4_seq : &v6_seq);
+}
+
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+ atomic_t *fp = inetpeer_seq_ptr(family);
+
+ if (unlikely(base->flush_seq != atomic_read(fp))) {
+ inetpeer_invalidate_tree(base);
+ base->flush_seq = atomic_read(fp);
+ }
+}
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -104,20 +102,53 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
* aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
-int inet_peer_gc_mintime __read_mostly = 10 * HZ;
-int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
-
-static struct {
- struct list_head list;
- spinlock_t lock;
-} unused_peers = {
- .list = LIST_HEAD_INIT(unused_peers.list),
- .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
-};
-static void peer_check_expire(unsigned long dummy);
-static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n, *c;
+ struct list_head list;
+
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if (need_resched())
+ cond_resched();
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
+ }
+
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
@@ -142,23 +173,7 @@ void __init inet_initpeers(void)
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- peer_periodic_timer.expires = jiffies
- + net_random() % inet_peer_gc_maxtime
- + inet_peer_gc_maxtime;
- add_timer(&peer_periodic_timer);
-}
-
-/* Called with or without local BH being disabled. */
-static void unlink_from_unused(struct inet_peer *p)
-{
- if (!list_empty(&p->unused)) {
- spin_lock_bh(&unused_peers.lock);
- list_del_init(&p->unused);
- spin_unlock_bh(&unused_peers.lock);
- }
+ INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
static int addr_compare(const struct inetpeer_addr *a,
@@ -167,9 +182,9 @@ static int addr_compare(const struct inetpeer_addr *a,
int i, n = (a->family == AF_INET ? 1 : 4);
for (i = 0; i < n; i++) {
- if (a->a6[i] == b->a6[i])
+ if (a->addr.a6[i] == b->addr.a6[i])
continue;
- if (a->a6[i] < b->a6[i])
+ if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
return -1;
return 1;
}
@@ -177,6 +192,9 @@ static int addr_compare(const struct inetpeer_addr *a,
return 0;
}
+#define rcu_deref_locked(X, BASE) \
+ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
/*
* Called with local BH disabled and the pool lock held.
*/
@@ -187,9 +205,8 @@ static int addr_compare(const struct inetpeer_addr *a,
\
stackptr = _stack; \
*stackptr++ = &_base->root; \
- for (u = rcu_dereference_protected(_base->root, \
- lockdep_is_held(&_base->lock)); \
- u != peer_avl_empty; ) { \
+ for (u = rcu_deref_locked(_base->root, _base); \
+ u != peer_avl_empty;) { \
int cmp = addr_compare(_daddr, &u->daddr); \
if (cmp == 0) \
break; \
@@ -198,41 +215,38 @@ static int addr_compare(const struct inetpeer_addr *a,
else \
v = &u->avl_right; \
*stackptr++ = v; \
- u = rcu_dereference_protected(*v, \
- lockdep_is_held(&_base->lock)); \
+ u = rcu_deref_locked(*v, _base); \
} \
u; \
})
/*
- * Called with rcu_read_lock_bh()
+ * Called with rcu_read_lock()
* Because we hold no lock against a writer, its quite possible we fall
* in an endless loop.
* But every pointer we follow is guaranteed to be valid thanks to RCU.
* We exit from this function if number of links exceeds PEER_MAXDEPTH
*/
-static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
- struct inet_peer_base *base)
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base)
{
- struct inet_peer *u = rcu_dereference_bh(base->root);
+ struct inet_peer *u = rcu_dereference(base->root);
int count = 0;
while (u != peer_avl_empty) {
int cmp = addr_compare(daddr, &u->daddr);
if (cmp == 0) {
/* Before taking a reference, check if this entry was
- * deleted, unlink_from_pool() sets refcnt=-1 to make
- * distinction between an unused entry (refcnt=0) and
- * a freed one.
+ * deleted (refcnt=-1)
*/
- if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+ if (!atomic_add_unless(&u->refcnt, 1, -1))
u = NULL;
return u;
}
if (cmp == -1)
- u = rcu_dereference_bh(u->avl_left);
+ u = rcu_dereference(u->avl_left);
else
- u = rcu_dereference_bh(u->avl_right);
+ u = rcu_dereference(u->avl_right);
if (unlikely(++count == PEER_MAXDEPTH))
break;
}
@@ -246,13 +260,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
struct inet_peer __rcu **v; \
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
- for (u = rcu_dereference_protected(*v, \
- lockdep_is_held(&base->lock)); \
- u->avl_right != peer_avl_empty_rcu; ) { \
+ for (u = rcu_deref_locked(*v, base); \
+ u->avl_right != peer_avl_empty_rcu;) { \
v = &u->avl_right; \
*stackptr++ = v; \
- u = rcu_dereference_protected(*v, \
- lockdep_is_held(&base->lock)); \
+ u = rcu_deref_locked(*v, base); \
} \
u; \
})
@@ -271,21 +283,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
while (stackend > stack) {
nodep = *--stackend;
- node = rcu_dereference_protected(*nodep,
- lockdep_is_held(&base->lock));
- l = rcu_dereference_protected(node->avl_left,
- lockdep_is_held(&base->lock));
- r = rcu_dereference_protected(node->avl_right,
- lockdep_is_held(&base->lock));
+ node = rcu_deref_locked(*nodep, base);
+ l = rcu_deref_locked(node->avl_left, base);
+ r = rcu_deref_locked(node->avl_right, base);
lh = node_height(l);
rh = node_height(r);
if (lh > rh + 1) { /* l: RH+2 */
struct inet_peer *ll, *lr, *lrl, *lrr;
int lrh;
- ll = rcu_dereference_protected(l->avl_left,
- lockdep_is_held(&base->lock));
- lr = rcu_dereference_protected(l->avl_right,
- lockdep_is_held(&base->lock));
+ ll = rcu_deref_locked(l->avl_left, base);
+ lr = rcu_deref_locked(l->avl_right, base);
lrh = node_height(lr);
if (lrh <= node_height(ll)) { /* ll: RH+1 */
RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +303,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
l->avl_height = node->avl_height + 1;
RCU_INIT_POINTER(*nodep, l);
} else { /* ll: RH, lr: RH+1 */
- lrl = rcu_dereference_protected(lr->avl_left,
- lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */
- lrr = rcu_dereference_protected(lr->avl_right,
- lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
+ lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+ lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +319,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
} else if (rh > lh + 1) { /* r: LH+2 */
struct inet_peer *rr, *rl, *rlr, *rll;
int rlh;
- rr = rcu_dereference_protected(r->avl_right,
- lockdep_is_held(&base->lock));
- rl = rcu_dereference_protected(r->avl_left,
- lockdep_is_held(&base->lock));
+ rr = rcu_deref_locked(r->avl_right, base);
+ rl = rcu_deref_locked(r->avl_left, base);
rlh = node_height(rl);
if (rlh <= node_height(rr)) { /* rr: LH+1 */
RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +331,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
r->avl_height = node->avl_height + 1;
RCU_INIT_POINTER(*nodep, r);
} else { /* rr: RH, rl: RH+1 */
- rlr = rcu_dereference_protected(rl->avl_right,
- lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */
- rll = rcu_dereference_protected(rl->avl_left,
- lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
+ rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+ rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = lh + 1; /* node: LH+1 */
@@ -365,217 +366,214 @@ static void inetpeer_free_rcu(struct rcu_head *head)
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}
-/* May be called with local BH enabled. */
-static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH])
{
- int do_free;
-
- do_free = 0;
-
- spin_lock_bh(&base->lock);
- /* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If we can
- * atomically (because of lockless readers) take this last reference,
- * it's safe to remove the node and free it later.
- * We use refcnt=-1 to alert lockless readers this entry is deleted.
- */
- if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
- struct inet_peer __rcu **stack[PEER_MAXDEPTH];
- struct inet_peer __rcu ***stackptr, ***delp;
- if (lookup(&p->daddr, stack, base) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty_rcu) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p, base);
- BUG_ON(rcu_dereference_protected(*stackptr[-1],
- lockdep_is_held(&base->lock)) != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->daddr > x->daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- RCU_INIT_POINTER(*delp[0], t);
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr, base);
- base->total--;
- do_free = 1;
+ struct inet_peer __rcu ***stackptr, ***delp;
+
+ if (lookup(&p->daddr, stack, base) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty_rcu) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p, base);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+ **--stackptr = t->avl_left;
+ /* t is removed, t->daddr > x->daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ RCU_INIT_POINTER(*delp[0], t);
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ BUG_ON(delp[1] != &p->avl_left);
+ delp[1] = &t->avl_left; /* was &p->avl_left */
}
- spin_unlock_bh(&base->lock);
-
- if (do_free)
- call_rcu_bh(&p->rcu, inetpeer_free_rcu);
- else
- /* The node is used again. Decrease the reference counter
- * back. The loop "cleanup -> unlink_from_unused
- * -> unlink_from_pool -> putpeer -> link_to_unused
- * -> cleanup (for the same node)"
- * doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon.
- */
- inet_putpeer(p);
-}
-
-static struct inet_peer_base *family_to_base(int family)
-{
- return (family == AF_INET ? &v4_peers : &v6_peers);
+ peer_avl_rebalance(stack, stackptr, base);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
}
-static struct inet_peer_base *peer_to_base(struct inet_peer *p)
+/* perform garbage collect on all items stacked during a lookup */
+static int inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH],
+ struct inet_peer __rcu ***stackptr)
{
- return family_to_base(p->daddr.family);
-}
+ struct inet_peer *p, *gchead = NULL;
+ __u32 delta, ttl;
+ int cnt = 0;
-/* May be called with local BH enabled. */
-static int cleanup_once(unsigned long ttl)
-{
- struct inet_peer *p = NULL;
-
- /* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&unused_peers.lock);
- if (!list_empty(&unused_peers.list)) {
- __u32 delta;
-
- p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
- delta = (__u32)jiffies - p->dtime;
-
- if (delta < ttl) {
- /* Do not prune fresh entries. */
- spin_unlock_bh(&unused_peers.lock);
- return -1;
+ if (base->total >= inet_peer_threshold)
+ ttl = 0; /* be aggressive */
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ base->total / inet_peer_threshold * HZ;
+ stackptr--; /* last stack slot is peer_avl_empty */
+ while (stackptr > stack) {
+ stackptr--;
+ p = rcu_deref_locked(**stackptr, base);
+ if (atomic_read(&p->refcnt) == 0) {
+ smp_rmb();
+ delta = (__u32)jiffies - p->dtime;
+ if (delta >= ttl &&
+ atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+ p->gc_next = gchead;
+ gchead = p;
+ }
}
-
- list_del_init(&p->unused);
-
- /* Grab an extra reference to prevent node disappearing
- * before unlink_from_pool() call. */
- atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&unused_peers.lock);
-
- if (p == NULL)
- /* It means that the total number of USED entries has
- * grown over inet_peer_threshold. It shouldn't really
- * happen because of entry limits in route cache. */
- return -1;
-
- unlink_from_pool(p, peer_to_base(p));
- return 0;
+ while ((p = gchead) != NULL) {
+ gchead = p->gc_next;
+ cnt++;
+ unlink_from_pool(p, base, stack);
+ }
+ return cnt;
}
-/* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer_base *base = family_to_base(AF_INET);
struct inet_peer *p;
+ unsigned int sequence;
+ int invalidated, gccnt = 0;
+
+ flush_check(base, daddr->family);
- /* Look up for the address quickly, lockless.
+ /* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
- rcu_read_lock_bh();
- p = lookup_rcu_bh(daddr, base);
- rcu_read_unlock_bh();
+ rcu_read_lock();
+ sequence = read_seqbegin(&base->lock);
+ p = lookup_rcu(daddr, base);
+ invalidated = read_seqretry(&base->lock, sequence);
+ rcu_read_unlock();
- if (p) {
- /* The existing node has been found.
- * Remove the entry from unused list if it was there.
- */
- unlink_from_unused(p);
+ if (p)
return p;
- }
+
+ /* If no writer did a change during our lookup, we can return early. */
+ if (!create && !invalidated)
+ return NULL;
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
- spin_lock_bh(&base->lock);
+ write_seqlock_bh(&base->lock);
+relookup:
p = lookup(daddr, stack, base);
if (p != peer_avl_empty) {
atomic_inc(&p->refcnt);
- spin_unlock_bh(&base->lock);
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
+ write_sequnlock_bh(&base->lock);
return p;
}
+ if (!gccnt) {
+ gccnt = inet_peer_gc(base, stack, stackptr);
+ if (gccnt && create)
+ goto relookup;
+ }
p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
if (p) {
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
- p->tcp_ts_stamp = 0;
- INIT_LIST_HEAD(&p->unused);
-
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+ INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
link_to_pool(p, base);
base->total++;
}
- spin_unlock_bh(&base->lock);
-
- if (base->total >= inet_peer_threshold)
- /* Remove one less-recently-used entry. */
- cleanup_once(0);
+ write_sequnlock_bh(&base->lock);
return p;
}
+EXPORT_SYMBOL_GPL(inet_getpeer);
-static int compute_total(void)
+void inet_putpeer(struct inet_peer *p)
{
- return v4_peers.total + v6_peers.total;
+ p->dtime = (__u32)jiffies;
+ smp_mb__before_atomic();
+ atomic_dec(&p->refcnt);
}
-EXPORT_SYMBOL_GPL(inet_getpeer);
+EXPORT_SYMBOL_GPL(inet_putpeer);
-/* Called with local BH disabled. */
-static void peer_check_expire(unsigned long dummy)
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
- unsigned long now = jiffies;
- int ttl, total;
-
- total = compute_total();
- if (total >= inet_peer_threshold)
- ttl = inet_peer_minttl;
- else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- total / inet_peer_threshold * HZ;
- while (!cleanup_once(ttl)) {
- if (jiffies != now)
- break;
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
}
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
- /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
- * interval depending on the total number of entries (more entries,
- * less interval). */
- total = compute_total();
- if (total >= inet_peer_threshold)
- peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
- else
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- total / inet_peer_threshold * HZ;
- add_timer(&peer_periodic_timer);
+static void inetpeer_inval_rcu(struct rcu_head *head)
+{
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
+
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
}
-void inet_putpeer(struct inet_peer *p)
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- local_bh_disable();
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
- if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
- list_add_tail(&p->unused, &unused_peers.list);
- p->dtime = (__u32)jiffies;
- spin_unlock(&unused_peers.lock);
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
+ base->total = 0;
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
}
- local_bh_enable();
+ write_sequnlock_bh(&base->lock);
}
-EXPORT_SYMBOL_GPL(inet_putpeer);
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 99461f09320..3a83ce5efa8 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,11 +39,30 @@
#include <net/route.h>
#include <net/xfrm.h>
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+ return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+ skb->ignore_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+
static int ip_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -53,9 +72,14 @@ static int ip_forward_finish(struct sk_buff *skb)
int ip_forward(struct sk_buff *skb)
{
+ u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
if (skb_warn_if_lro(skb))
goto drop;
@@ -66,9 +90,6 @@ int ip_forward(struct sk_buff *skb)
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
-
skb_forward_csum(skb);
/*
@@ -84,14 +105,15 @@ int ip_forward(struct sk_buff *skb)
rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
- (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->dst)));
+ htonl(mtu));
goto drop;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e6215bdd96c..ed32313e307 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -20,6 +20,8 @@
* Patrick McHardy : LRU queue of frag heads for evictor.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -45,6 +47,7 @@
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -70,11 +73,17 @@ struct ipq {
__be32 daddr;
__be16 id;
u8 protocol;
+ u8 ecn; /* RFC3168 support */
int iif;
unsigned int rid;
struct inet_peer *peer;
};
+static inline u8 ip4_frag_ecn(u8 tos)
+{
+ return 1 << (tos & INET_ECN_MASK);
+}
+
static struct inet_frags ip4_frags;
int ip_frag_nqueues(struct net *net)
@@ -84,7 +93,7 @@ int ip_frag_nqueues(struct net *net)
int ip_frag_mem(struct net *net)
{
- return atomic_read(&net->ipv4.frags.mem);
+ return sum_frag_mem_limit(&net->ipv4.frags);
}
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
@@ -97,6 +106,7 @@ struct ip4_create_arg {
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
+ net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
@@ -110,38 +120,36 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static int ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
{
struct ipq *qp;
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
- qp->saddr == arg->iph->saddr &&
- qp->daddr == arg->iph->daddr &&
- qp->protocol == arg->iph->protocol &&
- qp->user == arg->user;
-}
-
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf->mem);
- kfree_skb(skb);
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
}
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
+ struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+ frags);
+ struct net *net = container_of(ipv4, struct net, ipv4);
+
struct ip4_create_arg *arg = a;
qp->protocol = arg->iph->protocol;
qp->id = arg->iph->id;
+ qp->ecn = ip4_frag_ecn(arg->iph->tos);
qp->saddr = arg->iph->saddr;
qp->daddr = arg->iph->daddr;
qp->user = arg->user;
qp->peer = sysctl_ipfrag_max_dist ?
- inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
}
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -176,7 +184,7 @@ static void ip_evictor(struct net *net)
{
int evicted;
- evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
+ evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
if (evicted)
IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
}
@@ -204,31 +212,31 @@ static void ip_expire(unsigned long arg)
if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
struct sk_buff *head = qp->q.fragments;
+ const struct iphdr *iph;
+ int err;
rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out_rcu_unlock;
+ /* skb has no dst, perform route lookup again */
+ iph = ip_hdr(head);
+ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
+ if (err)
+ goto out_rcu_unlock;
+
/*
- * Only search router table for the head fragment,
- * when defraging timeout at PRE_ROUTING HOOK.
+ * Only an end host needs to send an ICMP
+ * "Fragment Reassembly Timeout" message, per RFC792.
*/
- if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
- const struct iphdr *iph = ip_hdr(head);
- int err = ip_route_input(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
- if (unlikely(err))
- goto out_rcu_unlock;
-
- /*
- * Only an end host needs to send an ICMP
- * "Fragment Reassembly Timeout" message, per RFC792.
- */
- if (skb_rtable(head)->rt_type != RTN_LOCAL)
- goto out_rcu_unlock;
+ if (qp->user == IP_DEFRAG_AF_PACKET ||
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ goto out_rcu_unlock;
- }
/* Send an ICMP "Fragment Reassembly Timeout" message. */
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
@@ -256,14 +264,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
- if (q == NULL)
- goto out_nomem;
-
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
return container_of(q, struct ipq, q);
-
-out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
- return NULL;
}
/* Is the fragment too far ahead to be part of ipq? */
@@ -297,6 +302,7 @@ static inline int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
atomic_inc(&qp->q.refcnt);
@@ -306,9 +312,12 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp);
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
fp = xp;
} while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
qp->q.last_in = 0;
qp->q.len = 0;
@@ -316,6 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
qp->iif = 0;
+ qp->ecn = 0;
return 0;
}
@@ -328,6 +338,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
int flags, offset;
int ihl, end;
int err = -ENOENT;
+ u8 ecn;
if (qp->q.last_in & INET_FRAG_COMPLETE)
goto err;
@@ -339,6 +350,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
goto err;
}
+ ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
offset = ntohs(ip_hdr(skb)->frag_off);
flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET;
@@ -352,7 +364,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
/* Is this the final fragment? */
if ((flags & IP_MF) == 0) {
/* If we already have some bits beyond end
- * or have different end, the segment is corrrupted.
+ * or have different end, the segment is corrupted.
*/
if (end < qp->q.len ||
((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
@@ -450,7 +462,8 @@ found:
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it);
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
}
}
@@ -472,17 +485,27 @@ found:
}
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
- atomic_add(skb->truesize, &qp->q.net->mem);
+ qp->ecn |= ecn;
+ add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
qp->q.last_in |= INET_FRAG_FIRST_IN;
+ if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len + ihl > qp->q.max_size)
+ qp->q.max_size = skb->len + ihl;
+
if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- qp->q.meat == qp->q.len)
- return ip_frag_reasm(qp, prev, dev);
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
- write_lock(&ip4_frags.lock);
- list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
- write_unlock(&ip4_frags.lock);
+ skb_dst_drop(skb);
+ inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -502,9 +525,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
int len;
int ihlen;
int err;
+ int sum_truesize;
+ u8 ecn;
ipq_kill(qp);
+ ecn = ip_frag_ecn_table[qp->ecn];
+ if (unlikely(ecn == 0xff)) {
+ err = -EINVAL;
+ goto out_fail;
+ }
/* Make the one we just received the head. */
if (prev) {
head = prev->next;
@@ -520,7 +550,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next;
- kfree_skb(qp->q.fragments);
+ consume_skb(qp->q.fragments);
qp->q.fragments = head;
}
@@ -536,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_oversize;
/* Head of list must not be cloned. */
- if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(head, GFP_ATOMIC))
goto out_nomem;
/* If the first fragment is fragmented itself, we split
@@ -552,51 +582,65 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
- for (i=0; i<skb_shinfo(head)->nr_frags; i++)
- plen += skb_shinfo(head)->frags[i].size;
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len;
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- atomic_add(clone->truesize, &qp->q.net->mem);
+ add_frag_mem_limit(&qp->q, clone->truesize);
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
}
- atomic_sub(head->truesize, &qp->q.net->mem);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
+ IPCB(head)->frag_max_size = qp->q.max_size;
iph = ip_hdr(head);
- iph->frag_off = 0;
+ /* max_size != 0 implies at least one fragment had IP_DF set */
+ iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
+ iph->tos |= ecn;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return 0;
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
- "queue %p\n", qp);
+ LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
+ qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
- if (net_ratelimit())
- printk(KERN_INFO "Oversized IP packet from %pI4.\n",
- &qp->saddr);
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
@@ -612,8 +656,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
/* Start by cleaning up the memory. */
- if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
- ip_evictor(net);
+ ip_evictor(net);
/* Lookup (or create) queue header */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
@@ -634,6 +677,41 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
+struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+{
+ struct iphdr iph;
+ u32 len;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return skb;
+
+ if (!skb_copy_bits(skb, 0, &iph, sizeof(iph)))
+ return skb;
+
+ if (iph.ihl < 5 || iph.version != 4)
+ return skb;
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < len || len < (iph.ihl * 4))
+ return skb;
+
+ if (ip_is_fragment(&iph)) {
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb) {
+ if (!pskb_may_pull(skb, iph.ihl*4))
+ return skb;
+ if (pskb_trim_rcsum(skb, len))
+ return skb;
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ if (ip_defrag(skb, user))
+ return NULL;
+ skb_clear_hash(skb);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(ip_check_defrag);
+
#ifdef CONFIG_SYSCTL
static int zero;
@@ -695,9 +773,13 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[0].data = &net->ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
table[2].data = &net->ipv4.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
}
- hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+ hdr = register_net_sysctl(net, "net/ipv4", table);
if (hdr == NULL)
goto err_reg;
@@ -722,7 +804,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
static void ip4_frags_ctl_register(void)
{
- register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
#else
static inline int ip4_frags_ns_ctl_register(struct net *net)
@@ -741,14 +823,22 @@ static inline void ip4_frags_ctl_register(void)
static int __net_init ipv4_frags_init_net(struct net *net)
{
- /*
- * Fragment cache limits. We will commit 256K at one time. Should we
- * cross that limit we will prune down to 192K. This should cope with
- * even the most extreme cases without allowing an attacker to
- * measurably harm machine performance.
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 256 * 1024;
- net->ipv4.frags.low_thresh = 192 * 1024;
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 258c98d5fa7..9b842544aea 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -35,7 +37,7 @@
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -46,7 +48,7 @@
#include <net/rtnetlink.h>
#include <net/gre.h>
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -65,7 +67,7 @@
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
and silently drop packet when it expires. It is a good
- solution, but it supposes maintaing new variable in ALL
+ solution, but it supposes maintaining new variable in ALL
skb, even if no tunneling is used.
Current solution: xmit_recursion breaks dead loops. This is a percpu
@@ -91,14 +93,14 @@
One of them is to parse packet trying to detect inner encapsulation
made by our node. It is difficult or even impossible, especially,
- taking into account fragmentation. TO be short, tt is not solution at all.
+ taking into account fragmentation. TO be short, ttl is not solution at all.
Current solution: The solution was UNEXPECTEDLY SIMPLE.
We force DF flag on tunnels with preconfigured hop limit,
that is ALL. :-) Well, it does not remove the problem completely,
but exponential growth of network traffic is changed to linear
(branches, that exceed pmtu are pruned) and tunnel mtu
- fastly degrades to value <68, where looping stops.
+ rapidly degrades to value <68, where looping stops.
Yes, it is not good if there exists a router in the loop,
which does not force DF, even when encapsulating packets have DF set.
But it is not our problem! Nobody could accuse us, we made
@@ -106,399 +108,54 @@
fatal route to network, even if it were you who configured
fatal static route: you are innocent. :-)
-
-
- 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
- practically identical code. It would be good to glue them
- together, but it is not very evident, how to make them modular.
- sit is integral part of IPv6, ipip and gre are naturally modular.
- We could extract common parts (hash table, ioctl etc)
- to a separate module (ip_tunnel.c).
-
Alexey Kuznetsov.
*/
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
-static void ipgre_tunnel_setup(struct net_device *dev);
-static int ipgre_tunnel_bind_dev(struct net_device *dev);
-
-/* Fallback tunnel: no source, no destination, no key, no options */
-
-#define HASH_SIZE 16
static int ipgre_net_id __read_mostly;
-struct ipgre_net {
- struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
-
- struct net_device *fb_tunnel_dev;
-};
-
-/* Tunnel hash table */
-
-/*
- 4 hash tables:
-
- 3: (remote,local)
- 2: (remote,*)
- 1: (*,local)
- 0: (*,*)
-
- We require exact key match i.e. if a key is present in packet
- it will match only tunnel with the same key; if it is not present,
- it will match only keyless tunnel.
-
- All keysless packets, if not matched configured keyless tunnels
- will match fallback tunnel.
- */
-
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-
-#define tunnels_r_l tunnels[3]
-#define tunnels_r tunnels[2]
-#define tunnels_l tunnels[1]
-#define tunnels_wc tunnels[0]
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-#define for_each_ip_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-};
-
-static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
-{
- struct pcpu_tstats sum = { 0 };
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
- }
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
-}
-
-/* Given src, dst and key, find appropriate for input tunnel. */
-
-static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
- __be32 remote, __be32 local,
- __be32 key, __be16 gre_proto)
-{
- struct net *net = dev_net(dev);
- int link = dev->ifindex;
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(key);
- struct ip_tunnel *t, *cand = NULL;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
- ARPHRD_ETHER : ARPHRD_IPGRE;
- int score, cand_score = 4;
-
- for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
- if (local != t->parms.iph.saddr ||
- remote != t->parms.iph.daddr ||
- key != t->parms.i_key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
-
- for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
- if (remote != t->parms.iph.daddr ||
- key != t->parms.i_key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
-
- for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
- if ((local != t->parms.iph.saddr &&
- (local != t->parms.iph.daddr ||
- !ipv4_is_multicast(local))) ||
- key != t->parms.i_key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
-
- for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
- if (t->parms.i_key != key ||
- !(t->dev->flags & IFF_UP))
- continue;
-
- if (t->dev->type != ARPHRD_IPGRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
- }
-
- if (cand != NULL)
- return cand;
-
- dev = ign->fb_tunnel_dev;
- if (dev->flags & IFF_UP)
- return netdev_priv(dev);
+static int gre_tap_net_id __read_mostly;
- return NULL;
-}
-
-static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- unsigned int h = HASH(key);
- int prio = 0;
-
- if (local)
- prio |= 1;
- if (remote && !ipv4_is_multicast(remote)) {
- prio |= 2;
- h ^= HASH(remote);
- }
-
- return &ign->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
- struct ip_tunnel *t)
-{
- return __ipgre_bucket(ign, &t->parms);
-}
-
-static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
- struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
-
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
-static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipgre_bucket(ign, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
- }
- }
-}
-
-static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
- struct ip_tunnel_parm *parms,
- int type)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- __be32 key = parms->i_key;
- int link = parms->link;
- struct ip_tunnel *t;
- struct ip_tunnel __rcu **tp;
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- for (tp = __ipgre_bucket(ign, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next)
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr &&
- key == t->parms.i_key &&
- link == t->parms.link &&
- type == t->dev->type)
- break;
-
- return t;
-}
-
-static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- struct ip_tunnel *t, *nt;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
- if (t || !create)
- return t;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "gre%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
- if (!dev)
- return NULL;
-
- dev_net_set(dev, net);
-
- if (strchr(name, '%')) {
- if (dev_alloc_name(dev, name) < 0)
- goto failed_free;
- }
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
- dev->rtnl_link_ops = &ipgre_link_ops;
-
- dev->mtu = ipgre_tunnel_bind_dev(dev);
-
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
- return nt;
-
-failed_free:
- free_netdev(dev);
- return NULL;
-}
-
-static void ipgre_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- ipgre_tunnel_unlink(ign, netdev_priv(dev));
- dev_put(dev);
-}
-
-
-static void ipgre_err(struct sk_buff *skb, u32 info)
-{
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
-/* All the routers (except for Linux) return only
- 8 bytes of packet payload. It means, that precise relaying of
- ICMP in the real Internet is absolutely infeasible.
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
- Moreover, Cisco "wise men" put GRE key to the third word
- in GRE header. It makes impossible maintaining even soft state for keyed
- GRE tunnels with enabled checksum. Tell them "thank you".
-
- Well, I wonder, rfc1812 was written by Cisco employee,
- what the hell these idiots break standrads established
- by themself???
- */
-
- struct iphdr *iph = (struct iphdr *)skb->data;
- __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
- int grehlen = (iph->ihl<<2) + 4;
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
- __be16 flags;
-
- flags = p[0];
- if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
- if (flags&(GRE_VERSION|GRE_ROUTING))
- return;
- if (flags&GRE_KEY) {
- grehlen += 4;
- if (flags&GRE_CSUM)
- grehlen += 4;
- }
- }
-
- /* If only 8 bytes returned, keyed message will be dropped here */
- if (skb_headlen(skb) < grehlen)
- return;
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return PACKET_RCVD;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return;
+ return PACKET_RCVD;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -509,627 +166,173 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
+ return PACKET_RCVD;
+ break;
+
+ case ICMP_REDIRECT:
break;
}
- rcu_read_lock();
- t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
- flags & GRE_KEY ?
- *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
- p[1]);
- if (t == NULL || t->parms.iph.daddr == 0 ||
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (t == NULL)
+ return PACKET_REJECT;
+
+ if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- goto out;
+ return PACKET_RCVD;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- goto out;
+ return PACKET_RCVD;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
-out:
- rcu_read_unlock();
+ return PACKET_RCVD;
}
-static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
- if (INET_ECN_is_ce(iph->tos)) {
- if (skb->protocol == htons(ETH_P_IP)) {
- IP_ECN_set_ce(ip_hdr(skb));
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
- IP6_ECN_set_ce(ipv6_hdr(skb));
- }
- }
-}
-
-static inline u8
-ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
-{
- u8 inner = 0;
- if (skb->protocol == htons(ETH_P_IP))
- inner = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
- return INET_ECN_encapsulate(tos, inner);
-}
-
-static int ipgre_rcv(struct sk_buff *skb)
-{
- struct iphdr *iph;
- u8 *h;
- __be16 flags;
- __sum16 csum = 0;
- __be32 key = 0;
- u32 seqno = 0;
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
struct ip_tunnel *tunnel;
- int offset = 4;
- __be16 gre_proto;
- if (!pskb_may_pull(skb, 16))
- goto drop_nolock;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
iph = ip_hdr(skb);
- h = skb->data;
- flags = *(__be16*)h;
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->saddr, iph->daddr, tpi->key);
- if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
- /* - Version must be 0.
- - We do not support routing headers.
- */
- if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop_nolock;
-
- if (flags&GRE_CSUM) {
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- csum = csum_fold(skb->csum);
- if (!csum)
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- csum = __skb_checksum_complete(skb);
- skb->ip_summed = CHECKSUM_COMPLETE;
- }
- offset += 4;
- }
- if (flags&GRE_KEY) {
- key = *(__be32*)(h + offset);
- offset += 4;
- }
- if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32*)(h + offset));
- offset += 4;
- }
+ if (tunnel) {
+ skb_pop_mac_header(skb);
+ ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ return PACKET_RCVD;
}
-
- gre_proto = *(__be16 *)(h + 2);
-
- rcu_read_lock();
- if ((tunnel = ipgre_tunnel_lookup(skb->dev,
- iph->saddr, iph->daddr, key,
- gre_proto))) {
- struct pcpu_tstats *tstats;
-
- secpath_reset(skb);
-
- skb->protocol = gre_proto;
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
- skb->protocol = htons(ETH_P_IP);
- if ((*(h + offset) & 0xF0) != 0x40)
- offset += 4;
- }
-
- skb->mac_header = skb->network_header;
- __pskb_pull(skb, offset);
- skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
- skb->pkt_type = PACKET_HOST;
-#ifdef CONFIG_NET_IPGRE_BROADCAST
- if (ipv4_is_multicast(iph->daddr)) {
- /* Looped back packet, drop it! */
- if (rt_is_output_route(skb_rtable(skb)))
- goto drop;
- tunnel->dev->stats.multicast++;
- skb->pkt_type = PACKET_BROADCAST;
- }
-#endif
-
- if (((flags&GRE_CSUM) && csum) ||
- (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- if (tunnel->parms.i_flags&GRE_SEQ) {
- if (!(flags&GRE_SEQ) ||
- (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
- tunnel->i_seqno = seqno + 1;
- }
-
- /* Warning: All skb pointers will be invalidated! */
- if (tunnel->dev->type == ARPHRD_ETHER) {
- if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
- goto drop;
- }
-
- iph = ip_hdr(skb);
- skb->protocol = eth_type_trans(skb, tunnel->dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- }
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- skb_reset_network_header(skb);
- ipgre_ecn_decapsulate(iph, skb);
-
- netif_rx(skb);
-
- rcu_read_unlock();
- return 0;
- }
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
-
-drop:
- rcu_read_unlock();
-drop_nolock:
- kfree_skb(skb);
- return 0;
+ return PACKET_REJECT;
}
-static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct pcpu_tstats *tstats;
- struct iphdr *old_iph = ip_hdr(skb);
- struct iphdr *tiph;
- u8 tos;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- int gre_hlen;
- __be32 dst;
- int mtu;
-
- if (dev->type == ARPHRD_ETHER)
- IPCB(skb)->flags = 0;
-
- if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
- gre_hlen = 0;
- tiph = (struct iphdr *)skb->data;
- } else {
- gre_hlen = tunnel->hlen;
- tiph = &tunnel->parms.iph;
- }
-
- if ((dst = tiph->daddr) == 0) {
- /* NBMA tunnel */
-
- if (skb_dst(skb) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
- }
-
- if (skb->protocol == htons(ETH_P_IP)) {
- rt = skb_rtable(skb);
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
- }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct in6_addr *addr6;
- int addr_type;
- struct neighbour *neigh = skb_dst(skb)->neighbour;
-
- if (neigh == NULL)
- goto tx_error;
+ struct tnl_ptk_info tpi;
- addr6 = (struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
+ tpi.flags = tunnel->parms.o_flags;
+ tpi.proto = proto;
+ tpi.key = tunnel->parms.o_key;
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
+ tpi.seq = htonl(tunnel->o_seqno);
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
+ /* Push GRE header. */
+ gre_build_header(skb, &tpi, tunnel->hlen);
- if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
- goto tx_error_icmp;
-
- dst = addr6->s6_addr32[3];
- }
-#endif
- else
- goto tx_error;
- }
-
- tos = tiph->tos;
- if (tos == 1) {
- tos = 0;
- if (skb->protocol == htons(ETH_P_IP))
- tos = old_iph->tos;
- else if (skb->protocol == htons(ETH_P_IPV6))
- tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
- }
-
- {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = dst,
- .fl4_src = tiph->saddr,
- .fl4_tos = RT_TOS(tos),
- .fl_gre_key = tunnel->parms.o_key
- };
- if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error;
- }
- }
- tdev = rt->dst.dev;
-
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
- goto tx_error;
- }
-
- df = tiph->frag_off;
- if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
-
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- if (skb->protocol == htons(ETH_P_IP)) {
- df |= (old_iph->frag_off&htons(IP_DF));
+ ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
+}
- if ((old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
-
- if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
- rt6->rt6i_dst.plen == 128) {
- rt6->rt6i_flags |= RTF_MODIFIED;
- skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
- }
- }
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-#endif
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
+ if (dev->header_ops) {
+ /* Need space for new headers */
+ if (skb_cow_head(skb, dev->needed_headroom -
+ (tunnel->hlen + sizeof(struct iphdr))))
+ goto free_skb;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
+ tnl_params = (const struct iphdr *)skb->data;
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- }
+ /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
+ * to gre header.
+ */
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- skb_reset_transport_header(skb);
- skb_push(skb, gre_hlen);
- skb_reset_network_header(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
-
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_GRE;
- iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
-
- if ((iph->ttl = tiph->ttl) == 0) {
- if (skb->protocol == htons(ETH_P_IP))
- iph->ttl = old_iph->ttl;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- else if (skb->protocol == htons(ETH_P_IPV6))
- iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
-#endif
- else
- iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
+ tnl_params = &tunnel->parms.iph;
}
- ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
- ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
- htons(ETH_P_TEB) : skb->protocol;
-
- if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
- __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
- if (tunnel->parms.o_flags&GRE_SEQ) {
- ++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_KEY) {
- *ptr = tunnel->parms.o_key;
- ptr--;
- }
- if (tunnel->parms.o_flags&GRE_CSUM) {
- *ptr = 0;
- *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
- }
- }
-
- nf_reset(skb);
- tstats = this_cpu_ptr(dev->tstats);
- __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
-tx_error_icmp:
- dst_link_failure(skb);
-
-tx_error:
- dev->stats.tx_errors++;
- dev_kfree_skb(skb);
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
-static int ipgre_tunnel_bind_dev(struct net_device *dev)
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
- int hlen = LL_MAX_HEADER;
- int mtu = ETH_DATA_LEN;
- int addend = sizeof(struct iphdr) + 4;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- /* Guess output device to choose reasonable mtu and needed_headroom */
-
- if (iph->daddr) {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_GRE,
- .fl_gre_key = tunnel->parms.o_key
- };
- struct rtable *rt;
-
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
-
- if (dev->type != ARPHRD_ETHER)
- dev->flags |= IFF_POINTOPOINT;
- }
+ struct ip_tunnel *tunnel = netdev_priv(dev);
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
- if (tdev) {
- hlen = tdev->hard_header_len + tdev->needed_headroom;
- mtu = tdev->mtu;
- }
- dev->iflink = tunnel->parms.link;
-
- /* Precalculate GRE options length */
- if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
- if (tunnel->parms.o_flags&GRE_CSUM)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_KEY)
- addend += 4;
- if (tunnel->parms.o_flags&GRE_SEQ)
- addend += 4;
- }
- dev->needed_headroom = addend + hlen;
- mtu -= dev->hard_header_len + addend;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
- if (mtu < 68)
- mtu = 68;
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
- tunnel->hlen = addend;
+ return NETDEV_TX_OK;
- return mtu;
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
}
-static int
-ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+static int ipgre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ign->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipgre_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- if (!(p.i_flags&GRE_KEY))
- p.i_key = 0;
- if (!(p.o_flags&GRE_KEY))
- p.o_key = 0;
-
- t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- unsigned int nflags = 0;
-
- t = netdev_priv(dev);
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
- err = -EINVAL;
- break;
- }
- ipgre_tunnel_unlink(ign, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- t->parms.o_key = p.o_key;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
- }
-
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- dev->mtu = ipgre_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
-
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- if (dev == ign->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(ign->fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
-
- default:
- err = -EINVAL;
+ return -EINVAL;
}
+ p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
+ p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
-done:
- return err;
-}
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
-static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
- return -EINVAL;
- dev->mtu = new_mtu;
+ p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
return 0;
}
@@ -1159,38 +362,36 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
...
ftp fec0:6666:6666::193.233.7.65
...
-
*/
-
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
- __be16 *p = (__be16*)(iph+1);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
- memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
- p[0] = t->parms.o_flags;
- p[1] = htons(type);
+ iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
- /*
- * Set the source hardware address.
- */
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+ /* Set the source hardware address. */
if (saddr)
memcpy(&iph->saddr, saddr, 4);
if (daddr)
memcpy(&iph->daddr, daddr, 4);
if (iph->daddr)
- return t->hlen;
+ return t->hlen + sizeof(*iph);
- return -t->hlen;
+ return -(t->hlen + sizeof(*iph));
}
static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
- struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
+ const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
memcpy(haddr, &iph->saddr, 4);
return 4;
}
@@ -1206,17 +407,16 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi fl = {
- .oif = t->parms.link,
- .fl4_dst = t->parms.iph.daddr,
- .fl4_src = t->parms.iph.saddr,
- .fl4_tos = RT_TOS(t->parms.iph.tos),
- .proto = IPPROTO_GRE,
- .fl_gre_key = t->parms.o_key
- };
+ struct flowi4 fl4;
struct rtable *rt;
- if (ip_route_output_key(dev_net(dev), &rt, &fl))
+ rt = ip_route_output_gre(t->net, &fl4,
+ t->parms.iph.daddr,
+ t->parms.iph.saddr,
+ t->parms.o_key,
+ RT_TOS(t->parms.iph.tos),
+ t->parms.link);
+ if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
ip_rt_put(rt);
@@ -1234,62 +434,77 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
- in_dev = inetdev_by_index(dev_net(dev), t->mlink);
+ in_dev = inetdev_by_index(t->net, t->mlink);
if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
}
return 0;
}
-
#endif
static const struct net_device_ops ipgre_netdev_ops = {
.ndo_init = ipgre_tunnel_init,
- .ndo_uninit = ipgre_tunnel_uninit,
+ .ndo_uninit = ip_tunnel_uninit,
#ifdef CONFIG_NET_IPGRE_BROADCAST
.ndo_open = ipgre_open,
.ndo_stop = ipgre_close,
#endif
- .ndo_start_xmit = ipgre_tunnel_xmit,
+ .ndo_start_xmit = ipgre_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipgre_dev_free(struct net_device *dev)
-{
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
+#define GRE_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = ipgre_dev_free;
-
dev->type = ARPHRD_IPGRE;
- dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
- dev->flags = IFF_NOARP;
- dev->iflink = 0;
- dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ ip_tunnel_setup(dev, ipgre_net_id);
}
-static int ipgre_tunnel_init(struct net_device *dev)
+static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
- struct iphdr *iph;
tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
+ tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+ dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+ dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ __gre_tunnel_init(dev);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
- memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->flags = IFF_NOARP;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ dev->addr_len = 4;
if (iph->daddr) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -1303,100 +518,31 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static void ipgre_fb_tunnel_init(struct net_device *dev)
-{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- iph->version = 4;
- iph->protocol = IPPROTO_GRE;
- iph->ihl = 5;
- tunnel->hlen = sizeof(struct iphdr) + 4;
-
- dev_hold(dev);
-}
-
-
-static const struct gre_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
+static struct gre_cisco_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+ .priority = 0,
};
-static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
-{
- int prio;
-
- for (prio = 0; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ign->tunnels[prio][h]);
-
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
-}
-
static int __net_init ipgre_init_net(struct net *net)
{
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int err;
-
- ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
- ipgre_tunnel_setup);
- if (!ign->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ign->fb_tunnel_dev, net);
-
- ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
- ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
-
- if ((err = register_netdev(ign->fb_tunnel_dev)))
- goto err_reg_dev;
-
- rcu_assign_pointer(ign->tunnels_wc[0],
- netdev_priv(ign->fb_tunnel_dev));
- return 0;
-
-err_reg_dev:
- ipgre_dev_free(ign->fb_tunnel_dev);
-err_alloc_dev:
- return err;
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
static void __net_exit ipgre_exit_net(struct net *net)
{
- struct ipgre_net *ign;
- LIST_HEAD(list);
-
- ign = net_generic(net, ipgre_net_id);
- rtnl_lock();
- ipgre_destroy_tunnels(ign, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_link_ops);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
.exit = ipgre_exit_net,
.id = &ipgre_net_id,
- .size = sizeof(struct ipgre_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1441,8 +587,8 @@ out:
return ipgre_tunnel_validate(tb, data);
}
-static void ipgre_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
{
memset(parms, 0, sizeof(*parms));
@@ -1455,10 +601,10 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1482,145 +628,47 @@ static void ipgre_netlink_parms(struct nlattr *data[],
parms->iph.frag_off = htons(IP_DF);
}
-static int ipgre_tap_init(struct net_device *dev)
+static int gre_tap_init(struct net_device *dev)
{
- struct ip_tunnel *tunnel;
-
- tunnel = netdev_priv(dev);
-
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
- ipgre_tunnel_bind_dev(dev);
+ __gre_tunnel_init(dev);
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ return ip_tunnel_init(dev);
}
-static const struct net_device_ops ipgre_tap_netdev_ops = {
- .ndo_init = ipgre_tap_init,
- .ndo_uninit = ipgre_tunnel_uninit,
- .ndo_start_xmit = ipgre_tunnel_xmit,
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
- .ndo_change_mtu = ipgre_tunnel_change_mtu,
- .ndo_get_stats = ipgre_get_stats,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
static void ipgre_tap_setup(struct net_device *dev)
{
-
ether_setup(dev);
-
- dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = ipgre_dev_free;
-
- dev->iflink = 0;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, gre_tap_net_id);
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
{
- struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- int mtu;
- int err;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &nt->parms);
-
- if (ipgre_tunnel_find(net, &nt->parms, dev->type))
- return -EEXIST;
-
- if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
- random_ether_addr(dev->dev_addr);
-
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & GRE_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- err = register_netdevice(dev);
- if (err)
- goto out;
-
- dev_hold(dev);
- ipgre_tunnel_link(ign, nt);
+ struct ip_tunnel_parm p;
-out:
- return err;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
- struct ip_tunnel *t, *nt;
- struct net *net = dev_net(dev);
- struct ipgre_net *ign = net_generic(net, ipgre_net_id);
struct ip_tunnel_parm p;
- int mtu;
-
- if (dev == ign->fb_tunnel_dev)
- return -EINVAL;
-
- nt = netdev_priv(dev);
- ipgre_netlink_parms(data, &p);
-
- t = ipgre_tunnel_locate(net, &p, 0);
-
- if (t) {
- if (t->dev != dev)
- return -EEXIST;
- } else {
- t = nt;
-
- if (dev->type != ARPHRD_ETHER) {
- unsigned int nflags = 0;
-
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
-
- if ((dev->flags ^ nflags) &
- (IFF_POINTOPOINT | IFF_BROADCAST))
- return -EINVAL;
- }
- ipgre_tunnel_unlink(ign, t);
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- t->parms.i_key = p.i_key;
- if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- }
- ipgre_tunnel_link(ign, t);
- netdev_state_change(dev);
- }
-
- t->parms.o_key = p.o_key;
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
-
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- mtu = ipgre_tunnel_bind_dev(dev);
- if (!tb[IFLA_MTU])
- dev->mtu = mtu;
- netdev_state_change(dev);
- }
-
- return 0;
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
}
static size_t ipgre_get_size(const struct net_device *dev)
@@ -1654,17 +702,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm *p = &t->parms;
- NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
- NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
- NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
- NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
- NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
- NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
- NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
- NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
- NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
- NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
-
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -1693,6 +742,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
.validate = ipgre_tunnel_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
@@ -1706,27 +756,46 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
.validate = ipgre_tap_validate,
.newlink = ipgre_newlink,
.changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
};
-/*
- * And now the modules code and kernel interface.
- */
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
static int __init ipgre_init(void)
{
int err;
- printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+ pr_info("GRE over IPv4 tunneling driver\n");
err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
return err;
- err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
+ err = gre_cisco_register(&ipgre_protocol);
if (err < 0) {
- printk(KERN_INFO "ipgre init: can't add protocol\n");
+ pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
}
@@ -1738,24 +807,25 @@ static int __init ipgre_init(void)
if (err < 0)
goto tap_ops_failed;
-out:
- return err;
+ return 0;
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ gre_cisco_unregister(&ipgre_protocol);
add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
unregister_pernet_device(&ipgre_net_ops);
- goto out;
+ return err;
}
static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
- printk(KERN_INFO "ipgre close: can't remove protocol\n");
+ gre_cisco_unregister(&ipgre_protocol);
+ unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
@@ -1764,4 +834,5 @@ module_exit(ipgre_fini);
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
-MODULE_ALIAS("gre0");
+MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb..3d4da2c16b6 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -113,7 +113,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/system.h>
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -140,6 +141,7 @@
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
+#include <net/inet_ecn.h>
#include <linux/netfilter_ipv4.h>
#include <net/xfrm.h>
#include <linux/mroute.h>
@@ -148,7 +150,7 @@
/*
* Process Router Attention IP option (RFC 2113)
*/
-int ip_call_ra_chain(struct sk_buff *skb)
+bool ip_call_ra_chain(struct sk_buff *skb)
{
struct ip_ra_chain *ra;
u8 protocol = ip_hdr(skb)->protocol;
@@ -165,9 +167,9 @@ int ip_call_ra_chain(struct sk_buff *skb)
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), dev_net(dev))) {
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
- return 1;
+ return true;
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -180,42 +182,30 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
- __skb_pull(skb, ip_hdrlen(skb));
-
- /* Point into the IP datagram, just past the header. */
- skb_reset_transport_header(skb);
+ __skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
- int hash, raw;
const struct net_protocol *ipprot;
+ int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
- hash = protocol & (MAX_INET_PROTOS - 1);
- ipprot = rcu_dereference(inet_protos[hash]);
+ ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot != NULL) {
int ret;
- if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
- if (net_ratelimit())
- printk("%s: proto %d isn't netns-ready\n",
- __func__, protocol);
- kfree_skb(skb);
- goto out;
- }
-
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
@@ -236,9 +226,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- } else
+ kfree_skb(skb);
+ } else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
+ consume_skb(skb);
+ }
}
}
out:
@@ -256,7 +248,7 @@ int ip_local_deliver(struct sk_buff *skb)
* Reassemble IP fragments.
*/
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
@@ -265,10 +257,10 @@ int ip_local_deliver(struct sk_buff *skb)
ip_local_deliver_finish);
}
-static inline int ip_rcv_options(struct sk_buff *skb)
+static inline bool ip_rcv_options(struct sk_buff *skb)
{
struct ip_options *opt;
- struct iphdr *iph;
+ const struct iphdr *iph;
struct net_device *dev = skb->dev;
/* It looks as overkill, because not all
@@ -297,10 +289,10 @@ static inline int ip_rcv_options(struct sk_buff *skb)
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) &&
- net_ratelimit())
- printk(KERN_INFO "source route option %pI4 -> %pI4\n",
- &iph->saddr, &iph->daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
goto drop;
}
}
@@ -309,38 +301,47 @@ static inline int ip_rcv_options(struct sk_buff *skb)
goto drop;
}
- return 0;
+ return false;
drop:
- return -1;
+ return true;
}
+int sysctl_ip_early_demux __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_ip_early_demux);
+
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->early_demux) {
+ ipprot->early_demux(skb);
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ }
+
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (skb_dst(skb) == NULL) {
+ if (!skb_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
- if (err == -EHOSTUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INADDRERRORS);
- else if (err == -ENETUNREACH)
- IP_INC_STATS_BH(dev_net(skb->dev),
- IPSTATS_MIB_INNOROUTES);
- else if (err == -EXDEV)
+ if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
@@ -374,7 +375,7 @@ drop:
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -410,13 +411,20 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
+ BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+ BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+ BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+ IP_ADD_STATS_BH(dev_net(dev),
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
- goto inhdr_error;
+ goto csum_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
@@ -434,6 +442,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
}
+ skb->transport_header = skb->network_header + iph->ihl*4;
+
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
@@ -443,6 +453,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
+csum_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1906fa35860..ad382499bac 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -9,11 +9,14 @@
*
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <asm/uaccess.h>
+#include <asm/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
@@ -24,6 +27,7 @@
#include <net/icmp.h>
#include <net/route.h>
#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
/*
* Write options to IP header, record destination address to
@@ -36,8 +40,8 @@
* saddr is address of outgoing interface.
*/
-void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+ __be32 daddr, struct rtable *rt, int is_frag)
{
unsigned char *iph = skb_network_header(skb);
@@ -50,9 +54,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
if (!is_frag) {
if (opt->rr_needaddr)
- ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
if (opt->ts_needaddr)
- ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
if (opt->ts_needtime) {
struct timespec tv;
__be32 midtime;
@@ -83,28 +87,23 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
* NOTE: dopt cannot point to skb.
*/
-int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
{
- struct ip_options *sopt;
+ const struct ip_options *sopt;
unsigned char *sptr, *dptr;
int soffset, doffset;
int optlen;
- __be32 daddr;
memset(dopt, 0, sizeof(struct ip_options));
sopt = &(IPCB(skb)->opt);
- if (sopt->optlen == 0) {
- dopt->optlen = 0;
+ if (sopt->optlen == 0)
return 0;
- }
sptr = skb_network_header(skb);
dptr = dopt->__data;
- daddr = skb_rtable(skb)->rt_spec_dst;
-
if (sopt->rr) {
optlen = sptr[sopt->rr+1];
soffset = sptr[sopt->rr+2];
@@ -140,11 +139,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
} else {
dopt->ts_needtime = 0;
- if (soffset + 8 <= optlen) {
+ if (soffset + 7 <= optlen) {
__be32 addr;
- memcpy(&addr, sptr+soffset-1, 4);
- if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) {
+ memcpy(&addr, dptr+soffset-1, 4);
+ if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
dopt->ts_needtime = 1;
soffset += 8;
}
@@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
dopt->optlen += optlen;
}
if (sopt->srr) {
- unsigned char * start = sptr+sopt->srr;
+ unsigned char *start = sptr+sopt->srr;
__be32 faddr;
optlen = start[1];
@@ -168,7 +167,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
soffset -= 4;
if (soffset > 3) {
memcpy(&faddr, &start[soffset-1], 4);
- for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+ for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
memcpy(&dptr[doffset-1], &start[soffset-1], 4);
/*
* RFC1812 requires to fix illegal source routes.
@@ -178,6 +177,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
doffset -= 4;
}
if (doffset > 3) {
+ __be32 daddr = fib_compute_spec_dst(skb);
+
memcpy(&start[doffset-1], &daddr, 4);
dopt->faddr = faddr;
dptr[0] = start[0];
@@ -209,10 +210,10 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
* Simple and stupid 8), but the most efficient way.
*/
-void ip_options_fragment(struct sk_buff * skb)
+void ip_options_fragment(struct sk_buff *skb)
{
unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
int l = opt->optlen;
int optlen;
@@ -226,7 +227,7 @@ void ip_options_fragment(struct sk_buff * skb)
continue;
}
optlen = optptr[1];
- if (optlen<2 || optlen>l)
+ if (optlen < 2 || optlen > l)
return;
if (!IPOPT_COPIED(*optptr))
memset(optptr, IPOPT_NOOP, optlen);
@@ -240,6 +241,15 @@ void ip_options_fragment(struct sk_buff * skb)
opt->ts_needtime = 0;
}
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
+}
+
/*
* Verify options and fill pointers in struct options.
* Caller should clear *opt, and set opt->data.
@@ -247,14 +257,14 @@ void ip_options_fragment(struct sk_buff * skb)
*/
int ip_options_compile(struct net *net,
- struct ip_options * opt, struct sk_buff * skb)
+ struct ip_options *opt, struct sk_buff *skb)
{
- int l;
- unsigned char * iph;
- unsigned char * optptr;
- int optlen;
- unsigned char * pp_ptr = NULL;
+ __be32 spec_dst = htonl(INADDR_ANY);
+ unsigned char *pp_ptr = NULL;
struct rtable *rt = NULL;
+ unsigned char *optptr;
+ unsigned char *iph;
+ int optlen, l;
if (skb != NULL) {
rt = skb_rtable(skb);
@@ -265,27 +275,31 @@ int ip_options_compile(struct net *net,
for (l = opt->optlen; l > 0; ) {
switch (*optptr) {
- case IPOPT_END:
- for (optptr++, l--; l>0; optptr++, l--) {
+ case IPOPT_END:
+ for (optptr++, l--; l > 0; optptr++, l--) {
if (*optptr != IPOPT_END) {
*optptr = IPOPT_END;
opt->is_changed = 1;
}
}
goto eol;
- case IPOPT_NOOP:
+ case IPOPT_NOOP:
l--;
optptr++;
continue;
}
+ if (unlikely(l < 2)) {
+ pp_ptr = optptr;
+ goto error;
+ }
optlen = optptr[1];
- if (optlen<2 || optlen>l) {
+ if (optlen < 2 || optlen > l) {
pp_ptr = optptr;
goto error;
}
switch (*optptr) {
- case IPOPT_SSRR:
- case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
if (optlen < 3) {
pp_ptr = optptr + 1;
goto error;
@@ -311,7 +325,7 @@ int ip_options_compile(struct net *net,
opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
opt->srr = optptr - iph;
break;
- case IPOPT_RR:
+ case IPOPT_RR:
if (opt->rr) {
pp_ptr = optptr;
goto error;
@@ -329,8 +343,9 @@ int ip_options_compile(struct net *net,
pp_ptr = optptr + 2;
goto error;
}
- if (skb) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
opt->is_changed = 1;
}
optptr[2] += 4;
@@ -338,7 +353,7 @@ int ip_options_compile(struct net *net,
}
opt->rr = optptr - iph;
break;
- case IPOPT_TIMESTAMP:
+ case IPOPT_TIMESTAMP:
if (opt->ts) {
pp_ptr = optptr;
goto error;
@@ -352,52 +367,50 @@ int ip_options_compile(struct net *net,
goto error;
}
if (optptr[2] <= optlen) {
- __be32 *timeptr = NULL;
- if (optptr[2]+3 > optptr[1]) {
+ unsigned char *timeptr = NULL;
+ if (optptr[2]+3 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
switch (optptr[3]&0xF) {
- case IPOPT_TS_TSONLY:
- opt->ts = optptr - iph;
+ case IPOPT_TS_TSONLY:
if (skb)
- timeptr = (__be32*)&optptr[optptr[2]-1];
+ timeptr = &optptr[optptr[2]-1];
opt->ts_needtime = 1;
optptr[2] += 4;
break;
- case IPOPT_TS_TSANDADDR:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
- if (skb) {
- memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
- timeptr = (__be32*)&optptr[optptr[2]+3];
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ timeptr = &optptr[optptr[2]+3];
}
opt->ts_needaddr = 1;
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- case IPOPT_TS_PRESPEC:
- if (optptr[2]+7 > optptr[1]) {
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optlen) {
pp_ptr = optptr + 2;
goto error;
}
- opt->ts = optptr - iph;
{
__be32 addr;
memcpy(&addr, &optptr[optptr[2]-1], 4);
if (inet_addr_type(net, addr) == RTN_UNICAST)
break;
if (skb)
- timeptr = (__be32*)&optptr[optptr[2]+3];
+ timeptr = &optptr[optptr[2]+3];
}
opt->ts_needtime = 1;
optptr[2] += 8;
break;
- default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr + 3;
goto error;
}
@@ -405,26 +418,26 @@ int ip_options_compile(struct net *net,
}
if (timeptr) {
struct timespec tv;
- __be32 midtime;
+ u32 midtime;
getnstimeofday(&tv);
- midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
- memcpy(timeptr, &midtime, sizeof(__be32));
+ midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
+ put_unaligned_be32(midtime, timeptr);
opt->is_changed = 1;
}
- } else {
- unsigned overflow = optptr[3]>>4;
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ unsigned int overflow = optptr[3]>>4;
if (overflow == 15) {
pp_ptr = optptr + 3;
goto error;
}
- opt->ts = optptr - iph;
if (skb) {
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
opt->is_changed = 1;
}
}
+ opt->ts = optptr - iph;
break;
- case IPOPT_RA:
+ case IPOPT_RA:
if (optlen < 4) {
pp_ptr = optptr + 1;
goto error;
@@ -432,8 +445,8 @@ int ip_options_compile(struct net *net,
if (optptr[2] == 0 && optptr[3] == 0)
opt->router_alert = optptr - iph;
break;
- case IPOPT_CIPSO:
- if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
+ case IPOPT_CIPSO:
+ if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
pp_ptr = optptr;
goto error;
}
@@ -443,10 +456,10 @@ int ip_options_compile(struct net *net,
goto error;
}
break;
- case IPOPT_SEC:
- case IPOPT_SID:
- default:
- if (!skb && !capable(CAP_NET_RAW)) {
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
pp_ptr = optptr;
goto error;
}
@@ -472,20 +485,20 @@ EXPORT_SYMBOL(ip_options_compile);
* Undo all the changes done by ip_options_compile().
*/
-void ip_options_undo(struct ip_options * opt)
+void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
memmove(optptr+7, optptr+3, optptr[1]-7);
memcpy(optptr+3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
}
if (opt->ts) {
- unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
if (opt->ts_needtime) {
optptr[2] -= 4;
memset(&optptr[optptr[2]-1], 0, 4);
@@ -499,19 +512,19 @@ void ip_options_undo(struct ip_options * opt)
}
}
-static struct ip_options *ip_options_get_alloc(const int optlen)
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
{
- return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3),
+ return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
GFP_KERNEL);
}
-static int ip_options_get_finish(struct net *net, struct ip_options **optp,
- struct ip_options *opt, int optlen)
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
+ struct ip_options_rcu *opt, int optlen)
{
while (optlen & 3)
- opt->__data[optlen++] = IPOPT_END;
- opt->optlen = optlen;
- if (optlen && ip_options_compile(net, opt, NULL)) {
+ opt->opt.__data[optlen++] = IPOPT_END;
+ opt->opt.optlen = optlen;
+ if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
kfree(opt);
return -EINVAL;
}
@@ -520,42 +533,42 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp,
return 0;
}
-int ip_options_get_from_user(struct net *net, struct ip_options **optp,
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
unsigned char __user *data, int optlen)
{
- struct ip_options *opt = ip_options_get_alloc(optlen);
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
if (!opt)
return -ENOMEM;
- if (optlen && copy_from_user(opt->__data, data, optlen)) {
+ if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
kfree(opt);
return -EFAULT;
}
return ip_options_get_finish(net, optp, opt, optlen);
}
-int ip_options_get(struct net *net, struct ip_options **optp,
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
unsigned char *data, int optlen)
{
- struct ip_options *opt = ip_options_get_alloc(optlen);
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
if (!opt)
return -ENOMEM;
if (optlen)
- memcpy(opt->__data, data, optlen);
+ memcpy(opt->opt.__data, data, optlen);
return ip_options_get_finish(net, optp, opt, optlen);
}
void ip_forward_options(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
- unsigned char * optptr;
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
struct rtable *rt = skb_rtable(skb);
unsigned char *raw = skb_network_header(skb);
if (opt->rr_needaddr) {
optptr = (unsigned char *)raw + opt->rr;
- ip_rt_get_source(&optptr[optptr[2]-5], rt);
+ ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
opt->is_changed = 1;
}
if (opt->srr_is_hit) {
@@ -563,25 +576,27 @@ void ip_forward_options(struct sk_buff *skb)
optptr = raw + opt->srr;
- for ( srrptr=optptr[2], srrspace = optptr[1];
+ for ( srrptr = optptr[2], srrspace = optptr[1];
srrptr <= srrspace;
srrptr += 4
) {
if (srrptr + 3 > srrspace)
break;
- if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+ if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
break;
}
if (srrptr + 3 <= srrspace) {
opt->is_changed = 1;
- ip_rt_get_source(&optptr[srrptr-1], rt);
- ip_hdr(skb)->daddr = rt->rt_dst;
+ ip_hdr(skb)->daddr = opt->nexthop;
+ ip_rt_get_source(&optptr[srrptr-1], skb, rt);
optptr[2] = srrptr+4;
- } else if (net_ratelimit())
- printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
if (opt->ts_needaddr) {
optptr = raw + opt->ts;
- ip_rt_get_source(&optptr[optptr[2]-9], rt);
+ ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
opt->is_changed = 1;
}
}
@@ -603,7 +618,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
unsigned long orefdst;
int err;
- if (!opt->srr)
+ if (!rt)
return 0;
if (skb->pkt_type != PACKET_HOST)
@@ -617,7 +632,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
if (rt->rt_type != RTN_LOCAL)
return -EINVAL;
- for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
if (srrptr + 3 > srrspace) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
return -EINVAL;
@@ -637,11 +652,12 @@ int ip_options_rcv_srr(struct sk_buff *skb)
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
- memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+ iph->daddr = nexthop;
opt->is_changed = 1;
}
if (srrptr <= srrspace) {
opt->srr_is_hit = 1;
+ opt->nexthop = nexthop;
opt->is_changed = 1;
}
return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5090c7ff525..8d3b6b0e985 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -43,7 +43,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -82,9 +81,10 @@
#include <linux/tcp.h>
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
/* Generate a checksum for an outgoing IP datagram. */
-__inline__ void ip_send_check(struct iphdr *iph)
+void ip_send_check(struct iphdr *iph)
{
iph->check = 0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
@@ -101,36 +101,24 @@ int __ip_local_out(struct sk_buff *skb)
skb_dst(skb)->dev, dst_output);
}
-int ip_local_out(struct sk_buff *skb)
+int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
{
int err;
err = __ip_local_out(skb);
if (likely(err == 1))
- err = dst_output(skb);
+ err = dst_output_sk(sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out);
-
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
- skb_reset_mac_header(newskb);
- __skb_pull(newskb, skb_network_offset(newskb));
- newskb->pkt_type = PACKET_LOOPBACK;
- newskb->ip_summed = CHECKSUM_UNNECESSARY;
- WARN_ON(!skb_dst(newskb));
- netif_rx_ni(newskb);
- return 0;
-}
+EXPORT_SYMBOL_GPL(ip_local_out_sk);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
int ttl = inet->uc_ttl;
if (ttl < 0)
- ttl = dst_metric(dst, RTAX_HOPLIMIT);
+ ttl = ip4_dst_hoplimit(dst);
return ttl;
}
@@ -139,14 +127,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options *opt)
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
/* Build the IP header. */
- skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
iph->version = 4;
@@ -157,14 +145,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
+ iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+ iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- ip_select_ident(iph, &rt->dst, sk);
+ ip_select_ident(skb, sk);
- if (opt && opt->optlen) {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, daddr, rt, 0);
+ if (opt && opt->opt.optlen) {
+ iph->ihl += opt->opt.optlen>>2;
+ ip_options_build(skb, &opt->opt, daddr, rt, 0);
}
skb->priority = sk->sk_priority;
@@ -181,6 +169,8 @@ static inline int ip_finish_output2(struct sk_buff *skb)
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -198,27 +188,69 @@ static inline int ip_finish_output2(struct sk_buff *skb)
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
- kfree_skb(skb);
+ consume_skb(skb);
skb = skb2;
}
- if (dst->hh)
- return neigh_hh_output(dst->hh, skb);
- else if (dst->neighbour)
- return dst->neighbour->output(skb);
+ rcu_read_lock_bh();
+ nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
+ neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ if (!IS_ERR(neigh)) {
+ int res = dst_neigh_output(dst, neigh, skb);
- if (net_ratelimit())
- printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+ rcu_read_unlock_bh();
+ return res;
+ }
+ rcu_read_unlock_bh();
+
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
kfree_skb(skb);
return -EINVAL;
}
-static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+static int ip_finish_output_gso(struct sk_buff *skb)
{
- struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+ netdev_features_t features;
+ struct sk_buff *segs;
+ int ret = 0;
+
+ /* common case: locally created skb or seglen is <= mtu */
+ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+ skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ return ip_finish_output2(skb);
- return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
- skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+ /* Slowpath - GSO segment length is exceeding the dst MTU.
+ *
+ * This can happen in two cases:
+ * 1) TCP GRO packet, DF bit not set
+ * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
+ * from host network stack.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = ip_fragment(segs, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ segs = nskb;
+ } while (segs);
+
+ return ret;
}
static int ip_finish_output(struct sk_buff *skb)
@@ -230,15 +262,17 @@ static int ip_finish_output(struct sk_buff *skb)
return dst_output(skb);
}
#endif
- if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(skb);
+
+ if (skb->len > ip_skb_dst_mtu(skb))
return ip_fragment(skb, ip_finish_output2);
- else
- return ip_finish_output2(skb);
+
+ return ip_finish_output2(skb);
}
-int ip_mc_output(struct sk_buff *skb)
+int ip_mc_output(struct sock *sk, struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
@@ -274,7 +308,7 @@ int ip_mc_output(struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
newskb, NULL, newskb->dev,
- ip_dev_loopback_xmit);
+ dev_loopback_xmit);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -289,7 +323,7 @@ int ip_mc_output(struct sk_buff *skb)
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
- NULL, newskb->dev, ip_dev_loopback_xmit);
+ NULL, newskb->dev, dev_loopback_xmit);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -297,7 +331,7 @@ int ip_mc_output(struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sk_buff *skb)
+int ip_output(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
@@ -311,11 +345,26 @@ int ip_output(struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_queue_xmit(struct sk_buff *skb)
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ * iph->saddr = fl4->saddr;
+ * iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+ BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+ offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+ memcpy(&iph->saddr, &fl4->saddr,
+ sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
- struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- struct ip_options *opt = inet->opt;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
@@ -324,6 +373,8 @@ int ip_queue_xmit(struct sk_buff *skb)
* f.e. by something like SCTP.
*/
rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ fl4 = &fl->u.ip4;
rt = skb_rtable(skb);
if (rt != NULL)
goto packet_routed;
@@ -335,59 +386,53 @@ int ip_queue_xmit(struct sk_buff *skb)
/* Use correct destination address if we have options. */
daddr = inet->inet_daddr;
- if(opt && opt->srr)
- daddr = opt->faddr;
-
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = inet->inet_saddr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = inet->inet_dport };
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times
- * itself out.
- */
- security_sk_classify_flow(sk, &fl);
- if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
- goto no_route;
- }
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport,
+ inet->inet_sport,
+ sk->sk_protocol,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
- skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
+ ip_copy_addrs(iph, fl4);
+
/* Transport layer set skb->h.foo itself. */
- if (opt && opt->optlen) {
- iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
+ if (inet_opt && inet_opt->opt.optlen) {
+ iph->ihl += inet_opt->opt.optlen >> 2;
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_more(iph, &rt->dst, sk,
- (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+ ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
+ /* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
@@ -421,10 +466,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
- to->nf_trace = from->nf_trace;
-#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
to->ipvs_property = from->ipvs_property;
#endif
@@ -458,10 +499,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
iph = ip_hdr(skb);
- if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+ mtu = ip_skb_dst_mtu(skb);
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(ip_skb_dst_mtu(skb)));
+ htonl(mtu));
kfree_skb(skb);
return -EMSGSIZE;
}
@@ -471,7 +515,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
*/
hlen = iph->ihl * 4;
- mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
+ mtu = mtu - hlen; /* Size of data space */
#ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
@@ -491,7 +535,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
- (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+ ip_is_fragment(iph) ||
skb_cloned(skb))
goto slow_path;
@@ -584,6 +628,11 @@ slow_path_clean:
}
slow_path:
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
+ goto fail;
+ iph = ip_hdr(skb);
+
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
@@ -608,7 +657,7 @@ slow_path:
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu)
len = mtu;
- /* IF: we are not sending upto and including the packet end
+ /* IF: we are not sending up to and including the packet end
then align the next start on an eight byte boundary */
if (len < left) {
len &= ~7;
@@ -691,7 +740,7 @@ slow_path:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
- kfree_skb(skb);
+ consume_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
@@ -732,10 +781,11 @@ csum_page(struct page *page, int offset, int copy)
}
static inline int ip_ufo_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu, unsigned int flags)
+ int transhdrlen, int maxfraglen, unsigned int flags)
{
struct sk_buff *skb;
int err;
@@ -744,7 +794,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
* device, so create one single skb packet containing complete
* udp datagram
*/
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+ if ((skb = skb_peek_tail(queue)) == NULL) {
skb = sock_alloc_send_skb(sk,
hh_len + fragheaderlen + transhdrlen + 20,
(flags & MSG_DONTWAIT), &err);
@@ -764,104 +814,62 @@ static inline int ip_ufo_append_data(struct sock *sk,
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
- skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- sk->sk_sndmsg_off = 0;
- /* specify the length of each IP datagram fragment */
- skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
- __skb_queue_tail(&sk->sk_write_queue, skb);
+
+ __skb_queue_tail(queue, skb);
+ } else if (skb_is_gso(skb)) {
+ goto append;
}
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ /* specify the length of each IP datagram fragment */
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+append:
return skb_append_datato_frags(sk, skb, getfrag, from,
(length - transhdrlen));
}
-/*
- * ip_append_data() and ip_append_page() can make one large IP datagram
- * from many pieces of data. Each pieces will be holded on the socket
- * until ip_push_pending_frames() is called. Each piece can be a page
- * or non-page data.
- *
- * Not only UDP, other transport protocols - e.g. raw sockets - can use
- * this interface potentially.
- *
- * LATER: length must be adjusted by pad at tail, when it is required.
- */
-int ip_append_data(struct sock *sk,
- int getfrag(void *from, char *to, int offset, int len,
- int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- struct ipcm_cookie *ipc, struct rtable **rtp,
- unsigned int flags)
+static int __ip_append_data(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
- struct ip_options *opt = NULL;
+ struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
int mtu;
int copy;
int err;
int offset = 0;
- unsigned int maxfraglen, fragheaderlen;
+ unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
- struct rtable *rt;
+ struct rtable *rt = (struct rtable *)cork->dst;
- if (flags&MSG_PROBE)
- return 0;
+ skb = skb_peek_tail(queue);
- if (skb_queue_empty(&sk->sk_write_queue)) {
- /*
- * setup for corking.
- */
- opt = ipc->opt;
- if (opt) {
- if (inet->cork.opt == NULL) {
- inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
- if (unlikely(inet->cork.opt == NULL))
- return -ENOBUFS;
- }
- memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
- inet->cork.flags |= IPCORK_OPT;
- inet->cork.addr = ipc->addr;
- }
- rt = *rtp;
- if (unlikely(!rt))
- return -EFAULT;
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
- inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->dst.dev->mtu :
- dst_mtu(rt->dst.path);
- inet->cork.dst = &rt->dst;
- inet->cork.length = 0;
- sk->sk_sndmsg_page = NULL;
- sk->sk_sndmsg_off = 0;
- exthdrlen = rt->dst.header_len;
- length += exthdrlen;
- transhdrlen += exthdrlen;
- } else {
- rt = (struct rtable *)inet->cork.dst;
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ exthdrlen = !skb ? rt->dst.header_len : 0;
+ mtu = cork->fragsize;
- transhdrlen = 0;
- exthdrlen = 0;
- mtu = inet->cork.fragsize;
- }
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
- if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
- mtu-exthdrlen);
+ if (cork->length + length > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
@@ -875,15 +883,13 @@ int ip_append_data(struct sock *sk,
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
- skb = skb_peek_tail(&sk->sk_write_queue);
-
- inet->cork.length += length;
+ cork->length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO)) {
- err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
- fragheaderlen, transhdrlen, mtu,
- flags);
+ (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
+ err = ip_ufo_append_data(sk, queue, getfrag, from, length,
+ hh_len, fragheaderlen, transhdrlen,
+ maxfraglen, flags);
if (err)
goto error;
return 0;
@@ -933,17 +939,16 @@ alloc_new_skb:
else
alloclen = fraglen;
+ alloclen += exthdrlen;
+
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
- if (datalen == length + fraggap) {
+ if (datalen == length + fraggap)
alloclen += rt->dst.trailer_len;
- /* make sure mtu is not reached */
- if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
- datalen -= ALIGN(rt->dst.trailer_len, 8);
- }
+
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
@@ -960,7 +965,7 @@ alloc_new_skb:
else
/* only the initial fragment is
time stamped */
- ipc->tx_flags = 0;
+ cork->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -971,16 +976,16 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
- skb_shinfo(skb)->tx_flags = ipc->tx_flags;
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen + exthdrlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
- data += fragheaderlen;
+ data += fragheaderlen + exthdrlen;
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
@@ -1008,7 +1013,7 @@ alloc_new_skb:
/*
* Put the packet on the pending queue.
*/
- __skb_queue_tail(&sk->sk_write_queue, skb);
+ __skb_queue_tail(queue, skb);
continue;
}
@@ -1027,46 +1032,30 @@ alloc_new_skb:
}
} else {
int i = skb_shinfo(skb)->nr_frags;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
- struct page *page = sk->sk_sndmsg_page;
- int off = sk->sk_sndmsg_off;
- unsigned int left;
-
- if (page && (left = PAGE_SIZE - off) > 0) {
- if (copy >= left)
- copy = left;
- if (page != frag->page) {
- if (i == MAX_SKB_FRAGS) {
- err = -EMSGSIZE;
- goto error;
- }
- get_page(page);
- skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
- frag = &skb_shinfo(skb)->frags[i];
- }
- } else if (i < MAX_SKB_FRAGS) {
- if (copy > PAGE_SIZE)
- copy = PAGE_SIZE;
- page = alloc_pages(sk->sk_allocation, 0);
- if (page == NULL) {
- err = -ENOMEM;
- goto error;
- }
- sk->sk_sndmsg_page = page;
- sk->sk_sndmsg_off = 0;
- skb_fill_page_desc(skb, i, page, 0, 0);
- frag = &skb_shinfo(skb)->frags[i];
- } else {
- err = -EMSGSIZE;
- goto error;
- }
- if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
- err = -EFAULT;
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
goto error;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
}
- sk->sk_sndmsg_off += copy;
- frag->size += copy;
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (getfrag(from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
@@ -1078,24 +1067,104 @@ alloc_new_skb:
return 0;
+error_efault:
+ err = -EFAULT;
error:
- inet->cork.length -= length;
+ cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
-ssize_t ip_append_page(struct sock *sk, struct page *page,
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+ struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+ struct ip_options_rcu *opt;
+ struct rtable *rt;
+
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (cork->opt == NULL) {
+ cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+ sk->sk_allocation);
+ if (unlikely(cork->opt == NULL))
+ return -ENOBUFS;
+ }
+ memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+ cork->flags |= IPCORK_OPT;
+ cork->addr = ipc->addr;
+ }
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+ /*
+ * We steal reference to this route, caller should not release it
+ */
+ *rtp = NULL;
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ cork->dst = &rt->dst;
+ cork->length = 0;
+ cork->ttl = ipc->ttl;
+ cork->tos = ipc->tos;
+ cork->priority = ipc->priority;
+ cork->tx_flags = ipc->tx_flags;
+
+ return 0;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+ if (err)
+ return err;
+ } else {
+ transhdrlen = 0;
+ }
+
+ return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+ sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags);
+}
+
+ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
int offset, size_t size, int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
struct rtable *rt;
struct ip_options *opt = NULL;
+ struct inet_cork *cork;
int hh_len;
int mtu;
int len;
int err;
- unsigned int maxfraglen, fragheaderlen, fraggap;
+ unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
if (inet->hdrincl)
return -EPERM;
@@ -1106,28 +1175,31 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
if (skb_queue_empty(&sk->sk_write_queue))
return -EINVAL;
- rt = (struct rtable *)inet->cork.dst;
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ cork = &inet->cork.base;
+ rt = (struct rtable *)cork->dst;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
if (!(rt->dst.dev->features&NETIF_F_SG))
return -EOPNOTSUPP;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = inet->cork.fragsize;
+ mtu = cork->fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
- if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
+ if (cork->length + size > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
return -EMSGSIZE;
}
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
return -EINVAL;
- inet->cork.length += size;
+ cork->length += size;
if ((size + skb->len > mtu) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) {
@@ -1197,7 +1269,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
if (len > size)
len = size;
if (skb_can_coalesce(skb, i, page, offset)) {
- skb_shinfo(skb)->frags[i-1].size += len;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
} else if (i < MAX_SKB_FRAGS) {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, len);
@@ -1222,45 +1294,47 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
return 0;
error:
- inet->cork.length -= size;
+ cork->length -= size;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
return err;
}
-static void ip_cork_release(struct inet_sock *inet)
+static void ip_cork_release(struct inet_cork *cork)
{
- inet->cork.flags &= ~IPCORK_OPT;
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- dst_release(inet->cork.dst);
- inet->cork.dst = NULL;
+ cork->flags &= ~IPCORK_OPT;
+ kfree(cork->opt);
+ cork->opt = NULL;
+ dst_release(cork->dst);
+ cork->dst = NULL;
}
/*
* Combined all pending IP fragments on the socket as one IP datagram
* and push them out.
*/
-int ip_push_pending_frames(struct sock *sk)
+struct sk_buff *__ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options *opt = NULL;
- struct rtable *rt = (struct rtable *)inet->cork.dst;
+ struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph;
__be16 df = 0;
__u8 ttl;
- int err = 0;
- if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+ if ((skb = __skb_dequeue(queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb));
- while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
__skb_pull(tmp_skb, skb_network_header_len(skb));
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
@@ -1275,84 +1349,141 @@ int ip_push_pending_frames(struct sock *sk)
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will come out.
*/
- if (inet->pmtudisc < IP_PMTUDISC_DO)
- skb->local_df = 1;
+ skb->ignore_df = ip_sk_ignore_df(sk);
/* DF bit is set when we want to see DF on outgoing frames.
- * If local_df is set too, we still allow to fragment this frame
+ * If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ inet->pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
- if (inet->cork.flags & IPCORK_OPT)
- opt = inet->cork.opt;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
- if (rt->rt_type == RTN_MULTICAST)
+ if (cork->ttl != 0)
+ ttl = cork->ttl;
+ else if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
ttl = ip_select_ttl(inet, &rt->dst);
- iph = (struct iphdr *)skb->data;
+ iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- if (opt) {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, inet->cork.addr, rt, 0);
- }
- iph->tos = inet->tos;
+ iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
iph->frag_off = df;
- ip_select_ident(iph, &rt->dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
+ ip_copy_addrs(iph, fl4);
+ ip_select_ident(skb, sk);
- skb->priority = sk->sk_priority;
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, cork->addr, rt, 0);
+ }
+
+ skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
skb->mark = sk->sk_mark;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
*/
- inet->cork.dst = NULL;
+ cork->dst = NULL;
skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(net, ((struct icmphdr *)
skb_transport_header(skb))->type);
- /* Netfilter gets whole the not fragmented skb. */
+ ip_cork_release(cork);
+out:
+ return skb;
+}
+
+int ip_send_skb(struct net *net, struct sk_buff *skb)
+{
+ int err;
+
err = ip_local_out(skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
if (err)
- goto error;
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
}
-out:
- ip_cork_release(inet);
return err;
+}
-error:
- IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
- goto out;
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+ struct sk_buff *skb;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ return 0;
+
+ /* Netfilter gets whole the not fragmented skb. */
+ return ip_send_skb(sock_net(sk), skb);
}
/*
* Throw away all pending data on the socket.
*/
-void ip_flush_pending_frames(struct sock *sk)
+static void __ip_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
{
struct sk_buff *skb;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ while ((skb = __skb_dequeue_tail(queue)) != NULL)
kfree_skb(skb);
- ip_cork_release(inet_sk(sk));
+ ip_cork_release(cork);
}
+void ip_flush_pending_frames(struct sock *sk)
+{
+ __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_cork cork;
+ struct sk_buff_head queue;
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ cork.flags = 0;
+ cork.addr = 0;
+ cork.opt = NULL;
+ err = ip_setup_cork(sk, &cork, ipc, rtp);
+ if (err)
+ return ERR_PTR(err);
+
+ err = __ip_append_data(sk, fl4, &queue, &cork,
+ &current->task_frag, getfrag,
+ from, length, transhdrlen, flags);
+ if (err) {
+ __ip_flush_pending_frames(sk, &queue, &cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip_make_skb(sk, fl4, &queue, &cork);
+}
/*
* Fetch data from kernel space and fill in checksum if needed.
@@ -1369,74 +1500,88 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
/*
* Generic function to send a packet as reply to another packet.
- * Used to send TCP resets so far. ICMP should use this function too.
+ * Used to send some TCP resets/acks so far.
*
- * Should run single threaded per socket because it uses the sock
- * structure to pass arguments.
+ * Use a fake percpu inet socket to avoid false sharing and contention.
*/
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
- unsigned int len)
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+ .sk = {
+ .__sk_common = {
+ .skc_refcnt = ATOMIC_INIT(1),
+ },
+ .sk_wmem_alloc = ATOMIC_INIT(1),
+ .sk_allocation = GFP_ATOMIC,
+ .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
+ },
+ .pmtudisc = IP_PMTUDISC_WANT,
+ .uc_ttl = -1,
+};
+
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+ __be32 saddr, const struct ip_reply_arg *arg,
+ unsigned int len)
{
- struct inet_sock *inet = inet_sk(sk);
- struct {
- struct ip_options opt;
- char data[40];
- } replyopts;
+ struct ip_options_data replyopts;
struct ipcm_cookie ipc;
- __be32 daddr;
+ struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
+ struct sk_buff *nskb;
+ struct sock *sk;
+ struct inet_sock *inet;
- if (ip_options_echo(&replyopts.opt, skb))
+ if (ip_options_echo(&replyopts.opt.opt, skb))
return;
- daddr = ipc.addr = rt->rt_src;
+ ipc.addr = daddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
- if (replyopts.opt.optlen) {
+ if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
- if (ipc.opt->srr)
- daddr = replyopts.opt.faddr;
+ if (replyopts.opt.opt.srr)
+ daddr = replyopts.opt.opt.faddr;
}
- {
- struct flowi fl = { .oif = arg->bound_dev_if,
- .fl4_dst = daddr,
- .fl4_src = rt->rt_spec_dst,
- .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
- .fl_ip_sport = tcp_hdr(skb)->dest,
- .fl_ip_dport = tcp_hdr(skb)->source,
- .proto = sk->sk_protocol,
- .flags = ip_reply_arg_flowi_flags(arg) };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(sock_net(sk), &rt, &fl))
- return;
- }
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(net, skb->mark),
+ RT_TOS(arg->tos),
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+ ip_reply_arg_flowi_flags(arg),
+ daddr, saddr,
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return;
- /* And let IP do all the hard work.
+ inet = &get_cpu_var(unicast_sock);
- This chunk is not reenterable, hence spinlock.
- Note that it uses the fact, that this function is called
- with locally disabled BH and that sk cannot be already spinlocked.
- */
- bh_lock_sock(sk);
- inet->tos = ip_hdr(skb)->tos;
+ inet->tos = arg->tos;
+ sk = &inet->sk;
sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
- ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+ sock_net_set(sk, net);
+ __skb_queue_head_init(&sk->sk_write_queue);
+ sk->sk_sndbuf = sysctl_wmem_default;
+ ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
&ipc, &rt, MSG_DONTWAIT);
- if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
if (arg->csumoffset >= 0)
- *((__sum16 *)skb_transport_header(skb) +
- arg->csumoffset) = csum_fold(csum_add(skb->csum,
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
- skb->ip_summed = CHECKSUM_NONE;
- ip_push_pending_frames(sk);
+ nskb->ip_summed = CHECKSUM_NONE;
+ skb_orphan(nskb);
+ skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
+ ip_push_pending_frames(sk, &fl4);
}
- bh_unlock_sock(sk);
+ put_cpu_var(unicast_sock);
ip_rt_put(rt);
}
@@ -1446,7 +1591,7 @@ void __init ip_init(void)
ip_rt_init();
inet_initpeers();
-#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
- igmp_mc_proc_init();
+#if defined(CONFIG_IP_MULTICAST)
+ igmp_mc_init();
#endif
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 3948c86e59c..64741b93863 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -33,12 +33,14 @@
#include <linux/netfilter.h>
#include <linux/route.h>
#include <linux/mroute.h>
+#include <net/inet_ecn.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/compat.h>
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
#include <net/transp_v6.h>
#endif
+#include <net/ip_fib.h>
#include <linux/errqueue.h>
#include <asm/uaccess.h>
@@ -57,17 +59,9 @@
static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
- struct in_pktinfo info;
- struct rtable *rt = skb_rtable(skb);
+ struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
- if (rt) {
- info.ipi_ifindex = rt->rt_iif;
- info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
- } else {
- info.ipi_ifindex = 0;
- info.ipi_spec_dst.s_addr = 0;
- }
put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
@@ -96,7 +90,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options * opt = (struct ip_options *)optbuf;
+ struct ip_options *opt = (struct ip_options *)optbuf;
if (IPCB(skb)->opt.optlen == 0)
return;
@@ -131,7 +125,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
- struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
__be16 *ports = (__be16 *)skb_transport_header(skb);
if (skb_transport_offset(skb) + 4 > skb->len)
@@ -153,7 +147,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
struct inet_sock *inet = inet_sk(skb->sk);
- unsigned flags = inet->cmsg_flags;
+ unsigned int flags = inet->cmsg_flags;
/* Ordered by supposed usage frequency */
if (flags & 1)
@@ -192,14 +186,31 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
}
EXPORT_SYMBOL(ip_cmsg_recv);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+ bool allow_ipv6)
{
- int err;
+ int err, val;
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
+#if defined(CONFIG_IPV6)
+ if (allow_ipv6 &&
+ cmsg->cmsg_level == SOL_IPV6 &&
+ cmsg->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *src_info;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+ return -EINVAL;
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+ return -EINVAL;
+ ipc->oif = src_info->ipi6_ifindex;
+ ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+ continue;
+ }
+#endif
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
@@ -220,6 +231,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
ipc->addr = info->ipi_spec_dst.s_addr;
break;
}
+ case IP_TTL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->ttl = val;
+ break;
+ case IP_TOS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ ipc->tos = val;
+ ipc->priority = rt_tos2priority(ipc->tos);
+ break;
+
default:
return -EINVAL;
}
@@ -373,11 +402,11 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
/*
* Handle MSG_ERRQUEUE
*/
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
struct sock_exterr_skb *serr;
struct sk_buff *skb, *skb2;
- struct sockaddr_in *sin;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct {
struct sock_extended_err ee;
struct sockaddr_in offender;
@@ -403,13 +432,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
serr = SKB_EXT_ERR(skb);
- sin = (struct sockaddr_in *)msg->msg_name;
if (sin) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
serr->addr_offset);
sin->sin_port = serr->port;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
@@ -462,18 +491,28 @@ static int do_ip_setsockopt(struct sock *sk, int level,
struct inet_sock *inet = inet_sk(sk);
int val = 0, err;
- if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
- (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
- (1<<IP_RETOPTS) | (1<<IP_TOS) |
- (1<<IP_TTL) | (1<<IP_HDRINCL) |
- (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
- (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
- (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
- (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
- optname == IP_MULTICAST_TTL ||
- optname == IP_MULTICAST_ALL ||
- optname == IP_MULTICAST_LOOP ||
- optname == IP_RECVORIGDSTADDR) {
+ switch (optname) {
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVOPTS:
+ case IP_RECVTOS:
+ case IP_RETOPTS:
+ case IP_TOS:
+ case IP_TTL:
+ case IP_HDRINCL:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+ case IP_ROUTER_ALERT:
+ case IP_FREEBIND:
+ case IP_PASSSEC:
+ case IP_TRANSPARENT:
+ case IP_MINTTL:
+ case IP_NODEFRAG:
+ case IP_UNICAST_IF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_ALL:
+ case IP_MULTICAST_LOOP:
+ case IP_RECVORIGDSTADDR:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -497,32 +536,36 @@ static int do_ip_setsockopt(struct sock *sk, int level,
switch (optname) {
case IP_OPTIONS:
{
- struct ip_options *opt = NULL;
+ struct ip_options_rcu *old, *opt = NULL;
+
if (optlen > 40)
goto e_inval;
err = ip_options_get_from_user(sock_net(sk), &opt,
optval, optlen);
if (err)
break;
+ old = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
if (inet->is_icsk) {
struct inet_connection_sock *icsk = inet_csk(sk);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == PF_INET ||
(!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE)) &&
inet->inet_daddr != LOOPBACK4_IPV6)) {
#endif
- if (inet->opt)
- icsk->icsk_ext_hdr_len -= inet->opt->optlen;
+ if (old)
+ icsk->icsk_ext_hdr_len -= old->opt.optlen;
if (opt)
- icsk->icsk_ext_hdr_len += opt->optlen;
+ icsk->icsk_ext_hdr_len += opt->opt.optlen;
icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
}
#endif
}
- opt = xchg(&inet->opt, opt);
- kfree(opt);
+ rcu_assign_pointer(inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
break;
}
case IP_PKTINFO:
@@ -569,8 +612,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
case IP_TOS: /* This sets both TOS and Precedence */
if (sk->sk_type == SOCK_STREAM) {
- val &= ~3;
- val |= inet->tos & 3;
+ val &= ~INET_ECN_MASK;
+ val |= inet->tos & INET_ECN_MASK;
}
if (inet->tos != val) {
inet->tos = val;
@@ -581,7 +624,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TTL:
if (optlen < 1)
goto e_inval;
- if (val != -1 && (val < 0 || val > 255))
+ if (val != -1 && (val < 1 || val > 255))
goto e_inval;
inet->uc_ttl = val;
break;
@@ -600,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->nodefrag = val ? 1 : 0;
break;
case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval;
inet->pmtudisc = val;
break;
@@ -625,6 +668,35 @@ static int do_ip_setsockopt(struct sock *sk, int level,
goto e_inval;
inet->mc_loop = !!val;
break;
+ case IP_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ inet->uc_index = 0;
+ err = 0;
+ break;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if)
+ break;
+
+ inet->uc_index = ifindex;
+ err = 0;
+ break;
+ }
case IP_MULTICAST_IF:
{
struct ip_mreqn mreq;
@@ -645,10 +717,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
} else {
memset(&mreq, 0, sizeof(mreq));
- if (optlen >= sizeof(struct in_addr) &&
- copy_from_user(&mreq.imr_address, optval,
- sizeof(struct in_addr)))
- break;
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_user(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_user(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
+ break;
+ }
}
if (!mreq.imr_ifindex) {
@@ -946,13 +1023,14 @@ mc_msf_out:
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
case IP_TRANSPARENT:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
break;
}
@@ -982,20 +1060,29 @@ e_inval:
}
/**
- * ip_queue_rcv_skb - Queue an skb into sock receive queue
+ * ipv4_pktinfo_prepare - transfert some info from rtable to skb
* @sk: socket
* @skb: buffer
*
- * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
- * is not set, we drop skb dst entry now, while dst cache line is hot.
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
+ * This way, receiver doesn't make cache line misses to read rtable.
*/
-int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
{
- if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
- skb_dst_drop(skb);
- return sock_queue_rcv_skb(sk, skb);
+ struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ ipv6_sk_rxinfo(sk);
+
+ if (prepare && skb_rtable(skb)) {
+ pktinfo->ipi_ifindex = inet_iif(skb);
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+ } else {
+ pktinfo->ipi_ifindex = 0;
+ pktinfo->ipi_spec_dst.s_addr = 0;
+ }
+ skb_dst_drop(skb);
}
-EXPORT_SYMBOL(ip_queue_rcv_skb);
int ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
@@ -1058,7 +1145,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
*/
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+ char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
int val;
@@ -1081,12 +1168,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_OPTIONS:
{
unsigned char optbuf[sizeof(struct ip_options)+40];
- struct ip_options * opt = (struct ip_options *)optbuf;
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
opt->optlen = 0;
- if (inet->opt)
- memcpy(optbuf, inet->opt,
- sizeof(struct ip_options)+
- inet->opt->optlen);
+ if (inet_opt)
+ memcpy(optbuf, &inet_opt->opt,
+ sizeof(struct ip_options) +
+ inet_opt->opt.optlen);
release_sock(sk);
if (opt->optlen == 0)
@@ -1163,6 +1254,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MULTICAST_LOOP:
val = inet->mc_loop;
break;
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) inet->uc_index);
+ break;
case IP_MULTICAST_IF:
{
struct in_addr addr;
@@ -1227,7 +1321,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
msg.msg_control = optval;
msg.msg_controllen = len;
- msg.msg_flags = 0;
+ msg.msg_flags = flags;
if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
struct in_pktinfo info;
@@ -1241,6 +1335,10 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
int hlim = inet->mc_ttl;
put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
}
+ if (inet->cmsg_flags & IP_CMSG_TOS) {
+ int tos = inet->rcv_tos;
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
len -= msg.msg_controllen;
return put_user(len, optlen);
}
@@ -1281,7 +1379,7 @@ int ip_getsockopt(struct sock *sk, int level,
{
int err;
- err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1314,7 +1412,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
return compat_mc_getsockopt(sk, level, optname, optval, optlen,
ip_getsockopt);
- err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen,
+ MSG_CMSG_COMPAT);
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 00000000000..6f9de61dce5
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1062 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
+ struct dst_entry *dst)
+{
+ struct dst_entry *old_dst;
+
+ dst_clone(dst);
+ old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
+ dst_release(old_dst);
+}
+
+static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+{
+ __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
+}
+
+static void tunnel_dst_reset(struct ip_tunnel *t)
+{
+ tunnel_dst_set(t, NULL);
+}
+
+void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
+}
+EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
+
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
+{
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
+ if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ dst = NULL;
+ if (dst) {
+ if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ tunnel_dst_reset(t);
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return (struct rtable *)dst;
+}
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+ __be16 flags, __be32 key)
+{
+ if (p->i_flags & TUNNEL_KEY) {
+ if (flags & TUNNEL_KEY)
+ return key == p->i_key;
+ else
+ /* key expected, none present */
+ return false;
+ } else
+ return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, __be16 flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ unsigned int hash;
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+
+ hash = ip_tunnel_hash(key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ t->parms.iph.saddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+ (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+ continue;
+
+ if (!(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ if (flags & TUNNEL_NO_KEY)
+ goto skip_key_lookup;
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (t->parms.i_key != key ||
+ t->parms.iph.saddr != 0 ||
+ t->parms.iph.daddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+skip_key_lookup:
+ if (cand)
+ return cand;
+
+ if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+ return netdev_priv(itn->fb_tunnel_dev);
+
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ unsigned int h;
+ __be32 remote;
+ __be32 i_key = parms->i_key;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ i_key = 0;
+
+ h = ip_tunnel_hash(i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel *t)
+{
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ __be32 key = parms->i_key;
+ __be16 flags = parms->i_flags;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ link == t->parms.link &&
+ type == t->dev->type &&
+ ip_tunnel_key_match(&t->parms, flags, key))
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+ err = -E2BIG;
+ goto failed;
+ }
+ strlcpy(name, ops->kind, IFNAMSIZ);
+ strncat(name, "%d", 2);
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+ tunnel->net = net;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ RT_TOS(iph->tos), tunnel->parms.link);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ tunnel_dst_set(tunnel, &rt->dst);
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+ dev->iflink = tunnel->parms.link;
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ struct ip_tunnel *nt;
+ struct net_device *dev;
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return ERR_CAST(dev);
+
+ dev->mtu = ip_tunnel_bind_dev(dev);
+
+ nt = netdev_priv(dev);
+ ip_tunnel_add(itn, nt);
+ return nt;
+}
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, bool log_ecn_error)
+{
+ struct pcpu_sw_netstats *tstats;
+ const struct iphdr *iph = ip_hdr(skb);
+ int err;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ tunnel->dev->stats.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+ ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+ if (!(tpi->flags&TUNNEL_SEQ) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ skb_reset_network_header(skb);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ struct rtable *rt, __be16 df)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int mtu;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr) - tunnel->hlen;
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!skb_is_gso(skb) &&
+ (df & htons(IP_DF)) && mtu < pkt_size) {
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ return -E2BIG;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < pkt_size) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -E2BIG;
+ }
+ }
+#endif
+ return 0;
+}
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params, const u8 protocol)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *inner_iph;
+ struct flowi4 fl4;
+ u8 tos, ttl;
+ __be16 df;
+ struct rtable *rt; /* Route to the other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst;
+ int err;
+ bool connected;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
+
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (skb_dst(skb) == NULL) {
+ dev->stats.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (neigh == NULL)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+
+ connected = false;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (skb->protocol == htons(ETH_P_IP)) {
+ tos = inner_iph->tos;
+ connected = false;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ connected = false;
+ }
+ }
+
+ init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+ tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+ rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
+
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (connected)
+ tunnel_dst_set(tunnel, &rt->dst);
+ }
+
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP))
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len;
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return;
+ }
+
+ err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
+ tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm *p,
+ bool set_mtu)
+{
+ ip_tunnel_del(t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link) {
+ int mtu;
+
+ t->parms.link = p->link;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ dev->mtu = mtu;
+ }
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == itn->fb_tunnel_dev) {
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ t = netdev_priv(dev);
+ }
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!(p->i_flags & VTI_ISVTI)) {
+ if (!(p->i_flags & TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags & TUNNEL_KEY))
+ p->o_key = 0;
+ }
+
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+
+ if (!t && (cmd == SIOCADDTUNNEL)) {
+ t = ip_tunnel_create(net, itn, p);
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t != NULL) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true);
+ } else {
+ err = -ENOENT;
+ }
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (t == NULL)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm parms;
+ unsigned int i;
+
+ for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+ if (!ops) {
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strlcpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
+ itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ }
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+ struct rtnl_link_ops *ops)
+{
+ struct net *net = dev_net(itn->fb_tunnel_dev);
+ struct net_device *dev, *aux;
+ int h;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+ }
+}
+
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list, ops);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+
+ nt->net = net;
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = tunnel;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->destructor = ip_tunnel_dev_free;
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ /* fb_tunnel_dev will be unregisted in net-exit call. */
+ if (itn->fb_tunnel_dev != dev)
+ ip_tunnel_del(netdev_priv(dev));
+
+ ip_tunnel_dst_reset_all(tunnel);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 00000000000..f4c987bb7e9
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+ int pkt_len = skb->len;
+ struct iphdr *iph;
+ int err;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = dst;
+ iph->saddr = src;
+ iph->ttl = ttl;
+ __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
+
+ err = ip_local_out_sk(sk, skb);
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ return pkt_len;
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+{
+ if (unlikely(!pskb_may_pull(skb, hdr_len)))
+ return -ENOMEM;
+
+ skb_pull_rcsum(skb, hdr_len);
+
+ if (inner_proto == htons(ETH_P_TEB)) {
+ struct ethhdr *eh = (struct ethhdr *)skb->data;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ return -ENOMEM;
+
+ if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ } else {
+ skb->protocol = inner_proto;
+ }
+
+ nf_reset(skb);
+ secpath_reset(skb);
+ skb_clear_hash_if_not_l4(skb);
+ skb_dst_drop(skb);
+ skb->vlan_tci = 0;
+ skb_set_queue_mapping(skb, 0);
+ skb->pkt_type = PACKET_HOST;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
+ bool csum_help,
+ int gso_type_mask)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= gso_type_mask;
+ return skb;
+ }
+
+ /* If packet is not gso and we are resolving any partial checksum,
+ * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+ * on the outer header without confusing devices that implement
+ * NETIF_F_IP_CSUM with encapsulation.
+ */
+ if (csum_help)
+ skb->encapsulation = 0;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 00000000000..b8960f3527f
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,603 @@
+/*
+ * Linux NET3: IP/IP protocol decoder modified to support
+ * virtual tunnel interface
+ *
+ * Authors:
+ * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+ This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static int vti_net_id __read_mostly;
+static int vti_tunnel_init(struct net_device *dev);
+
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel != NULL) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti_rcv(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+}
+
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+
+ if (!tunnel)
+ return 1;
+
+ dev = tunnel->dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+ family = x->inner_mode->afinfo->family;
+
+ if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+ skb->dev = dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)&src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET)
+ return false;
+
+ if (!dst)
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+ return false;
+
+ return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct flowi *fl)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev; /* Device to other host */
+ int err;
+
+ if (!dst) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ dst_hold(dst);
+ dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
+ dev->stats.tx_carrier_errors++;
+ dst_release(dst);
+ goto tx_error_icmp;
+ }
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ dst_release(dst);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst(skb)->dev;
+
+ err = dst_output(skb);
+ if (net_xmit_eval(err) == 0)
+ err = skb->len;
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+
+ skb->mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ default:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ return vti_xmit(skb, dev, &fl);
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ int protocol = iph->protocol;
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!tunnel)
+ return -1;
+
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5)
+ return -EINVAL;
+ }
+
+ if (!(p.i_flags & GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags & GRE_KEY))
+ p.o_key = 0;
+
+ p.i_flags = VTI_ISVTI;
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (cmd != SIOCDELTUNNEL) {
+ p.i_flags |= GRE_KEY;
+ p.o_flags |= GRE_KEY;
+ }
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+ .ndo_init = vti_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = vti_tunnel_xmit,
+ .ndo_do_ioctl = vti_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+};
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti_netdev_ops;
+ dev->type = ARPHRD_TUNNEL;
+ ip_tunnel_setup(dev, vti_net_id);
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = ETH_DATA_LEN;
+ dev->flags = IFF_NOARP;
+ dev->iflink = 0;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+
+ return ip_tunnel_init(dev);
+}
+
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+}
+
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static int __net_init vti_init_net(struct net *net)
+{
+ int err;
+ struct ip_tunnel_net *itn;
+
+ err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
+ if (err)
+ return err;
+ itn = net_generic(net, vti_net_id);
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ return 0;
+}
+
+static void __net_exit vti_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ ip_tunnel_delete_net(itn, &vti_link_ops);
+}
+
+static struct pernet_operations vti_net_ops = {
+ .init = vti_init_net,
+ .exit = vti_exit_net,
+ .id = &vti_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_IPIP;
+
+ if (!data)
+ return;
+
+ parms->i_flags = VTI_ISVTI;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
+
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm parms;
+
+ vti_netlink_parms(data, &parms);
+ return ip_tunnel_newlink(dev, tb, &parms);
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+
+ vti_netlink_parms(data, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+ nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+ nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+ nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+ nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+
+ return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+ .kind = "vti",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = vti_tunnel_setup,
+ .validate = vti_tunnel_validate,
+ .newlink = vti_newlink,
+ .changelink = vti_changelink,
+ .get_size = vti_get_size,
+ .fill_info = vti_fill_info,
+};
+
+static int __init vti_init(void)
+{
+ int err;
+
+ pr_info("IPv4 over IPSec tunneling driver\n");
+
+ err = register_pernet_device(&vti_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0) {
+ unregister_pernet_device(&vti_net_ops);
+ pr_info("vti init: can't register tunnel\n");
+
+ return err;
+ }
+
+ err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+ if (err < 0) {
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+ pr_info("vti init: can't register tunnel\n");
+
+ return err;
+ }
+
+ err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0) {
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+ pr_info("vti init: can't register tunnel\n");
+
+ return err;
+ }
+
+ err = rtnl_link_register(&vti_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return err;
+
+rtnl_link_failed:
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+ return err;
+}
+
+static void __exit vti_fini(void)
+{
+ rtnl_link_unregister(&vti_link_ops);
+ if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
+ pr_info("vti close: can't deregister tunnel\n");
+ if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
+ pr_info("vti close: can't deregister tunnel\n");
+ if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
+ pr_info("vti close: can't deregister tunnel\n");
+
+
+ unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 629067571f0..c0855d50a3f 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -23,26 +23,37 @@
#include <net/protocol.h>
#include <net/sock.h>
-static void ipcomp4_err(struct sk_buff *skb, u32 info)
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
__be32 spi;
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
spi = htonl(ntohs(ipch->cpi));
- x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
spi, IPPROTO_COMP, AF_INET);
if (!x)
- return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
- spi, &iph->daddr);
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
xfrm_state_put(x);
+
+ return 0;
}
/* We always hold one tunnel user reference to indicate a tunnel */
@@ -63,6 +74,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->props.mode = x->props.mode;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
+ t->props.extra_flags = x->props.extra_flags;
memcpy(&t->mark, &x->mark, sizeof(t->mark));
if (xfrm_init_state(t))
@@ -137,6 +149,11 @@ out:
return err;
}
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
static const struct xfrm_type ipcomp_type = {
.description = "IPCOMP4",
.owner = THIS_MODULE,
@@ -147,20 +164,22 @@ static const struct xfrm_type ipcomp_type = {
.output = ipcomp_output
};
-static const struct net_protocol ipcomp4_protocol = {
+static struct xfrm4_protocol ipcomp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp4_rcv_cb,
.err_handler = ipcomp4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init ipcomp4_init(void)
{
if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
- printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
- printk(KERN_INFO "ipcomp init: can't add protocol\n");
+ if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ipcomp_type, AF_INET);
return -EAGAIN;
}
@@ -169,10 +188,10 @@ static int __init ipcomp4_init(void)
static void __exit ipcomp4_fini(void)
{
- if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+ if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
- printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ipcomp4_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2b097752426..b3e86ea7b71 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,6 +54,7 @@
#include <linux/delay.h>
#include <linux/nfs_fs.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -87,8 +88,8 @@
#endif
/* Define the friendly delay before and after opening net devices */
-#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */
-#define CONF_POST_OPEN 1 /* After opening: 1 second */
+#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
+#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
@@ -135,12 +136,14 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
+__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+
__be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
-u32 ic_dev_xid; /* Device under configuration */
+__be32 ic_dev_xid; /* Device under configuration */
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
@@ -188,14 +191,14 @@ struct ic_device {
static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
static struct net_device *ic_dev __initdata = NULL; /* Selected device */
-static bool __init ic_device_match(struct net_device *dev)
+static bool __init ic_is_init_dev(struct net_device *dev)
{
- if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ if (dev->flags & IFF_LOOPBACK)
+ return false;
+ return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
(!(dev->flags & IFF_LOOPBACK) &&
(dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
- strncmp(dev->name, "dummy", 5)))
- return true;
- return false;
+ strncmp(dev->name, "dummy", 5));
}
static int __init ic_open_devs(void)
@@ -203,6 +206,7 @@ static int __init ic_open_devs(void)
struct ic_device *d, **last;
struct net_device *dev;
unsigned short oflags;
+ unsigned long start, next_msg;
last = &ic_first_dev;
rtnl_lock();
@@ -212,18 +216,17 @@ static int __init ic_open_devs(void)
if (!(dev->flags & IFF_LOOPBACK))
continue;
if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
- printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ pr_err("IP-Config: Failed to open %s\n", dev->name);
}
for_each_netdev(&init_net, dev) {
- if (dev->flags & IFF_LOOPBACK)
- continue;
- if (ic_device_match(dev)) {
+ if (ic_is_init_dev(dev)) {
int able = 0;
if (dev->mtu >= 364)
able |= IC_BOOTP;
else
- printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ dev->name, dev->mtu);
if (!(dev->flags & IFF_NOARP))
able |= IC_RARP;
able &= ic_proto_enabled;
@@ -231,7 +234,8 @@ static int __init ic_open_devs(void)
continue;
oflags = dev->flags;
if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
- printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ pr_err("IP-Config: Failed to open %s\n",
+ dev->name);
continue;
}
if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
@@ -252,15 +256,42 @@ static int __init ic_open_devs(void)
dev->name, able, d->xid));
}
}
+
+ /* no point in waiting if we could not bring up at least one device */
+ if (!ic_first_dev)
+ goto have_carrier;
+
+ /* wait for a carrier on at least one device */
+ start = jiffies;
+ next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+ int wait, elapsed;
+
+ for_each_netdev(&init_net, dev)
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+ goto have_carrier;
+
+ msleep(1);
+
+ if (time_before(jiffies, next_msg))
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ }
+have_carrier:
rtnl_unlock();
*last = NULL;
if (!ic_first_dev) {
if (user_dev_name[0])
- printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+ pr_err("IP-Config: Device `%s' not found\n",
+ user_dev_name);
else
- printk(KERN_ERR "IP-Config: No network devices available.\n");
+ pr_err("IP-Config: No network devices available\n");
return -ENODEV;
}
return 0;
@@ -344,17 +375,20 @@ static int __init ic_setup_if(void)
strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
set_sockaddr(sin, ic_myaddr, 0);
if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface address (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_netmask, 0);
if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+ err);
return -1;
}
set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
- printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+ pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+ err);
return -1;
}
/* Handle the case where we need non-standard MTU on the boot link (a network
@@ -365,8 +399,8 @@ static int __init ic_setup_if(void)
strcpy(ir.ifr_name, ic_dev->name);
ir.ifr_mtu = ic_dev_mtu;
if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
- printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
- ic_dev_mtu, err);
+ pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+ ic_dev_mtu, err);
}
return 0;
}
@@ -381,7 +415,7 @@ static int __init ic_setup_routes(void)
memset(&rm, 0, sizeof(rm));
if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
- printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+ pr_err("IP-Config: Gateway not on directly connected network\n");
return -1;
}
set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
@@ -389,7 +423,8 @@ static int __init ic_setup_routes(void)
set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
rm.rt_flags = RTF_UP | RTF_GATEWAY;
if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
- printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+ pr_err("IP-Config: Cannot add default route (%d)\n",
+ err);
return -1;
}
}
@@ -422,8 +457,8 @@ static int __init ic_defaults(void)
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
else {
- printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
- &ic_myaddr);
+ pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+ &ic_myaddr);
return -1;
}
printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
@@ -536,6 +571,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (ic_myaddr == NONE)
ic_myaddr = tip;
ic_servaddr = sip;
+ ic_addrservaddr = sip;
ic_got_reply = IC_RARP;
drop_unlock:
@@ -561,6 +597,17 @@ static void __init ic_rarp_send_if(struct ic_device *d)
#endif
/*
+ * Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ ic_nameservers[i] = NONE;
+}
+
+/*
* DHCP/BOOTP support.
*/
@@ -673,8 +720,8 @@ ic_dhcp_init_options(u8 *options)
e += len;
}
if (*vendor_class_identifier) {
- printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
- vendor_class_identifier);
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
*e++ = 60; /* Class-identifier */
len = strlen(vendor_class_identifier);
*e++ = len;
@@ -725,10 +772,7 @@ static void __init ic_bootp_init_ext(u8 *e)
*/
static inline void __init ic_bootp_init(void)
{
- int i;
-
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
- ic_nameservers[i] = NONE;
+ ic_nameservers_predef();
dev_add_pack(&bootp_packet_type);
}
@@ -752,13 +796,15 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
struct sk_buff *skb;
struct bootp_pkt *b;
struct iphdr *h;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
/* Allocate packet */
- skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15,
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
GFP_KERNEL);
if (!skb)
return;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
memset(b, 0, sizeof(struct bootp_pkt));
@@ -784,8 +830,6 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
b->op = BOOTP_REQUEST;
if (dev->type < 256) /* check for false types */
b->htype = dev->type;
- else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
- b->htype = ARPHRD_IEEE802;
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
@@ -811,8 +855,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
if (dev_hard_header(skb, dev, ntohs(skb->protocol),
- dev->broadcast, dev->dev_addr, skb->len) < 0 ||
- dev_queue_xmit(skb) < 0)
+ dev->broadcast, dev->dev_addr, skb->len) < 0) {
+ kfree_skb(skb);
+ printk("E");
+ return;
+ }
+
+ if (dev_queue_xmit(skb) < 0)
printk("E");
}
@@ -837,9 +886,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)
*/
static void __init ic_do_bootp_ext(u8 *ext)
{
- u8 servers;
- int i;
- u16 mtu;
+ u8 servers;
+ int i;
+ __be16 mtu;
#ifdef IPCONFIG_DEBUG
u8 *c;
@@ -851,41 +900,44 @@ static void __init ic_do_bootp_ext(u8 *ext)
#endif
switch (*ext++) {
- case 1: /* Subnet mask */
- if (ic_netmask == NONE)
- memcpy(&ic_netmask, ext+1, 4);
- break;
- case 3: /* Default gateway */
- if (ic_gateway == NONE)
- memcpy(&ic_gateway, ext+1, 4);
- break;
- case 6: /* DNS server */
- servers= *ext/4;
- if (servers > CONF_NAMESERVERS_MAX)
- servers = CONF_NAMESERVERS_MAX;
- for (i = 0; i < servers; i++) {
- if (ic_nameservers[i] == NONE)
- memcpy(&ic_nameservers[i], ext+1+4*i, 4);
- }
- break;
- case 12: /* Host name */
- ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN);
- ic_host_name_set = 1;
- break;
- case 15: /* Domain name (DNS) */
- ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
- break;
- case 17: /* Root path */
- if (!root_server_path[0])
- ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
- break;
- case 26: /* Interface MTU */
- memcpy(&mtu, ext+1, sizeof(mtu));
- ic_dev_mtu = ntohs(mtu);
- break;
- case 40: /* NIS Domain name (_not_ DNS) */
- ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
- break;
+ case 1: /* Subnet mask */
+ if (ic_netmask == NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == NONE)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 15: /* Domain name (DNS) */
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext,
+ sizeof(root_server_path));
+ break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(utsname()->domainname, ext+1, *ext,
+ __NEW_UTS_LEN);
+ break;
}
}
@@ -922,10 +974,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
goto drop;
/* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
- "reply.\n");
+ if (ip_is_fragment(h)) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
goto drop;
}
@@ -973,17 +1023,14 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply to our BOOTP request? */
if (b->op != BOOTP_REPLY ||
b->xid != d->xid) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
- "op[%x] xid[%x]\n",
- b->op, b->xid);
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+ b->op, b->xid);
goto drop_unlock;
}
/* Is it a reply for the device we are configuring? */
if (b->xid != ic_dev_xid) {
- if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
+ net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
goto drop_unlock;
}
@@ -1035,7 +1082,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_servaddr = server_id;
#ifdef IPCONFIG_DEBUG
printk("DHCP: Offered address %pI4 by server %pI4\n",
- &ic_myaddr, &ic_servaddr);
+ &ic_myaddr, &b->iph.saddr);
#endif
/* The DHCP indicated server address takes
* precedence over the bootp header one if
@@ -1080,6 +1127,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_dev = dev;
ic_myaddr = b->your_ip;
ic_servaddr = b->server_ip;
+ ic_addrservaddr = b->iph.saddr;
if (ic_gateway == NONE && b->relay_ip)
ic_gateway = b->relay_ip;
if (ic_nameservers[0] == NONE)
@@ -1121,17 +1169,17 @@ static int __init ic_dynamic(void)
* are missing, and without DHCP/BOOTP/RARP we are unable to get it.
*/
if (!ic_proto_enabled) {
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
return -1;
}
#ifdef IPCONFIG_BOOTP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
- printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+ pr_err("DHCP/BOOTP: No suitable device found\n");
#endif
#ifdef IPCONFIG_RARP
if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
- printk(KERN_ERR "RARP: No suitable device found.\n");
+ pr_err("RARP: No suitable device found\n");
#endif
if (!ic_proto_have_if)
@@ -1158,17 +1206,17 @@ static int __init ic_dynamic(void)
* [Actually we could now, but the nothing else running note still
* applies.. - AC]
*/
- printk(KERN_NOTICE "Sending %s%s%s requests .",
- do_bootp
- ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
- (do_bootp && do_rarp) ? " and " : "",
- do_rarp ? "RARP" : "");
+ pr_notice("Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
start_jiffies = jiffies;
d = ic_first_dev;
retries = CONF_SEND_RETRIES;
get_random_bytes(&timeout, sizeof(timeout));
- timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
/* Track the device we are configuring */
ic_dev_xid = d->xid;
@@ -1191,13 +1239,13 @@ static int __init ic_dynamic(void)
(ic_proto_enabled & IC_USE_DHCP) &&
ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
- printk(KERN_CONT ",");
+ pr_cont(",");
continue;
}
#endif /* IPCONFIG_DHCP */
if (ic_got_reply) {
- printk(KERN_CONT " OK\n");
+ pr_cont(" OK\n");
break;
}
@@ -1205,7 +1253,7 @@ static int __init ic_dynamic(void)
continue;
if (! --retries) {
- printk(KERN_CONT " timed out!\n");
+ pr_cont(" timed out!\n");
break;
}
@@ -1215,7 +1263,7 @@ static int __init ic_dynamic(void)
if (timeout > CONF_TIMEOUT_MAX)
timeout = CONF_TIMEOUT_MAX;
- printk(KERN_CONT ".");
+ pr_cont(".");
}
#ifdef IPCONFIG_BOOTP
@@ -1235,8 +1283,8 @@ static int __init ic_dynamic(void)
printk("IP-Config: Got %s answer from %pI4, ",
((ic_got_reply & IC_RARP) ? "RARP"
: (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- &ic_servaddr);
- printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
+ &ic_addrservaddr);
+ pr_cont("my address is %pI4\n", &ic_myaddr);
return 0;
}
@@ -1324,14 +1372,13 @@ static int __init wait_for_devices(void)
{
int i;
- msleep(CONF_PRE_OPEN);
for (i = 0; i < DEVICE_WAIT_MAX; i++) {
struct net_device *dev;
int found = 0;
rtnl_lock();
for_each_netdev(&init_net, dev) {
- if (ic_device_match(dev)) {
+ if (ic_is_init_dev(dev)) {
found = 1;
break;
}
@@ -1355,9 +1402,10 @@ static int __init ip_auto_config(void)
int retries = CONF_OPEN_RETRIES;
#endif
int err;
+ unsigned int i;
#ifdef CONFIG_PROC_FS
- proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
+ proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1378,7 +1426,7 @@ static int __init ip_auto_config(void)
return err;
/* Give drivers a chance to settle */
- ssleep(CONF_POST_OPEN);
+ msleep(CONF_POST_OPEN);
/*
* If the config information is insufficient (e.g., our IP address or
@@ -1413,24 +1461,22 @@ static int __init ip_auto_config(void)
*/
#ifdef CONFIG_ROOT_NFS
if (ROOT_DEV == Root_NFS) {
- printk(KERN_ERR
- "IP-Config: Retrying forever (NFS root)...\n");
+ pr_err("IP-Config: Retrying forever (NFS root)...\n");
goto try_try_again;
}
#endif
if (--retries) {
- printk(KERN_ERR
- "IP-Config: Reopening network devices...\n");
+ pr_err("IP-Config: Reopening network devices...\n");
goto try_try_again;
}
/* Oh, well. At least we tried. */
- printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+ pr_err("IP-Config: Auto-configuration of network failed\n");
return -1;
}
#else /* !DYNAMIC */
- printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+ pr_err("IP-Config: Incomplete network configuration information\n");
ic_close_devs();
return -1;
#endif /* IPCONFIG_DYNAMIC */
@@ -1444,7 +1490,7 @@ static int __init ip_auto_config(void)
root_server_addr = addr;
/*
- * Use defaults whereever applicable.
+ * Use defaults wherever applicable.
*/
if (ic_defaults() < 0)
return -1;
@@ -1468,19 +1514,27 @@ static int __init ip_auto_config(void)
/*
* Clue in the operator.
*/
- printk("IP-Config: Complete:\n");
- printk(" device=%s", ic_dev->name);
- printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
- printk(KERN_CONT ", mask=%pI4", &ic_netmask);
- printk(KERN_CONT ", gw=%pI4", &ic_gateway);
- printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
- utsname()->nodename, ic_domain, utsname()->domainname);
- printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
- printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
- printk(KERN_CONT ", rootpath=%s", root_server_path);
+ pr_info("IP-Config: Complete:\n");
+
+ pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+ &ic_myaddr, &ic_netmask, &ic_gateway);
+ pr_info(" host=%s, domain=%s, nis-domain=%s\n",
+ utsname()->nodename, ic_domain, utsname()->domainname);
+ pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+ &ic_servaddr, &root_server_addr, root_server_path);
if (ic_dev_mtu)
- printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
- printk(KERN_CONT "\n");
+ pr_cont(", mtu=%d", ic_dev_mtu);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE) {
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ break;
+ }
+ for (i++; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE)
+ pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
+ pr_cont("\n");
#endif /* !SILENT */
return 0;
@@ -1551,6 +1605,8 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+ ic_nameservers_predef();
+
/* Parse string for static IP assignment. */
ip = addrs;
while (ip && *ip) {
@@ -1594,6 +1650,20 @@ static int __init ip_auto_config_setup(char *addrs)
ic_enable = 0;
}
break;
+ case 7:
+ if (CONF_NAMESERVERS_MAX >= 1) {
+ ic_nameservers[0] = in_aton(ip);
+ if (ic_nameservers[0] == ANY)
+ ic_nameservers[0] = NONE;
+ }
+ break;
+ case 8:
+ if (CONF_NAMESERVERS_MAX >= 2) {
+ ic_nameservers[1] = in_aton(ip);
+ if (ic_nameservers[1] == ANY)
+ ic_nameservers[1] = NONE;
+ }
+ break;
}
}
ip = cp;
@@ -1602,22 +1672,21 @@ static int __init ip_auto_config_setup(char *addrs)
return 1;
}
+__setup("ip=", ip_auto_config_setup);
static int __init nfsaddrs_config_setup(char *addrs)
{
return ip_auto_config_setup(addrs);
}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
if (strlcpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
- printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
- vendor_class_identifier);
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ vendor_class_identifier);
return 1;
}
-
-__setup("ip=", ip_auto_config_setup);
-__setup("nfsaddrs=", nfsaddrs_config_setup);
__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 988f52fba54..62eaa005e14 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -111,206 +111,20 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
-#include <net/ipip.h>
+#include <net/ip_tunnels.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#define HASH_SIZE 16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static int ipip_net_id __read_mostly;
-struct ipip_net {
- struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
- struct ip_tunnel __rcu *tunnels_wc[1];
- struct ip_tunnel __rcu **tunnels[4];
-
- struct net_device *fb_tunnel_dev;
-};
static int ipip_tunnel_init(struct net_device *dev);
-static void ipip_tunnel_setup(struct net_device *dev);
-static void ipip_dev_free(struct net_device *dev);
-
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-#define for_each_ip_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
-/* often modified stats are per cpu, other are shared (netdev->stats) */
-struct pcpu_tstats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long tx_packets;
- unsigned long tx_bytes;
-};
-
-static struct net_device_stats *ipip_get_stats(struct net_device *dev)
-{
- struct pcpu_tstats sum = { 0 };
- int i;
-
- for_each_possible_cpu(i) {
- const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-
- sum.rx_packets += tstats->rx_packets;
- sum.rx_bytes += tstats->rx_bytes;
- sum.tx_packets += tstats->tx_packets;
- sum.tx_bytes += tstats->tx_bytes;
- }
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
-}
-
-static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
- __be32 remote, __be32 local)
-{
- unsigned int h0 = HASH(remote);
- unsigned int h1 = HASH(local);
- struct ip_tunnel *t;
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
- if (local == t->parms.iph.saddr &&
- remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
- if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
- return t;
-
- for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
- if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
- return t;
-
- t = rcu_dereference(ipn->tunnels_wc[0]);
- if (t && (t->dev->flags&IFF_UP))
- return t;
- return NULL;
-}
-
-static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel_parm *parms)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- unsigned int h = 0;
- int prio = 0;
-
- if (remote) {
- prio |= 2;
- h ^= HASH(remote);
- }
- if (local) {
- prio |= 1;
- h ^= HASH(local);
- }
- return &ipn->tunnels[prio][h];
-}
-
-static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
- struct ip_tunnel *t)
-{
- return __ipip_bucket(ipn, &t->parms);
-}
-
-static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp;
- struct ip_tunnel *iter;
-
- for (tp = ipip_bucket(ipn, t);
- (iter = rtnl_dereference(*tp)) != NULL;
- tp = &iter->next) {
- if (t == iter) {
- rcu_assign_pointer(*tp, t->next);
- break;
- }
- }
-}
-
-static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
-{
- struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
-
- rcu_assign_pointer(t->next, rtnl_dereference(*tp));
- rcu_assign_pointer(*tp, t);
-}
-
-static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
-{
- __be32 remote = parms->iph.daddr;
- __be32 local = parms->iph.saddr;
- struct ip_tunnel *t, *nt;
- struct ip_tunnel __rcu **tp;
- struct net_device *dev;
- char name[IFNAMSIZ];
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- for (tp = __ipip_bucket(ipn, parms);
- (t = rtnl_dereference(*tp)) != NULL;
- tp = &t->next) {
- if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
- return t;
- }
- if (!create)
- return NULL;
-
- if (parms->name[0])
- strlcpy(name, parms->name, IFNAMSIZ);
- else
- strcpy(name, "tunl%d");
-
- dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
- if (dev == NULL)
- return NULL;
-
- dev_net_set(dev, net);
-
- if (strchr(name, '%')) {
- if (dev_alloc_name(dev, name) < 0)
- goto failed_free;
- }
-
- nt = netdev_priv(dev);
- nt->parms = *parms;
-
- if (ipip_tunnel_init(dev) < 0)
- goto failed_free;
-
- if (register_netdevice(dev) < 0)
- goto failed_free;
-
- dev_hold(dev);
- ipip_tunnel_link(ipn, nt);
- return nt;
-
-failed_free:
- ipip_dev_free(dev);
- return NULL;
-}
-
-/* called with RTNL */
-static void ipip_tunnel_uninit(struct net_device *dev)
-{
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- if (dev == ipn->fb_tunnel_dev)
- rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
- else
- ipip_tunnel_unlink(ipn, netdev_priv(dev));
- dev_put(dev);
-}
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
static int ipip_err(struct sk_buff *skb, u32 info)
{
@@ -319,45 +133,35 @@ static int ipip_err(struct sk_buff *skb, u32 info)
8 bytes of packet payload. It means, that precise relaying of
ICMP in the real Internet is absolutely infeasible.
*/
- struct iphdr *iph = (struct iphdr *)skb->data;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct ip_tunnel *t;
int err;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
- switch (type) {
- default:
- case ICMP_PARAMETERPROB:
- return 0;
-
- case ICMP_DEST_UNREACH:
- switch (code) {
- case ICMP_SR_FAILED:
- case ICMP_PORT_UNREACH:
- /* Impossible event. */
- return 0;
- case ICMP_FRAG_NEEDED:
- /* Soft state for pmtu is maintained by IP core. */
- return 0;
- default:
- /* All others are translated to HOST_UNREACH.
- rfc2003 contains "deep thoughts" about NET_UNREACH,
- I believe they are just ether pollution. --ANK
- */
- break;
- }
- break;
- case ICMP_TIME_EXCEEDED:
- if (code != ICMP_EXC_TTL)
- return 0;
- break;
+ err = -ENOENT;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (t == NULL)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
}
- err = -ENOENT;
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+ IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
- rcu_read_lock();
- t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
- if (t == NULL || t->parms.iph.daddr == 0)
+ if (t->parms.iph.daddr == 0)
goto out;
err = 0;
@@ -369,543 +173,312 @@ static int ipip_err(struct sk_buff *skb, u32 info)
else
t->err_count = 1;
t->err_time = jiffies;
+
out:
- rcu_read_unlock();
return err;
}
-static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
- struct sk_buff *skb)
-{
- struct iphdr *inner_iph = ip_hdr(skb);
-
- if (INET_ECN_is_ce(outer_iph->tos))
- IP_ECN_set_ce(inner_iph);
-}
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
static int ipip_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct ip_tunnel *tunnel;
- const struct iphdr *iph = ip_hdr(skb);
-
- rcu_read_lock();
- tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
- if (tunnel != NULL) {
- struct pcpu_tstats *tstats;
-
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- rcu_read_unlock();
- kfree_skb(skb);
- return 0;
- }
-
- secpath_reset(skb);
-
- skb->mac_header = skb->network_header;
- skb_reset_network_header(skb);
- skb->protocol = htons(ETH_P_IP);
- skb->pkt_type = PACKET_HOST;
-
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
-
- __skb_tunnel_rx(skb, tunnel->dev);
-
- ipip_ecn_decapsulate(iph, skb);
-
- netif_rx(skb);
-
- rcu_read_unlock();
- return 0;
+ const struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi.proto))
+ goto drop;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
}
- rcu_read_unlock();
return -1;
+
+drop:
+ kfree_skb(skb);
+ return 0;
}
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
*/
-
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct pcpu_tstats *tstats;
- struct iphdr *tiph = &tunnel->parms.iph;
- u8 tos = tunnel->parms.iph.tos;
- __be16 df = tiph->frag_off;
- struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
- struct iphdr *old_iph = ip_hdr(skb);
- struct iphdr *iph; /* Our new IP header */
- unsigned int max_headroom; /* The extra header space needed */
- __be32 dst = tiph->daddr;
- int mtu;
-
- if (skb->protocol != htons(ETH_P_IP))
- goto tx_error;
-
- if (tos & 1)
- tos = old_iph->tos;
-
- if (!dst) {
- /* NBMA tunnel */
- if ((rt = skb_rtable(skb)) == NULL) {
- dev->stats.tx_fifo_errors++;
- goto tx_error;
- }
- if ((dst = rt->rt_gateway) == 0)
- goto tx_error_icmp;
- }
+ const struct iphdr *tiph = &tunnel->parms.iph;
- {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = dst,
- .fl4_src= tiph->saddr,
- .fl4_tos = RT_TOS(tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
- }
- }
- tdev = rt->dst.dev;
-
- if (tdev == dev) {
- ip_rt_put(rt);
- dev->stats.collisions++;
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- }
-
- df |= old_iph->frag_off & htons(IP_DF);
-
- if (df) {
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
-
- if (mtu < 68) {
- dev->stats.collisions++;
- ip_rt_put(rt);
- goto tx_error;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- if ((old_iph->frag_off & htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
- }
-
- if (tunnel->err_count > 0) {
- if (time_before(jiffies,
- tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
- tunnel->err_count--;
- dst_link_failure(skb);
- } else
- tunnel->err_count = 0;
- }
-
- /*
- * Okay, now see if we can stuff it in the buffer as-is.
- */
- max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
-
- if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
- (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
- struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- dev->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
- }
- if (skb->sk)
- skb_set_owner_w(new_skb, skb->sk);
- dev_kfree_skb(skb);
- skb = new_skb;
- old_iph = ip_hdr(skb);
- }
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto out;
- skb->transport_header = skb->network_header;
- skb_push(skb, sizeof(struct iphdr));
- skb_reset_network_header(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
- /*
- * Push down and install the IPIP header.
- */
-
- iph = ip_hdr(skb);
- iph->version = 4;
- iph->ihl = sizeof(struct iphdr)>>2;
- iph->frag_off = df;
- iph->protocol = IPPROTO_IPIP;
- iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
-
- if ((iph->ttl = tiph->ttl) == 0)
- iph->ttl = old_iph->ttl;
-
- nf_reset(skb);
- tstats = this_cpu_ptr(dev->tstats);
- __IPTUNNEL_XMIT(tstats, &dev->stats);
+ ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
return NETDEV_TX_OK;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
+ kfree_skb(skb);
+out:
dev->stats.tx_errors++;
- dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
-static void ipip_tunnel_bind_dev(struct net_device *dev)
-{
- struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
- struct iphdr *iph;
-
- tunnel = netdev_priv(dev);
- iph = &tunnel->parms.iph;
-
- if (iph->daddr) {
- struct flowi fl = {
- .oif = tunnel->parms.link,
- .fl4_dst = iph->daddr,
- .fl4_src = iph->saddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
- struct rtable *rt;
-
- if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->dst.dev;
- ip_rt_put(rt);
- }
- dev->flags |= IFF_POINTOPOINT;
- }
-
- if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-
- if (tdev) {
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - sizeof(struct iphdr);
- }
- dev->iflink = tunnel->parms.link;
-}
-
static int
-ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
- struct ip_tunnel *t;
- struct net *net = dev_net(dev);
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
-
- switch (cmd) {
- case SIOCGETTUNNEL:
- t = NULL;
- if (dev == ipn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip_tunnel_locate(net, &p, 0);
- }
- if (t == NULL)
- t = netdev_priv(dev);
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- err = -EFAULT;
- break;
-
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t != NULL) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- ipip_tunnel_unlink(ipn, t);
- synchronize_net();
- t->parms.iph.saddr = p.iph.saddr;
- t->parms.iph.daddr = p.iph.daddr;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
- ipip_tunnel_link(ipn, t);
- netdev_state_change(dev);
- }
- }
-
- if (t) {
- err = 0;
- if (cmd == SIOCCHGTUNNEL) {
- t->parms.iph.ttl = p.iph.ttl;
- t->parms.iph.tos = p.iph.tos;
- t->parms.iph.frag_off = p.iph.frag_off;
- if (t->parms.link != p.link) {
- t->parms.link = p.link;
- ipip_tunnel_bind_dev(dev);
- netdev_state_change(dev);
- }
- }
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
-
- case SIOCDELTUNNEL:
- err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
- goto done;
-
- if (dev == ipn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
- goto done;
- err = -EPERM;
- if (t->dev == ipn->fb_tunnel_dev)
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
-
- default:
- err = -EINVAL;
+ return -EINVAL;
}
-done:
- return err;
-}
+ p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
-static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
- if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
- return -EINVAL;
- dev->mtu = new_mtu;
return 0;
}
static const struct net_device_ops ipip_netdev_ops = {
- .ndo_uninit = ipip_tunnel_uninit,
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
- .ndo_change_mtu = ipip_tunnel_change_mtu,
- .ndo_get_stats = ipip_get_stats,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
};
-static void ipip_dev_free(struct net_device *dev)
-{
- free_percpu(dev->tstats);
- free_netdev(dev);
-}
+#define IPIP_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
- dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
- dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr);
dev->flags = IFF_NOARP;
dev->iflink = 0;
dev->addr_len = 4;
- dev->features |= NETIF_F_NETNS_LOCAL;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+
+ dev->features |= IPIP_FEATURES;
+ dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
}
static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
-
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
- ipip_tunnel_bind_dev(dev);
-
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
-
- return 0;
+ tunnel->hlen = 0;
+ tunnel->parms.iph.protocol = IPPROTO_IPIP;
+ return ip_tunnel_init(dev);
}
-static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static void ipip_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
{
- struct ip_tunnel *tunnel = netdev_priv(dev);
- struct iphdr *iph = &tunnel->parms.iph;
- struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
+ memset(parms, 0, sizeof(*parms));
- tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPIP;
+ parms->iph.ihl = 5;
- iph->version = 4;
- iph->protocol = IPPROTO_IPIP;
- iph->ihl = 5;
+ if (!data)
+ return;
- dev->tstats = alloc_percpu(struct pcpu_tstats);
- if (!dev->tstats)
- return -ENOMEM;
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
- dev_hold(dev);
- rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
- return 0;
-}
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
-static struct xfrm_tunnel ipip_handler __read_mostly = {
- .handler = ipip_rcv,
- .err_handler = ipip_err,
- .priority = 1,
-};
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
-static const char banner[] __initconst =
- KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
-static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+static int ipip_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
{
- int prio;
-
- for (prio = 1; prio < 4; prio++) {
- int h;
- for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
-
- t = rtnl_dereference(ipn->tunnels[prio][h]);
- while (t != NULL) {
- unregister_netdevice_queue(t->dev, head);
- t = rtnl_dereference(t->next);
- }
- }
- }
+ struct ip_tunnel_parm p;
+
+ ipip_netlink_parms(data, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
}
-static int __net_init ipip_init_net(struct net *net)
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- int err;
+ struct ip_tunnel_parm p;
- ipn->tunnels[0] = ipn->tunnels_wc;
- ipn->tunnels[1] = ipn->tunnels_l;
- ipn->tunnels[2] = ipn->tunnels_r;
- ipn->tunnels[3] = ipn->tunnels_r_l;
-
- ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
- "tunl0",
- ipip_tunnel_setup);
- if (!ipn->fb_tunnel_dev) {
- err = -ENOMEM;
- goto err_alloc_dev;
- }
- dev_net_set(ipn->fb_tunnel_dev, net);
+ ipip_netlink_parms(data, &p);
- err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
- if (err)
- goto err_reg_dev;
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ return ip_tunnel_changelink(dev, tb, &p);
+}
- if ((err = register_netdev(ipn->fb_tunnel_dev)))
- goto err_reg_dev;
+static size_t ipip_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ 0;
+}
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
return 0;
-err_reg_dev:
- ipip_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
- /* nothing */
- return err;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+ .kind = "ipip",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip_tunnel_setup,
+ .newlink = ipip_newlink,
+ .changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipip_get_size,
+ .fill_info = ipip_fill_info,
+};
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+ .priority = 1,
+};
+
+static int __net_init ipip_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
static void __net_exit ipip_exit_net(struct net *net)
{
- struct ipip_net *ipn = net_generic(net, ipip_net_id);
- LIST_HEAD(list);
-
- rtnl_lock();
- ipip_destroy_tunnels(ipn, &list);
- unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ ip_tunnel_delete_net(itn, &ipip_link_ops);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
.exit = ipip_exit_net,
.id = &ipip_net_id,
- .size = sizeof(struct ipip_net),
+ .size = sizeof(struct ip_tunnel_net),
};
static int __init ipip_init(void)
{
int err;
- printk(banner);
+ pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
err = register_pernet_device(&ipip_net_ops);
if (err < 0)
return err;
err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
if (err < 0) {
- unregister_pernet_device(&ipip_net_ops);
- printk(KERN_INFO "ipip init: can't register tunnel\n");
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_failed;
}
+ err = rtnl_link_register(&ipip_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+out:
return err;
+
+rtnl_link_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&ipip_net_ops);
+ goto out;
}
static void __exit ipip_fini(void)
{
+ rtnl_link_unregister(&ipip_link_ops);
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
- printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+ pr_info("%s: can't deregister tunnel\n", __func__);
unregister_pernet_device(&ipip_net_ops);
}
@@ -913,4 +486,5 @@ static void __exit ipip_fini(void)
module_init(ipip_init);
module_exit(ipip_fini);
MODULE_LICENSE("GPL");
-MODULE_ALIAS("tunl0");
+MODULE_ALIAS_RTNL_LINK("ipip");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f3a9afd73e..65bcaa78904 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,6 @@
*
*/
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/types.h>
#include <linux/capability.h>
@@ -60,10 +59,13 @@
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
-#include <net/ipip.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>
+#include <linux/netconf.h>
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
#define CONFIG_IP_PIMSM 1
@@ -82,8 +84,8 @@ struct mr_table {
struct vif_device vif_table[MAXVIFS];
int maxvif;
atomic_t cache_resolve_queue_len;
- int mroute_do_assert;
- int mroute_do_pim;
+ bool mroute_do_assert;
+ bool mroute_do_pim;
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
int mroute_reg_vif_num;
#endif
@@ -123,13 +125,18 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
static struct kmem_cache *mrt_cachep __read_mostly;
static struct mr_table *ipmr_new_table(struct net *net, u32 id);
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local);
+static void ipmr_free_table(struct mr_table *mrt);
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local);
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd);
+static void mroute_clean_tables(struct mr_table *mrt);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -147,14 +154,18 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return NULL;
}
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
- struct ipmr_result res;
- struct fib_lookup_arg arg = { .result = &res, };
int err;
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
- err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+ flowi4_to_flowi(flp4), 0, &arg);
if (err < 0)
return err;
*mrt = res.mrt;
@@ -216,7 +227,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
return 0;
}
-static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.family = RTNL_FAMILY_IPMR,
.rule_size = sizeof(struct ipmr_rule),
.addr_size = sizeof(u32),
@@ -269,7 +280,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
- kfree(mrt);
+ ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
}
@@ -282,7 +293,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return net->ipv4.mrt;
}
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
*mrt = net->ipv4.mrt;
@@ -297,7 +308,7 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
- kfree(net->ipv4.mrt);
+ ipmr_free_table(net->ipv4.mrt);
}
#endif
@@ -334,6 +345,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
return mrt;
}
+static void ipmr_free_table(struct mr_table *mrt)
+{
+ del_timer_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt);
+ kfree(mrt);
+}
+
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -410,6 +428,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
goto failure;
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
if (dev_open(dev))
@@ -434,14 +453,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net *net = dev_net(dev);
struct mr_table *mrt;
- struct flowi fl = {
- .oif = dev->ifindex,
- .iif = skb->skb_iif,
- .mark = skb->mark,
+ struct flowi4 fl4 = {
+ .flowi4_oif = dev->ifindex,
+ .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi4_mark = skb->mark,
};
int err;
- err = ipmr_fib_lookup(net, &fl, &mrt);
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
if (err < 0) {
kfree_skb(skb);
return err;
@@ -465,7 +484,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->type = ARPHRD_PIMREG;
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
- dev->netdev_ops = &reg_vif_netdev_ops,
+ dev->netdev_ops = &reg_vif_netdev_ops;
dev->destructor = free_netdev;
dev->features |= NETIF_F_NETNS_LOCAL;
}
@@ -502,6 +521,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
}
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
rcu_read_unlock();
@@ -522,8 +542,8 @@ failure:
}
#endif
-/*
- * Delete a VIF entry
+/**
+ * vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
*/
@@ -570,6 +590,9 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
in_dev = __in_dev_get_rtnl(dev);
if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
ip_rt_multicast_event(in_dev);
}
@@ -608,13 +631,13 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -ETIMEDOUT;
memset(&e->msg, 0, sizeof(e->msg));
- rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
kfree_skb(skb);
}
@@ -653,6 +676,7 @@ static void ipmr_expire_process(unsigned long arg)
}
list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
}
@@ -760,6 +784,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
+ &in_dev->cnf);
ip_rt_multicast_event(in_dev);
/* Fill in the VIF structures */
@@ -807,6 +833,49 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
return NULL;
}
+/* Look for a (*,*,oif) entry */
+static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
+ int vifi)
+{
+ int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY) &&
+ c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache *c, *proxy;
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ goto skip;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == mcastgrp) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt,
+ c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+skip:
+ return ipmr_cache_find_any_parent(mrt, vifi);
+}
+
/*
* Allocate a multicast cache entry
*/
@@ -846,19 +915,19 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+ if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
nlh->nlmsg_len = skb_tail_pointer(skb) -
(u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- e = NLMSG_DATA(nlh);
+ e = nlmsg_data(nlh);
e->error = -EMSGSIZE;
memset(&e->msg, 0, sizeof(e->msg));
}
- rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
ip_mr_forward(net, mrt, skb, c, 0);
}
@@ -916,7 +985,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Copy the IP header */
- skb->network_header = skb->tail;
+ skb_set_network_header(skb, skb->len);
skb_put(skb, ihl);
skb_copy_to_linear_data(skb, pkt->data, ihl);
ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
@@ -947,8 +1016,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
if (ret < 0) {
- if (net_ratelimit())
- printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
}
@@ -1009,6 +1077,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
atomic_inc(&mrt->cache_resolve_queue_len);
list_add(&c->list, &mrt->mfc_unres_queue);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
@@ -1032,7 +1101,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
* MFC cache manipulation by user space mroute daemon
*/
-static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
int line;
struct mfc_cache *c, *next;
@@ -1041,9 +1110,10 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
list_del_rcu(&c->list);
-
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
return 0;
}
@@ -1052,7 +1122,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
}
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
- struct mfcctl *mfc, int mrtsock)
+ struct mfcctl *mfc, int mrtsock, int parent)
{
bool found = false;
int line;
@@ -1065,7 +1135,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
found = true;
break;
}
@@ -1078,10 +1149,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
- if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
c = ipmr_cache_alloc();
@@ -1120,6 +1193,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
ipmr_cache_resolve(net, mrt, uc, c);
ipmr_cache_free(uc);
}
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
return 0;
}
@@ -1148,6 +1222,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
if (c->mfc_flags & MFC_STATIC)
continue;
list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
}
}
@@ -1156,6 +1231,7 @@ static void mroute_clean_tables(struct mr_table *mrt)
spin_lock_bh(&mfc_unres_lock);
list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_destroy_unres(mrt, c);
}
spin_unlock_bh(&mfc_unres_lock);
@@ -1174,7 +1250,10 @@ static void mrtsock_destruct(struct sock *sk)
ipmr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
- rcu_assign_pointer(mrt->mroute_sk, NULL);
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ RCU_INIT_POINTER(mrt->mroute_sk, NULL);
mroute_clean_tables(mrt);
}
}
@@ -1190,29 +1269,30 @@ static void mrtsock_destruct(struct sock *sk)
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
{
- int ret;
+ int ret, parent = 0;
struct vifctl vif;
struct mfcctl mfc;
struct net *net = sock_net(sk);
struct mr_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
if (mrt == NULL)
return -ENOENT;
if (optname != MRT_INIT) {
- if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
- !capable(CAP_NET_ADMIN))
+ if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EACCES;
}
switch (optname) {
case MRT_INIT:
- if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_IGMP)
- return -EOPNOTSUPP;
if (optlen != sizeof(int))
- return -ENOPROTOOPT;
+ return -EINVAL;
rtnl_lock();
if (rtnl_dereference(mrt->mroute_sk)) {
@@ -1224,11 +1304,14 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (ret == 0) {
rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
}
rtnl_unlock();
return ret;
case MRT_DONE:
- if (sk != rcu_dereference_raw(mrt->mroute_sk))
+ if (sk != rcu_access_pointer(mrt->mroute_sk))
return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
@@ -1255,16 +1338,22 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
*/
case MRT_ADD_MFC:
case MRT_DEL_MFC:
+ parent = -1;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc))
return -EINVAL;
if (copy_from_user(&mfc, optval, sizeof(mfc)))
return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
rtnl_lock();
- if (optname == MRT_DEL_MFC)
- ret = ipmr_mfc_delete(mrt, &mfc);
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
else
ret = ipmr_mfc_add(net, mrt, &mfc,
- sk == rtnl_dereference(mrt->mroute_sk));
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
rtnl_unlock();
return ret;
/*
@@ -1273,9 +1362,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
case MRT_ASSERT:
{
int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
if (get_user(v, (int __user *)optval))
return -EFAULT;
- mrt->mroute_do_assert = (v) ? 1 : 0;
+ mrt->mroute_do_assert = v;
return 0;
}
#ifdef CONFIG_IP_PIMSM
@@ -1283,9 +1374,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
{
int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
if (get_user(v, (int __user *)optval))
return -EFAULT;
- v = (v) ? 1 : 0;
+ v = !!v;
rtnl_lock();
ret = 0;
@@ -1307,6 +1400,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (get_user(v, (u32 __user *)optval))
return -EFAULT;
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 1000000000)
+ return -EINVAL;
+
rtnl_lock();
ret = 0;
if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1314,7 +1411,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
} else {
if (!ipmr_new_table(net, v))
ret = -ENOMEM;
- raw_sk(sk)->ipmr_table = v;
+ else
+ raw_sk(sk)->ipmr_table = v;
}
rtnl_unlock();
return ret;
@@ -1340,6 +1438,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
struct net *net = sock_net(sk);
struct mr_table *mrt;
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
if (mrt == NULL)
return -ENOENT;
@@ -1434,15 +1536,89 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
}
}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct mr_table *mrt;
struct vif_device *v;
int ct;
- LIST_HEAD(list);
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
@@ -1451,10 +1627,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
if (v->dev == dev)
- vif_delete(mrt, ct, 1, &list);
+ vif_delete(mrt, ct, 1, NULL);
}
}
- unregister_netdevice_many(&list);
return NOTIFY_DONE;
}
@@ -1472,7 +1647,7 @@ static struct notifier_block ip_mr_notifier = {
static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct iphdr *iph;
- struct iphdr *old_iph = ip_hdr(skb);
+ const struct iphdr *old_iph = ip_hdr(skb);
skb_push(skb, sizeof(struct iphdr));
skb->transport_header = skb->network_header;
@@ -1488,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
iph->tot_len = htons(skb->len);
- ip_select_ident(iph, skb_dst(skb), NULL);
+ ip_select_ident(skb, NULL);
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1500,6 +1675,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -1518,6 +1694,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
struct vif_device *vif = &mrt->vif_table[vifi];
struct net_device *dev;
struct rtable *rt;
+ struct flowi4 fl4;
int encap = 0;
if (vif->dev == NULL)
@@ -1535,26 +1712,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
#endif
if (vif->flags & VIFF_TUNNEL) {
- struct flowi fl = {
- .oif = vif->link,
- .fl4_dst = vif->remote,
- .fl4_src = vif->local,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_ports(net, &fl4, NULL,
+ vif->remote, vif->local,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
goto out_free;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = {
- .oif = vif->link,
- .fl4_dst = iph->daddr,
- .fl4_tos = RT_TOS(iph->tos),
- .proto = IPPROTO_IPIP
- };
-
- if (ip_route_output_key(net, &rt, &fl))
+ rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
goto out_free;
}
@@ -1629,23 +1800,34 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
-static int ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local)
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local)
{
int psend = -1;
int vif, ct;
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
cache->mfc_un.res.bytes += skb->len;
+ if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incomming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
if (mrt->vif_table[vif].dev != skb->dev) {
- int true_vifi;
-
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1662,7 +1844,6 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
}
cache->mfc_un.res.wrong_if++;
- true_vifi = ipmr_find_vif(mrt, skb->dev);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -1680,15 +1861,34 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
+forward:
mrt->vif_table[vif].pkt_in++;
mrt->vif_table[vif].bytes_in += skb->len;
/*
* Forward the frame
*/
+ if (cache->mfc_origin == htonl(INADDR_ANY) &&
+ cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mfc_parent &&
+ ip_hdr(skb)->ttl >
+ cache->mfc_un.res.ttls[cache->mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
for (ct = cache->mfc_un.res.maxvif - 1;
ct >= cache->mfc_un.res.minvif; ct--) {
- if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1699,6 +1899,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
psend = ct;
}
}
+last_forward:
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -1707,16 +1908,38 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
ipmr_queue_xmit(net, mrt, skb, cache, psend);
- return 0;
+ return;
}
}
dont_forward:
if (!local)
kfree_skb(skb);
- return 0;
}
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = (rt_is_output_route(rt) ?
+ skb->dev->ifindex : 0),
+ .flowi4_iif = (rt_is_output_route(rt) ?
+ LOOPBACK_IFINDEX :
+ skb->dev->ifindex),
+ .flowi4_mark = skb->mark,
+ };
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err)
+ return ERR_PTR(err);
+ return mrt;
+}
/*
* Multicast packets for forwarding arrive here
@@ -1729,7 +1952,6 @@ int ip_mr_input(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
struct mr_table *mrt;
- int err;
/* Packet is looped back after forward, it should not be
* forwarded second time, but still can be delivered locally.
@@ -1737,12 +1959,11 @@ int ip_mr_input(struct sk_buff *skb)
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
- err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
- if (err < 0) {
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt)) {
kfree_skb(skb);
- return err;
+ return PTR_ERR(mrt);
}
-
if (!local) {
if (IPCB(skb)->opt.router_alert) {
if (ip_call_ra_chain(skb))
@@ -1767,6 +1988,13 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (cache == NULL) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
/*
* No usable cache entry
@@ -1844,9 +2072,8 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
skb_reset_network_header(skb);
skb->protocol = htons(ETH_P_IP);
skb->ip_summed = CHECKSUM_NONE;
- skb->pkt_type = PACKET_HOST;
- skb_tunnel_rx(skb, reg_dev);
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
netif_rx(skb);
@@ -1870,9 +2097,9 @@ int pim_rcv_v1(struct sk_buff *skb)
pim = igmp_hdr(skb);
- if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
goto drop;
-
if (!mrt->mroute_do_pim ||
pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
goto drop;
@@ -1902,9 +2129,9 @@ static int pim_rcv(struct sk_buff *skb)
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
- if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
goto drop;
-
if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
kfree_skb(skb);
@@ -1918,54 +2145,66 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
{
int ct;
struct rtnexthop *nhp;
- u8 *b = skb_tail_pointer(skb);
- struct rtattr *mp_head;
+ struct nlattr *mp_attr;
+ struct rta_mfc_stats mfcs;
/* If cache is unresolved, don't try to parse IIF and OIF */
if (c->mfc_parent >= MAXVIFS)
return -ENOENT;
- if (VIF_EXISTS(mrt, c->mfc_parent))
- RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
+ if (VIF_EXISTS(mrt, c->mfc_parent) &&
+ nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ return -EMSGSIZE;
- mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+ if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+ return -EMSGSIZE;
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
- goto rtattr_failure;
- nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+ if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
- mp_head->rta_type = RTA_MULTIPATH;
- mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+
+ nla_nest_end(skb, mp_attr);
+
+ mfcs.mfcs_packets = c->mfc_un.res.pkt;
+ mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+ mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ return -EMSGSIZE;
+
rtm->rtm_type = RTN_MULTICAST;
return 1;
-
-rtattr_failure:
- nlmsg_trim(skb, b);
- return -EMSGSIZE;
}
-int ipmr_get_route(struct net *net,
- struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr,
+ struct rtmsg *rtm, int nowait)
{
- int err;
- struct mr_table *mrt;
struct mfc_cache *cache;
- struct rtable *rt = skb_rtable(skb);
+ struct mr_table *mrt;
+ int err;
mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
if (mrt == NULL)
return -ENOENT;
rcu_read_lock();
- cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
+ cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (cache == NULL && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
@@ -1997,8 +2236,8 @@ int ipmr_get_route(struct net *net,
skb_reset_network_header(skb2);
iph = ip_hdr(skb2);
iph->ihl = sizeof(struct iphdr) >> 2;
- iph->saddr = rt->rt_src;
- iph->daddr = rt->rt_dst;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2);
read_unlock(&mrt_lock);
@@ -2016,12 +2255,14 @@ int ipmr_get_route(struct net *net,
}
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
- u32 pid, u32 seq, struct mfc_cache *c)
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+ int flags)
{
struct nlmsghdr *nlh;
struct rtmsg *rtm;
+ int err;
- nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2031,16 +2272,22 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
rtm->rtm_src_len = 32;
rtm->rtm_tos = 0;
rtm->rtm_table = mrt->id;
- NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
rtm->rtm_type = RTN_MULTICAST;
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
- rtm->rtm_protocol = RTPROT_UNSPEC;
+ if (c->mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
rtm->rtm_flags = 0;
- NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
- NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
-
- if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
+ if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
+ err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -2050,6 +2297,52 @@ nla_put_failure:
return -EMSGSIZE;
}
+static size_t mroute_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_SRC */
+ + nla_total_size(4) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+}
+
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2074,15 +2367,33 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
if (e < s_e)
goto next_entry;
if (ipmr_fill_mroute(mrt, skb,
- NETLINK_CB(cb->skb).pid,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- mfc) < 0)
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
goto done;
next_entry:
e++;
}
e = s_e = 0;
}
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0) {
+ spin_unlock_bh(&mfc_unres_lock);
+ goto done;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ e = s_e = 0;
s_h = 0;
next_table:
t++;
@@ -2398,16 +2709,16 @@ static int __net_init ipmr_net_init(struct net *net)
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
- if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
+ if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
goto proc_vif_fail;
- if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
+ if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
goto proc_cache_fail;
#endif
return 0;
#ifdef CONFIG_PROC_FS
proc_cache_fail:
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
ipmr_rules_exit(net);
#endif
@@ -2418,8 +2729,8 @@ fail:
static void __net_exit ipmr_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_net_remove(net, "ip_mr_cache");
- proc_net_remove(net, "ip_mr_vif");
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_rules_exit(net);
}
@@ -2449,12 +2760,13 @@ int __init ip_mr_init(void)
goto reg_notif_fail;
#ifdef CONFIG_IP_PIMSM_V2
if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
- printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
+ pr_err("%s: can't add PIM protocol\n", __func__);
err = -EAGAIN;
goto add_proto_fail;
}
#endif
- rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+ NULL, ipmr_rtm_dumproute, NULL);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 994a1f29ebb..7ebd6e37875 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -1,76 +1,67 @@
-/* IPv4 specific functions of netfilter core */
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/gfp.h>
+#include <linux/export.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
+int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
{
struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- struct flowi fl = {};
- unsigned long orefdst;
+ struct flowi4 fl4 = {};
+ __be32 saddr = iph->saddr;
+ __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
unsigned int hh_len;
- unsigned int type;
- type = inet_addr_type(net, iph->saddr);
- if (skb->sk && inet_sk(skb->sk)->transparent)
- type = RTN_LOCAL;
if (addr_type == RTN_UNSPEC)
- addr_type = type;
+ addr_type = inet_addr_type(net, saddr);
+ if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+ flags |= FLOWI_FLAG_ANYSRC;
+ else
+ saddr = 0;
/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
* packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
*/
- if (addr_type == RTN_LOCAL) {
- fl.fl4_dst = iph->daddr;
- if (type == RTN_LOCAL)
- fl.fl4_src = iph->saddr;
- fl.fl4_tos = RT_TOS(iph->tos);
- fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
- fl.mark = skb->mark;
- fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
- if (ip_route_output_key(net, &rt, &fl) != 0)
- return -1;
-
- /* Drop old route. */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* non-local src, find valid iif to satisfy
- * rp-filter when calling ip_route_input. */
- fl.fl4_dst = iph->saddr;
- if (ip_route_output_key(net, &rt, &fl) != 0)
- return -1;
-
- orefdst = skb->_skb_refdst;
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->dst.dev) != 0) {
- dst_release(&rt->dst);
- return -1;
- }
- dst_release(&rt->dst);
- refdst_drop(orefdst);
- }
+ fl4.daddr = iph->daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_flags = flags;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
if (skb_dst(skb)->error)
- return -1;
+ return skb_dst(skb)->error;
#ifdef CONFIG_XFRM
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, &fl, AF_INET) == 0) {
+ xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
skb_dst_set(skb, NULL);
- if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
- return -1;
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
skb_dst_set(skb, dst);
}
#endif
@@ -78,49 +69,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
/* Change in oif may mean change in hh_len. */
hh_len = skb_dst(skb)->dev->hard_header_len;
if (skb_headroom(skb) < hh_len &&
- pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -1;
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(ip_route_me_harder);
-#ifdef CONFIG_XFRM
-int ip_xfrm_me_harder(struct sk_buff *skb)
-{
- struct flowi fl;
- unsigned int hh_len;
- struct dst_entry *dst;
-
- if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
- return 0;
- if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
- return -1;
-
- dst = skb_dst(skb);
- if (dst->xfrm)
- dst = ((struct xfrm_dst *)dst)->route;
- dst_hold(dst);
-
- if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
- return -1;
-
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
-
- /* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
- if (skb_headroom(skb) < hh_len &&
- pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -1;
- return 0;
-}
-EXPORT_SYMBOL(ip_xfrm_me_harder);
-#endif
-
-void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(ip_nat_decode_session);
-
/*
* Extra routing may needed on local out, as the QUEUE target never
* returns control to the table.
@@ -217,9 +173,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
return csum;
}
-static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
+static int nf_ip_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict __always_unused)
{
- return ip_route_output_key(&init_net, (struct rtable **)dst, fl);
+ struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ *dst = &rt->dst;
+ return 0;
}
static const struct nf_afinfo nf_ip_afinfo = {
@@ -232,25 +193,15 @@ static const struct nf_afinfo nf_ip_afinfo = {
.route_key_size = sizeof(struct ip_rt_info),
};
-static int ipv4_netfilter_init(void)
+static int __init ipv4_netfilter_init(void)
{
return nf_register_afinfo(&nf_ip_afinfo);
}
-static void ipv4_netfilter_fini(void)
+static void __exit ipv4_netfilter_fini(void)
{
nf_unregister_afinfo(&nf_ip_afinfo);
}
module_init(ipv4_netfilter_init);
module_exit(ipv4_netfilter_fini);
-
-#ifdef CONFIG_SYSCTL
-struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "netfilter", },
- { }
-};
-EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
-#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5..a26ce035e3f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4
config NF_CONNTRACK_PROC_COMPAT
bool "proc/sysctl compatibility with old connection tracking"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
default y
help
This option enables /proc and sysctl compatibility with the old
@@ -36,18 +36,41 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config IP_NF_QUEUE
- tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
- depends on NETFILTER_ADVANCED
+config NF_TABLES_IPV4
+ depends on NF_TABLES
+ tristate "IPv4 nf_tables support"
help
- Netfilter has the ability to queue packets to user space: the
- netlink device can be used to access them using this driver.
+ This option enables the IPv4 support for nf_tables.
- This option enables the old IPv4-only "ip_queue" implementation
- which has been obsoleted by the new "nfnetlink_queue" code (see
- CONFIG_NETFILTER_NETLINK_QUEUE).
+config NFT_CHAIN_ROUTE_IPV4
+ depends on NF_TABLES_IPV4
+ tristate "IPv4 nf_tables route chain support"
+ help
+ This option enables the "route" chain for IPv4 in nf_tables. This
+ chain type is used to force packet re-routing after mangling header
+ fields such as the source, destination, type of service and
+ the packet mark.
+
+config NFT_CHAIN_NAT_IPV4
+ depends on NF_TABLES_IPV4
+ depends on NF_NAT_IPV4 && NFT_NAT
+ tristate "IPv4 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv4 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NFT_REJECT_IPV4
+ depends on NF_TABLES_IPV4
+ default NFT_REJECT
+ tristate
- To compile it as a module, choose M here. If unsure, say N.
+config NF_TABLES_ARP
+ depends on NF_TABLES
+ tristate "ARP nf_tables support"
+ help
+ This option enables the ARP support for nf_tables.
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
@@ -64,16 +87,6 @@ config IP_NF_IPTABLES
if IP_NF_IPTABLES
# The matches.
-config IP_NF_MATCH_ADDRTYPE
- tristate '"addrtype" address type match support'
- depends on NETFILTER_ADVANCED
- help
- This option allows you to match what routing thinks of an address,
- eg. UNICAST, LOCAL, BROADCAST, ...
-
- If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
-
config IP_NF_MATCH_AH
tristate '"ah" match support'
depends on NETFILTER_ADVANCED
@@ -86,11 +99,21 @@ config IP_NF_MATCH_AH
config IP_NF_MATCH_ECN
tristate '"ecn" match support'
depends on NETFILTER_ADVANCED
- help
- This option adds a `ECN' match, which allows you to match against
- the IPv4 and TCP header ECN fields.
+ select NETFILTER_XT_MATCH_ECN
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_ECN.
+
+config IP_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ ---help---
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ipt_rpfilter.
config IP_NF_MATCH_TTL
tristate '"ttl" match support'
@@ -123,17 +146,21 @@ config IP_NF_TARGET_REJECT
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_LOG
- tristate "LOG target support"
- default m if NETFILTER_ADVANCED=n
+config IP_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
help
- This option adds a `LOG' target, which allows you to create rules in
- any iptables table which records the packet header to the syslog.
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
- To compile it as a module, choose M here. If unsure, say N.
+ To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ULOG
- tristate "ULOG target support"
+ tristate "ULOG target support (obsolete)"
default m if NETFILTER_ADVANCED=n
---help---
@@ -152,25 +179,22 @@ config IP_NF_TARGET_ULOG
To compile it as a module, choose M here. If unsure, say N.
# NAT + specific targets: nf_conntrack
-config NF_NAT
- tristate "Full NAT"
+config NF_NAT_IPV4
+ tristate "IPv4 NAT"
depends on NF_CONNTRACK_IPV4
default m if NETFILTER_ADVANCED=n
+ select NF_NAT
help
- The Full NAT option allows masquerading, port forwarding and other
+ The IPv4 NAT option allows masquerading, port forwarding and other
forms of full Network Address Port Translation. It is controlled by
the `nat' table in iptables: see the man page for iptables(8).
To compile it as a module, choose M here. If unsure, say N.
-config NF_NAT_NEEDED
- bool
- depends on NF_NAT
- default y
+if NF_NAT_IPV4
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- depends on NF_NAT
default m if NETFILTER_ADVANCED=n
help
Masquerading is a special case of NAT: all outgoing connections are
@@ -183,31 +207,29 @@ config IP_NF_TARGET_MASQUERADE
config IP_NF_TARGET_NETMAP
tristate "NETMAP target support"
- depends on NF_NAT
depends on NETFILTER_ADVANCED
- help
- NETMAP is an implementation of static 1:1 NAT mapping of network
- addresses. It maps the network address part, while keeping the host
- address part intact.
-
- To compile it as a module, choose M here. If unsure, say N.
+ select NETFILTER_XT_TARGET_NETMAP
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_NETMAP.
config IP_NF_TARGET_REDIRECT
tristate "REDIRECT target support"
- depends on NF_NAT
depends on NETFILTER_ADVANCED
- help
- REDIRECT is a special case of NAT: all incoming connections are
- mapped onto the incoming interface's address, causing the packets to
- come to the local machine instead of passing through. This is
- useful for transparent proxies.
+ select NETFILTER_XT_TARGET_REDIRECT
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_REDIRECT.
- To compile it as a module, choose M here. If unsure, say N.
+endif
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
- depends on NF_NAT
+ depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
---help---
This module implements an Application Layer Gateway (ALG) for
@@ -227,61 +249,21 @@ config NF_NAT_SNMP_BASIC
# <expr> '&&' <expr> (6)
#
# (6) Returns the result of min(/expr/, /expr/).
-config NF_NAT_PROTO_DCCP
- tristate
- depends on NF_NAT && NF_CT_PROTO_DCCP
- default NF_NAT && NF_CT_PROTO_DCCP
config NF_NAT_PROTO_GRE
tristate
- depends on NF_NAT && NF_CT_PROTO_GRE
-
-config NF_NAT_PROTO_UDPLITE
- tristate
- depends on NF_NAT && NF_CT_PROTO_UDPLITE
- default NF_NAT && NF_CT_PROTO_UDPLITE
-
-config NF_NAT_PROTO_SCTP
- tristate
- default NF_NAT && NF_CT_PROTO_SCTP
- depends on NF_NAT && NF_CT_PROTO_SCTP
- select LIBCRC32C
-
-config NF_NAT_FTP
- tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_FTP
-
-config NF_NAT_IRC
- tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_IRC
-
-config NF_NAT_TFTP
- tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_TFTP
-
-config NF_NAT_AMANDA
- tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_AMANDA
+ depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
config NF_NAT_PPTP
tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_PPTP
+ depends on NF_CONNTRACK && NF_NAT_IPV4
+ default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
select NF_NAT_PROTO_GRE
config NF_NAT_H323
tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_H323
-
-config NF_NAT_SIP
- tristate
- depends on NF_CONNTRACK && NF_NAT
- default NF_NAT && NF_CONNTRACK_SIP
+ depends on NF_CONNTRACK && NF_NAT_IPV4
+ default NF_NAT_IPV4 && NF_CONNTRACK_H323
# mangle + specific targets
config IP_NF_MANGLE
@@ -295,8 +277,8 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_CLUSTERIP
- tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_MANGLE && EXPERIMENTAL
+ tristate "CLUSTERIP target support"
+ depends on IP_NF_MANGLE
depends on NF_CONNTRACK_IPV4
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
@@ -334,7 +316,6 @@ config IP_NF_TARGET_TTL
# raw + specific targets
config IP_NF_RAW
tristate 'raw table support (required for NOTRACK/TRACE)'
- depends on NETFILTER_ADVANCED
help
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 19eb59d0103..90b82405331 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,32 +10,28 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
endif
endif
-nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
-iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
-
# connection tracking
obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-obj-$(CONFIG_NF_NAT) += nf_nat.o
+nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
# NAT helpers (nf_conntrack)
-obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
-obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
-obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
-obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -43,23 +39,20 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
# matches
-obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
-obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
+obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
# targets
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
-obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
-obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
# generic ARP tables
@@ -68,6 +61,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
# just filtering instance of ARP tables for now
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
-
-obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340a28d..f95b6f93814 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -6,6 +6,7 @@
* Some ARP specific bits are:
*
* Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -76,7 +77,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
}
/*
- * Unfortunatly, _b and _mask are not aligned to an int (or long int)
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
* Some arches dont care, unrolling the loop is a win on them.
* For other arches, we only have a 16bit alignement.
*/
@@ -221,9 +222,8 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
static unsigned int
arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_err("arp_tables: error: '%s'\n",
- (const char *)par->targinfo);
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
return NF_DROP;
}
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
void *table_base;
const struct xt_table_info *private;
struct xt_action_param acpar;
+ unsigned int addend;
if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
return NF_DROP;
@@ -267,8 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
- xt_info_rdlock_bh();
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
@@ -301,7 +308,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
e = back;
@@ -338,7 +345,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- xt_info_rdunlock_bh();
+ xt_write_recseq_end(addend);
+ local_bh_enable();
if (acpar.hotdrop)
return NF_DROP;
@@ -710,42 +718,25 @@ static void get_counters(const struct xt_table_info *t,
struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu = get_cpu();
-
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- *
- * Bottom half has to be disabled to prevent deadlock
- * if new softirq were to run and call ipt_do_table
- */
- local_bh_disable();
- i = 0;
- xt_entry_foreach(iter, t->entries[curcpu], t->size) {
- SET_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
- ++i;
- }
- local_bh_enable();
- /* Processing counters from other cpus, we can let bottom half enabled,
- * (preemption is disabled)
- */
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
i = 0;
- local_bh_disable();
- xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
- ADD_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
}
- xt_info_wrunlock(cpu);
- local_bh_enable();
}
- put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +750,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
* about).
*/
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -883,6 +874,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -915,7 +907,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -972,7 +964,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n",
@@ -1007,7 +999,7 @@ static int __do_replace(struct net *net, const char *name,
struct arpt_entry *iter;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
@@ -1015,7 +1007,7 @@ static int __do_replace(struct net *net, const char *name,
t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
"arptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1052,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
- sizeof(struct xt_counters) * num_counters) != 0)
- ret = -EFAULT;
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
xt_table_unlock(t);
return ret;
@@ -1082,6 +1076,7 @@ static int do_replace(struct net *net, const void __user *user,
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
@@ -1130,6 +1125,7 @@ static int do_add_counters(struct net *net, const void __user *user,
int ret = 0;
void *loc_cpu_entry;
struct arpt_entry *iter;
+ unsigned int addend;
#ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp;
@@ -1170,7 +1166,7 @@ static int do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, NFPROTO_ARP, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -1186,12 +1182,12 @@ static int do_add_counters(struct net *net, const void __user *user,
/* Choose the copy that is on our node */
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
- xt_info_wrlock(curcpu);
+ addend = xt_write_recseq_begin();
xt_entry_foreach(iter, loc_cpu_entry, private->size) {
ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
- xt_info_wrunlock(curcpu);
+ xt_write_recseq_end(addend);
unlock_up_free:
local_bh_enable();
xt_table_unlock(t);
@@ -1350,6 +1346,7 @@ static int translate_compat_table(const char *name,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1503,6 +1500,7 @@ static int compat_do_replace(struct net *net, void __user *user,
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
@@ -1543,7 +1541,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1656,7 +1654,7 @@ static int compat_get_entries(struct net *net,
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
@@ -1687,7 +1685,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1708,7 +1706,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1732,7 +1730,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1755,6 +1753,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
ret = -EFAULT;
break;
}
+ rev.name[sizeof(rev.name)-1] = 0;
try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
rev.revision, 1, &ret),
@@ -1886,7 +1885,7 @@ static int __init arp_tables_init(void)
if (ret < 0)
goto err1;
- /* Noone else will be downing sem now, so we won't sleep */
+ /* No one else will be downing sem now, so we won't sleep */
ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
if (ret < 0)
goto err2;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b8ddcc480ed..a5e52a9f0a1 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
if (mangle->flags & ~ARPT_MANGLE_MASK ||
!(mangle->flags & ARPT_MANGLE_MASK))
- return false;
+ return -EINVAL;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
mangle->target != XT_CONTINUE)
- return false;
- return true;
+ return -EINVAL;
+ return 0;
}
static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 79ca5e70d49..802ddecb30b 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {
/* The work comes in here from netfilter.c */
static unsigned int
-arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net = dev_net((in != NULL) ? in : out);
- return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
+ return arpt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.arptable_filter);
}
static struct nf_hook_ops *arpfilter_ops __read_mostly;
@@ -48,9 +49,7 @@ static int __net_init arptable_filter_net_init(struct net *net)
net->ipv4.arptable_filter =
arpt_register_table(net, &packet_filter, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.arptable_filter))
- return PTR_ERR(net->ipv4.arptable_filter);
- return 0;
+ return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
}
static void __net_exit arptable_filter_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
deleted file mode 100644
index d2c1311cb28..00000000000
--- a/net/ipv4/netfilter/ip_queue.c
+++ /dev/null
@@ -1,637 +0,0 @@
-/*
- * This is a module which is used for queueing IPv4 packets and
- * communicating with userspace via netlink.
- *
- * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
- * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/notifier.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netlink.h>
-#include <linux/spinlock.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/net.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <net/netfilter/nf_queue.h>
-#include <net/ip.h>
-
-#define IPQ_QMAX_DEFAULT 1024
-#define IPQ_PROC_FS_NAME "ip_queue"
-#define NET_IPQ_QMAX 2088
-#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-
-typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
-
-static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
-static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_SPINLOCK(queue_lock);
-static int peer_pid __read_mostly;
-static unsigned int copy_range __read_mostly;
-static unsigned int queue_total;
-static unsigned int queue_dropped = 0;
-static unsigned int queue_user_dropped = 0;
-static struct sock *ipqnl __read_mostly;
-static LIST_HEAD(queue_list);
-static DEFINE_MUTEX(ipqnl_mutex);
-
-static inline void
-__ipq_enqueue_entry(struct nf_queue_entry *entry)
-{
- list_add_tail(&entry->list, &queue_list);
- queue_total++;
-}
-
-static inline int
-__ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status = 0;
-
- switch(mode) {
- case IPQ_COPY_NONE:
- case IPQ_COPY_META:
- copy_mode = mode;
- copy_range = 0;
- break;
-
- case IPQ_COPY_PACKET:
- if (range > 0xFFFF)
- range = 0xFFFF;
- copy_range = range;
- copy_mode = mode;
- break;
-
- default:
- status = -EINVAL;
-
- }
- return status;
-}
-
-static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
-
-static inline void
-__ipq_reset(void)
-{
- peer_pid = 0;
- net_disable_timestamp();
- __ipq_set_mode(IPQ_COPY_NONE, 0);
- __ipq_flush(NULL, 0);
-}
-
-static struct nf_queue_entry *
-ipq_find_dequeue_entry(unsigned long id)
-{
- struct nf_queue_entry *entry = NULL, *i;
-
- spin_lock_bh(&queue_lock);
-
- list_for_each_entry(i, &queue_list, list) {
- if ((unsigned long)i == id) {
- entry = i;
- break;
- }
- }
-
- if (entry) {
- list_del(&entry->list);
- queue_total--;
- }
-
- spin_unlock_bh(&queue_lock);
- return entry;
-}
-
-static void
-__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- struct nf_queue_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &queue_list, list) {
- if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue_total--;
- nf_reinject(entry, NF_DROP);
- }
- }
-}
-
-static void
-ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
-{
- spin_lock_bh(&queue_lock);
- __ipq_flush(cmpfn, data);
- spin_unlock_bh(&queue_lock);
-}
-
-static struct sk_buff *
-ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
-{
- sk_buff_data_t old_tail;
- size_t size = 0;
- size_t data_len = 0;
- struct sk_buff *skb;
- struct ipq_packet_msg *pmsg;
- struct nlmsghdr *nlh;
- struct timeval tv;
-
- switch (ACCESS_ONCE(copy_mode)) {
- case IPQ_COPY_META:
- case IPQ_COPY_NONE:
- size = NLMSG_SPACE(sizeof(*pmsg));
- break;
-
- case IPQ_COPY_PACKET:
- if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb)))
- return NULL;
-
- data_len = ACCESS_ONCE(copy_range);
- if (data_len == 0 || data_len > entry->skb->len)
- data_len = entry->skb->len;
-
- size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
- break;
-
- default:
- *errp = -EINVAL;
- return NULL;
- }
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- goto nlmsg_failure;
-
- old_tail = skb->tail;
- nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
- pmsg = NLMSG_DATA(nlh);
- memset(pmsg, 0, sizeof(*pmsg));
-
- pmsg->packet_id = (unsigned long )entry;
- pmsg->data_len = data_len;
- tv = ktime_to_timeval(entry->skb->tstamp);
- pmsg->timestamp_sec = tv.tv_sec;
- pmsg->timestamp_usec = tv.tv_usec;
- pmsg->mark = entry->skb->mark;
- pmsg->hook = entry->hook;
- pmsg->hw_protocol = entry->skb->protocol;
-
- if (entry->indev)
- strcpy(pmsg->indev_name, entry->indev->name);
- else
- pmsg->indev_name[0] = '\0';
-
- if (entry->outdev)
- strcpy(pmsg->outdev_name, entry->outdev->name);
- else
- pmsg->outdev_name[0] = '\0';
-
- if (entry->indev && entry->skb->dev) {
- pmsg->hw_type = entry->skb->dev->type;
- pmsg->hw_addrlen = dev_parse_header(entry->skb,
- pmsg->hw_addr);
- }
-
- if (data_len)
- if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
- BUG();
-
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-
-nlmsg_failure:
- *errp = -EINVAL;
- printk(KERN_ERR "ip_queue: error creating packet message\n");
- return NULL;
-}
-
-static int
-ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
-{
- int status = -EINVAL;
- struct sk_buff *nskb;
-
- if (copy_mode == IPQ_COPY_NONE)
- return -EAGAIN;
-
- nskb = ipq_build_packet_message(entry, &status);
- if (nskb == NULL)
- return status;
-
- spin_lock_bh(&queue_lock);
-
- if (!peer_pid)
- goto err_out_free_nskb;
-
- if (queue_total >= queue_maxlen) {
- queue_dropped++;
- status = -ENOSPC;
- if (net_ratelimit())
- printk (KERN_WARNING "ip_queue: full at %d entries, "
- "dropping packets(s). Dropped: %d\n", queue_total,
- queue_dropped);
- goto err_out_free_nskb;
- }
-
- /* netlink_unicast will either free the nskb or attach it to a socket */
- status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
- if (status < 0) {
- queue_user_dropped++;
- goto err_out_unlock;
- }
-
- __ipq_enqueue_entry(entry);
-
- spin_unlock_bh(&queue_lock);
- return status;
-
-err_out_free_nskb:
- kfree_skb(nskb);
-
-err_out_unlock:
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
-{
- int diff;
- struct iphdr *user_iph = (struct iphdr *)v->payload;
- struct sk_buff *nskb;
-
- if (v->data_len < sizeof(*user_iph))
- return 0;
- diff = v->data_len - e->skb->len;
- if (diff < 0) {
- if (pskb_trim(e->skb, v->data_len))
- return -ENOMEM;
- } else if (diff > 0) {
- if (v->data_len > 0xFFFF)
- return -EINVAL;
- if (diff > skb_tailroom(e->skb)) {
- nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
- diff, GFP_ATOMIC);
- if (!nskb) {
- printk(KERN_WARNING "ip_queue: error "
- "in mangle, dropping packet\n");
- return -ENOMEM;
- }
- kfree_skb(e->skb);
- e->skb = nskb;
- }
- skb_put(e->skb, diff);
- }
- if (!skb_make_writable(e->skb, v->data_len))
- return -ENOMEM;
- skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
- e->skb->ip_summed = CHECKSUM_NONE;
-
- return 0;
-}
-
-static int
-ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
-{
- struct nf_queue_entry *entry;
-
- if (vmsg->value > NF_MAX_VERDICT)
- return -EINVAL;
-
- entry = ipq_find_dequeue_entry(vmsg->id);
- if (entry == NULL)
- return -ENOENT;
- else {
- int verdict = vmsg->value;
-
- if (vmsg->data_len && vmsg->data_len == len)
- if (ipq_mangle_ipv4(vmsg, entry) < 0)
- verdict = NF_DROP;
-
- nf_reinject(entry, verdict);
- return 0;
- }
-}
-
-static int
-ipq_set_mode(unsigned char mode, unsigned int range)
-{
- int status;
-
- spin_lock_bh(&queue_lock);
- status = __ipq_set_mode(mode, range);
- spin_unlock_bh(&queue_lock);
- return status;
-}
-
-static int
-ipq_receive_peer(struct ipq_peer_msg *pmsg,
- unsigned char type, unsigned int len)
-{
- int status = 0;
-
- if (len < sizeof(*pmsg))
- return -EINVAL;
-
- switch (type) {
- case IPQM_MODE:
- status = ipq_set_mode(pmsg->msg.mode.value,
- pmsg->msg.mode.range);
- break;
-
- case IPQM_VERDICT:
- if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
- status = -EINVAL;
- else
- status = ipq_set_verdict(&pmsg->msg.verdict,
- len - sizeof(*pmsg));
- break;
- default:
- status = -EINVAL;
- }
- return status;
-}
-
-static int
-dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
-{
- if (entry->indev)
- if (entry->indev->ifindex == ifindex)
- return 1;
- if (entry->outdev)
- if (entry->outdev->ifindex == ifindex)
- return 1;
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (entry->skb->nf_bridge) {
- if (entry->skb->nf_bridge->physindev &&
- entry->skb->nf_bridge->physindev->ifindex == ifindex)
- return 1;
- if (entry->skb->nf_bridge->physoutdev &&
- entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
- return 1;
- }
-#endif
- return 0;
-}
-
-static void
-ipq_dev_drop(int ifindex)
-{
- ipq_flush(dev_cmp, ifindex);
-}
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
-
-static inline void
-__ipq_rcv_skb(struct sk_buff *skb)
-{
- int status, type, pid, flags, nlmsglen, skblen;
- struct nlmsghdr *nlh;
-
- skblen = skb->len;
- if (skblen < sizeof(*nlh))
- return;
-
- nlh = nlmsg_hdr(skb);
- nlmsglen = nlh->nlmsg_len;
- if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
- return;
-
- pid = nlh->nlmsg_pid;
- flags = nlh->nlmsg_flags;
-
- if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
- RCV_SKB_FAIL(-EINVAL);
-
- if (flags & MSG_TRUNC)
- RCV_SKB_FAIL(-ECOMM);
-
- type = nlh->nlmsg_type;
- if (type < NLMSG_NOOP || type >= IPQM_MAX)
- RCV_SKB_FAIL(-EINVAL);
-
- if (type <= IPQM_BASE)
- return;
-
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- spin_lock_bh(&queue_lock);
-
- if (peer_pid) {
- if (peer_pid != pid) {
- spin_unlock_bh(&queue_lock);
- RCV_SKB_FAIL(-EBUSY);
- }
- } else {
- net_enable_timestamp();
- peer_pid = pid;
- }
-
- spin_unlock_bh(&queue_lock);
-
- status = ipq_receive_peer(NLMSG_DATA(nlh), type,
- nlmsglen - NLMSG_LENGTH(0));
- if (status < 0)
- RCV_SKB_FAIL(status);
-
- if (flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
-}
-
-static void
-ipq_rcv_skb(struct sk_buff *skb)
-{
- mutex_lock(&ipqnl_mutex);
- __ipq_rcv_skb(skb);
- mutex_unlock(&ipqnl_mutex);
-}
-
-static int
-ipq_rcv_dev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = ptr;
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- /* Drop any packets associated with the downed device */
- if (event == NETDEV_DOWN)
- ipq_dev_drop(dev->ifindex);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_dev_notifier = {
- .notifier_call = ipq_rcv_dev_event,
-};
-
-static int
-ipq_rcv_nl_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct netlink_notify *n = ptr;
-
- if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- spin_lock_bh(&queue_lock);
- if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
- __ipq_reset();
- spin_unlock_bh(&queue_lock);
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ipq_nl_notifier = {
- .notifier_call = ipq_rcv_nl_event,
-};
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *ipq_sysctl_header;
-
-static ctl_table ipq_table[] = {
- {
- .procname = NET_IPQ_QMAX_NAME,
- .data = &queue_maxlen,
- .maxlen = sizeof(queue_maxlen),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- { }
-};
-#endif
-
-#ifdef CONFIG_PROC_FS
-static int ip_queue_show(struct seq_file *m, void *v)
-{
- spin_lock_bh(&queue_lock);
-
- seq_printf(m,
- "Peer PID : %d\n"
- "Copy mode : %hu\n"
- "Copy range : %u\n"
- "Queue length : %u\n"
- "Queue max. length : %u\n"
- "Queue dropped : %u\n"
- "Netlink dropped : %u\n",
- peer_pid,
- copy_mode,
- copy_range,
- queue_total,
- queue_maxlen,
- queue_dropped,
- queue_user_dropped);
-
- spin_unlock_bh(&queue_lock);
- return 0;
-}
-
-static int ip_queue_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ip_queue_show, NULL);
-}
-
-static const struct file_operations ip_queue_proc_fops = {
- .open = ip_queue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-#endif
-
-static const struct nf_queue_handler nfqh = {
- .name = "ip_queue",
- .outfn = &ipq_enqueue_packet,
-};
-
-static int __init ip_queue_init(void)
-{
- int status = -ENOMEM;
- struct proc_dir_entry *proc __maybe_unused;
-
- netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
- ipq_rcv_skb, NULL, THIS_MODULE);
- if (ipqnl == NULL) {
- printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
- goto cleanup_netlink_notifier;
- }
-
-#ifdef CONFIG_PROC_FS
- proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
- &ip_queue_proc_fops);
- if (!proc) {
- printk(KERN_ERR "ip_queue: failed to create proc entry\n");
- goto cleanup_ipqnl;
- }
-#endif
- register_netdevice_notifier(&ipq_dev_notifier);
-#ifdef CONFIG_SYSCTL
- ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
-#endif
- status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
- if (status < 0) {
- printk(KERN_ERR "ip_queue: failed to register queue handler\n");
- goto cleanup_sysctl;
- }
- return status;
-
-cleanup_sysctl:
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-cleanup_ipqnl: __maybe_unused
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
-cleanup_netlink_notifier:
- netlink_unregister_notifier(&ipq_nl_notifier);
- return status;
-}
-
-static void __exit ip_queue_fini(void)
-{
- nf_unregister_queue_handlers(&nfqh);
-
- ipq_flush(NULL, 0);
-
-#ifdef CONFIG_SYSCTL
- unregister_sysctl_table(ipq_sysctl_header);
-#endif
- unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
-
- netlink_kernel_release(ipqnl);
- mutex_lock(&ipqnl_mutex);
- mutex_unlock(&ipqnl_mutex);
-
- netlink_unregister_notifier(&ipq_nl_notifier);
-}
-
-MODULE_DESCRIPTION("IPv4 packet queue handler");
-MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
-
-module_init(ip_queue_init);
-module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d633b3b..99e810f8467 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -3,6 +3,7 @@
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -68,15 +69,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
}
EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
-/*
- We keep a set of rules for each CPU, so we can avoid write-locking
- them in the softirq when updating the counters and therefore
- only need to read-lock in the softirq; doing a write_lock_bh() in user
- context stops packets coming through and allows user context to read
- the counters or update the rules.
-
- Hence the start of any table is given by get_table() below. */
-
/* Returns whether matches rule or not. */
/* Performance critical - called for every packet */
static inline bool
@@ -162,8 +154,7 @@ ip_checkentry(const struct ipt_ip *ip)
static unsigned int
ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (net_ratelimit())
- pr_info("error: `%s'\n", (const char *)par->targinfo);
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
@@ -192,8 +183,7 @@ ipt_get_target_c(const struct ipt_entry *e)
return ipt_get_target((struct ipt_entry *)e);
}
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
static const char *const hooknames[] = {
[NF_INET_PRE_ROUTING] = "PREROUTING",
[NF_INET_LOCAL_IN] = "INPUT",
@@ -269,6 +259,7 @@ static void trace_packet(const struct sk_buff *skb,
const char *hookname, *chainname, *comment;
const struct ipt_entry *iter;
unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
table_base = private->entries[smp_processor_id()];
root = get_entry(table_base, private->hook_entry[hook]);
@@ -281,7 +272,7 @@ static void trace_packet(const struct sk_buff *skb,
&chainname, &comment, &rulenum) != 0)
break;
- nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+ nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo,
"TRACE: %s:%s:%s:%u ",
tablename, chainname, comment, rulenum);
}
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
unsigned int *stackptr, origptr, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
+ unsigned int addend;
/* Initialization */
ip = ip_hdr(skb);
@@ -331,9 +323,15 @@ ipt_do_table(struct sk_buff *skb,
acpar.hooknum = hook;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- xt_info_rdlock_bh();
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
table_base = private->entries[cpu];
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
stackptr = per_cpu_ptr(private->stackptr, cpu);
@@ -369,8 +367,7 @@ ipt_do_table(struct sk_buff *skb,
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(skb, hook, in, out,
@@ -384,10 +381,10 @@ ipt_do_table(struct sk_buff *skb,
if (v < 0) {
/* Pop from stack? */
if (v != XT_RETURN) {
- verdict = (unsigned)(-v) - 1;
+ verdict = (unsigned int)(-v) - 1;
break;
}
- if (*stackptr == 0) {
+ if (*stackptr <= origptr) {
e = get_entry(table_base,
private->underflow[hook]);
pr_debug("Underflow (this is normal) "
@@ -427,10 +424,12 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- xt_info_rdunlock_bh();
pr_debug("Exiting %s; resetting sp from %u to %u\n",
__func__, *stackptr, origptr);
*stackptr = origptr;
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
#else
@@ -571,7 +570,7 @@ check_entry(const struct ipt_entry *e, const char *name)
const struct xt_entry_target *t;
if (!ip_checkentry(&e->ip)) {
- duprintf("ip check failed %p %s.\n", e, par->match->name);
+ duprintf("ip check failed %p %s.\n", e, name);
return -EINVAL;
}
@@ -884,42 +883,25 @@ get_counters(const struct xt_table_info *t,
struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu = get_cpu();
-
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU.
- *
- * Bottom half has to be disabled to prevent deadlock
- * if new softirq were to run and call ipt_do_table
- */
- local_bh_disable();
- i = 0;
- xt_entry_foreach(iter, t->entries[curcpu], t->size) {
- SET_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
- ++i;
- }
- local_bh_enable();
- /* Processing counters from other cpus, we can let bottom half enabled,
- * (preemption is disabled)
- */
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
i = 0;
- local_bh_disable();
- xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
- ADD_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */
}
- xt_info_wrunlock(cpu);
- local_bh_enable();
}
- put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +914,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1080,6 +1062,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1112,7 +1095,7 @@ static int get_info(struct net *net, void __user *user,
#endif
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
#ifdef CONFIG_COMPAT
@@ -1171,7 +1154,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
}
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
@@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct ipt_entry *iter;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
@@ -1211,7 +1194,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
"iptable_%s", name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free_newinfo_counters_untrans;
}
@@ -1248,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
- sizeof(struct xt_counters) * num_counters) != 0)
- ret = -EFAULT;
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+ }
vfree(counters);
xt_table_unlock(t);
return ret;
@@ -1278,6 +1263,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
@@ -1327,6 +1313,7 @@ do_add_counters(struct net *net, const void __user *user,
int ret = 0;
void *loc_cpu_entry;
struct ipt_entry *iter;
+ unsigned int addend;
#ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp;
@@ -1367,7 +1354,7 @@ do_add_counters(struct net *net, const void __user *user,
}
t = xt_find_table_lock(net, AF_INET, name);
- if (!t || IS_ERR(t)) {
+ if (IS_ERR_OR_NULL(t)) {
ret = t ? PTR_ERR(t) : -ENOENT;
goto free;
}
@@ -1383,12 +1370,12 @@ do_add_counters(struct net *net, const void __user *user,
/* Choose the copy that is on our node */
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
- xt_info_wrlock(curcpu);
+ addend = xt_write_recseq_begin();
xt_entry_foreach(iter, loc_cpu_entry, private->size) {
ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
++i;
}
- xt_info_wrunlock(curcpu);
+ xt_write_recseq_end(addend);
unlock_up_free:
local_bh_enable();
xt_table_unlock(t);
@@ -1681,6 +1668,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1822,6 +1810,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
+ tmp.name[sizeof(tmp.name)-1] = 0;
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
@@ -1864,7 +1853,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1949,7 +1938,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
- if (t && !IS_ERR(t)) {
+ if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
duprintf("t->private->number = %u\n", private->number);
@@ -1979,7 +1968,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2001,7 +1990,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2026,7 +2015,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2051,6 +2040,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
ret = -EFAULT;
break;
}
+ rev.name[sizeof(rev.name)-1] = 0;
if (cmd == IPT_SO_GET_REVISION_TARGET)
target = 1;
@@ -2245,7 +2235,7 @@ static int __init ip_tables_init(void)
if (ret < 0)
goto err1;
- /* Noone else will be downing sem now, so we won't sleep */
+ /* No one else will be downing sem now, so we won't sleep */
ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
if (ret < 0)
goto err2;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a489765..2510c02c2d2 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
@@ -57,15 +58,21 @@ struct clusterip_config {
struct rcu_head rcu;
};
-static LIST_HEAD(clusterip_configs);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
+ struct proc_dir_entry *procdir;
#endif
+};
static inline void
clusterip_config_get(struct clusterip_config *c)
@@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
local_bh_disable();
- if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
list_del_rcu(&c->list);
- spin_unlock(&clusterip_lock);
+ spin_unlock(&cn->lock);
local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
@@ -105,7 +115,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
- remove_proc_entry(c->pde->name, c->pde->parent);
+ proc_remove(c->pde);
#endif
return;
}
@@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)
}
static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- list_for_each_entry_rcu(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)
}
static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
- c = __clusterip_config_find(clusterip);
+ c = __clusterip_config_find(net, clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
@@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
struct net_device *dev)
{
struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
@@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
- clusterip_procdir,
+ cn->procdir,
&clusterip_proc_fops, c);
if (!c->pde) {
kfree(c);
@@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- spin_lock_bh(&clusterip_lock);
- list_add_rcu(&c->list, &clusterip_configs);
- spin_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
return c;
}
@@ -246,8 +258,7 @@ clusterip_hashfn(const struct sk_buff *skb,
dport = ports[1];
}
} else {
- if (net_ratelimit())
- pr_info("unknown protocol %u\n", iph->protocol);
+ net_info_ratelimited("unknown protocol %u\n", iph->protocol);
}
switch (config->hash_mode) {
@@ -300,19 +311,14 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
* that the ->target() function isn't called after ->destroy() */
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL) {
- pr_info("no conntrack!\n");
- /* FIXME: need to drop invalid ones, since replies
- * to outgoing connections of other nodes will be
- * marked as INVALID */
+ if (ct == NULL)
return NF_DROP;
- }
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
(ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
+ ctinfo == IP_CT_RELATED_REPLY))
return XT_CONTINUE;
/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -322,19 +328,19 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
hash = clusterip_hashfn(skb, cipinfo->config);
switch (ctinfo) {
- case IP_CT_NEW:
- ct->mark = hash;
- break;
- case IP_CT_RELATED:
- case IP_CT_RELATED+IP_CT_IS_REPLY:
- /* FIXME: we don't handle expectations at the
- * moment. they can arrive on a different node than
- * the master connection (e.g. FTP passive mode) */
- case IP_CT_ESTABLISHED:
- case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
- break;
- default:
- break;
+ case IP_CT_NEW:
+ ct->mark = hash;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* FIXME: we don't handle expectations at the moment.
+ * They can arrive on a different node than
+ * the master connection (e.g. FTP passive mode) */
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+ default: /* Prevent gcc warnings */
+ break;
}
#ifdef DEBUG
@@ -376,7 +382,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
/* FIXME: further sanity checks */
- config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
@@ -390,7 +396,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- dev = dev_get_by_name(&init_net, e->ip.iniface);
+ dev = dev_get_by_name(par->net, e->ip.iniface);
if (!dev) {
pr_info("no such interface %s\n",
e->ip.iniface);
@@ -400,7 +406,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
if (!config) {
- pr_info("cannot allocate config\n");
dev_put(dev);
return -ENOMEM;
}
@@ -490,7 +495,7 @@ static void arp_print(struct arp_payload *payload)
#endif
static unsigned int
-arp_mangle(unsigned int hook,
+arp_mangle(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -499,6 +504,7 @@ arp_mangle(unsigned int hook,
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
+ struct net *net = dev_net(in ? in : out);
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -515,7 +521,7 @@ arp_mangle(unsigned int hook,
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
- c = clusterip_config_find_get(payload->src_ip, 0);
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
@@ -638,7 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- struct clusterip_config *c = PDE(inode)->data;
+ struct clusterip_config *c = PDE_DATA(inode);
sf->private = c;
@@ -650,7 +656,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
static int clusterip_proc_release(struct inode *inode, struct file *file)
{
- struct clusterip_config *c = PDE(inode)->data;
+ struct clusterip_config *c = PDE_DATA(inode);
int ret;
ret = seq_release(inode, file);
@@ -664,20 +670,28 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
- struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
+ struct clusterip_config *c = PDE_DATA(file_inode(file));
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
unsigned long nodenum;
+ int rc;
- if (copy_from_user(buffer, input, PROC_WRITELEN))
+ if (size > PROC_WRITELEN)
+ return -EIO;
+ if (copy_from_user(buffer, input, size))
return -EFAULT;
+ buffer[size] = 0;
if (*buffer == '+') {
- nodenum = simple_strtoul(buffer+1, NULL, 10);
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
if (clusterip_add_node(c, nodenum))
return -ENOMEM;
} else if (*buffer == '-') {
- nodenum = simple_strtoul(buffer+1, NULL,10);
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
if (clusterip_del_node(c, nodenum))
return -ENOENT;
} else
@@ -697,48 +711,75 @@ static const struct file_operations clusterip_proc_fops = {
#endif /* CONFIG_PROC_FS */
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
static int __init clusterip_tg_init(void)
{
int ret;
- ret = xt_register_target(&clusterip_tg_reg);
+ ret = register_pernet_subsys(&clusterip_net_ops);
if (ret < 0)
return ret;
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
ret = nf_register_hook(&cip_arp_ops);
if (ret < 0)
goto cleanup_target;
-#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
- if (!clusterip_procdir) {
- pr_err("Unable to proc dir entry\n");
- ret = -ENOMEM;
- goto cleanup_hook;
- }
-#endif /* CONFIG_PROC_FS */
-
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
+
return 0;
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
- nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
cleanup_target:
xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
-#endif
+
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
rcu_barrier_bh();
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
deleted file mode 100644
index 72ffc8fda2e..00000000000
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * This is a module which is used for logging packets.
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_LOG.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netfilter/xt_log.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-
-/* One level of recursion won't kill us */
-static void dump_packet(struct sbuff *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb,
- unsigned int iphoff)
-{
- struct iphdr _iph;
- const struct iphdr *ih;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_MASK;
-
- ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
- if (ih == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Important fields:
- * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
- /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- sb_add(m, "SRC=%pI4 DST=%pI4 ",
- &ih->saddr, &ih->daddr);
-
- /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
- ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
-
- /* Max length: 6 "CE DF MF " */
- if (ntohs(ih->frag_off) & IP_CE)
- sb_add(m, "CE ");
- if (ntohs(ih->frag_off) & IP_DF)
- sb_add(m, "DF ");
- if (ntohs(ih->frag_off) & IP_MF)
- sb_add(m, "MF ");
-
- /* Max length: 11 "FRAG:65535 " */
- if (ntohs(ih->frag_off) & IP_OFFSET)
- sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-
- if ((logflags & IPT_LOG_IPOPT) &&
- ih->ihl * 4 > sizeof(struct iphdr)) {
- const unsigned char *op;
- unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
- unsigned int i, optsize;
-
- optsize = ih->ihl * 4 - sizeof(struct iphdr);
- op = skb_header_pointer(skb, iphoff+sizeof(_iph),
- optsize, _opt);
- if (op == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- sb_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- sb_add(m, "%02X", op[i]);
- sb_add(m, ") ");
- }
-
- switch (ih->protocol) {
- case IPPROTO_TCP: {
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* Max length: 10 "PROTO=TCP " */
- sb_add(m, "PROTO=TCP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_tcph), &_tcph);
- if (th == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- sb_add(m, "SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & IPT_LOG_TCPSEQ)
- sb_add(m, "SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- /* Max length: 13 "WINDOW=65535 " */
- sb_add(m, "WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3F " */
- sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- sb_add(m, "CWR ");
- if (th->ece)
- sb_add(m, "ECE ");
- if (th->urg)
- sb_add(m, "URG ");
- if (th->ack)
- sb_add(m, "ACK ");
- if (th->psh)
- sb_add(m, "PSH ");
- if (th->rst)
- sb_add(m, "RST ");
- if (th->syn)
- sb_add(m, "SYN ");
- if (th->fin)
- sb_add(m, "FIN ");
- /* Max length: 11 "URGP=65535 " */
- sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & IPT_LOG_TCPOPT) &&
- th->doff * 4 > sizeof(struct tcphdr)) {
- unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
- const unsigned char *op;
- unsigned int i, optsize;
-
- optsize = th->doff * 4 - sizeof(struct tcphdr);
- op = skb_header_pointer(skb,
- iphoff+ih->ihl*4+sizeof(_tcph),
- optsize, _opt);
- if (op == NULL) {
- sb_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- sb_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- sb_add(m, "%02X", op[i]);
- sb_add(m, ") ");
- }
- break;
- }
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE: {
- struct udphdr _udph;
- const struct udphdr *uh;
-
- if (ih->protocol == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- sb_add(m, "PROTO=UDP " );
- else /* Max length: 14 "PROTO=UDPLITE " */
- sb_add(m, "PROTO=UDPLITE ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_udph), &_udph);
- if (uh == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- sb_add(m, "SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest),
- ntohs(uh->len));
- break;
- }
- case IPPROTO_ICMP: {
- struct icmphdr _icmph;
- const struct icmphdr *ich;
- static const size_t required_len[NR_ICMP_TYPES+1]
- = { [ICMP_ECHOREPLY] = 4,
- [ICMP_DEST_UNREACH]
- = 8 + sizeof(struct iphdr),
- [ICMP_SOURCE_QUENCH]
- = 8 + sizeof(struct iphdr),
- [ICMP_REDIRECT]
- = 8 + sizeof(struct iphdr),
- [ICMP_ECHO] = 4,
- [ICMP_TIME_EXCEEDED]
- = 8 + sizeof(struct iphdr),
- [ICMP_PARAMETERPROB]
- = 8 + sizeof(struct iphdr),
- [ICMP_TIMESTAMP] = 20,
- [ICMP_TIMESTAMPREPLY] = 20,
- [ICMP_ADDRESS] = 12,
- [ICMP_ADDRESSREPLY] = 12 };
-
- /* Max length: 11 "PROTO=ICMP " */
- sb_add(m, "PROTO=ICMP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_icmph), &_icmph);
- if (ich == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- if (ich->type <= NR_ICMP_TYPES &&
- required_len[ich->type] &&
- skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- switch (ich->type) {
- case ICMP_ECHOREPLY:
- case ICMP_ECHO:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- sb_add(m, "ID=%u SEQ=%u ",
- ntohs(ich->un.echo.id),
- ntohs(ich->un.echo.sequence));
- break;
-
- case ICMP_PARAMETERPROB:
- /* Max length: 14 "PARAMETER=255 " */
- sb_add(m, "PARAMETER=%u ",
- ntohl(ich->un.gateway) >> 24);
- break;
- case ICMP_REDIRECT:
- /* Max length: 24 "GATEWAY=255.255.255.255 " */
- sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
- /* Fall through */
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_TIME_EXCEEDED:
- /* Max length: 3+maxlen */
- if (!iphoff) { /* Only recurse once. */
- sb_add(m, "[");
- dump_packet(m, info, skb,
- iphoff + ih->ihl*4+sizeof(_icmph));
- sb_add(m, "] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ich->type == ICMP_DEST_UNREACH &&
- ich->code == ICMP_FRAG_NEEDED)
- sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
- }
- break;
- }
- /* Max Length */
- case IPPROTO_AH: {
- struct ip_auth_hdr _ahdr;
- const struct ip_auth_hdr *ah;
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 9 "PROTO=AH " */
- sb_add(m, "PROTO=AH ");
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_ahdr), &_ahdr);
- if (ah == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
- break;
- }
- case IPPROTO_ESP: {
- struct ip_esp_hdr _esph;
- const struct ip_esp_hdr *eh;
-
- /* Max length: 10 "PROTO=ESP " */
- sb_add(m, "PROTO=ESP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_esph), &_esph);
- if (eh == NULL) {
- sb_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
- break;
- }
- /* Max length: 10 "PROTO 255 " */
- default:
- sb_add(m, "PROTO=%u ", ih->protocol);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- sb_add(m, "UID=%u GID=%u ",
- skb->sk->sk_socket->file->f_cred->fsuid,
- skb->sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
- }
-
- /* Max length: 16 "MARK=0xFFFFFFFF " */
- if (!iphoff && skb->mark)
- sb_add(m, "MARK=0x%x ", skb->mark);
-
- /* Proto Max log string length */
- /* IP: 40+46+6+11+127 = 230 */
- /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
- /* UDP: 10+max(25,20) = 35 */
- /* UDPLITE: 14+max(25,20) = 39 */
- /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
- /* ESP: 10+max(25)+15 = 50 */
- /* AH: 9+max(25)+15 = 49 */
- /* unknown: 10 */
-
- /* (ICMP allows recursion one level deep) */
- /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
- /* maxlen = 230+ 91 + 230 + 252 = 803 */
-}
-
-static void dump_mac_header(struct sbuff *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & IPT_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- sb_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int i;
-
- sb_add(m, "%02x", *p++);
- for (i = 1; i < dev->hard_header_len; i++, p++)
- sb_add(m, ":%02x", *p);
- }
- sb_add(m, " ");
-}
-
-static struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = 5,
- .logflags = NF_LOG_MASK,
- },
- },
-};
-
-static void
-ipt_log_packet(u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct sbuff *m = sb_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- const struct net_device *physindev;
- const struct net_device *physoutdev;
-
- physindev = skb->nf_bridge->physindev;
- if (physindev && in != physindev)
- sb_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev && out != physoutdev)
- sb_add(m, "PHYSOUT=%s ", physoutdev->name);
- }
-#endif
-
- /* MAC logging for input path only. */
- if (in && !out)
- dump_mac_header(m, loginfo, skb);
-
- dump_packet(m, loginfo, skb, 0);
-
- sb_close(m);
-}
-
-static unsigned int
-log_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- const struct ipt_log_info *loginfo = par->targinfo;
- struct nf_loginfo li;
-
- li.type = NF_LOG_TYPE_LOG;
- li.u.log.level = loginfo->level;
- li.u.log.logflags = loginfo->logflags;
-
- ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li,
- loginfo->prefix);
- return XT_CONTINUE;
-}
-
-static int log_tg_check(const struct xt_tgchk_param *par)
-{
- const struct ipt_log_info *loginfo = par->targinfo;
-
- if (loginfo->level >= 8) {
- pr_debug("level %u >= 8\n", loginfo->level);
- return -EINVAL;
- }
- if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
- pr_debug("prefix is not null-terminated\n");
- return -EINVAL;
- }
- return 0;
-}
-
-static struct xt_target log_tg_reg __read_mostly = {
- .name = "LOG",
- .family = NFPROTO_IPV4,
- .target = log_tg,
- .targetsize = sizeof(struct ipt_log_info),
- .checkentry = log_tg_check,
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_log_logger __read_mostly = {
- .name = "ipt_LOG",
- .logfn = &ipt_log_packet,
- .me = THIS_MODULE,
-};
-
-static int __init log_tg_init(void)
-{
- int ret;
-
- ret = xt_register_target(&log_tg_reg);
- if (ret < 0)
- return ret;
- nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
- return 0;
-}
-
-static void __exit log_tg_exit(void)
-{
- nf_log_unregister(&ipt_log_logger);
- xt_unregister_target(&log_tg_reg);
-}
-
-module_init(log_tg_init);
-module_exit(log_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index d2ed9dc74eb..00352ce0f0d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
#include <net/ip.h>
#include <net/checksum.h>
#include <net/route.h>
-#include <net/netfilter/nf_nat_rule.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
/* FIXME: Multiple targets. --RR */
static int masquerade_tg_check(const struct xt_tgchk_param *par)
{
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
- if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
pr_debug("bad MAP_IPS.\n");
return -EINVAL;
}
@@ -50,9 +50,9 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
struct nf_nat_range newrange;
- const struct nf_nat_multi_range_compat *mr;
+ const struct nf_nat_ipv4_multi_range_compat *mr;
const struct rtable *rt;
- __be32 newsrc;
+ __be32 newsrc, nh;
NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
@@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
nat = nfct_nat(ct);
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+ ctinfo == IP_CT_RELATED_REPLY));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
mr = par->targinfo;
rt = skb_rtable(skb);
- newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
if (!newsrc) {
pr_info("%s ate my IP address\n", par->out->name);
return NF_DROP;
@@ -79,13 +80,16 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
nat->masq_index = par->out->ifindex;
/* Transfer from original range. */
- newrange = ((struct nf_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- newsrc, newsrc,
- mr->range[0].min, mr->range[0].max });
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newsrc;
+ newrange.max_addr.ip = newsrc;
+ newrange.min_proto = mr->range[0].min;
+ newrange.max_proto = mr->range[0].max;
/* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
static int
@@ -95,7 +99,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
if (!nat)
return 0;
-
+ if (nf_ct_l3num(i) != NFPROTO_IPV4)
+ return 0;
return nat->masq_index == (int)(long)ifindex;
}
@@ -103,7 +108,7 @@ static int masq_device_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
- const struct net_device *dev = ptr;
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
if (event == NETDEV_DOWN) {
@@ -113,7 +118,7 @@ static int masq_device_event(struct notifier_block *this,
NF_CT_ASSERT(dev->ifindex != 0);
nf_ct_iterate_cleanup(net, device_cmp,
- (void *)(long)dev->ifindex);
+ (void *)(long)dev->ifindex, 0, 0);
}
return NOTIFY_DONE;
@@ -124,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this,
void *ptr)
{
struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
- return masq_device_event(this, event, dev);
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return masq_device_event(this, event, &info);
}
static struct notifier_block masq_dev_notifier = {
@@ -139,7 +147,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.name = "MASQUERADE",
.family = NFPROTO_IPV4,
.target = masquerade_tg,
- .targetsize = sizeof(struct nf_nat_multi_range_compat),
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
deleted file mode 100644
index 6cdb298f103..00000000000
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* NETMAP - static NAT mapping of IP network addresses (1:1).
- * The mapping can be applied to source (POSTROUTING),
- * destination (PREROUTING), or both (with separate rules).
- */
-
-/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/ip.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
-MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
-
-static int netmap_tg_check(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
- pr_debug("bad MAP_IPS.\n");
- return -EINVAL;
- }
- if (mr->rangesize != 1) {
- pr_debug("bad rangesize %u.\n", mr->rangesize);
- return -EINVAL;
- }
- return 0;
-}
-
-static unsigned int
-netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be32 new_ip, netmask;
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range newrange;
-
- NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT ||
- par->hooknum == NF_INET_LOCAL_IN);
- ct = nf_ct_get(skb, &ctinfo);
-
- netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
-
- if (par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT)
- new_ip = ip_hdr(skb)->daddr & ~netmask;
- else
- new_ip = ip_hdr(skb)->saddr & ~netmask;
- new_ip |= mr->range[0].min_ip & netmask;
-
- newrange = ((struct nf_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- new_ip, new_ip,
- mr->range[0].min, mr->range[0].max });
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
-}
-
-static struct xt_target netmap_tg_reg __read_mostly = {
- .name = "NETMAP",
- .family = NFPROTO_IPV4,
- .target = netmap_tg,
- .targetsize = sizeof(struct nf_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_LOCAL_IN),
- .checkentry = netmap_tg_check,
- .me = THIS_MODULE
-};
-
-static int __init netmap_tg_init(void)
-{
- return xt_register_target(&netmap_tg_reg);
-}
-
-static void __exit netmap_tg_exit(void)
-{
- xt_unregister_target(&netmap_tg_reg);
-}
-
-module_init(netmap_tg_init);
-module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
deleted file mode 100644
index 18a0656505a..00000000000
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Redirect. Simple mapping which alters dst to a local IP address. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <net/protocol.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_rule.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
-
-/* FIXME: Take multiple ranges --RR */
-static int redirect_tg_check(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
- pr_debug("bad MAP_IPS.\n");
- return -EINVAL;
- }
- if (mr->rangesize != 1) {
- pr_debug("bad rangesize %u.\n", mr->rangesize);
- return -EINVAL;
- }
- return 0;
-}
-
-static unsigned int
-redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be32 newdst;
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
- struct nf_nat_range newrange;
-
- NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT);
-
- ct = nf_ct_get(skb, &ctinfo);
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
- /* Local packets: make them go to loopback */
- if (par->hooknum == NF_INET_LOCAL_OUT)
- newdst = htonl(0x7F000001);
- else {
- struct in_device *indev;
- struct in_ifaddr *ifa;
-
- newdst = 0;
-
- rcu_read_lock();
- indev = __in_dev_get_rcu(skb->dev);
- if (indev && (ifa = indev->ifa_list))
- newdst = ifa->ifa_local;
- rcu_read_unlock();
-
- if (!newdst)
- return NF_DROP;
- }
-
- /* Transfer from original range. */
- newrange = ((struct nf_nat_range)
- { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
- newdst, newdst,
- mr->range[0].min, mr->range[0].max });
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST);
-}
-
-static struct xt_target redirect_tg_reg __read_mostly = {
- .name = "REDIRECT",
- .family = NFPROTO_IPV4,
- .target = redirect_tg,
- .targetsize = sizeof(struct nf_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
- .checkentry = redirect_tg_check,
- .me = THIS_MODULE,
-};
-
-static int __init redirect_tg_init(void)
-{
- return xt_register_target(&redirect_tg_reg);
-}
-
-static void __exit redirect_tg_exit(void)
-{
- xt_unregister_target(&redirect_tg_reg);
-}
-
-module_init(redirect_tg_init);
-module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 43eec80c0e7..5b6e0df4ccf 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -17,10 +17,6 @@
#include <linux/udp.h>
#include <linux/icmp.h>
#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <net/route.h>
-#include <net/dst.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_REJECT.h>
@@ -28,114 +24,12 @@
#include <linux/netfilter_bridge.h>
#endif
+#include <net/netfilter/ipv4/nf_reject.h>
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
-/* Send RST reply */
-static void send_reset(struct sk_buff *oldskb, int hook)
-{
- struct sk_buff *nskb;
- const struct iphdr *oiph;
- struct iphdr *niph;
- const struct tcphdr *oth;
- struct tcphdr _otcph, *tcph;
- unsigned int addr_type;
-
- /* IP header checks: fragment. */
- if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
- return;
-
- oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
- sizeof(_otcph), &_otcph);
- if (oth == NULL)
- return;
-
- /* No RST for RST. */
- if (oth->rst)
- return;
-
- /* Check checksum */
- if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
- return;
- oiph = ip_hdr(oldskb);
-
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
- if (!nskb)
- return;
-
- skb_reserve(nskb, LL_MAX_HEADER);
-
- skb_reset_network_header(nskb);
- niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
- niph->version = 4;
- niph->ihl = sizeof(struct iphdr) / 4;
- niph->tos = 0;
- niph->id = 0;
- niph->frag_off = htons(IP_DF);
- niph->protocol = IPPROTO_TCP;
- niph->check = 0;
- niph->saddr = oiph->daddr;
- niph->daddr = oiph->saddr;
-
- tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
- memset(tcph, 0, sizeof(*tcph));
- tcph->source = oth->dest;
- tcph->dest = oth->source;
- tcph->doff = sizeof(struct tcphdr) / 4;
-
- if (oth->ack)
- tcph->seq = oth->ack_seq;
- else {
- tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
- oldskb->len - ip_hdrlen(oldskb) -
- (oth->doff << 2));
- tcph->ack = 1;
- }
-
- tcph->rst = 1;
- tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
- niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)tcph - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- addr_type = RTN_UNSPEC;
- if (hook != NF_INET_FORWARD
-#ifdef CONFIG_BRIDGE_NETFILTER
- || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
-#endif
- )
- addr_type = RTN_LOCAL;
-
- /* ip_route_me_harder expects skb->dst to be set */
- skb_dst_set_noref(nskb, skb_dst(oldskb));
-
- nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, addr_type))
- goto free_nskb;
-
- niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
-
- /* "Never happens" */
- if (nskb->len > dst_mtu(skb_dst(nskb)))
- goto free_nskb;
-
- nf_ct_attach(nskb, oldskb);
-
- ip_local_out(nskb);
- return;
-
- free_nskb:
- kfree_skb(nskb);
-}
-
-static inline void send_unreach(struct sk_buff *skb_in, int code)
-{
- icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
-}
-
static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -143,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
- send_unreach(skb, ICMP_NET_UNREACH);
+ nf_send_unreach(skb, ICMP_NET_UNREACH);
break;
case IPT_ICMP_HOST_UNREACHABLE:
- send_unreach(skb, ICMP_HOST_UNREACH);
+ nf_send_unreach(skb, ICMP_HOST_UNREACH);
break;
case IPT_ICMP_PROT_UNREACHABLE:
- send_unreach(skb, ICMP_PROT_UNREACH);
+ nf_send_unreach(skb, ICMP_PROT_UNREACH);
break;
case IPT_ICMP_PORT_UNREACHABLE:
- send_unreach(skb, ICMP_PORT_UNREACH);
+ nf_send_unreach(skb, ICMP_PORT_UNREACH);
break;
case IPT_ICMP_NET_PROHIBITED:
- send_unreach(skb, ICMP_NET_ANO);
+ nf_send_unreach(skb, ICMP_NET_ANO);
break;
case IPT_ICMP_HOST_PROHIBITED:
- send_unreach(skb, ICMP_HOST_ANO);
+ nf_send_unreach(skb, ICMP_HOST_ANO);
break;
case IPT_ICMP_ADMIN_PROHIBITED:
- send_unreach(skb, ICMP_PKT_FILTERED);
+ nf_send_unreach(skb, ICMP_PKT_FILTERED);
break;
case IPT_TCP_RESET:
- send_reset(skb, par->hooknum);
+ nf_send_reset(skb, par->hooknum);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 00000000000..a313c3fbeb4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = sysctl_ip_default_ttl;
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nskb->nfct = nfct;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ return true;
+}
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack(skb, th, &opts);
+ return NF_DROP;
+
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ return NF_DROP;
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (synproxy == NULL)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ ntohl(th->seq) + 1))
+ this_cpu_inc(snet->stats->cookie_retrans);
+
+ return NF_DROP;
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->tsoff = opts.tsval - synproxy->its;
+
+ opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+ XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(snet, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (e->ip.proto != IPPROTO_TCP ||
+ e->ip.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV4,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg4,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg4_check,
+ .destroy = synproxy_tg4_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+static int __init synproxy_tg4_init(void)
+{
+ int err;
+
+ err = nf_register_hooks(ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err < 0)
+ goto err1;
+
+ err = xt_register_target(&synproxy_tg4_reg);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+ return err;
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+ xt_unregister_target(&synproxy_tg4_reg);
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 446e0f467a1..9cb993cd224 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -4,6 +4,7 @@
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -37,7 +38,7 @@
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netdevice.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
@@ -45,6 +46,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_ULOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <linux/bitops.h>
#include <asm/unaligned.h>
@@ -65,7 +67,7 @@ static unsigned int flushtimeout = 10;
module_param(flushtimeout, uint, 0600);
MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-static int nflog = 1;
+static bool nflog = true;
module_param(nflog, bool, 0400);
MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
@@ -78,20 +80,26 @@ typedef struct {
struct timer_list timer; /* the timer function */
} ulog_buff_t;
-static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
+static int ulog_net_id __read_mostly;
+struct ulog_net {
+ unsigned int nlgroup[ULOG_MAXNLGROUPS];
+ ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
+ struct sock *nflognl;
+ spinlock_t lock;
+};
-static struct sock *nflognl; /* our socket */
-static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
+static struct ulog_net *ulog_pernet(struct net *net)
+{
+ return net_generic(net, ulog_net_id);
+}
/* send one ulog_buff_t to userspace */
-static void ulog_send(unsigned int nlgroupnum)
+static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
{
- ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+ ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
- if (timer_pending(&ub->timer)) {
- pr_debug("ulog_send: timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ pr_debug("ulog_send: timer is deleting\n");
+ del_timer(&ub->timer);
if (!ub->skb) {
pr_debug("ulog_send: nothing to send\n");
@@ -105,7 +113,8 @@ static void ulog_send(unsigned int nlgroupnum)
NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
pr_debug("throwing %d packets to netlink group %u\n",
ub->qlen, nlgroupnum + 1);
- netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
+ netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
+ GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -116,13 +125,17 @@ static void ulog_send(unsigned int nlgroupnum)
/* timer function to flush queue in flushtimeout time */
static void ulog_timer(unsigned long data)
{
+ unsigned int groupnum = *((unsigned int *)data);
+ struct ulog_net *ulog = container_of((void *)data,
+ struct ulog_net,
+ nlgroup[groupnum]);
pr_debug("timer function called, calling ulog_send\n");
/* lock to protect against somebody modifying our structure
* from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog_lock);
- ulog_send(data);
- spin_unlock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
+ ulog_send(ulog, groupnum);
+ spin_unlock_bh(&ulog->lock);
}
static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -135,10 +148,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
* due to slab allocator restrictions */
n = max(size, nlbufsiz);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
if (!skb) {
- pr_debug("cannot alloc whole buffer %ub!\n", n);
-
if (n > size) {
/* try to allocate only as much as we need for
* current packet */
@@ -152,7 +163,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
return skb;
}
-static void ipt_ulog_packet(unsigned int hooknum,
+static void ipt_ulog_packet(struct net *net,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -164,6 +176,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
size_t size, copy_len;
struct nlmsghdr *nlh;
struct timeval tv;
+ struct ulog_net *ulog = ulog_pernet(net);
/* ffs == find first bit set, necessary because userspace
* is already shifting groupnumber, but we need unshifted.
@@ -176,11 +189,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
else
copy_len = loginfo->copy_range;
- size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+ size = nlmsg_total_size(sizeof(*pm) + copy_len);
- ub = &ulog_buffers[groupnum];
+ ub = &ulog->ulog_buffers[groupnum];
- spin_lock_bh(&ulog_lock);
+ spin_lock_bh(&ulog->lock);
if (!ub->skb) {
if (!(ub->skb = ulog_alloc_skb(size)))
@@ -190,7 +203,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
/* either the queue len is too high or we don't have
* enough room in nlskb left. send it to userspace. */
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
if (!(ub->skb = ulog_alloc_skb(size)))
goto alloc_failure;
@@ -198,12 +211,16 @@ static void ipt_ulog_packet(unsigned int hooknum,
pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
- /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
- nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
- sizeof(*pm)+copy_len);
+ nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+ sizeof(*pm)+copy_len, 0);
+ if (!nlh) {
+ pr_debug("error during nlmsg_put\n");
+ goto out_unlock;
+ }
ub->qlen++;
- pm = NLMSG_DATA(nlh);
+ pm = nlmsg_data(nlh);
+ memset(pm, 0, sizeof(*pm));
/* We might not have a timestamp, get one */
if (skb->tstamp.tv64 == 0)
@@ -216,12 +233,12 @@ static void ipt_ulog_packet(unsigned int hooknum,
put_unaligned(tv.tv_usec, &pm->timestamp_usec);
put_unaligned(skb->mark, &pm->mark);
pm->hook = hooknum;
- if (prefix != NULL)
- strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+ if (prefix != NULL) {
+ strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
+ pm->prefix[sizeof(pm->prefix) - 1] = '\0';
+ }
else if (loginfo->prefix[0] != '\0')
strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
- else
- *(pm->prefix) = '\0';
if (in && in->hard_header_len > 0 &&
skb->mac_header != skb->network_header &&
@@ -233,13 +250,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
if (in)
strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
- else
- pm->indev_name[0] = '\0';
if (out)
strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
- else
- pm->outdev_name[0] = '\0';
/* copy_len <= skb->len, so can't fail. */
if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
@@ -261,29 +274,30 @@ static void ipt_ulog_packet(unsigned int hooknum,
if (ub->qlen >= loginfo->qthreshold) {
if (loginfo->qthreshold > 1)
nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(groupnum);
+ ulog_send(ulog, groupnum);
}
-
- spin_unlock_bh(&ulog_lock);
+out_unlock:
+ spin_unlock_bh(&ulog->lock);
return;
-nlmsg_failure:
- pr_debug("error during NLMSG_PUT\n");
alloc_failure:
pr_debug("Error building netlink message\n");
- spin_unlock_bh(&ulog_lock);
+ spin_unlock_bh(&ulog->lock);
}
static unsigned int
ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
par->targinfo, NULL);
return XT_CONTINUE;
}
-static void ipt_logfn(u_int8_t pf,
+static void ipt_logfn(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -305,13 +319,19 @@ static void ipt_logfn(u_int8_t pf,
strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
}
- ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+ ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
}
static int ulog_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_ulog_info *loginfo = par->targinfo;
+ if (!par->net->xt.ulog_warn_deprecated) {
+ pr_info("ULOG is deprecated and it will be removed soon, "
+ "use NFLOG instead\n");
+ par->net->xt.ulog_warn_deprecated = true;
+ }
+
if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
pr_debug("prefix not null-terminated\n");
return -EINVAL;
@@ -379,57 +399,48 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
.me = THIS_MODULE,
};
-static int __init ulog_tg_init(void)
+static int __net_init ulog_tg_net_init(struct net *net)
{
- int ret, i;
-
- pr_debug("init module\n");
-
- if (nlbufsiz > 128*1024) {
- pr_warning("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
+ int i;
+ struct ulog_net *ulog = ulog_pernet(net);
+ struct netlink_kernel_cfg cfg = {
+ .groups = ULOG_MAXNLGROUPS,
+ };
+ spin_lock_init(&ulog->lock);
/* initialize ulog_buffers */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++)
- setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+ for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+ ulog->nlgroup[i] = i;
+ setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
+ (unsigned long)&ulog->nlgroup[i]);
+ }
- nflognl = netlink_kernel_create(&init_net,
- NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
- NULL, THIS_MODULE);
- if (!nflognl)
+ ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
+ if (!ulog->nflognl)
return -ENOMEM;
- ret = xt_register_target(&ulog_tg_reg);
- if (ret < 0) {
- netlink_kernel_release(nflognl);
- return ret;
- }
if (nflog)
- nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+ nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
return 0;
}
-static void __exit ulog_tg_exit(void)
+static void __net_exit ulog_tg_net_exit(struct net *net)
{
ulog_buff_t *ub;
int i;
-
- pr_debug("cleanup_module\n");
+ struct ulog_net *ulog = ulog_pernet(net);
if (nflog)
- nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ulog_tg_reg);
- netlink_kernel_release(nflognl);
+ nf_log_unset(net, &ipt_ulog_logger);
+
+ netlink_kernel_release(ulog->nflognl);
/* remove pending timers and free allocated skb's */
for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog_buffers[i];
- if (timer_pending(&ub->timer)) {
- pr_debug("timer was pending, deleting\n");
- del_timer(&ub->timer);
- }
+ ub = &ulog->ulog_buffers[i];
+ pr_debug("timer is deleting\n");
+ del_timer(&ub->timer);
if (ub->skb) {
kfree_skb(ub->skb);
@@ -438,5 +449,50 @@ static void __exit ulog_tg_exit(void)
}
}
+static struct pernet_operations ulog_tg_net_ops = {
+ .init = ulog_tg_net_init,
+ .exit = ulog_tg_net_exit,
+ .id = &ulog_net_id,
+ .size = sizeof(struct ulog_net),
+};
+
+static int __init ulog_tg_init(void)
+{
+ int ret;
+ pr_debug("init module\n");
+
+ if (nlbufsiz > 128*1024) {
+ pr_warn("Netlink buffer has to be <= 128kB\n");
+ return -EINVAL;
+ }
+
+ ret = register_pernet_subsys(&ulog_tg_net_ops);
+ if (ret)
+ goto out_pernet;
+
+ ret = xt_register_target(&ulog_tg_reg);
+ if (ret < 0)
+ goto out_target;
+
+ if (nflog)
+ nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+
+ return 0;
+
+out_target:
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit ulog_tg_exit(void)
+{
+ pr_debug("cleanup_module\n");
+ if (nflog)
+ nf_log_unregister(&ipt_ulog_logger);
+ xt_unregister_target(&ulog_tg_reg);
+ unregister_pernet_subsys(&ulog_tg_net_ops);
+}
+
module_init(ulog_tg_init);
module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86..00000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * iptables module to match inet_addr_type() of an ip.
- *
- * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
- * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter_ipv4/ipt_addrtype.h>
-#include <linux/netfilter/x_tables.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("Xtables: address type match for IPv4");
-
-static inline bool match_type(struct net *net, const struct net_device *dev,
- __be32 addr, u_int16_t mask)
-{
- return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
-}
-
-static bool
-addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
-{
- struct net *net = dev_net(par->in ? par->in : par->out);
- const struct ipt_addrtype_info *info = par->matchinfo;
- const struct iphdr *iph = ip_hdr(skb);
- bool ret = true;
-
- if (info->source)
- ret &= match_type(net, NULL, iph->saddr, info->source) ^
- info->invert_source;
- if (info->dest)
- ret &= match_type(net, NULL, iph->daddr, info->dest) ^
- info->invert_dest;
-
- return ret;
-}
-
-static bool
-addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
-{
- struct net *net = dev_net(par->in ? par->in : par->out);
- const struct ipt_addrtype_info_v1 *info = par->matchinfo;
- const struct iphdr *iph = ip_hdr(skb);
- const struct net_device *dev = NULL;
- bool ret = true;
-
- if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
- dev = par->in;
- else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
- dev = par->out;
-
- if (info->source)
- ret &= match_type(net, dev, iph->saddr, info->source) ^
- (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
- if (ret && info->dest)
- ret &= match_type(net, dev, iph->daddr, info->dest) ^
- !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
- return ret;
-}
-
-static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
-{
- struct ipt_addrtype_info_v1 *info = par->matchinfo;
-
- if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
- info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
- pr_info("both incoming and outgoing "
- "interface limitation cannot be selected\n");
- return -EINVAL;
- }
-
- if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_LOCAL_IN)) &&
- info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
- pr_info("output interface limitation "
- "not valid in PREROUTING and INPUT\n");
- return -EINVAL;
- }
-
- if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT)) &&
- info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
- pr_info("input interface limitation "
- "not valid in POSTROUTING and OUTPUT\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct xt_match addrtype_mt_reg[] __read_mostly = {
- {
- .name = "addrtype",
- .family = NFPROTO_IPV4,
- .match = addrtype_mt_v0,
- .matchsize = sizeof(struct ipt_addrtype_info),
- .me = THIS_MODULE
- },
- {
- .name = "addrtype",
- .family = NFPROTO_IPV4,
- .revision = 1,
- .match = addrtype_mt_v1,
- .checkentry = addrtype_mt_checkentry_v1,
- .matchsize = sizeof(struct ipt_addrtype_info_v1),
- .me = THIS_MODULE
- }
-};
-
-static int __init addrtype_mt_init(void)
-{
- return xt_register_matches(addrtype_mt_reg,
- ARRAY_SIZE(addrtype_mt_reg));
-}
-
-static void __exit addrtype_mt_exit(void)
-{
- xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
-}
-
-module_init(addrtype_mt_init);
-module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
deleted file mode 100644
index af6e9c77834..00000000000
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/* IP tables module for matching the value of the IPv4 and TCP ECN bits
- *
- * (C) 2002 by Harald Welte <laforge@gnumonks.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_ecn.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4");
-MODULE_LICENSE("GPL");
-
-static inline bool match_ip(const struct sk_buff *skb,
- const struct ipt_ecn_info *einfo)
-{
- return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect;
-}
-
-static inline bool match_tcp(const struct sk_buff *skb,
- const struct ipt_ecn_info *einfo,
- bool *hotdrop)
-{
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* In practice, TCP match does this, so can't fail. But let's
- * be good citizens.
- */
- th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
- if (th == NULL) {
- *hotdrop = false;
- return false;
- }
-
- if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
- if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
- if (th->ece == 1)
- return false;
- } else {
- if (th->ece == 0)
- return false;
- }
- }
-
- if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
- if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
- if (th->cwr == 1)
- return false;
- } else {
- if (th->cwr == 0)
- return false;
- }
- }
-
- return true;
-}
-
-static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
-{
- const struct ipt_ecn_info *info = par->matchinfo;
-
- if (info->operation & IPT_ECN_OP_MATCH_IP)
- if (!match_ip(skb, info))
- return false;
-
- if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
- if (ip_hdr(skb)->protocol != IPPROTO_TCP)
- return false;
- if (!match_tcp(skb, info, &par->hotdrop))
- return false;
- }
-
- return true;
-}
-
-static int ecn_mt_check(const struct xt_mtchk_param *par)
-{
- const struct ipt_ecn_info *info = par->matchinfo;
- const struct ipt_ip *ip = par->entryinfo;
-
- if (info->operation & IPT_ECN_OP_MATCH_MASK)
- return -EINVAL;
-
- if (info->invert & IPT_ECN_OP_MATCH_MASK)
- return -EINVAL;
-
- if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
- ip->proto != IPPROTO_TCP) {
- pr_info("cannot match TCP bits in rule for non-tcp packets\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct xt_match ecn_mt_reg __read_mostly = {
- .name = "ecn",
- .family = NFPROTO_IPV4,
- .match = ecn_mt,
- .matchsize = sizeof(struct ipt_ecn_info),
- .checkentry = ecn_mt_check,
- .me = THIS_MODULE,
-};
-
-static int __init ecn_mt_init(void)
-{
- return xt_register_match(&ecn_mt_reg);
-}
-
-static void __exit ecn_mt_exit(void)
-{
- xt_unregister_match(&ecn_mt_reg);
-}
-
-module_init(ecn_mt_init);
-module_exit(ecn_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 00000000000..4bfaedf9b34
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 rpfilter_get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+ const struct net_device *dev, u8 flags)
+{
+ struct fib_result res;
+ bool dev_match;
+ struct net *net = dev_net(dev);
+ int ret __maybe_unused;
+
+ if (fib_lookup(net, fl4, &res))
+ return false;
+
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
+ return false;
+ }
+ dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match || flags & XT_RPFILTER_LOOSE)
+ return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
+ return dev_match;
+}
+
+static bool rpfilter_is_local(const struct sk_buff *skb)
+{
+ const struct rtable *rt = skb_rtable(skb);
+ return rt && (rt->rt_flags & RTCF_LOCAL);
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info;
+ const struct iphdr *iph;
+ struct flowi4 flow;
+ bool invert;
+
+ info = par->matchinfo;
+ invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_local(skb))
+ return true ^ invert;
+
+ iph = ip_hdr(skb);
+ if (ipv4_is_multicast(iph->daddr)) {
+ if (ipv4_is_zeronet(iph->saddr))
+ return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ }
+ flow.flowi4_iif = LOOPBACK_IFINDEX;
+ flow.daddr = iph->saddr;
+ flow.saddr = rpfilter_get_saddr(iph->daddr);
+ flow.flowi4_oif = 0;
+ flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+ if (info->flags & options) {
+ pr_info("unknown options encountered");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info("match only valid in the \'raw\' "
+ "or \'mangle\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV4,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index c37641e819f..e08a74a243a 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,26 +33,27 @@ static const struct xt_table packet_filter = {
};
static unsigned int
-iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+ return ipt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.iptable_filter);
}
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
-static int forward = NF_ACCEPT;
+static bool forward = true;
module_param(forward, bool, 0000);
static int __net_init iptable_filter_net_init(struct net *net)
@@ -64,14 +65,12 @@ static int __net_init iptable_filter_net_init(struct net *net)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
- -forward - 1;
+ forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
net->ipv4.iptable_filter =
ipt_register_table(net, &packet_filter, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.iptable_filter))
- return PTR_ERR(net->ipv4.iptable_filter);
- return 0;
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
}
static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -88,11 +87,6 @@ static int __init iptable_filter_init(void)
{
int ret;
- if (forward < 0 || forward > NF_MAX_VERDICT) {
- pr_err("iptables forward must be 0 or 1\n");
- return -EINVAL;
- }
-
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
return ret;
@@ -101,14 +95,10 @@ static int __init iptable_filter_init(void)
filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
if (IS_ERR(filter_ops)) {
ret = PTR_ERR(filter_ops);
- goto cleanup_table;
+ unregister_pernet_subsys(&iptable_filter_net_ops);
}
return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&iptable_filter_net_ops);
- return ret;
}
static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f29..6a5079c34bb 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -44,6 +44,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
u_int8_t tos;
__be32 saddr, daddr;
u_int32_t mark;
+ int err;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -60,15 +61,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
- iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
+ iph->tos != tos) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
}
return ret;
@@ -76,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_mangle_hook(unsigned int hook,
+iptable_mangle_hook(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (hook == NF_INET_LOCAL_OUT)
+ if (ops->hooknum == NF_INET_LOCAL_OUT)
return ipt_mangle_out(skb, out);
- if (hook == NF_INET_POST_ROUTING)
- return ipt_do_table(skb, hook, in, out,
+ if (ops->hooknum == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, ops->hooknum, in, out,
dev_net(out)->ipv4.iptable_mangle);
/* PREROUTING/INPUT/FORWARD: */
- return ipt_do_table(skb, hook, in, out,
+ return ipt_do_table(skb, ops->hooknum, in, out,
dev_net(in)->ipv4.iptable_mangle);
}
@@ -104,9 +107,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
net->ipv4.iptable_mangle =
ipt_register_table(net, &packet_mangler, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.iptable_mangle))
- return PTR_ERR(net->ipv4.iptable_mangle);
- return 0;
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -131,14 +132,10 @@ static int __init iptable_mangle_init(void)
mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
if (IS_ERR(mangle_ops)) {
ret = PTR_ERR(mangle_ops);
- goto cleanup_table;
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
}
return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&iptable_mangle_net_ops);
- return ret;
}
static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 00000000000..f1787c04a4d
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,328 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv4_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+};
+
+static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+ /* Force range to this IP; let proto decide mapping for
+ * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ */
+ struct nf_nat_range range;
+
+ range.flags = 0;
+ pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+ HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
+ return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned int ret;
+
+ ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
+ if (ret == NF_ACCEPT) {
+ if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
+ ret = alloc_null_binding(ct, hooknum);
+ }
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ * and local-out, and nf_nat_out protects post-routing.
+ */
+ NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ goto oif_changed;
+ }
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
+ goto oif_changed;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
+}
+
+static unsigned int
+nf_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+ __be32 daddr = ip_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ daddr != ip_hdr(skb)->daddr)
+ skb_dst_drop(skb);
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_local_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+static int __net_init iptable_nat_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+ .init = iptable_nat_net_init,
+ .exit = iptable_nat_net_exit,
+};
+
+static int __init iptable_nat_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&iptable_nat_net_ops);
+ if (err < 0)
+ goto err1;
+
+ err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+ return err;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+ nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 07fb710cd72..b2f7e8f9831 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {
/* The work comes in here from netfilter.c. */
static unsigned int
-iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* root is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
+ return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
}
static struct nf_hook_ops *rawtable_ops __read_mostly;
@@ -48,9 +48,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
net->ipv4.iptable_raw =
ipt_register_table(net, &packet_raw, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.iptable_raw))
- return PTR_ERR(net->ipv4.iptable_raw);
- return 0;
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
}
static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -75,14 +73,10 @@ static int __init iptable_raw_init(void)
rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
if (IS_ERR(rawtable_ops)) {
ret = PTR_ERR(rawtable_ops);
- goto cleanup_table;
+ unregister_pernet_subsys(&iptable_raw_net_ops);
}
return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&iptable_raw_net_ops);
- return ret;
}
static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index be45bdc4c60..c86647ed207 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,21 +37,22 @@ static const struct xt_table security_table = {
};
static unsigned int
-iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct net *net;
- if (hook == NF_INET_LOCAL_OUT &&
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
(skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr)))
/* Somebody is playing with raw sockets. */
return NF_ACCEPT;
net = dev_net((in != NULL) ? in : out);
- return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
+ return ipt_do_table(skb, ops->hooknum, in, out,
+ net->ipv4.iptable_security);
}
static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -66,10 +67,7 @@ static int __net_init iptable_security_net_init(struct net *net)
net->ipv4.iptable_security =
ipt_register_table(net, &security_table, repl);
kfree(repl);
- if (IS_ERR(net->ipv4.iptable_security))
- return PTR_ERR(net->ipv4.iptable_security);
-
- return 0;
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
}
static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a03c02af99..8127dc80286 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -1,6 +1,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -24,16 +25,12 @@
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/nf_log.h>
-int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo);
-EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
-
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
struct nf_conntrack_tuple *tuple)
{
@@ -74,58 +71,73 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
if (iph == NULL)
- return -NF_DROP;
+ return -NF_ACCEPT;
/* Conntrack defragments packets, we might still see fragments
* inside ICMP packets though. */
if (iph->frag_off & htons(IP_OFFSET))
- return -NF_DROP;
+ return -NF_ACCEPT;
*dataoff = nhoff + (iph->ihl << 2);
*protonum = iph->protocol;
+ /* Check bogus IP headers */
+ if (*dataoff > skb->len) {
+ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
+ "nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -NF_ACCEPT;
+ }
+
return NF_ACCEPT;
}
-static unsigned int ipv4_confirm(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
const struct nf_conn_help *help;
const struct nf_conntrack_helper *helper;
- unsigned int ret;
/* This is where we call the helper: as the packet goes out. */
ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
- goto out;
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
help = nfct_help(ct);
if (!help)
- goto out;
+ return NF_ACCEPT;
/* rcu_read_lock()ed by nf_hook_slow */
helper = rcu_dereference(help->helper);
if (!helper)
- goto out;
+ return NF_ACCEPT;
- ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
- if (ret != NF_ACCEPT) {
- nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
- "nf_ct_%s: dropping packet", helper->name);
- return ret;
- }
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
- typeof(nf_nat_seq_adjust_hook) seq_adjust;
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
- seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
- if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
return NF_DROP;
}
@@ -135,16 +147,16 @@ out:
return nf_conntrack_confirm(skb);
}
-static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
+ return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
}
-static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -154,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
- return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
+ return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
}
/* Connection tracking may drop packets, but never alters them, so
@@ -175,6 +187,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
@@ -182,6 +201,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
.hook = ipv4_confirm,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
@@ -194,38 +220,33 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
static int log_invalid_proto_min = 0;
static int log_invalid_proto_max = 255;
-static ctl_table ip_ct_sysctl_table[] = {
+static struct ctl_table ip_ct_sysctl_table[] = {
{
.procname = "ip_conntrack_max",
- .data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_count",
- .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_buckets",
- .data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_checksum",
- .data = &init_net.ct.sysctl_checksum,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ip_conntrack_log_invalid",
- .data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
@@ -301,8 +322,9 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
- NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
+ if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -340,6 +362,25 @@ static struct nf_sockopt_ops so_getorigdst = {
.owner = THIS_MODULE,
};
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ struct nf_ip_net *in = &net->ct.nf_ct_proto;
+ in->ctl_table = kmemdup(ip_ct_sysctl_table,
+ sizeof(ip_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!in->ctl_table)
+ return -ENOMEM;
+
+ in->ctl_table[0].data = &nf_conntrack_max;
+ in->ctl_table[1].data = &net->ct.count;
+ in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[3].data = &net->ct.sysctl_checksum;
+ in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+ return 0;
+}
+
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.l3proto = PF_INET,
.name = "ipv4",
@@ -354,9 +395,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.nla_policy = ipv4_nla_policy,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
- .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
- .ctl_table = ip_ct_sysctl_table,
+ .ctl_table_path = "net/ipv4/netfilter",
#endif
+ .init_net = ipv4_init_net,
.me = THIS_MODULE,
};
@@ -367,6 +408,54 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("ip_conntrack");
MODULE_LICENSE("GPL");
+static int ipv4_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_tcp4: pernet registration failed\n");
+ goto out_tcp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udp4: pernet registration failed\n");
+ goto out_udp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_icmp4: pernet registration failed\n");
+ goto out_icmp;
+ }
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: pernet registration failed\n");
+ goto out_ipv4;
+ }
+ return 0;
+out_ipv4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+out_icmp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+out_udp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+out_tcp:
+ return ret;
+}
+
+static void ipv4_net_exit(struct net *net)
+{
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+}
+
+static struct pernet_operations ipv4_net_ops = {
+ .init = ipv4_net_init,
+ .exit = ipv4_net_exit,
+};
+
static int __init nf_conntrack_l3proto_ipv4_init(void)
{
int ret = 0;
@@ -380,54 +469,63 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
return ret;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ ret = register_pernet_subsys(&ipv4_net_ops);
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register tcp.\n");
+ pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
goto cleanup_sockopt;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
+ ret = nf_register_hooks(ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register udp.\n");
- goto cleanup_tcp;
+ pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ goto cleanup_pernet;
}
- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register icmp.\n");
- goto cleanup_udp;
+ pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
+ goto cleanup_hooks;
}
- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register ipv4\n");
- goto cleanup_icmp;
+ pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
+ goto cleanup_tcp4;
}
- ret = nf_register_hooks(ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register hooks.\n");
- goto cleanup_ipv4;
+ pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
+ goto cleanup_udp4;
}
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
+ goto cleanup_icmpv4;
+ }
+
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
ret = nf_conntrack_ipv4_compat_init();
if (ret < 0)
- goto cleanup_hooks;
+ goto cleanup_proto;
#endif
return ret;
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_proto:
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+#endif
+ cleanup_icmpv4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
cleanup_hooks:
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-#endif
- cleanup_ipv4:
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- cleanup_icmp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp:
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ cleanup_pernet:
+ unregister_pernet_subsys(&ipv4_net_ops);
cleanup_sockopt:
nf_unregister_sockopt(&so_getorigdst);
return ret;
@@ -439,19 +537,14 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
nf_conntrack_ipv4_compat_fini();
#endif
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ unregister_pernet_subsys(&ipv4_net_ops);
nf_unregister_sockopt(&so_getorigdst);
}
module_init(nf_conntrack_l3proto_ipv4_init);
module_exit(nf_conntrack_l3proto_ipv4_fini);
-
-void need_ipv4_conntrack(void)
-{
- return;
-}
-EXPORT_SYMBOL_GPL(need_ipv4_conntrack);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 37f8adb68c7..4c48e434bb1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -2,6 +2,7 @@
*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,6 +21,8 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
+#include <linux/export.h>
struct ct_iter_state {
struct seq_net_private p;
@@ -35,7 +38,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -48,13 +52,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
}
return head;
}
@@ -97,7 +102,7 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
if (ret)
- return ret;
+ return 0;
ret = seq_printf(s, "secctx=%s ", secctx);
@@ -217,7 +222,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -230,11 +236,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
@@ -411,12 +418,12 @@ static int __net_init ip_conntrack_net_init(struct net *net)
{
struct proc_dir_entry *proc, *proc_exp, *proc_stat;
- proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
+ proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
if (!proc)
goto err1;
- proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
- &ip_exp_file_ops);
+ proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+ &ip_exp_file_ops);
if (!proc_exp)
goto err2;
@@ -427,9 +434,9 @@ static int __net_init ip_conntrack_net_init(struct net *net)
return 0;
err3:
- proc_net_remove(net, "ip_conntrack_expect");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
err2:
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack", net->proc_net);
err1:
return -ENOMEM;
}
@@ -437,8 +444,8 @@ err1:
static void __net_exit ip_conntrack_net_exit(struct net *net)
{
remove_proc_entry("ip_conntrack", net->proc_net_stat);
- proc_net_remove(net, "ip_conntrack_expect");
- proc_net_remove(net, "ip_conntrack");
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+ remove_proc_entry("ip_conntrack", net->proc_net);
}
static struct pernet_operations ip_conntrack_net_ops = {
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7404bde9599..a338dad41b7 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -23,6 +24,11 @@
static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp;
+}
+
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct nf_conntrack_tuple *tuple)
{
@@ -75,25 +81,31 @@ static int icmp_print_tuple(struct seq_file *s,
ntohs(tuple->src.u.icmp.id));
}
+static unsigned int *icmp_get_timeouts(struct net *net)
+{
+ return &icmp_pernet(net)->timeout;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
u_int8_t pf,
- unsigned int hooknum)
+ unsigned int hooknum,
+ unsigned int *timeout)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, unsigned int *timeouts)
{
static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
@@ -160,7 +172,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
/* Update skb to refer to this connection */
skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
skb->nfctinfo = *ctinfo;
- return -NF_ACCEPT;
+ return NF_ACCEPT;
}
/* Small and modified version of icmp_rcv */
@@ -176,8 +188,8 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "nf_ct_icmp: short packet ");
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+ NULL, "nf_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -185,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
}
@@ -198,7 +210,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(net, IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
"nf_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
@@ -222,10 +234,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
static int icmp_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *t)
{
- NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
- NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
-
+ if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
return 0;
nla_put_failure:
@@ -263,12 +275,50 @@ static int icmp_nlattr_tuple_size(void)
}
#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_icmp_net *in = icmp_pernet(net);
+
+ if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
+ } else {
+ /* Set default ICMP timeout. */
+ *timeout = in->timeout;
+ }
+ return 0;
+}
+
+static int
+icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
+ [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmp_sysctl_header;
static struct ctl_table icmp_sysctl_table[] = {
{
.procname = "nf_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -279,7 +329,6 @@ static struct ctl_table icmp_sysctl_table[] = {
static struct ctl_table icmp_compat_sysctl_table[] = {
{
.procname = "ip_conntrack_icmp_timeout",
- .data = &nf_ct_icmp_timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -289,6 +338,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(icmp_sysctl_table,
+ sizeof(icmp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &in->timeout;
+#endif
+ return 0;
+}
+
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+ sizeof(icmp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_icmp_net *in = icmp_pernet(net);
+ struct nf_proto_net *pn = &in->pn;
+
+ in->timeout = nf_ct_icmp_timeout;
+
+ ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+ if (ret < 0)
+ return ret;
+
+ ret = icmp_kmemdup_sysctl_table(pn, in);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp.pn;
+}
+
struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
{
.l3proto = PF_INET,
@@ -298,6 +403,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.invert_tuple = icmp_invert_tuple,
.print_tuple = icmp_print_tuple,
.packet = icmp_packet,
+ .get_timeouts = icmp_get_timeouts,
.new = icmp_new,
.error = icmp_error,
.destroy = NULL,
@@ -308,11 +414,15 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.nlattr_to_tuple = icmp_nlattr_to_tuple,
.nla_policy = icmp_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_header = &icmp_sysctl_header,
- .ctl_table = icmp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
- .ctl_compat_table = icmp_compat_sysctl_table,
-#endif
-#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = icmp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = icmp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = icmp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = icmp_init_net,
+ .get_net_proto = icmp_get_net_proto,
};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index f3a9b42b16c..b8f6381c7d0 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,7 +22,6 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-/* Returns new sk_buff, or NULL */
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
{
int err;
@@ -33,8 +32,10 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
err = ip_defrag(skb, user);
local_bh_enable();
- if (!err)
+ if (!err) {
ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ }
return err;
}
@@ -60,7 +61,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
return IP_DEFRAG_CONNTRACK_OUT + zone;
}
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -82,8 +83,10 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
#endif
#endif
/* Gather fragments. */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+ if (ip_is_fragment(ip_hdr(skb))) {
+ enum ip_defrag_users user =
+ nf_ct_defrag_user(ops->hooknum, skb);
+
if (nf_ct_ipv4_gather_frags(skb, user))
return NF_STOLEN;
}
@@ -94,14 +97,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
deleted file mode 100644
index 0f23b3f06df..00000000000
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Amanda extension for TCP NAT alteration.
- * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
- * based on a copy of HW's ip_nat_irc.c as well as other modules
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/udp.h>
-
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_amanda.h>
-
-MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
-MODULE_DESCRIPTION("Amanda NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_amanda");
-
-static unsigned int help(struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conntrack_expect *exp)
-{
- char buffer[sizeof("65535")];
- u_int16_t port;
- unsigned int ret;
-
- /* Connection comes from client. */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = IP_CT_DIR_ORIGINAL;
-
- /* When you see the packet, we need to NAT it the same as the
- * this one (ie. same IP: it will be TCP and master is UDP). */
- exp->expectfn = nf_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
- if (port == 0)
- return NF_DROP;
-
- sprintf(buffer, "%u", port);
- ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
- matchoff, matchlen,
- buffer, strlen(buffer));
- if (ret != NF_ACCEPT)
- nf_ct_unexpect_related(exp);
- return ret;
-}
-
-static void __exit nf_nat_amanda_fini(void)
-{
- rcu_assign_pointer(nf_nat_amanda_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init nf_nat_amanda_init(void)
-{
- BUG_ON(nf_nat_amanda_hook != NULL);
- rcu_assign_pointer(nf_nat_amanda_hook, help);
- return 0;
-}
-
-module_init(nf_nat_amanda_init);
-module_exit(nf_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
deleted file mode 100644
index c04787ce1a7..00000000000
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ /dev/null
@@ -1,774 +0,0 @@
-/* NAT for netfilter; shared with compatibility layer. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/gfp.h>
-#include <net/checksum.h>
-#include <net/icmp.h>
-#include <net/ip.h>
-#include <net/tcp.h> /* For tcp_prot in getorigdst */
-#include <linux/icmp.h>
-#include <linux/udp.h>
-#include <linux/jhash.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-
-static DEFINE_SPINLOCK(nf_nat_lock);
-
-static struct nf_conntrack_l3proto *l3proto __read_mostly;
-
-#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
- __read_mostly;
-
-static inline const struct nf_nat_protocol *
-__nf_nat_proto_find(u_int8_t protonum)
-{
- return rcu_dereference(nf_nat_protos[protonum]);
-}
-
-/* We keep an extra hash for each conntrack, for fast searching. */
-static inline unsigned int
-hash_by_src(const struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
-{
- unsigned int hash;
-
- /* Original src, to ensure we map it consistently if poss. */
- hash = jhash_3words((__force u32)tuple->src.u3.ip,
- (__force u32)tuple->src.u.all ^ zone,
- tuple->dst.protonum, 0);
- return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
-}
-
-/* Is this tuple already taken? (not by us) */
-int
-nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
- const struct nf_conn *ignored_conntrack)
-{
- /* Conntrack tracking doesn't keep track of outgoing tuples; only
- incoming ones. NAT means they don't have a fixed mapping,
- so we invert the tuple and look for the incoming reply.
-
- We could keep a separate hash if this proves too slow. */
- struct nf_conntrack_tuple reply;
-
- nf_ct_invert_tuplepr(&reply, tuple);
- return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
-}
-EXPORT_SYMBOL(nf_nat_used_tuple);
-
-/* If we source map this tuple so reply looks like reply_tuple, will
- * that meet the constraints of range. */
-static int
-in_range(const struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range)
-{
- const struct nf_nat_protocol *proto;
- int ret = 0;
-
- /* If we are supposed to map IPs, then we must be in the
- range specified, otherwise let this drag us onto a new src IP. */
- if (range->flags & IP_NAT_RANGE_MAP_IPS) {
- if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
- ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
- return 0;
- }
-
- rcu_read_lock();
- proto = __nf_nat_proto_find(tuple->dst.protonum);
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
- proto->in_range(tuple, IP_NAT_MANIP_SRC,
- &range->min, &range->max))
- ret = 1;
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline int
-same_src(const struct nf_conn *ct,
- const struct nf_conntrack_tuple *tuple)
-{
- const struct nf_conntrack_tuple *t;
-
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- return (t->dst.protonum == tuple->dst.protonum &&
- t->src.u3.ip == tuple->src.u3.ip &&
- t->src.u.all == tuple->src.u.all);
-}
-
-/* Only called for SRC manip */
-static int
-find_appropriate_src(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple,
- struct nf_conntrack_tuple *result,
- const struct nf_nat_range *range)
-{
- unsigned int h = hash_by_src(net, zone, tuple);
- const struct nf_conn_nat *nat;
- const struct nf_conn *ct;
- const struct hlist_node *n;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
- ct = nat->ct;
- if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
- /* Copy source part from reply tuple. */
- nf_ct_invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
-
- if (in_range(result, range)) {
- rcu_read_unlock();
- return 1;
- }
- }
- }
- rcu_read_unlock();
- return 0;
-}
-
-/* For [FUTURE] fragmentation handling, we want the least-used
- src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
- if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
- 1-65535, we don't do pro-rata allocation based on ports; we choose
- the ip with the lowest src-ip/dst-ip/proto usage.
-*/
-static void
-find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- const struct nf_conn *ct,
- enum nf_nat_manip_type maniptype)
-{
- __be32 *var_ipp;
- /* Host order */
- u_int32_t minip, maxip, j;
-
- /* No IP mapping? Do nothing. */
- if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
- return;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- var_ipp = &tuple->src.u3.ip;
- else
- var_ipp = &tuple->dst.u3.ip;
-
- /* Fast path: only one choice. */
- if (range->min_ip == range->max_ip) {
- *var_ipp = range->min_ip;
- return;
- }
-
- /* Hashing source and destination IPs gives a fairly even
- * spread in practice (if there are a small number of IPs
- * involved, there usually aren't that many connections
- * anyway). The consistency means that servers see the same
- * client coming from the same IP (some Internet Banking sites
- * like this), even across reboots. */
- minip = ntohl(range->min_ip);
- maxip = ntohl(range->max_ip);
- j = jhash_2words((__force u32)tuple->src.u3.ip,
- range->flags & IP_NAT_RANGE_PERSISTENT ?
- 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
- j = ((u64)j * (maxip - minip + 1)) >> 32;
- *var_ipp = htonl(minip + j);
-}
-
-/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
- * we change the source to map into the range. For NF_INET_PRE_ROUTING
- * and NF_INET_LOCAL_OUT, we change the destination to map into the
- * range. It might not be possible to get a unique tuple, but we try.
- * At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
-static void
-get_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig_tuple,
- const struct nf_nat_range *range,
- struct nf_conn *ct,
- enum nf_nat_manip_type maniptype)
-{
- struct net *net = nf_ct_net(ct);
- const struct nf_nat_protocol *proto;
- u16 zone = nf_ct_zone(ct);
-
- /* 1) If this srcip/proto/src-proto-part is currently mapped,
- and that same mapping gives a unique tuple within the given
- range, use that.
-
- This is only required for source (ie. NAT/masq) mappings.
- So far, we don't do local source mappings, so multiple
- manips not an issue. */
- if (maniptype == IP_NAT_MANIP_SRC &&
- !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
- pr_debug("get_unique_tuple: Found current src map\n");
- if (!nf_nat_used_tuple(tuple, ct))
- return;
- }
- }
-
- /* 2) Select the least-used IP/proto combination in the given
- range. */
- *tuple = *orig_tuple;
- find_best_ips_proto(zone, tuple, range, ct, maniptype);
-
- /* 3) The per-protocol part of the manip is made to map into
- the range to make a unique tuple. */
-
- rcu_read_lock();
- proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
-
- /* Only bother mapping if it's not already in range and unique */
- if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
- if (proto->in_range(tuple, maniptype, &range->min,
- &range->max) &&
- (range->min.all == range->max.all ||
- !nf_nat_used_tuple(tuple, ct)))
- goto out;
- } else if (!nf_nat_used_tuple(tuple, ct)) {
- goto out;
- }
- }
-
- /* Last change: get protocol to try to obtain unique tuple. */
- proto->unique_tuple(tuple, range, maniptype, ct);
-out:
- rcu_read_unlock();
-}
-
-unsigned int
-nf_nat_setup_info(struct nf_conn *ct,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_conntrack_tuple curr_tuple, new_tuple;
- struct nf_conn_nat *nat;
- int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
-
- /* nat helper or nfctnetlink also setup binding */
- nat = nfct_nat(ct);
- if (!nat) {
- nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
- if (nat == NULL) {
- pr_debug("failed to add NAT extension\n");
- return NF_ACCEPT;
- }
- }
-
- NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
- maniptype == IP_NAT_MANIP_DST);
- BUG_ON(nf_nat_initialized(ct, maniptype));
-
- /* What we've got will look like inverse of reply. Normally
- this is what is in the conntrack, except for prior
- manipulations (future optimization: if num_manips == 0,
- orig_tp =
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
- nf_ct_invert_tuplepr(&curr_tuple,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
-
- if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
- struct nf_conntrack_tuple reply;
-
- /* Alter conntrack table so will recognize replies. */
- nf_ct_invert_tuplepr(&reply, &new_tuple);
- nf_conntrack_alter_reply(ct, &reply);
-
- /* Non-atomic: we own this at the moment. */
- if (maniptype == IP_NAT_MANIP_SRC)
- ct->status |= IPS_SRC_NAT;
- else
- ct->status |= IPS_DST_NAT;
- }
-
- /* Place in source hash if this is the first time. */
- if (have_to_hash) {
- unsigned int srchash;
-
- srchash = hash_by_src(net, nf_ct_zone(ct),
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- spin_lock_bh(&nf_nat_lock);
- /* nf_conntrack_alter_reply might re-allocate exntension aera */
- nat = nfct_nat(ct);
- nat->ct = ct;
- hlist_add_head_rcu(&nat->bysource,
- &net->ipv4.nat_bysource[srchash]);
- spin_unlock_bh(&nf_nat_lock);
- }
-
- /* It's done. */
- if (maniptype == IP_NAT_MANIP_DST)
- set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
- else
- set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL(nf_nat_setup_info);
-
-/* Returns true if succeeded. */
-static bool
-manip_pkt(u_int16_t proto,
- struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *target,
- enum nf_nat_manip_type maniptype)
-{
- struct iphdr *iph;
- const struct nf_nat_protocol *p;
-
- if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
- return false;
-
- iph = (void *)skb->data + iphdroff;
-
- /* Manipulate protcol part. */
-
- /* rcu_read_lock()ed by nf_hook_slow */
- p = __nf_nat_proto_find(proto);
- if (!p->manip_pkt(skb, iphdroff, target, maniptype))
- return false;
-
- iph = (void *)skb->data + iphdroff;
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
- iph->saddr = target->src.u3.ip;
- } else {
- csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
- iph->daddr = target->dst.u3.ip;
- }
- return true;
-}
-
-/* Do packet manipulations according to nf_nat_setup_info. */
-unsigned int nf_nat_packet(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff *skb)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned long statusbit;
- enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
-
- if (mtype == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- /* Non-atomic: these bits don't change. */
- if (ct->status & statusbit) {
- struct nf_conntrack_tuple target;
-
- /* We are aiming to look like inverse of other direction. */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-
- if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
- return NF_DROP;
- }
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_nat_packet);
-
-/* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int nf_nat_icmp_reply_translation(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- struct sk_buff *skb)
-{
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } *inside;
- const struct nf_conntrack_l4proto *l4proto;
- struct nf_conntrack_tuple inner, target;
- int hdrlen = ip_hdrlen(skb);
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned long statusbit;
- enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
-
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
- return 0;
-
- inside = (void *)skb->data + hdrlen;
-
- /* We're actually going to mangle it beyond trivial checksum
- adjustment, so make sure the current checksum is correct. */
- if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
- return 0;
-
- /* Must be RELATED */
- NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
- skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
-
- /* Redirects on non-null nats must be dropped, else they'll
- start talking to each other without our translation, and be
- confused... --RR */
- if (inside->icmp.type == ICMP_REDIRECT) {
- /* If NAT isn't finished, assume it and drop. */
- if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
- return 0;
-
- if (ct->status & IPS_NAT_MASK)
- return 0;
- }
-
- if (manip == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (!(ct->status & statusbit))
- return 1;
-
- pr_debug("icmp_reply_translation: translating error %p manip %u "
- "dir %s\n", skb, manip,
- dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
-
- /* rcu_read_lock()ed by nf_hook_slow */
- l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
-
- if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
- (hdrlen +
- sizeof(struct icmphdr) + inside->ip.ihl * 4),
- (u_int16_t)AF_INET, inside->ip.protocol,
- &inner, l3proto, l4proto))
- return 0;
-
- /* Change inner back to look like incoming packet. We do the
- opposite manip on this hook to normal, because it might not
- pass all hooks (locally-generated ICMP). Consider incoming
- packet: PREROUTING (DST manip), routing produces ICMP, goes
- through POSTROUTING (which must correct the DST manip). */
- if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
- &ct->tuplehash[!dir].tuple, !manip))
- return 0;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- /* Reloading "inside" here since manip_pkt inner. */
- inside = (void *)skb->data + hdrlen;
- inside->icmp.checksum = 0;
- inside->icmp.checksum =
- csum_fold(skb_checksum(skb, hdrlen,
- skb->len - hdrlen, 0));
- }
-
- /* Change outer to look the reply to an incoming packet
- * (proto 0 means don't invert per-proto part). */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- if (!manip_pkt(0, skb, 0, &target, manip))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-
-/* Protocol registration. */
-int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
-{
- int ret = 0;
-
- spin_lock_bh(&nf_nat_lock);
- if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
- ret = -EBUSY;
- goto out;
- }
- rcu_assign_pointer(nf_nat_protos[proto->protonum], proto);
- out:
- spin_unlock_bh(&nf_nat_lock);
- return ret;
-}
-EXPORT_SYMBOL(nf_nat_protocol_register);
-
-/* Noone stores the protocol anywhere; simply delete it. */
-void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
-{
- spin_lock_bh(&nf_nat_lock);
- rcu_assign_pointer(nf_nat_protos[proto->protonum],
- &nf_nat_unknown_protocol);
- spin_unlock_bh(&nf_nat_lock);
- synchronize_rcu();
-}
-EXPORT_SYMBOL(nf_nat_protocol_unregister);
-
-/* Noone using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
- if (nat == NULL || nat->ct == NULL)
- return;
-
- NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
-
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&nat->bysource);
- spin_unlock_bh(&nf_nat_lock);
-}
-
-static void nf_nat_move_storage(void *new, void *old)
-{
- struct nf_conn_nat *new_nat = new;
- struct nf_conn_nat *old_nat = old;
- struct nf_conn *ct = old_nat->ct;
-
- if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
- return;
-
- spin_lock_bh(&nf_nat_lock);
- new_nat->ct = ct;
- hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
- spin_unlock_bh(&nf_nat_lock);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .move = nf_nat_move_storage,
- .id = NF_CT_EXT_NAT,
- .flags = NF_CT_EXT_F_PREALLOC,
-};
-
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static const struct nf_nat_protocol *
-nf_nat_proto_find_get(u_int8_t protonum)
-{
- const struct nf_nat_protocol *p;
-
- rcu_read_lock();
- p = __nf_nat_proto_find(protonum);
- if (!try_module_get(p->me))
- p = &nf_nat_unknown_protocol;
- rcu_read_unlock();
-
- return p;
-}
-
-static void
-nf_nat_proto_put(const struct nf_nat_protocol *p)
-{
- module_put(p->me);
-}
-
-static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
- [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
- [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
-};
-
-static int nfnetlink_parse_nat_proto(struct nlattr *attr,
- const struct nf_conn *ct,
- struct nf_nat_range *range)
-{
- struct nlattr *tb[CTA_PROTONAT_MAX+1];
- const struct nf_nat_protocol *npt;
- int err;
-
- err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
- if (err < 0)
- return err;
-
- npt = nf_nat_proto_find_get(nf_ct_protonum(ct));
- if (npt->nlattr_to_range)
- err = npt->nlattr_to_range(tb, range);
- nf_nat_proto_put(npt);
- return err;
-}
-
-static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
- [CTA_NAT_MINIP] = { .type = NLA_U32 },
- [CTA_NAT_MAXIP] = { .type = NLA_U32 },
-};
-
-static int
-nfnetlink_parse_nat(const struct nlattr *nat,
- const struct nf_conn *ct, struct nf_nat_range *range)
-{
- struct nlattr *tb[CTA_NAT_MAX+1];
- int err;
-
- memset(range, 0, sizeof(*range));
-
- err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
- if (err < 0)
- return err;
-
- if (tb[CTA_NAT_MINIP])
- range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
-
- if (!tb[CTA_NAT_MAXIP])
- range->max_ip = range->min_ip;
- else
- range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
-
- if (range->min_ip)
- range->flags |= IP_NAT_RANGE_MAP_IPS;
-
- if (!tb[CTA_NAT_PROTO])
- return 0;
-
- err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
- if (err < 0)
- return err;
-
- return 0;
-}
-
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
- enum nf_nat_manip_type manip,
- const struct nlattr *attr)
-{
- struct nf_nat_range range;
-
- if (nfnetlink_parse_nat(attr, ct, &range) < 0)
- return -EINVAL;
- if (nf_nat_initialized(ct, manip))
- return -EEXIST;
-
- return nf_nat_setup_info(ct, &range, manip);
-}
-#else
-static int
-nfnetlink_parse_nat_setup(struct nf_conn *ct,
- enum nf_nat_manip_type manip,
- const struct nlattr *attr)
-{
- return -EOPNOTSUPP;
-}
-#endif
-
-static int __net_init nf_nat_net_init(struct net *net)
-{
- /* Leave them the same for the moment. */
- net->ipv4.nat_htable_size = net->ct.htable_size;
- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
- &net->ipv4.nat_vmalloced, 0);
- if (!net->ipv4.nat_bysource)
- return -ENOMEM;
- return 0;
-}
-
-/* Clear NAT section of all conntracks, in case we're loaded again. */
-static int clean_nat(struct nf_conn *i, void *data)
-{
- struct nf_conn_nat *nat = nfct_nat(i);
-
- if (!nat)
- return 0;
- memset(nat, 0, sizeof(*nat));
- i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
- return 0;
-}
-
-static void __net_exit nf_nat_net_exit(struct net *net)
-{
- nf_ct_iterate_cleanup(net, &clean_nat, NULL);
- synchronize_rcu();
- nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
- net->ipv4.nat_htable_size);
-}
-
-static struct pernet_operations nf_nat_net_ops = {
- .init = nf_nat_net_init,
- .exit = nf_nat_net_exit,
-};
-
-static int __init nf_nat_init(void)
-{
- size_t i;
- int ret;
-
- need_ipv4_conntrack();
-
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&nf_nat_net_ops);
- if (ret < 0)
- goto cleanup_extend;
-
- /* Sew in builtin protocols. */
- spin_lock_bh(&nf_nat_lock);
- for (i = 0; i < MAX_IP_NAT_PROTO; i++)
- rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol);
- rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
- rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
- rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
- spin_unlock_bh(&nf_nat_lock);
-
- /* Initialize fake conntrack so that NAT will skip it */
- nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
-
- l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
-
- BUG_ON(nf_nat_seq_adjust_hook != NULL);
- rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
- BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
- rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
- nfnetlink_parse_nat_setup);
- BUG_ON(nf_ct_nat_offset != NULL);
- rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
- return 0;
-
- cleanup_extend:
- nf_ct_extend_unregister(&nat_extend);
- return ret;
-}
-
-static void __exit nf_nat_cleanup(void)
-{
- unregister_pernet_subsys(&nf_nat_net_ops);
- nf_ct_l3proto_put(l3proto);
- nf_ct_extend_unregister(&nat_extend);
- rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
- rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
- rcu_assign_pointer(nf_ct_nat_offset, NULL);
- synchronize_net();
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-ipv4");
-
-module_init(nf_nat_init);
-module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
deleted file mode 100644
index dc73abb3fe2..00000000000
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/* FTP extension for TCP NAT alteration. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_ftp.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
-MODULE_DESCRIPTION("ftp NAT helper");
-MODULE_ALIAS("ip_nat_ftp");
-
-/* FIXME: Time out? --RR */
-
-static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
- char *buffer, size_t buflen,
- __be32 addr, u16 port)
-{
- switch (type) {
- case NF_CT_FTP_PORT:
- case NF_CT_FTP_PASV:
- return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
- ((unsigned char *)&addr)[0],
- ((unsigned char *)&addr)[1],
- ((unsigned char *)&addr)[2],
- ((unsigned char *)&addr)[3],
- port >> 8,
- port & 0xFF);
- case NF_CT_FTP_EPRT:
- return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
- case NF_CT_FTP_EPSV:
- return snprintf(buffer, buflen, "|||%u|", port);
- }
-
- return 0;
-}
-
-/* So, this packet has hit the connection tracking matching code.
- Mangle it, and change the expectation to match the new version. */
-static unsigned int nf_nat_ftp(struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- enum nf_ct_ftp_type type,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conntrack_expect *exp)
-{
- __be32 newip;
- u_int16_t port;
- int dir = CTINFO2DIR(ctinfo);
- struct nf_conn *ct = exp->master;
- char buffer[sizeof("|1|255.255.255.255|65535|")];
- unsigned int buflen;
-
- pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
-
- /* Connection will come from wherever this packet goes, hence !dir */
- newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = !dir;
-
- /* When you see the packet, we need to NAT it the same as the
- * this one. */
- exp->expectfn = nf_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
- if (port == 0)
- return NF_DROP;
-
- buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
- if (!buflen)
- goto out;
-
- pr_debug("calling nf_nat_mangle_tcp_packet\n");
-
- if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
- matchlen, buffer, buflen))
- goto out;
-
- return NF_ACCEPT;
-
-out:
- nf_ct_unexpect_related(exp);
- return NF_DROP;
-}
-
-static void __exit nf_nat_ftp_fini(void)
-{
- rcu_assign_pointer(nf_nat_ftp_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init nf_nat_ftp_init(void)
-{
- BUG_ON(nf_nat_ftp_hook != NULL);
- rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp);
- return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
- printk(KERN_INFO KBUILD_MODNAME
- ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
- return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(nf_nat_ftp_init);
-module_exit(nf_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 790f3160e01..574f7ebba0b 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -2,6 +2,7 @@
* H.323 extension for NAT alteration.
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This source code is licensed under General Public License version 2.
*
@@ -15,13 +16,12 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_h323.h>
/****************************************************************************/
-static int set_addr(struct sk_buff *skb,
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
unsigned char **data, int dataoff,
unsigned int addroff, __be32 ip, __be16 port)
{
@@ -40,11 +40,9 @@ static int set_addr(struct sk_buff *skb,
if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
- addroff, sizeof(buf),
+ protoff, addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
return -1;
}
@@ -56,11 +54,9 @@ static int set_addr(struct sk_buff *skb,
*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
} else {
if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
- addroff, sizeof(buf),
+ protoff, addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
- " error\n");
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -73,22 +69,22 @@ static int set_addr(struct sk_buff *skb,
}
/****************************************************************************/
-static int set_h225_addr(struct sk_buff *skb,
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr,
union nf_inet_addr *addr, __be16 port)
{
- return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
+ return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
addr->ip, port);
}
/****************************************************************************/
-static int set_h245_addr(struct sk_buff *skb,
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr,
union nf_inet_addr *addr, __be16 port)
{
- return set_addr(skb, data, dataoff,
+ return set_addr(skb, protoff, data, dataoff,
taddr->unicastAddress.iPAddress.network,
addr->ip, port);
}
@@ -96,10 +92,10 @@ static int set_h245_addr(struct sk_buff *skb,
/****************************************************************************/
static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data,
+ unsigned int protoff, unsigned char **data,
TransportAddress *taddr, int count)
{
- const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int i;
__be16 port;
@@ -122,7 +118,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
&addr.ip, port,
&ct->tuplehash[!dir].tuple.dst.u3.ip,
info->sig_port[!dir]);
- return set_h225_addr(skb, data, 0, &taddr[i],
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
&ct->tuplehash[!dir].
tuple.dst.u3,
info->sig_port[!dir]);
@@ -133,7 +130,8 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
&addr.ip, port,
&ct->tuplehash[!dir].tuple.src.u3.ip,
info->sig_port[!dir]);
- return set_h225_addr(skb, data, 0, &taddr[i],
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
&ct->tuplehash[!dir].
tuple.src.u3,
info->sig_port[!dir]);
@@ -147,7 +145,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data,
+ unsigned int protoff, unsigned char **data,
TransportAddress *taddr, int count)
{
int dir = CTINFO2DIR(ctinfo);
@@ -163,7 +161,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
&addr.ip, ntohs(port),
&ct->tuplehash[!dir].tuple.dst.u3.ip,
ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
- return set_h225_addr(skb, data, 0, &taddr[i],
+ return set_h225_addr(skb, protoff, data, 0, &taddr[i],
&ct->tuplehash[!dir].tuple.dst.u3,
ct->tuplehash[!dir].tuple.
dst.u.udp.port);
@@ -176,13 +174,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
H245_TransportAddress *taddr,
__be16 port, __be16 rtp_port,
struct nf_conntrack_expect *rtp_exp,
struct nf_conntrack_expect *rtcp_exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
int i;
u_int16_t nated_port;
@@ -214,8 +212,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Run out of expectations */
if (i >= H323_RTP_CHANNEL_MAX) {
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of expectations\n");
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
return 0;
}
@@ -232,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
ret = nf_ct_expect_related(rtcp_exp);
if (ret == 0)
break;
- else if (ret != -EBUSY) {
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
nf_ct_unexpect_related(rtp_exp);
nated_port = 0;
break;
@@ -244,13 +244,12 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of RTP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
return 0;
}
/* Modify signal */
- if (set_h245_addr(skb, data, dataoff, taddr,
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
htons((port & htons(1)) ? nated_port + 1 :
nated_port)) == 0) {
@@ -281,7 +280,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
H245_TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp)
{
@@ -308,13 +307,12 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_h323: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
return 0;
}
/* Modify signal */
- if (set_h245_addr(skb, data, dataoff, taddr,
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
htons(nated_port)) < 0) {
nf_ct_unexpect_related(exp);
@@ -333,11 +331,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
/****************************************************************************/
static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, int dataoff,
+ unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
@@ -365,13 +363,12 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
/* Modify signal */
- if (set_h225_addr(skb, data, dataoff, taddr,
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
htons(nated_port)) == 0) {
/* Save ports */
@@ -409,25 +406,27 @@ static void ip_nat_q931_expect(struct nf_conn *new,
BUG_ON(new->status & IPS_NAT_DONE_MASK);
/* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
- nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = this->saved_proto;
- range.min_ip = range.max_ip =
- new->master->tuplehash[!this->dir].tuple.src.u3.ip;
- nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr =
+ new->master->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
}
/****************************************************************************/
static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- unsigned char **data, TransportAddress *taddr, int idx,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int idx,
__be16 port, struct nf_conntrack_expect *exp)
{
- struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
union nf_inet_addr addr;
@@ -456,13 +455,12 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_ras: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
return 0;
}
/* Modify signal */
- if (set_h225_addr(skb, data, 0, &taddr[idx],
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
&ct->tuplehash[!dir].tuple.dst.u3,
htons(nated_port)) == 0) {
/* Save ports */
@@ -473,7 +471,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
if (idx > 0 &&
get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
(ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
- set_h225_addr(skb, data, 0, &taddr[0],
+ set_h225_addr(skb, protoff, data, 0, &taddr[0],
&ct->tuplehash[!dir].tuple.dst.u3,
info->sig_port[!dir]);
}
@@ -502,20 +500,22 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,
BUG_ON(new->status & IPS_NAT_DONE_MASK);
/* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
- nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = this->saved_proto;
- range.min_ip = range.max_ip = this->saved_ip;
- nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr = this->saved_addr;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
}
/****************************************************************************/
static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
unsigned char **data, int dataoff,
TransportAddress *taddr, __be16 port,
struct nf_conntrack_expect *exp)
@@ -524,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
u_int16_t nated_port;
/* Set expectations for NAT */
- exp->saved_ip = exp->tuple.dst.u3.ip;
+ exp->saved_addr = exp->tuple.dst.u3;
exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
exp->expectfn = ip_nat_callforwarding_expect;
@@ -545,13 +545,12 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
}
if (nated_port == 0) { /* No port available */
- if (net_ratelimit())
- pr_notice("nf_nat_q931: out of TCP ports\n");
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
}
/* Modify signal */
- if (!set_h225_addr(skb, data, dataoff, taddr,
+ if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
&ct->tuplehash[!dir].tuple.dst.u3,
htons(nated_port)) == 0) {
nf_ct_unexpect_related(exp);
@@ -568,6 +567,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
return 0;
}
+static struct nf_ct_helper_expectfn q931_nat = {
+ .name = "Q.931",
+ .expectfn = ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+ .name = "callforwarding",
+ .expectfn = ip_nat_callforwarding_expect,
+};
+
/****************************************************************************/
static int __init init(void)
{
@@ -581,30 +590,34 @@ static int __init init(void)
BUG_ON(nat_callforwarding_hook != NULL);
BUG_ON(nat_q931_hook != NULL);
- rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
- rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
- rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
- rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
- rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
- rcu_assign_pointer(nat_t120_hook, nat_t120);
- rcu_assign_pointer(nat_h245_hook, nat_h245);
- rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
- rcu_assign_pointer(nat_q931_hook, nat_q931);
+ RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
+ RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
+ RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
+ RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
+ RCU_INIT_POINTER(nat_t120_hook, nat_t120);
+ RCU_INIT_POINTER(nat_h245_hook, nat_h245);
+ RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
+ RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+ nf_ct_helper_expectfn_register(&q931_nat);
+ nf_ct_helper_expectfn_register(&callforwarding_nat);
return 0;
}
/****************************************************************************/
static void __exit fini(void)
{
- rcu_assign_pointer(set_h245_addr_hook, NULL);
- rcu_assign_pointer(set_h225_addr_hook, NULL);
- rcu_assign_pointer(set_sig_addr_hook, NULL);
- rcu_assign_pointer(set_ras_addr_hook, NULL);
- rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
- rcu_assign_pointer(nat_t120_hook, NULL);
- rcu_assign_pointer(nat_h245_hook, NULL);
- rcu_assign_pointer(nat_callforwarding_hook, NULL);
- rcu_assign_pointer(nat_q931_hook, NULL);
+ RCU_INIT_POINTER(set_h245_addr_hook, NULL);
+ RCU_INIT_POINTER(set_h225_addr_hook, NULL);
+ RCU_INIT_POINTER(set_sig_addr_hook, NULL);
+ RCU_INIT_POINTER(set_ras_addr_hook, NULL);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
+ RCU_INIT_POINTER(nat_t120_hook, NULL);
+ RCU_INIT_POINTER(nat_h245_hook, NULL);
+ RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
+ RCU_INIT_POINTER(nat_q931_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&q931_nat);
+ nf_ct_helper_expectfn_unregister(&callforwarding_nat);
synchronize_rcu();
}
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
deleted file mode 100644
index 31427fb57aa..00000000000
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/* ip_nat_helper.c - generic support functions for NAT helpers
- *
- * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
- * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/gfp.h>
-#include <linux/kmod.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/skbuff.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <net/checksum.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-
-#define DUMP_OFFSET(x) \
- pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
- x->offset_before, x->offset_after, x->correction_pos);
-
-static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
-
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
- int sizediff,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- struct nf_conn_nat *nat = nfct_nat(ct);
- struct nf_nat_seq *this_way = &nat->seq[dir];
-
- pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
- seq, sizediff);
-
- pr_debug("adjust_tcp_sequence: Seq_offset before: ");
- DUMP_OFFSET(this_way);
-
- spin_lock_bh(&nf_nat_seqofs_lock);
-
- /* SYN adjust. If it's uninitialized, or this is after last
- * correction, record it: we don't handle more than one
- * adjustment in the window, but do deal with common case of a
- * retransmit */
- if (this_way->offset_before == this_way->offset_after ||
- before(this_way->correction_pos, seq)) {
- this_way->correction_pos = seq;
- this_way->offset_before = this_way->offset_after;
- this_way->offset_after += sizediff;
- }
- spin_unlock_bh(&nf_nat_seqofs_lock);
-
- pr_debug("adjust_tcp_sequence: Seq_offset after: ");
- DUMP_OFFSET(this_way);
-}
-
-/* Get the offset value, for conntrack */
-s16 nf_nat_get_offset(const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- u32 seq)
-{
- struct nf_conn_nat *nat = nfct_nat(ct);
- struct nf_nat_seq *this_way;
- s16 offset;
-
- if (!nat)
- return 0;
-
- this_way = &nat->seq[dir];
- spin_lock_bh(&nf_nat_seqofs_lock);
- offset = after(seq, this_way->correction_pos)
- ? this_way->offset_after : this_way->offset_before;
- spin_unlock_bh(&nf_nat_seqofs_lock);
-
- return offset;
-}
-EXPORT_SYMBOL_GPL(nf_nat_get_offset);
-
-/* Frobs data inside this packet, which is linear. */
-static void mangle_contents(struct sk_buff *skb,
- unsigned int dataoff,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
-{
- unsigned char *data;
-
- BUG_ON(skb_is_nonlinear(skb));
- data = skb_network_header(skb) + dataoff;
-
- /* move post-replacement */
- memmove(data + match_offset + rep_len,
- data + match_offset + match_len,
- skb->tail - (skb->network_header + dataoff +
- match_offset + match_len));
-
- /* insert data from buffer */
- memcpy(data + match_offset, rep_buffer, rep_len);
-
- /* update skb info */
- if (rep_len > match_len) {
- pr_debug("nf_nat_mangle_packet: Extending packet by "
- "%u from %u bytes\n", rep_len - match_len, skb->len);
- skb_put(skb, rep_len - match_len);
- } else {
- pr_debug("nf_nat_mangle_packet: Shrinking packet from "
- "%u from %u bytes\n", match_len - rep_len, skb->len);
- __skb_trim(skb, skb->len + rep_len - match_len);
- }
-
- /* fix IP hdr checksum information */
- ip_hdr(skb)->tot_len = htons(skb->len);
- ip_send_check(ip_hdr(skb));
-}
-
-/* Unusual, but possible case. */
-static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
-{
- if (skb->len + extra > 65535)
- return 0;
-
- if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
- return 0;
-
- return 1;
-}
-
-void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- __be32 seq, s16 off)
-{
- if (!off)
- return;
- set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
- adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
- nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
-}
-EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
-
-static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
- int datalen, __sum16 *check, int oldlen)
-{
- struct rtable *rt = skb_rtable(skb);
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = (void *)check - data;
- *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, iph->protocol, 0);
- } else {
- *check = 0;
- *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, iph->protocol,
- csum_partial(data, datalen,
- 0));
- if (iph->protocol == IPPROTO_UDP && !*check)
- *check = CSUM_MANGLED_0;
- }
- } else
- inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), 1);
-}
-
-/* Generic function for mangling variable-length address changes inside
- * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
- * command in FTP).
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * */
-int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len, bool adjust)
-{
- struct iphdr *iph;
- struct tcphdr *tcph;
- int oldlen, datalen;
-
- if (!skb_make_writable(skb, skb->len))
- return 0;
-
- if (rep_len > match_len &&
- rep_len - match_len > skb_tailroom(skb) &&
- !enlarge_skb(skb, rep_len - match_len))
- return 0;
-
- SKB_LINEAR_ASSERT(skb);
-
- iph = ip_hdr(skb);
- tcph = (void *)iph + iph->ihl*4;
-
- oldlen = skb->len - iph->ihl*4;
- mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
- match_offset, match_len, rep_buffer, rep_len);
-
- datalen = skb->len - iph->ihl*4;
- nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
-
- if (adjust && rep_len != match_len)
- nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
- (int)rep_len - (int)match_len);
-
- return 1;
-}
-EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
-
-/* Generic function for mangling variable-length address changes inside
- * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
- * command in the Amanda protocol)
- *
- * Takes care about all the nasty sequence number changes, checksumming,
- * skb enlargement, ...
- *
- * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
- * should be fairly easy to do.
- */
-int
-nf_nat_mangle_udp_packet(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
-{
- struct iphdr *iph;
- struct udphdr *udph;
- int datalen, oldlen;
-
- /* UDP helpers might accidentally mangle the wrong packet */
- iph = ip_hdr(skb);
- if (skb->len < iph->ihl*4 + sizeof(*udph) +
- match_offset + match_len)
- return 0;
-
- if (!skb_make_writable(skb, skb->len))
- return 0;
-
- if (rep_len > match_len &&
- rep_len - match_len > skb_tailroom(skb) &&
- !enlarge_skb(skb, rep_len - match_len))
- return 0;
-
- iph = ip_hdr(skb);
- udph = (void *)iph + iph->ihl*4;
-
- oldlen = skb->len - iph->ihl*4;
- mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
- match_offset, match_len, rep_buffer, rep_len);
-
- /* update the length of the UDP packet */
- datalen = skb->len - iph->ihl*4;
- udph->len = htons(datalen);
-
- /* fix udp checksum if udp checksum was previously calculated */
- if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
- return 1;
-
- nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
-
- return 1;
-}
-EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
-
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
- struct tcphdr *tcph,
- unsigned int sackoff,
- unsigned int sackend,
- struct nf_nat_seq *natseq)
-{
- while (sackoff < sackend) {
- struct tcp_sack_block_wire *sack;
- __be32 new_start_seq, new_end_seq;
-
- sack = (void *)skb->data + sackoff;
- if (after(ntohl(sack->start_seq) - natseq->offset_before,
- natseq->correction_pos))
- new_start_seq = htonl(ntohl(sack->start_seq)
- - natseq->offset_after);
- else
- new_start_seq = htonl(ntohl(sack->start_seq)
- - natseq->offset_before);
-
- if (after(ntohl(sack->end_seq) - natseq->offset_before,
- natseq->correction_pos))
- new_end_seq = htonl(ntohl(sack->end_seq)
- - natseq->offset_after);
- else
- new_end_seq = htonl(ntohl(sack->end_seq)
- - natseq->offset_before);
-
- pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
- ntohl(sack->start_seq), new_start_seq,
- ntohl(sack->end_seq), new_end_seq);
-
- inet_proto_csum_replace4(&tcph->check, skb,
- sack->start_seq, new_start_seq, 0);
- inet_proto_csum_replace4(&tcph->check, skb,
- sack->end_seq, new_end_seq, 0);
- sack->start_seq = new_start_seq;
- sack->end_seq = new_end_seq;
- sackoff += sizeof(*sack);
- }
-}
-
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-nf_nat_sack_adjust(struct sk_buff *skb,
- struct tcphdr *tcph,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- unsigned int dir, optoff, optend;
- struct nf_conn_nat *nat = nfct_nat(ct);
-
- optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
- optend = ip_hdrlen(skb) + tcph->doff * 4;
-
- if (!skb_make_writable(skb, optend))
- return 0;
-
- dir = CTINFO2DIR(ctinfo);
-
- while (optoff < optend) {
- /* Usually: option, length. */
- unsigned char *op = skb->data + optoff;
-
- switch (op[0]) {
- case TCPOPT_EOL:
- return 1;
- case TCPOPT_NOP:
- optoff++;
- continue;
- default:
- /* no partial options */
- if (optoff + 1 == optend ||
- optoff + op[1] > optend ||
- op[1] < 2)
- return 0;
- if (op[0] == TCPOPT_SACK &&
- op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
- ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
- sack_adjust(skb, tcph, optoff+2,
- optoff+op[1], &nat->seq[!dir]);
- optoff += op[1];
- }
- }
- return 1;
-}
-
-/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
-int
-nf_nat_seq_adjust(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct tcphdr *tcph;
- int dir;
- __be32 newseq, newack;
- s16 seqoff, ackoff;
- struct nf_conn_nat *nat = nfct_nat(ct);
- struct nf_nat_seq *this_way, *other_way;
-
- dir = CTINFO2DIR(ctinfo);
-
- this_way = &nat->seq[dir];
- other_way = &nat->seq[!dir];
-
- if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
- return 0;
-
- tcph = (void *)skb->data + ip_hdrlen(skb);
- if (after(ntohl(tcph->seq), this_way->correction_pos))
- seqoff = this_way->offset_after;
- else
- seqoff = this_way->offset_before;
-
- if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
- other_way->correction_pos))
- ackoff = other_way->offset_after;
- else
- ackoff = other_way->offset_before;
-
- newseq = htonl(ntohl(tcph->seq) + seqoff);
- newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
- inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
- inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
-
- pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
- ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
- ntohl(newack));
-
- tcph->seq = newseq;
- tcph->ack_seq = newack;
-
- return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
-}
-
-/* Setup NAT on this expected conntrack so it follows master. */
-/* If we fail to get a free NAT slot, we'll get dropped on confirm */
-void nf_nat_follow_master(struct nf_conn *ct,
- struct nf_conntrack_expect *exp)
-{
- struct nf_nat_range range;
-
- /* This must be a fresh one. */
- BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
- /* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = exp->saved_proto;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
-}
-EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
deleted file mode 100644
index 535e1a80235..00000000000
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* IRC extension for TCP NAT alteration.
- *
- * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
- * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- * based on a copy of RR's ip_nat_ftp.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/tcp.h>
-#include <linux/kernel.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_irc.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("IRC (DCC) NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_irc");
-
-static unsigned int help(struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conntrack_expect *exp)
-{
- char buffer[sizeof("4294967296 65635")];
- u_int32_t ip;
- u_int16_t port;
- unsigned int ret;
-
- /* Reply comes from server. */
- exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
- exp->dir = IP_CT_DIR_REPLY;
- exp->expectfn = nf_nat_follow_master;
-
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
- if (port == 0)
- return NF_DROP;
-
- ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
- sprintf(buffer, "%u %u", ip, port);
- pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
- buffer, &ip, port);
-
- ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
- matchoff, matchlen, buffer,
- strlen(buffer));
- if (ret != NF_ACCEPT)
- nf_ct_unexpect_related(exp);
- return ret;
-}
-
-static void __exit nf_nat_irc_fini(void)
-{
- rcu_assign_pointer(nf_nat_irc_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init nf_nat_irc_init(void)
-{
- BUG_ON(nf_nat_irc_hook != NULL);
- rcu_assign_pointer(nf_nat_irc_hook, help);
- return 0;
-}
-
-/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
-static int warn_set(const char *val, struct kernel_param *kp)
-{
- printk(KERN_INFO KBUILD_MODNAME
- ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
- return 0;
-}
-module_param_call(ports, warn_set, NULL, NULL, 0);
-
-module_init(nf_nat_irc_init);
-module_exit(nf_nat_irc_fini);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 00000000000..d8b2e14efdd
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,281 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi4 *fl4 = &fl->u.ip4;
+
+ if (ct->status & statusbit) {
+ fl4->daddr = t->dst.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl4->saddr = t->src.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_sport = t->src.u.all;
+ }
+}
+#endif /* CONFIG_XFRM */
+
+static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range *range)
+{
+ return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+ ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+}
+
+static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
+ __be16 dport)
+{
+ return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
+}
+
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph;
+ unsigned int hdroff;
+
+ if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+ return false;
+
+ iph = (void *)skb->data + iphdroff;
+ hdroff = iphdroff + iph->ihl * 4;
+
+ if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
+ target, maniptype))
+ return false;
+ iph = (void *)skb->data + iphdroff;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+ iph->saddr = target->src.u3.ip;
+ } else {
+ csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+ iph->daddr = target->dst.u3.ip;
+ }
+ return true;
+}
+
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+ __be32 oldip, newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = iph->saddr;
+ newip = t->src.u3.ip;
+ } else {
+ oldip = iph->daddr;
+ newip = t->dst.u3.ip;
+ }
+ inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+}
+
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto,
+ csum_partial(data, datalen,
+ 0));
+ if (proto == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_NAT_V4_MINIP]) {
+ range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V4_MAXIP])
+ range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
+ else
+ range->max_addr.ip = range->min_addr.ip;
+
+ return 0;
+}
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
+ .l3proto = NFPROTO_IPV4,
+ .in_range = nf_nat_ipv4_in_range,
+ .secure_port = nf_nat_ipv4_secure_port,
+ .manip_pkt = nf_nat_ipv4_manip_pkt,
+ .csum_update = nf_nat_ipv4_csum_update,
+ .csum_recalc = nf_nat_ipv4_csum_recalc,
+ .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
+#ifdef CONFIG_XFRM
+ .decode_session = nf_nat_ipv4_decode_session,
+#endif
+};
+
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum)
+{
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ unsigned int hdrlen = ip_hdrlen(skb);
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+ if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp.type == ICMP_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
+ if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+ l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Reloading "inside" here since manip_pkt may reallocate */
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp.checksum = 0;
+ inside->icmp.checksum =
+ csum_fold(skb_checksum(skb, hdrlen,
+ skb->len - hdrlen, 0));
+ }
+
+ /* Change outer to look like the reply to an incoming packet */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+static int __init nf_nat_l3proto_ipv4_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
+ if (err < 0)
+ goto err2;
+ return err;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_l3proto_ipv4_exit(void)
+{
+ nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
+
+module_init(nf_nat_l3proto_ipv4_init);
+module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 4c060038d29..657d2307f03 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -13,6 +13,8 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
* TODO: - NAT to a unique tuple, not to TCP source port
* (needs netfilter tuple reservation)
*/
@@ -22,7 +24,6 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_zones.h>
@@ -49,7 +50,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
- ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(master);
nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
/* And here goes the grand finale of corrosion... */
@@ -88,24 +89,24 @@ static void pptp_nat_expected(struct nf_conn *ct,
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
/* Change src to where master sends to */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
if (exp->dir == IP_CT_DIR_ORIGINAL) {
- range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- range.min = range.max = exp->saved_proto;
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
}
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
/* For DST manip, map port here to where it's expected. */
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
if (exp->dir == IP_CT_DIR_REPLY) {
- range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- range.min = range.max = exp->saved_proto;
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
}
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
}
/* outbound packets == from PNS to PAC */
@@ -113,6 +114,7 @@ static int
pptp_outbound_pkt(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq)
@@ -123,7 +125,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
__be16 new_callid;
unsigned int cid_off;
- ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
new_callid = ct_pptp_info->pns_call_id;
@@ -175,7 +177,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
/* mangle packet */
- if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
cid_off + sizeof(struct pptp_pkt_hdr) +
sizeof(struct PptpControlHeader),
sizeof(new_callid), (char *)&new_callid,
@@ -192,7 +194,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
struct nf_ct_pptp_master *ct_pptp_info;
struct nf_nat_pptp *nat_pptp_info;
- ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
+ ct_pptp_info = nfct_help_data(ct);
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
/* save original PAC call ID in nat_info */
@@ -216,6 +218,7 @@ static int
pptp_inbound_pkt(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
struct PptpControlHeader *ctlh,
union pptp_ctrl_union *pptpReq)
{
@@ -268,7 +271,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
- if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
pcid_off + sizeof(struct pptp_pkt_hdr) +
sizeof(struct PptpControlHeader),
sizeof(new_pcid), (char *)&new_pcid,
@@ -282,25 +285,25 @@ static int __init nf_nat_helper_pptp_init(void)
nf_nat_need_gre();
BUG_ON(nf_nat_pptp_hook_outbound != NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
BUG_ON(nf_nat_pptp_hook_inbound != NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
return 0;
}
static void __exit nf_nat_helper_pptp_fini(void)
{
- rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL);
- rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
synchronize_rcu();
}
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
deleted file mode 100644
index 3e61faf23a9..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/random.h>
-#include <linux/ip.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- __be16 port;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- port = tuple->src.u.all;
- else
- port = tuple->dst.u.all;
-
- return ntohs(port) >= ntohs(min->all) &&
- ntohs(port) <= ntohs(max->all);
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
-
-void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct,
- u_int16_t *rover)
-{
- unsigned int range_size, min, i;
- __be16 *portptr;
- u_int16_t off;
-
- if (maniptype == IP_NAT_MANIP_SRC)
- portptr = &tuple->src.u.all;
- else
- portptr = &tuple->dst.u.all;
-
- /* If no range specified... */
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
- /* If it's dst rewrite, can't change port */
- if (maniptype == IP_NAT_MANIP_DST)
- return;
-
- if (ntohs(*portptr) < 1024) {
- /* Loose convention: >> 512 is credential passing */
- if (ntohs(*portptr) < 512) {
- min = 1;
- range_size = 511 - min + 1;
- } else {
- min = 600;
- range_size = 1023 - min + 1;
- }
- } else {
- min = 1024;
- range_size = 65535 - 1024 + 1;
- }
- } else {
- min = ntohs(range->min.all);
- range_size = ntohs(range->max.all) - min + 1;
- }
-
- if (range->flags & IP_NAT_RANGE_PROTO_RANDOM)
- off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
- maniptype == IP_NAT_MANIP_SRC
- ? tuple->dst.u.all
- : tuple->src.u.all);
- else
- off = *rover;
-
- for (i = 0; ; ++off) {
- *portptr = htons(min + off % range_size);
- if (++i != range_size && nf_nat_used_tuple(tuple, ct))
- continue;
- if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
- *rover = off;
- return;
- }
- return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
-
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-int nf_nat_proto_range_to_nlattr(struct sk_buff *skb,
- const struct nf_nat_range *range)
-{
- NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all);
- NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all);
- return 0;
-
-nla_put_failure:
- return -1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
-
-int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range *range)
-{
- if (tb[CTA_PROTONAT_PORT_MIN]) {
- range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
- range->max.all = range->min.tcp.port;
- range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- }
- if (tb[CTA_PROTONAT_PORT_MAX]) {
- range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
- range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr);
-#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
deleted file mode 100644
index 570faf2667b..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * DCCP NAT protocol helper
- *
- * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/dccp.h>
-
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t dccp_port_rover;
-
-static void
-dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &dccp_port_rover);
-}
-
-static bool
-dccp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct iphdr *iph = (const void *)(skb->data + iphdroff);
- struct dccp_hdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl * 4;
- __be32 oldip, newip;
- __be16 *portptr, oldport, newport;
- int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
- if (skb->len >= hdroff + sizeof(struct dccp_hdr))
- hdrsize = sizeof(struct dccp_hdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- iph = (struct iphdr *)(skb->data + iphdroff);
- hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- oldip = iph->saddr;
- newip = tuple->src.u3.ip;
- newport = tuple->src.u.dccp.port;
- portptr = &hdr->dccph_sport;
- } else {
- oldip = iph->daddr;
- newip = tuple->dst.u3.ip;
- newport = tuple->dst.u.dccp.port;
- portptr = &hdr->dccph_dport;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
- inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- 0);
- return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_dccp = {
- .protonum = IPPROTO_DCCP,
- .me = THIS_MODULE,
- .manip_pkt = dccp_manip_pkt,
- .in_range = nf_nat_proto_in_range,
- .unique_tuple = dccp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_dccp_init(void)
-{
- return nf_nat_protocol_register(&nf_nat_protocol_dccp);
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
- nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP NAT protocol helper");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index bc8d83a31c7..690d890111b 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -21,6 +21,8 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
*/
#include <linux/module.h>
@@ -28,8 +30,7 @@
#include <linux/ip.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_l4proto.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
MODULE_LICENSE("GPL");
@@ -38,7 +39,8 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
/* generate unique tuple ... */
static void
-gre_unique_tuple(struct nf_conntrack_tuple *tuple,
+gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
@@ -52,18 +54,18 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!ct->master)
return;
- if (maniptype == IP_NAT_MANIP_SRC)
+ if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.gre.key;
else
keyptr = &tuple->dst.u.gre.key;
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
pr_debug("%p: NATing GRE PPTP\n", ct);
min = 1;
range_size = 0xffff;
} else {
- min = ntohs(range->min.gre.key);
- range_size = ntohs(range->max.gre.key) - min + 1;
+ min = ntohs(range->min_proto.gre.key);
+ range_size = ntohs(range->max_proto.gre.key) - min + 1;
}
pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +82,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
/* manipulate a GRE packet according to maniptype */
static bool
-gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
+gre_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
const struct gre_hdr *greh;
struct gre_hdr_pptp *pgreh;
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- unsigned int hdroff = iphdroff + iph->ihl * 4;
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
@@ -99,7 +101,7 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
/* we only have destination manip of a packet, since 'source key'
* is not present in the packet itself */
- if (maniptype != IP_NAT_MANIP_DST)
+ if (maniptype != NF_NAT_MANIP_DST)
return true;
switch (greh->version) {
case GRE_VERSION_1701:
@@ -117,26 +119,24 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
return true;
}
-static const struct nf_nat_protocol gre = {
- .protonum = IPPROTO_GRE,
- .me = THIS_MODULE,
+static const struct nf_nat_l4proto gre = {
+ .l4proto = IPPROTO_GRE,
.manip_pkt = gre_manip_pkt,
- .in_range = nf_nat_proto_in_range,
+ .in_range = nf_nat_l4proto_in_range,
.unique_tuple = gre_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
static int __init nf_nat_proto_gre_init(void)
{
- return nf_nat_protocol_register(&gre);
+ return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
}
static void __exit nf_nat_proto_gre_fini(void)
{
- nf_nat_protocol_unregister(&gre);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
}
module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 5744c3ec847..eb303471bcf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -8,14 +8,14 @@
#include <linux/types.h>
#include <linux/init.h>
+#include <linux/export.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_l4proto.h>
static bool
icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -28,7 +28,8 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
}
static void
-icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
+icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
@@ -37,13 +38,14 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size;
unsigned int i;
- range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
/* If no range specified... */
- if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
range_size = 0xFFFF;
for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+ tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
(id % range_size));
if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
return;
@@ -53,13 +55,12 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
static bool
icmp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
struct icmphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
return false;
@@ -71,14 +72,12 @@ icmp_manip_pkt(struct sk_buff *skb,
return true;
}
-const struct nf_nat_protocol nf_nat_protocol_icmp = {
- .protonum = IPPROTO_ICMP,
- .me = THIS_MODULE,
+const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
+ .l4proto = IPPROTO_ICMP,
.manip_pkt = icmp_manip_pkt,
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
deleted file mode 100644
index 756331d4266..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/sctp.h>
-#include <net/sctp/checksum.h>
-
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t nf_sctp_port_rover;
-
-static void
-sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &nf_sctp_port_rover);
-}
-
-static bool
-sctp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- struct sk_buff *frag;
- sctp_sctphdr_t *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be32 crc32;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- iph = (struct iphdr *)(skb->data + iphdroff);
- hdr = (struct sctphdr *)(skb->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.u3.ip;
- hdr->source = tuple->src.u.sctp.port;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.u3.ip;
- hdr->dest = tuple->dst.u.sctp.port;
- }
-
- crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
- skb_walk_frags(skb, frag)
- crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
- crc32);
- crc32 = sctp_end_cksum(crc32);
- hdr->checksum = crc32;
-
- return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_sctp = {
- .protonum = IPPROTO_SCTP,
- .me = THIS_MODULE,
- .manip_pkt = sctp_manip_pkt,
- .in_range = nf_nat_proto_in_range,
- .unique_tuple = sctp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_sctp_init(void)
-{
- return nf_nat_protocol_register(&nf_nat_protocol_sctp);
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
- nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
deleted file mode 100644
index aa460a595d5..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-
-static u_int16_t tcp_port_rover;
-
-static void
-tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
-}
-
-static bool
-tcp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- struct tcphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be16 *portptr, newport, oldport;
- int hdrsize = 8; /* TCP connection tracking guarantees this much */
-
- /* this could be a inner header returned in icmp packet; in such
- cases we cannot update the checksum field since it is outside of
- the 8 bytes of transport layer headers we are guaranteed */
- if (skb->len >= hdroff + sizeof(struct tcphdr))
- hdrsize = sizeof(struct tcphdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- iph = (struct iphdr *)(skb->data + iphdroff);
- hdr = (struct tcphdr *)(skb->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.u3.ip;
- newport = tuple->src.u.tcp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.u3.ip;
- newport = tuple->dst.u.tcp.port;
- portptr = &hdr->dest;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
- inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
- return true;
-}
-
-const struct nf_nat_protocol nf_nat_protocol_tcp = {
- .protonum = IPPROTO_TCP,
- .me = THIS_MODULE,
- .manip_pkt = tcp_manip_pkt,
- .in_range = nf_nat_proto_in_range,
- .unique_tuple = tcp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
deleted file mode 100644
index dfe65c7e292..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t udp_port_rover;
-
-static void
-udp_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
-}
-
-static bool
-udp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- struct udphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be16 *portptr, newport;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- iph = (struct iphdr *)(skb->data + iphdroff);
- hdr = (struct udphdr *)(skb->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.u3.ip;
- newport = tuple->src.u.udp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.u3.ip;
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
- if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
- 0);
- if (!hdr->check)
- hdr->check = CSUM_MANGLED_0;
- }
- *portptr = newport;
- return true;
-}
-
-const struct nf_nat_protocol nf_nat_protocol_udp = {
- .protonum = IPPROTO_UDP,
- .me = THIS_MODULE,
- .manip_pkt = udp_manip_pkt,
- .in_range = nf_nat_proto_in_range,
- .unique_tuple = udp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
deleted file mode 100644
index 3cc8c8af39e..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static u_int16_t udplite_port_rover;
-
-static void
-udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &udplite_port_rover);
-}
-
-static bool
-udplite_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- struct udphdr *hdr;
- unsigned int hdroff = iphdroff + iph->ihl*4;
- __be32 oldip, newip;
- __be16 *portptr, newport;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- iph = (struct iphdr *)(skb->data + iphdroff);
- hdr = (struct udphdr *)(skb->data + hdroff);
-
- if (maniptype == IP_NAT_MANIP_SRC) {
- /* Get rid of src ip and src pt */
- oldip = iph->saddr;
- newip = tuple->src.u3.ip;
- newport = tuple->src.u.udp.port;
- portptr = &hdr->source;
- } else {
- /* Get rid of dst ip and dst pt */
- oldip = iph->daddr;
- newip = tuple->dst.u3.ip;
- newport = tuple->dst.u.udp.port;
- portptr = &hdr->dest;
- }
-
- inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
- inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
- if (!hdr->check)
- hdr->check = CSUM_MANGLED_0;
-
- *portptr = newport;
- return true;
-}
-
-static const struct nf_nat_protocol nf_nat_protocol_udplite = {
- .protonum = IPPROTO_UDPLITE,
- .me = THIS_MODULE,
- .manip_pkt = udplite_manip_pkt,
- .in_range = nf_nat_proto_in_range,
- .unique_tuple = udplite_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
- .nlattr_to_range = nf_nat_proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_udplite_init(void)
-{
- return nf_nat_protocol_register(&nf_nat_protocol_udplite);
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
- nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
-}
-
-module_init(nf_nat_proto_udplite_init);
-module_exit(nf_nat_proto_udplite_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
deleted file mode 100644
index a50f2bc1c73..00000000000
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/* The "unknown" protocol. This is what is used for protocols we
- * don't understand. It's returned by ip_ct_find_proto().
- */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-
-static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type manip_type,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return true;
-}
-
-static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- /* Sorry: we can't help you; if it's not unique, we can't frob
- anything. */
- return;
-}
-
-static bool
-unknown_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- return true;
-}
-
-const struct nf_nat_protocol nf_nat_unknown_protocol = {
- /* .me isn't set: getting a ref to this cannot fail. */
- .manip_pkt = unknown_manip_pkt,
- .in_range = unknown_in_range,
- .unique_tuple = unknown_unique_tuple,
-};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
deleted file mode 100644
index 21c30426480..00000000000
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/* Everything about the rules for NAT. */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/bitops.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_rule.h>
-
-#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
- (1 << NF_INET_POST_ROUTING) | \
- (1 << NF_INET_LOCAL_OUT) | \
- (1 << NF_INET_LOCAL_IN))
-
-static const struct xt_table nat_table = {
- .name = "nat",
- .valid_hooks = NAT_VALID_HOOKS,
- .me = THIS_MODULE,
- .af = NFPROTO_IPV4,
-};
-
-/* Source NAT */
-static unsigned int
-ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_IN);
-
- ct = nf_ct_get(skb, &ctinfo);
-
- /* Connection must be valid and new. */
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
- NF_CT_ASSERT(par->out != NULL);
-
- return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
-}
-
-static unsigned int
-ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT);
-
- ct = nf_ct_get(skb, &ctinfo);
-
- /* Connection must be valid and new. */
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
-
- return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
-}
-
-static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- /* Must be a valid range */
- if (mr->rangesize != 1) {
- pr_info("SNAT: multiple ranges no longer supported\n");
- return -EINVAL;
- }
- return 0;
-}
-
-static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_multi_range_compat *mr = par->targinfo;
-
- /* Must be a valid range */
- if (mr->rangesize != 1) {
- pr_info("DNAT: multiple ranges no longer supported\n");
- return -EINVAL;
- }
- return 0;
-}
-
-static unsigned int
-alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-{
- /* Force range to this IP; let proto decide mapping for
- per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- */
- struct nf_nat_range range;
-
- range.flags = 0;
- pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
- HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-
- return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
-}
-
-int nf_nat_rule_find(struct sk_buff *skb,
- unsigned int hooknum,
- const struct net_device *in,
- const struct net_device *out,
- struct nf_conn *ct)
-{
- struct net *net = nf_ct_net(ct);
- int ret;
-
- ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
-
- if (ret == NF_ACCEPT) {
- if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
- /* NUL mapping */
- ret = alloc_null_binding(ct, hooknum);
- }
- return ret;
-}
-
-static struct xt_target ipt_snat_reg __read_mostly = {
- .name = "SNAT",
- .target = ipt_snat_target,
- .targetsize = sizeof(struct nf_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
- .checkentry = ipt_snat_checkentry,
- .family = AF_INET,
-};
-
-static struct xt_target ipt_dnat_reg __read_mostly = {
- .name = "DNAT",
- .target = ipt_dnat_target,
- .targetsize = sizeof(struct nf_nat_multi_range_compat),
- .table = "nat",
- .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
- .checkentry = ipt_dnat_checkentry,
- .family = AF_INET,
-};
-
-static int __net_init nf_nat_rule_net_init(struct net *net)
-{
- struct ipt_replace *repl;
-
- repl = ipt_alloc_initial_table(&nat_table);
- if (repl == NULL)
- return -ENOMEM;
- net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
- kfree(repl);
- if (IS_ERR(net->ipv4.nat_table))
- return PTR_ERR(net->ipv4.nat_table);
- return 0;
-}
-
-static void __net_exit nf_nat_rule_net_exit(struct net *net)
-{
- ipt_unregister_table(net, net->ipv4.nat_table);
-}
-
-static struct pernet_operations nf_nat_rule_net_ops = {
- .init = nf_nat_rule_net_init,
- .exit = nf_nat_rule_net_exit,
-};
-
-int __init nf_nat_rule_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&nf_nat_rule_net_ops);
- if (ret != 0)
- goto out;
- ret = xt_register_target(&ipt_snat_reg);
- if (ret != 0)
- goto unregister_table;
-
- ret = xt_register_target(&ipt_dnat_reg);
- if (ret != 0)
- goto unregister_snat;
-
- return ret;
-
- unregister_snat:
- xt_unregister_target(&ipt_snat_reg);
- unregister_table:
- unregister_pernet_subsys(&nf_nat_rule_net_ops);
- out:
- return ret;
-}
-
-void nf_nat_rule_cleanup(void)
-{
- xt_unregister_target(&ipt_dnat_reg);
- xt_unregister_target(&ipt_snat_reg);
- unregister_pernet_subsys(&nf_nat_rule_net_ops);
-}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
deleted file mode 100644
index e40cf7816fd..00000000000
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ /dev/null
@@ -1,561 +0,0 @@
-/* SIP extension for NAT alteration.
- *
- * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
- * based on RR's ip_nat_ftp.c and other modules.
- * (C) 2007 United Security Providers
- * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_sip.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
-MODULE_DESCRIPTION("SIP NAT helper");
-MODULE_ALIAS("ip_nat_sip");
-
-
-static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int matchoff, unsigned int matchlen,
- const char *buffer, unsigned int buflen)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- struct tcphdr *th;
- unsigned int baseoff;
-
- if (nf_ct_protonum(ct) == IPPROTO_TCP) {
- th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
- baseoff = ip_hdrlen(skb) + th->doff * 4;
- matchoff += dataoff - baseoff;
-
- if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
- matchoff, matchlen,
- buffer, buflen, false))
- return 0;
- } else {
- baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
- matchoff += dataoff - baseoff;
-
- if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
- matchoff, matchlen,
- buffer, buflen))
- return 0;
- }
-
- /* Reload data pointer and adjust datalen value */
- *dptr = skb->data + dataoff;
- *datalen += buflen - matchlen;
- return 1;
-}
-
-static int map_addr(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int matchoff, unsigned int matchlen,
- union nf_inet_addr *addr, __be16 port)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- unsigned int buflen;
- __be32 newaddr;
- __be16 newport;
-
- if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
- ct->tuplehash[dir].tuple.src.u.udp.port == port) {
- newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
- newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
- } else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
- ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
- newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
- newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
- } else
- return 1;
-
- if (newaddr == addr->ip && newport == port)
- return 1;
-
- buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
-
- return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
- buffer, buflen);
-}
-
-static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- enum sip_header_types type)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- unsigned int matchlen, matchoff;
- union nf_inet_addr addr;
- __be16 port;
-
- if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
- &matchoff, &matchlen, &addr, &port) <= 0)
- return 1;
- return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
- &addr, port);
-}
-
-static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned int coff, matchoff, matchlen;
- enum sip_header_types hdr;
- union nf_inet_addr addr;
- __be16 port;
- int request, in_header;
-
- /* Basic rules: requests and responses. */
- if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
- if (ct_sip_parse_request(ct, *dptr, *datalen,
- &matchoff, &matchlen,
- &addr, &port) > 0 &&
- !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
- &addr, port))
- return NF_DROP;
- request = 1;
- } else
- request = 0;
-
- if (nf_ct_protonum(ct) == IPPROTO_TCP)
- hdr = SIP_HDR_VIA_TCP;
- else
- hdr = SIP_HDR_VIA_UDP;
-
- /* Translate topmost Via header and parameters */
- if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
- hdr, NULL, &matchoff, &matchlen,
- &addr, &port) > 0) {
- unsigned int matchend, poff, plen, buflen, n;
- char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
-
- /* We're only interested in headers related to this
- * connection */
- if (request) {
- if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
- port != ct->tuplehash[dir].tuple.src.u.udp.port)
- goto next;
- } else {
- if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
- port != ct->tuplehash[dir].tuple.dst.u.udp.port)
- goto next;
- }
-
- if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
- &addr, port))
- return NF_DROP;
-
- matchend = matchoff + matchlen;
-
- /* The maddr= parameter (RFC 2361) specifies where to send
- * the reply. */
- if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
- "maddr=", &poff, &plen,
- &addr) > 0 &&
- addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
- addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
- buflen = sprintf(buffer, "%pI4",
- &ct->tuplehash[!dir].tuple.dst.u3.ip);
- if (!mangle_packet(skb, dataoff, dptr, datalen,
- poff, plen, buffer, buflen))
- return NF_DROP;
- }
-
- /* The received= parameter (RFC 2361) contains the address
- * from which the server received the request. */
- if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
- "received=", &poff, &plen,
- &addr) > 0 &&
- addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
- addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
- buflen = sprintf(buffer, "%pI4",
- &ct->tuplehash[!dir].tuple.src.u3.ip);
- if (!mangle_packet(skb, dataoff, dptr, datalen,
- poff, plen, buffer, buflen))
- return NF_DROP;
- }
-
- /* The rport= parameter (RFC 3581) contains the port number
- * from which the server received the request. */
- if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
- "rport=", &poff, &plen,
- &n) > 0 &&
- htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
- htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
- __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
- buflen = sprintf(buffer, "%u", ntohs(p));
- if (!mangle_packet(skb, dataoff, dptr, datalen,
- poff, plen, buffer, buflen))
- return NF_DROP;
- }
- }
-
-next:
- /* Translate Contact headers */
- coff = 0;
- in_header = 0;
- while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
- SIP_HDR_CONTACT, &in_header,
- &matchoff, &matchlen,
- &addr, &port) > 0) {
- if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
- &addr, port))
- return NF_DROP;
- }
-
- if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
- !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
- return NF_DROP;
-
- return NF_ACCEPT;
-}
-
-static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- const struct tcphdr *th;
-
- if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
- return;
-
- th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
- nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-
-/* Handles expected signalling connections and media streams */
-static void ip_nat_sip_expected(struct nf_conn *ct,
- struct nf_conntrack_expect *exp)
-{
- struct nf_nat_range range;
-
- /* This must be a fresh one. */
- BUG_ON(ct->status & IPS_NAT_DONE_MASK);
-
- /* For DST manip, map port here to where it's expected. */
- range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
- range.min = range.max = exp->saved_proto;
- range.min_ip = range.max_ip = exp->saved_ip;
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
-
- /* Change src to where master sends to, but only if the connection
- * actually came from the same source. */
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
- ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
- range.flags = IP_NAT_RANGE_MAP_IPS;
- range.min_ip = range.max_ip
- = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
- nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
- }
-}
-
-static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- struct nf_conntrack_expect *exp,
- unsigned int matchoff,
- unsigned int matchlen)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- __be32 newip;
- u_int16_t port;
- char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
- unsigned buflen;
-
- /* Connection will come from reply */
- if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
- newip = exp->tuple.dst.u3.ip;
- else
- newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-
- /* If the signalling port matches the connection's source port in the
- * original direction, try to use the destination port in the opposite
- * direction. */
- if (exp->tuple.dst.u.udp.port ==
- ct->tuplehash[dir].tuple.src.u.udp.port)
- port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
- else
- port = ntohs(exp->tuple.dst.u.udp.port);
-
- exp->saved_ip = exp->tuple.dst.u3.ip;
- exp->tuple.dst.u3.ip = newip;
- exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
- exp->dir = !dir;
- exp->expectfn = ip_nat_sip_expected;
-
- for (; port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
- if (port == 0)
- return NF_DROP;
-
- if (exp->tuple.dst.u3.ip != exp->saved_ip ||
- exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
- buflen = sprintf(buffer, "%pI4:%u", &newip, port);
- if (!mangle_packet(skb, dataoff, dptr, datalen,
- matchoff, matchlen, buffer, buflen))
- goto err;
- }
- return NF_ACCEPT;
-
-err:
- nf_ct_unexpect_related(exp);
- return NF_DROP;
-}
-
-static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- unsigned int matchoff, matchlen;
- char buffer[sizeof("65536")];
- int buflen, c_len;
-
- /* Get actual SDP length */
- if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
- SDP_HDR_VERSION, SDP_HDR_UNSPEC,
- &matchoff, &matchlen) <= 0)
- return 0;
- c_len = *datalen - matchoff + strlen("v=");
-
- /* Now, update SDP length */
- if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
- &matchoff, &matchlen) <= 0)
- return 0;
-
- buflen = sprintf(buffer, "%u", c_len);
- return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
- buffer, buflen);
-}
-
-static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int sdpoff,
- enum sdp_header_types type,
- enum sdp_header_types term,
- char *buffer, int buflen)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- unsigned int matchlen, matchoff;
-
- if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
- &matchoff, &matchlen) <= 0)
- return -ENOENT;
- return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
- buffer, buflen) ? 0 : -EINVAL;
-}
-
-static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int sdpoff,
- enum sdp_header_types type,
- enum sdp_header_types term,
- const union nf_inet_addr *addr)
-{
- char buffer[sizeof("nnn.nnn.nnn.nnn")];
- unsigned int buflen;
-
- buflen = sprintf(buffer, "%pI4", &addr->ip);
- if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
- buffer, buflen))
- return 0;
-
- return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-
-static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int matchoff,
- unsigned int matchlen,
- u_int16_t port)
-{
- char buffer[sizeof("nnnnn")];
- unsigned int buflen;
-
- buflen = sprintf(buffer, "%u", port);
- if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
- buffer, buflen))
- return 0;
-
- return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-
-static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- unsigned int sdpoff,
- const union nf_inet_addr *addr)
-{
- char buffer[sizeof("nnn.nnn.nnn.nnn")];
- unsigned int buflen;
-
- /* Mangle session description owner and contact addresses */
- buflen = sprintf(buffer, "%pI4", &addr->ip);
- if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
- SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
- buffer, buflen))
- return 0;
-
- switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
- SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
- buffer, buflen)) {
- case 0:
- /*
- * RFC 2327:
- *
- * Session description
- *
- * c=* (connection information - not required if included in all media)
- */
- case -ENOENT:
- break;
- default:
- return 0;
- }
-
- return mangle_content_len(skb, dataoff, dptr, datalen);
-}
-
-/* So, this packet has hit the connection tracking matching code.
- Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
- const char **dptr, unsigned int *datalen,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp,
- unsigned int mediaoff,
- unsigned int medialen,
- union nf_inet_addr *rtp_addr)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- u_int16_t port;
-
- /* Connection will come from reply */
- if (ct->tuplehash[dir].tuple.src.u3.ip ==
- ct->tuplehash[!dir].tuple.dst.u3.ip)
- rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
- else
- rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
-
- rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
- rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
- rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
- rtp_exp->dir = !dir;
- rtp_exp->expectfn = ip_nat_sip_expected;
-
- rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
- rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
- rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
- rtcp_exp->dir = !dir;
- rtcp_exp->expectfn = ip_nat_sip_expected;
-
- /* Try to get same pair of ports: if not, try to change them. */
- for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
- port != 0; port += 2) {
- int ret;
-
- rtp_exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(rtp_exp);
- if (ret == -EBUSY)
- continue;
- else if (ret < 0) {
- port = 0;
- break;
- }
- rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
- ret = nf_ct_expect_related(rtcp_exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- nf_ct_unexpect_related(rtp_exp);
- port = 0;
- break;
- }
- }
-
- if (port == 0)
- goto err1;
-
- /* Update media port. */
- if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
- !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
- mediaoff, medialen, port))
- goto err2;
-
- return NF_ACCEPT;
-
-err2:
- nf_ct_unexpect_related(rtp_exp);
- nf_ct_unexpect_related(rtcp_exp);
-err1:
- return NF_DROP;
-}
-
-static void __exit nf_nat_sip_fini(void)
-{
- rcu_assign_pointer(nf_nat_sip_hook, NULL);
- rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
- rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
- rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
- rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
- rcu_assign_pointer(nf_nat_sdp_session_hook, NULL);
- rcu_assign_pointer(nf_nat_sdp_media_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init nf_nat_sip_init(void)
-{
- BUG_ON(nf_nat_sip_hook != NULL);
- BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
- BUG_ON(nf_nat_sip_expect_hook != NULL);
- BUG_ON(nf_nat_sdp_addr_hook != NULL);
- BUG_ON(nf_nat_sdp_port_hook != NULL);
- BUG_ON(nf_nat_sdp_session_hook != NULL);
- BUG_ON(nf_nat_sdp_media_hook != NULL);
- rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
- rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
- rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
- rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
- rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
- rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session);
- rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media);
- return 0;
-}
-
-module_init(nf_nat_sip_init);
-module_exit(nf_nat_sip_fini);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a5..7c676671329 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -34,10 +34,11 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -54,6 +55,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -399,15 +401,12 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
*len = 0;
*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
- if (*octets == NULL) {
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ if (*octets == NULL)
return 0;
- }
ptr = *octets;
while (ctx->pointer < eoc) {
- if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+ if (!asn1_octet_decode(ctx, ptr++)) {
kfree(*octets);
*octets = NULL;
return 0;
@@ -450,11 +449,8 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
return 0;
*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
- if (*oid == NULL) {
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
+ if (*oid == NULL)
return 0;
- }
optr = *oid;
@@ -465,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
}
if (subid < 40) {
- optr [0] = 0;
- optr [1] = subid;
+ optr[0] = 0;
+ optr[1] = subid;
} else if (subid < 80) {
- optr [0] = 1;
- optr [1] = subid - 40;
+ optr[0] = 1;
+ optr[1] = subid - 40;
} else {
- optr [0] = 2;
- optr [1] = subid - 80;
+ optr[0] = 2;
+ optr[1] = subid - 80;
}
*len = 2;
@@ -718,117 +714,103 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
l = 0;
switch (type) {
- case SNMP_INTEGER:
- len = sizeof(long);
- if (!asn1_long_decode(ctx, end, &l)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.l[0] = l;
- break;
- case SNMP_OCTETSTR:
- case SNMP_OPAQUE:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len,
- GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(p);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.c, p, len);
+ case SNMP_INTEGER:
+ len = sizeof(long);
+ if (!asn1_long_decode(ctx, end, &l)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.l[0] = l;
+ break;
+ case SNMP_OCTETSTR:
+ case SNMP_OPAQUE:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
kfree(p);
- break;
- case SNMP_NULL:
- case SNMP_NOSUCHOBJECT:
- case SNMP_NOSUCHINSTANCE:
- case SNMP_ENDOFMIBVIEW:
- len = 0;
- *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- if (!asn1_null_decode(ctx, end)) {
- kfree(id);
- kfree(*obj);
- *obj = NULL;
- return 0;
- }
- break;
- case SNMP_OBJECTID:
- if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
- kfree(id);
- return 0;
- }
- len *= sizeof(unsigned long);
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(lp);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.ul, lp, len);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.c, p, len);
+ kfree(p);
+ break;
+ case SNMP_NULL:
+ case SNMP_NOSUCHOBJECT:
+ case SNMP_NOSUCHINSTANCE:
+ case SNMP_ENDOFMIBVIEW:
+ len = 0;
+ *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ if (!asn1_null_decode(ctx, end)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ break;
+ case SNMP_OBJECTID:
+ if (!asn1_oid_decode(ctx, end, &lp, &len)) {
+ kfree(id);
+ return 0;
+ }
+ len *= sizeof(unsigned long);
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
kfree(lp);
- break;
- case SNMP_IPADDR:
- if (!asn1_octets_decode(ctx, end, &p, &len)) {
- kfree(id);
- return 0;
- }
- if (len != 4) {
- kfree(p);
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(p);
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- memcpy((*obj)->syntax.uc, p, len);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.ul, lp, len);
+ kfree(lp);
+ break;
+ case SNMP_IPADDR:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ if (len != 4) {
kfree(p);
- break;
- case SNMP_COUNTER:
- case SNMP_GAUGE:
- case SNMP_TIMETICKS:
- len = sizeof(unsigned long);
- if (!asn1_ulong_decode(ctx, end, &ul)) {
- kfree(id);
- return 0;
- }
- *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
- if (*obj == NULL) {
- kfree(id);
- if (net_ratelimit())
- pr_notice("OOM in bsalg (%d)\n", __LINE__);
- return 0;
- }
- (*obj)->syntax.ul[0] = ul;
- break;
- default:
kfree(id);
return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.uc, p, len);
+ kfree(p);
+ break;
+ case SNMP_COUNTER:
+ case SNMP_GAUGE:
+ case SNMP_TIMETICKS:
+ len = sizeof(unsigned long);
+ if (!asn1_ulong_decode(ctx, end, &ul)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.ul[0] = ul;
+ break;
+ default:
+ kfree(id);
+ return 0;
}
(*obj)->syntax_len = len;
@@ -1216,8 +1198,8 @@ static int snmp_translate(struct nf_conn *ct,
map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
} else {
/* DNAT replies */
- map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
- map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+ map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);
}
if (map.from == map.to)
@@ -1225,8 +1207,7 @@ static int snmp_translate(struct nf_conn *ct,
if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
paylen, &map, &udph->check)) {
- if (net_ratelimit())
- printk(KERN_WARNING "bsalg: parser failed\n");
+ net_warn_ratelimited("bsalg: parser failed\n");
return NF_DROP;
}
return NF_ACCEPT;
@@ -1260,9 +1241,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
* can mess around with the payload.
*/
if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
- &iph->saddr, &iph->daddr);
+ net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+ &iph->saddr, &iph->daddr);
return NF_DROP;
}
@@ -1310,9 +1290,9 @@ static int __init nf_nat_snmp_basic_init(void)
{
int ret = 0;
- ret = nf_conntrack_helper_register(&snmp_helper);
- if (ret < 0)
- return ret;
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
ret = nf_conntrack_helper_register(&snmp_trap_helper);
if (ret < 0) {
nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1303,7 @@ static int __init nf_nat_snmp_basic_init(void)
static void __exit nf_nat_snmp_basic_fini(void)
{
- nf_conntrack_helper_unregister(&snmp_helper);
+ RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
deleted file mode 100644
index 95481fee8bd..00000000000
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/icmp.h>
-#include <linux/gfp.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/spinlock.h>
-
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_extend.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_nat_protocol.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-#ifdef CONFIG_XFRM
-static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
-{
- const struct nf_conn *ct;
- const struct nf_conntrack_tuple *t;
- enum ip_conntrack_info ctinfo;
- enum ip_conntrack_dir dir;
- unsigned long statusbit;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return;
- dir = CTINFO2DIR(ctinfo);
- t = &ct->tuplehash[dir].tuple;
-
- if (dir == IP_CT_DIR_ORIGINAL)
- statusbit = IPS_DST_NAT;
- else
- statusbit = IPS_SRC_NAT;
-
- if (ct->status & statusbit) {
- fl->fl4_dst = t->dst.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl->fl_ip_dport = t->dst.u.tcp.port;
- }
-
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- fl->fl4_src = t->src.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl->fl_ip_sport = t->src.u.tcp.port;
- }
-}
-#endif
-
-static unsigned int
-nf_nat_fn(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- /* maniptype == SRC for postrouting. */
- enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
-
- /* We never see fragments: conntrack defrags on pre-routing
- and local-out, and nf_nat_out protects post-routing. */
- NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
-
- ct = nf_ct_get(skb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- have dropped it. Hence it's the user's responsibilty to
- packet filter it out, or implement conntrack/NAT for that
- protocol. 8) --RR */
- if (!ct)
- return NF_ACCEPT;
-
- /* Don't try to NAT if this packet is not conntracked */
- if (nf_ct_is_untracked(ct))
- return NF_ACCEPT;
-
- nat = nfct_nat(ct);
- if (!nat) {
- /* NAT module was loaded late. */
- if (nf_ct_is_confirmed(ct))
- return NF_ACCEPT;
- nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
- if (nat == NULL) {
- pr_debug("failed to add NAT extension\n");
- return NF_ACCEPT;
- }
- }
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED+IP_CT_IS_REPLY:
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(ct, ctinfo,
- hooknum, skb))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
- case IP_CT_NEW:
-
- /* Seen it before? This can happen for loopback, retrans,
- or local packets.. */
- if (!nf_nat_initialized(ct, maniptype)) {
- unsigned int ret;
-
- ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
- if (ret != NF_ACCEPT)
- return ret;
- } else
- pr_debug("Already setup manip %s for ct %p\n",
- maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
- ct);
- break;
-
- default:
- /* ESTABLISHED */
- NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
- ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
- }
-
- return nf_nat_packet(ct, ctinfo, hooknum, skb);
-}
-
-static unsigned int
-nf_nat_in(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- unsigned int ret;
- __be32 daddr = ip_hdr(skb)->daddr;
-
- ret = nf_nat_fn(hooknum, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- daddr != ip_hdr(skb)->daddr)
- skb_dst_drop(skb);
-
- return ret;
-}
-
-static unsigned int
-nf_nat_out(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
-#ifdef CONFIG_XFRM
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-#endif
- unsigned int ret;
-
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = nf_nat_fn(hooknum, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if ((ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip) ||
- (ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)
- )
- return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
- }
-#endif
- return ret;
-}
-
-static unsigned int
-nf_nat_local_fn(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
-
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
- return NF_ACCEPT;
-
- ret = nf_nat_fn(hooknum, skb, in, out, okfn);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- if (ip_route_me_harder(skb, RTN_UNSPEC))
- ret = NF_DROP;
- }
-#ifdef CONFIG_XFRM
- else if (ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all)
- if (ip_xfrm_me_harder(skb))
- ret = NF_DROP;
-#endif
- }
- return ret;
-}
-
-/* We must be after connection tracking and before packet filtering. */
-
-static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_in,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_out,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC,
- },
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_local_fn,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_fn,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_NAT_SRC,
- },
-};
-
-static int __init nf_nat_standalone_init(void)
-{
- int ret = 0;
-
- need_ipv4_conntrack();
-
-#ifdef CONFIG_XFRM
- BUG_ON(ip_nat_decode_session != NULL);
- rcu_assign_pointer(ip_nat_decode_session, nat_decode_session);
-#endif
- ret = nf_nat_rule_init();
- if (ret < 0) {
- pr_err("nf_nat_init: can't setup rules.\n");
- goto cleanup_decode_session;
- }
- ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
- if (ret < 0) {
- pr_err("nf_nat_init: can't register hooks.\n");
- goto cleanup_rule_init;
- }
- return ret;
-
- cleanup_rule_init:
- nf_nat_rule_cleanup();
- cleanup_decode_session:
-#ifdef CONFIG_XFRM
- rcu_assign_pointer(ip_nat_decode_session, NULL);
- synchronize_net();
-#endif
- return ret;
-}
-
-static void __exit nf_nat_standalone_fini(void)
-{
- nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
- nf_nat_rule_cleanup();
-#ifdef CONFIG_XFRM
- rcu_assign_pointer(ip_nat_decode_session, NULL);
- synchronize_net();
-#endif
- /* Conntrack caches are unregistered in nf_conntrack_cleanup */
-}
-
-module_init(nf_nat_standalone_init);
-module_exit(nf_nat_standalone_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
deleted file mode 100644
index 7274a43c7a1..00000000000
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/udp.h>
-
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_expect.h>
-#include <linux/netfilter/nf_conntrack_tftp.h>
-
-MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
-MODULE_DESCRIPTION("TFTP NAT helper");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_tftp");
-
-static unsigned int help(struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- struct nf_conntrack_expect *exp)
-{
- const struct nf_conn *ct = exp->master;
-
- exp->saved_proto.udp.port
- = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
- exp->dir = IP_CT_DIR_REPLY;
- exp->expectfn = nf_nat_follow_master;
- if (nf_ct_expect_related(exp) != 0)
- return NF_DROP;
- return NF_ACCEPT;
-}
-
-static void __exit nf_nat_tftp_fini(void)
-{
- rcu_assign_pointer(nf_nat_tftp_hook, NULL);
- synchronize_rcu();
-}
-
-static int __init nf_nat_tftp_init(void)
-{
- BUG_ON(nf_nat_tftp_hook != NULL);
- rcu_assign_pointer(nf_nat_tftp_hook, help);
- return 0;
-}
-
-module_init(nf_nat_tftp_init);
-module_exit(nf_nat_tftp_fini);
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
new file mode 100644
index 00000000000..19412a4063f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nft_do_chain_arp(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, in, out);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_arp __read_mostly = {
+ .family = NFPROTO_ARP,
+ .nhooks = NF_ARP_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_ARP_IN] = nft_do_chain_arp,
+ [NF_ARP_OUT] = nft_do_chain_arp,
+ [NF_ARP_FORWARD] = nft_do_chain_arp,
+ },
+};
+
+static int nf_tables_arp_init_net(struct net *net)
+{
+ net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.arp== NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
+
+ if (nft_register_afinfo(net, net->nft.arp) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.arp);
+ return -ENOMEM;
+}
+
+static void nf_tables_arp_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.arp);
+ kfree(net->nft.arp);
+}
+
+static struct pernet_operations nf_tables_arp_net_ops = {
+ .init = nf_tables_arp_init_net,
+ .exit = nf_tables_arp_exit_net,
+};
+
+static const struct nf_chain_type filter_arp = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_ARP,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_ARP_IN) |
+ (1 << NF_ARP_OUT) |
+ (1 << NF_ARP_FORWARD),
+};
+
+static int __init nf_tables_arp_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_arp);
+ ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_arp);
+
+ return ret;
+}
+
+static void __exit nf_tables_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_arp_net_ops);
+ nft_unregister_chain_type(&filter_arp);
+}
+
+module_init(nf_tables_arp_init);
+module_exit(nf_tables_arp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 00000000000..6820c8c4084
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+
+static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ if (unlikely(skb->len < sizeof(struct iphdr) ||
+ ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+ if (net_ratelimit())
+ pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+ "packet\n");
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain_ipv4(ops, skb, in, out, okfn);
+}
+
+struct nft_af_info nft_af_ipv4 __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
+ [NF_INET_LOCAL_OUT] = nft_ipv4_output,
+ [NF_INET_FORWARD] = nft_do_chain_ipv4,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
+ },
+};
+EXPORT_SYMBOL_GPL(nft_af_ipv4);
+
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+ net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.ipv4 == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+ if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.ipv4);
+ return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.ipv4);
+ kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+ .init = nf_tables_ipv4_init_net,
+ .exit = nf_tables_ipv4_exit_net,
+};
+
+static const struct nf_chain_type filter_ipv4 = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_ipv4);
+ ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_ipv4);
+
+ return ret;
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
+ nft_unregister_chain_type(&filter_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 00000000000..3964157d826
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+/*
+ * NAT chains
+ */
+
+static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn_nat *nat;
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ struct nft_pktinfo pkt;
+ unsigned int ret;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED + IP_CT_IS_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall through */
+ case IP_CT_NEW:
+ if (nf_nat_initialized(ct, maniptype))
+ break;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_ACCEPT)
+ return ret;
+ if (!nf_nat_initialized(ct, maniptype)) {
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+ default:
+ break;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_prerouting(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ __be32 daddr = ip_hdr(skb)->daddr;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ ip_hdr(skb)->daddr != daddr) {
+ skb_dst_drop(skb);
+ }
+ return ret;
+}
+
+static unsigned int nf_nat_postrouting(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo __maybe_unused;
+ const struct nf_conn *ct __maybe_unused;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip ||
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)
+ return nf_xfrm_me_harder(skb, AF_INET) == 0 ?
+ ret : NF_DROP;
+ }
+#endif
+ return ret;
+}
+
+static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int ret;
+
+ ret = nf_nat_fn(ops, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+#ifdef CONFIG_XFRM
+ else if (ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all)
+ if (nf_xfrm_me_harder(skb, AF_INET))
+ ret = NF_DROP;
+#endif
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_nat_ipv4 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nf_nat_prerouting,
+ [NF_INET_POST_ROUTING] = nf_nat_postrouting,
+ [NF_INET_LOCAL_OUT] = nf_nat_output,
+ [NF_INET_LOCAL_IN] = nf_nat_fn,
+ },
+};
+
+static int __init nft_chain_nat_init(void)
+{
+ int err;
+
+ err = nft_register_chain_type(&nft_chain_nat_ipv4);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 00000000000..125b66766c0
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+ struct nft_pktinfo pkt;
+ u32 mark;
+ __be32 saddr, daddr;
+ u_int8_t tos;
+ const struct iphdr *iph;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_DROP && ret != NF_QUEUE) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos)
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
+};
+
+static int __init nft_chain_route_init(void)
+{
+ return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 00000000000..e79718a382f
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/icmp.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ break;
+ }
+
+ data[NFT_REG_VERDICT].verdict = NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval);
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+ .type = &nft_reject_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv4_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "reject",
+ .ops = &nft_reject_ipv4_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 00000000000..044a0ddf6a7
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,1218 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
+ * Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
+
+struct ping_table {
+ struct hlist_nulls_head hash[PING_HTABLE_SIZE];
+ rwlock_t lock;
+};
+
+static struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_SYMBOL_GPL(pingv6_ops);
+
+static u16 ping_port_rover;
+
+static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
+{
+ int res = (num + net_hash_mix(net)) & mask;
+
+ pr_debug("hash(%d) = %d\n", num, res);
+ return res;
+}
+EXPORT_SYMBOL_GPL(ping_hash);
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+ struct net *net, unsigned int num)
+{
+ return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+int ping_get_port(struct sock *sk, unsigned short ident)
+{
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hlist;
+ struct inet_sock *isk, *isk2;
+ struct sock *sk2 = NULL;
+
+ isk = inet_sk(sk);
+ write_lock_bh(&ping_table.lock);
+ if (ident == 0) {
+ u32 i;
+ u16 result = ping_port_rover + 1;
+
+ for (i = 0; i < (1L << 16); i++, result++) {
+ if (!result)
+ result++; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, sock_net(sk),
+ result);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ if (isk2->inet_num == result)
+ goto next_port;
+ }
+
+ /* found */
+ ping_port_rover = ident = result;
+ break;
+next_port:
+ ;
+ }
+ if (i >= (1L << 16))
+ goto fail;
+ } else {
+ hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ /* BUG? Why is this reuse and not reuseaddr? ping.c
+ * doesn't turn off SO_REUSEADDR, and it doesn't expect
+ * that other ping processes can steal its packets.
+ */
+ if ((isk2->inet_num == ident) &&
+ (sk2 != sk) &&
+ (!sk2->sk_reuse || !sk->sk_reuse))
+ goto fail;
+ }
+ }
+
+ pr_debug("found port/ident = %d\n", ident);
+ isk->inet_num = ident;
+ if (sk_unhashed(sk)) {
+ pr_debug("was not hashed\n");
+ sock_hold(sk);
+ hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ }
+ write_unlock_bh(&ping_table.lock);
+ return 0;
+
+fail:
+ write_unlock_bh(&ping_table.lock);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ping_get_port);
+
+void ping_hash(struct sock *sk)
+{
+ pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+ BUG(); /* "Please do not press this button again." */
+}
+
+void ping_unhash(struct sock *sk)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ if (sk_hashed(sk)) {
+ write_lock_bh(&ping_table.lock);
+ hlist_nulls_del(&sk->sk_nulls_node);
+ sock_put(sk);
+ isk->inet_num = 0;
+ isk->inet_sport = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&ping_table.lock);
+ }
+}
+EXPORT_SYMBOL_GPL(ping_unhash);
+
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
+{
+ struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+ struct sock *sk = NULL;
+ struct inet_sock *isk;
+ struct hlist_nulls_node *hnode;
+ int dif = skb->dev->ifindex;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+ (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+ (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+ }
+
+ read_lock_bh(&ping_table.lock);
+
+ ping_portaddr_for_each_entry(sk, hnode, hslot) {
+ isk = inet_sk(sk);
+
+ pr_debug("iterate\n");
+ if (isk->inet_num != ident)
+ continue;
+
+ if (skb->protocol == htons(ETH_P_IP) &&
+ sk->sk_family == AF_INET) {
+ pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+ (int) isk->inet_num, &isk->inet_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (isk->inet_rcv_saddr &&
+ isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+ continue;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6) &&
+ sk->sk_family == AF_INET6) {
+
+ pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+ (int) isk->inet_num,
+ &sk->sk_v6_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+ &ipv6_hdr(skb)->daddr))
+ continue;
+#endif
+ }
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
+ sock_hold(sk);
+ goto exit;
+ }
+
+ sk = NULL;
+exit:
+ read_unlock_bh(&ping_table.lock);
+
+ return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+ kgid_t *high)
+{
+ kgid_t *data = net->ipv4.ping_group_range.range;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+
+int ping_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ kgid_t group = current_egid();
+ struct group_info *group_info;
+ int i, j, count;
+ kgid_t low, high;
+ int ret = 0;
+
+ inet_get_ping_group_range_net(net, &low, &high);
+ if (gid_lte(low, group) && gid_lte(group, high))
+ return 0;
+
+ group_info = get_current_groups();
+ count = group_info->ngroups;
+ for (i = 0; i < group_info->nblocks; i++) {
+ int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+ for (j = 0; j < cp_count; j++) {
+ kgid_t gid = group_info->blocks[i][j];
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
+ }
+
+ count -= cp_count;
+ }
+
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ping_init_sock);
+
+void ping_close(struct sock *sk, long timeout)
+{
+ pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num);
+ pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+ sk_common_release(sk);
+}
+EXPORT_SYMBOL_GPL(ping_close);
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+ struct sockaddr *uaddr, int addr_len) {
+ struct net *net = sock_net(sk);
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int chk_addr_ret;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+ sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ chk_addr_ret = RTN_LOCAL;
+
+ if ((sysctl_ip_nonlocal_bind == 0 &&
+ isk->freebind == 0 && isk->transparent == 0 &&
+ chk_addr_ret != RTN_LOCAL) ||
+ chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST)
+ return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ int addr_type, scoped, has_addr;
+ struct net_device *dev = NULL;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+ sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ scoped = __ipv6_addr_needs_scope_id(addr_type);
+ if ((addr_type != IPV6_ADDR_ANY &&
+ !(addr_type & IPV6_ADDR_UNICAST)) ||
+ (scoped && !addr->sin6_scope_id))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (addr->sin6_scope_id) {
+ dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+ has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+ scoped);
+ rcu_read_unlock();
+
+ if (!(isk->freebind || isk->transparent || has_addr ||
+ addr_type == IPV6_ADDR_ANY))
+ return -EADDRNOTAVAIL;
+
+ if (scoped)
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+{
+ if (saddr->sa_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+ isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (saddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+ }
+}
+
+static void ping_clear_saddr(struct sock *sk, int dif)
+{
+ sk->sk_bound_dev_if = dif;
+ if (sk->sk_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ isk->inet_rcv_saddr = isk->inet_saddr = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
+ memset(&np->saddr, 0, sizeof(np->saddr));
+#endif
+ }
+}
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ unsigned short snum;
+ int err;
+ int dif = sk->sk_bound_dev_if;
+
+ err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (isk->inet_num != 0)
+ goto out;
+
+ err = -EADDRINUSE;
+ ping_set_saddr(sk, uaddr);
+ snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+ if (ping_get_port(sk, snum) != 0) {
+ ping_clear_saddr(sk, dif);
+ goto out;
+ }
+
+ pr_debug("after bind(): num = %d, dif = %d\n",
+ (int)isk->inet_num,
+ (int)sk->sk_bound_dev_if);
+
+ err = 0;
+ if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
+
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ isk->inet_sport = htons(isk->inet_num);
+ isk->inet_daddr = 0;
+ isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+#endif
+
+ sk_dst_reset(sk);
+out:
+ release_sock(sk);
+ pr_debug("ping_v4_bind -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_bind);
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int family, int type, int code)
+{
+ return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+void ping_err(struct sk_buff *skb, int offset, u32 info)
+{
+ int family;
+ struct icmphdr *icmph;
+ struct inet_sock *inet_sock;
+ int type;
+ int code;
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ int harderr;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ family = AF_INET;
+ type = icmp_hdr(skb)->type;
+ code = icmp_hdr(skb)->code;
+ icmph = (struct icmphdr *)(skb->data + offset);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ family = AF_INET6;
+ type = icmp6_hdr(skb)->icmp6_type;
+ code = icmp6_hdr(skb)->icmp6_code;
+ icmph = (struct icmphdr *) (skb->data + offset);
+ } else {
+ BUG();
+ }
+
+ /* We assume the packet has already been checked by icmp_unreach */
+
+ if (!ping_supported(family, icmph->type, icmph->code))
+ return;
+
+ pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+ skb->protocol, type, code, ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (sk == NULL) {
+ pr_debug("no socket, dropping\n");
+ return; /* No socket for error */
+ }
+ pr_debug("err on socket %p\n", sk);
+
+ err = 0;
+ harderr = 0;
+ inet_sock = inet_sk(sk);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ /* This is not a real error but ping wants to see it.
+ * Report it with some fake errno.
+ */
+ err = EREMOTEIO;
+ break;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ /* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
+ err = EREMOTEIO;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if ((family == AF_INET && !inet_sock->recverr) ||
+ (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
+ if (family == AF_INET) {
+ ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+ info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+ info, (u8 *)icmph);
+#endif
+ }
+ }
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(ping_err);
+
+/*
+ * Copy and checksum an ICMP Echo packet from user space into a buffer
+ * starting from the payload.
+ */
+
+int ping_getfrag(void *from, char *to,
+ int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+ struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+ if (offset == 0) {
+ if (fraglen < sizeof(struct icmphdr))
+ BUG();
+ if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
+ pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+ &pfh->wcheck))
+ return -EFAULT;
+ } else if (offset < sizeof(struct icmphdr)) {
+ BUG();
+ } else {
+ if (csum_partial_copy_fromiovecend
+ (to, pfh->iov, offset - sizeof(struct icmphdr),
+ fraglen, &pfh->wcheck))
+ return -EFAULT;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* For IPv6, checksum each skb as we go along, as expected by
+ * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+ * wcheck, it will be finalized in ping_v4_push_pending_frames.
+ */
+ if (pfh->family == AF_INET6) {
+ skb->csum = pfh->wcheck;
+ skb->ip_summed = CHECKSUM_NONE;
+ pfh->wcheck = 0;
+ }
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_getfrag);
+
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+ struct flowi4 *fl4)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+ pfh->wcheck = csum_partial((char *)&pfh->icmph,
+ sizeof(struct icmphdr), pfh->wcheck);
+ pfh->icmph.checksum = csum_fold(pfh->wcheck);
+ memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+ skb->ip_summed = CHECKSUM_NONE;
+ return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+ void *user_icmph, size_t icmph_len) {
+ u8 type, code;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Fetch the ICMP header provided by the userland.
+ * iovec is modified! The ICMP header is consumed.
+ */
+ if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len))
+ return -EFAULT;
+
+ if (family == AF_INET) {
+ type = ((struct icmphdr *) user_icmph)->type;
+ code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+ code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+ } else {
+ BUG();
+ }
+
+ if (!ping_supported(family, type, code))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+
+static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ struct net *net = sock_net(sk);
+ struct flowi4 fl4;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct icmphdr user_icmph;
+ struct pingfakehdr pfh;
+ struct rtable *rt = NULL;
+ struct ip_options_data opt_copy;
+ int free = 0;
+ __be32 saddr, daddr, faddr;
+ u8 tos;
+ int err;
+
+ pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EINVAL;
+ daddr = usin->sin_addr.s_addr;
+ /* no remote port */
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ /* no remote port */
+ }
+
+ ipc.addr = inet->inet_saddr;
+ ipc.opt = NULL;
+ ipc.oif = sk->sk_bound_dev_if;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->opt.faddr;
+ }
+ tos = get_rttos(&ipc, inet);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
+ tos |= RTO_ONLINK;
+ }
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+
+ lock_sock(sk);
+
+ pfh.icmph.type = user_icmph.type; /* already checked */
+ pfh.icmph.code = user_icmph.code; /* ditto */
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+ pfh.iov = msg->msg_iov;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET;
+
+ err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+ 0, &ipc, &rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else
+ err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ icmp_out_count(sock_net(sk), user_icmph.type);
+ return len;
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ int family = sk->sk_family;
+ struct sk_buff *skb;
+ int copied, err;
+
+ pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+ err = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE) {
+ if (family == AF_INET) {
+ return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ return pingv6_ops.ipv6_recv_error(sk, msg, len,
+ addr_len);
+#endif
+ }
+ }
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* Don't bother checking the checksum */
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address and add cmsg data. */
+ if (family == AF_INET) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ if (isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *ip6 = ipv6_hdr(skb);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ IP6CB(skb)->iif);
+ *addr_len = sizeof(*sin6);
+ }
+
+ if (inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+#endif
+ } else {
+ BUG();
+ }
+
+ err = copied;
+
+done:
+ skb_free_datagram(sk, skb);
+out:
+ pr_debug("ping_recvmsg -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_recvmsg);
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ pr_debug("ping_queue_rcv_skb -> failed\n");
+ return -1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+
+
+/*
+ * All we need to do is get the socket.
+ */
+
+void ping_rcv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct net *net = dev_net(skb->dev);
+ struct icmphdr *icmph = icmp_hdr(skb);
+
+ /* We assume the packet has already been checked by icmp_rcv */
+
+ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+ skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+ /* Push ICMP header back */
+ skb_push(skb, skb->data - (u8 *)icmph);
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (sk != NULL) {
+ pr_debug("rcv on socket %p\n", sk);
+ ping_queue_rcv_skb(sk, skb_get(skb));
+ sock_put(sk);
+ return;
+ }
+ pr_debug("no socket, dropping\n");
+
+ /* We're called from icmp_rcv(). kfree_skb() is done there. */
+}
+EXPORT_SYMBOL_GPL(ping_rcv);
+
+struct proto ping_prot = {
+ .name = "PING",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .sendmsg = ping_v4_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .obj_size = sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+ ++state->bucket) {
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hslot;
+
+ hslot = &ping_table.hash[state->bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
+ sk_nulls_for_each(sk, node, hslot) {
+ if (net_eq(sock_net(sk), net) &&
+ sk->sk_family == state->family)
+ goto found;
+ }
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_nulls_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net)));
+
+ if (!sk)
+ return ping_get_first(seq, state->bucket + 1);
+ return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = ping_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+{
+ struct ping_iter_state *state = seq->private;
+ state->bucket = 0;
+ state->family = family;
+
+ read_lock_bh(&ping_table.lock);
+
+ return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET);
+}
+
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = ping_get_idx(seq, 0);
+ else
+ sk = ping_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(ping_seq_next);
+
+void ping_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock_bh(&ping_table.lock);
+}
+EXPORT_SYMBOL_GPL(ping_seq_stop);
+
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
+
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct ping_iter_state *state = seq->private;
+
+ ping_v4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations ping_v4_seq_ops = {
+ .show = ping_v4_seq_show,
+ .start = ping_v4_seq_start,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+ struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
+ return seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct ping_iter_state));
+}
+
+const struct file_operations ping_seq_fops = {
+ .open = ping_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+EXPORT_SYMBOL_GPL(ping_seq_fops);
+
+static struct ping_seq_afinfo ping_v4_seq_afinfo = {
+ .name = "icmp",
+ .family = AF_INET,
+ .seq_fops = &ping_seq_fops,
+ .seq_ops = {
+ .start = ping_v4_seq_start,
+ .show = ping_v4_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+ },
+};
+
+int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_proc_register);
+
+void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL_GPL(ping_proc_unregister);
+
+static int __net_init ping_v4_proc_init_net(struct net *net)
+{
+ return ping_proc_register(net, &ping_v4_seq_afinfo);
+}
+
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
+{
+ ping_proc_unregister(net, &ping_v4_seq_afinfo);
+}
+
+static struct pernet_operations ping_v4_net_ops = {
+ .init = ping_v4_proc_init_net,
+ .exit = ping_v4_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+ return register_pernet_subsys(&ping_v4_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+ unregister_pernet_subsys(&ping_v4_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+ int i;
+
+ for (i = 0; i < PING_HTABLE_SIZE; i++)
+ INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+ rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1b48eb1ed45..ae0af9386f7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -42,6 +42,7 @@
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/export.h>
#include <net/sock.h>
#include <net/raw.h>
@@ -55,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
local_bh_disable();
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
- sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
+ sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
local_bh_enable();
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
tcp_death_row.tw_count, sockets,
- atomic_long_read(&tcp_memory_allocated));
+ proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
- atomic_long_read(&udp_memory_allocated));
+ proto_memory_allocated(&udp_prot));
seq_printf(seq, "UDPLITE: inuse %d\n",
sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n",
@@ -110,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_SENTINEL
};
-/* Following RFC4293 items are displayed in /proc/net/netstat */
+/* Following items are displayed in /proc/net/netstat */
static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -124,6 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* Non RFC4293 fields */
+ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_SENTINEL
};
@@ -161,6 +168,7 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+ SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
SNMP_MIB_SENTINEL
};
@@ -171,6 +179,7 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_SENTINEL
};
@@ -215,7 +224,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
- SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
@@ -224,6 +232,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
@@ -232,7 +242,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
- SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -253,6 +262,30 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+ SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+ SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+ SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+ SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+ SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+ SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+ SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+ SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+ SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+ SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+ SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+ SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+ SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+ SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+ SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+ SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
SNMP_MIB_SENTINEL
};
@@ -284,7 +317,7 @@ static void icmpmsg_put(struct seq_file *seq)
count = 0;
for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
- val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
+ val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
if (val) {
type[count] = i;
vals[count++] = val;
@@ -303,27 +336,27 @@ static void icmp_put(struct seq_file *seq)
{
int i;
struct net *net = seq->private;
+ atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
- seq_puts(seq, "\nIcmp: InMsgs InErrors");
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
seq_printf(seq, " OutMsgs OutErrors");
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
- seq_printf(seq, "\nIcmp: %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
- icmpmibmap[i].index));
+ atomic_long_read(ptr + icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
- for (i=0; icmpmibmap[i].name != NULL; i++)
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
- icmpmibmap[i].index | 0x100));
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
}
/*
@@ -346,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
@@ -362,11 +395,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
else
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+ snmp_fold_field(net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
}
@@ -377,7 +410,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdp:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udp_statistics,
+ snmp_fold_field(net->mib.udp_statistics,
snmp4_udp_list[i].entry));
/* the UDP and UDP-Lite MIBs are the same */
@@ -388,7 +421,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdpLite:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
+ snmp_fold_field(net->mib.udplite_statistics,
snmp4_udp_list[i].entry));
seq_putc(seq, '\n');
@@ -425,7 +458,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nTcpExt:");
for (i = 0; snmp4_net_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.net_statistics,
+ snmp_fold_field(net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
@@ -435,7 +468,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
seq_printf(seq, " %llu",
- snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp_fold_field64(net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry,
offsetof(struct ipstats_mib, syncp)));
@@ -458,28 +491,29 @@ static const struct file_operations netstat_seq_fops = {
static __net_init int ip_proc_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops))
+ if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+ &sockstat_seq_fops))
goto out_sockstat;
- if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops))
+ if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
goto out_netstat;
- if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops))
+ if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
goto out_snmp;
return 0;
out_snmp:
- proc_net_remove(net, "netstat");
+ remove_proc_entry("netstat", net->proc_net);
out_netstat:
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("sockstat", net->proc_net);
out_sockstat:
return -ENOMEM;
}
static __net_exit void ip_proc_exit_net(struct net *net)
{
- proc_net_remove(net, "snmp");
- proc_net_remove(net, "netstat");
- proc_net_remove(net, "sockstat");
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
}
static __net_initdata struct pernet_operations ip_proc_ops = {
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b..46d6a1c923a 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,29 +29,33 @@
#include <net/protocol.h>
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
-
-/*
- * Add a protocol handler to the hash tables
- */
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash = protocol & (MAX_INET_PROTOS - 1);
+ if (!prot->netns_ok) {
+ pr_err("Protocol %u is not namespace aware, cannot register.\n",
+ protocol);
+ return -EINVAL;
+ }
- return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
-/*
- * Remove a protocol from the hash tables.
- */
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+ int ret;
- ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
prot, NULL) == prot) ? 0 : -1;
synchronize_net();
@@ -59,3 +63,16 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet_del_protocol);
+
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a3d5ab786e8..2c65160565e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -38,7 +38,7 @@
*/
#include <linux/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/byteorder.h>
#include <asm/current.h>
#include <asm/uaccess.h>
@@ -48,6 +48,7 @@
#include <linux/errno.h>
#include <linux/aio.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/sockios.h>
#include <linux/socket.h>
@@ -76,6 +77,7 @@
#include <linux/seq_file.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -109,9 +111,7 @@ EXPORT_SYMBOL_GPL(raw_unhash_sk);
static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
unsigned short num, __be32 raddr, __be32 laddr, int dif)
{
- struct hlist_node *node;
-
- sk_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
struct inet_sock *inet = inet_sk(sk);
if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
@@ -129,18 +129,20 @@ found:
* 0 - deliver
* 1 - block
*/
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
{
- int type;
+ struct icmphdr _hdr;
+ const struct icmphdr *hdr;
- if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!hdr)
return 1;
- type = icmp_hdr(skb)->type;
- if (type < 32) {
+ if (hdr->type < 32) {
__u32 data = raw_sk(sk)->filter.data;
- return ((1 << type) & data) != 0;
+ return ((1U << hdr->type) & data) != 0;
}
/* Do not block unknown ICMP types */
@@ -153,7 +155,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
@@ -214,6 +216,13 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
int err = 0;
int harderr = 0;
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_sk_update_pmtu(skb, sk, info);
+ else if (type == ICMP_REDIRECT) {
+ ipv4_sk_redirect(skb, sk);
+ return;
+ }
+
/* Report error on raw socket, if:
1. User requested ip_recverr.
2. Socket is connected (otherwise the error indication
@@ -246,7 +255,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
}
if (inet->recverr) {
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 *payload = skb->data + (iph->ihl << 2);
if (inet->hdrincl)
@@ -264,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
int hash;
struct sock *raw_sk;
- struct iphdr *iph;
+ const struct iphdr *iph;
struct net *net;
hash = protocol & (RAW_HTABLE_SIZE - 1);
@@ -272,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
read_lock(&raw_v4_hashinfo.lock);
raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
if (raw_sk != NULL) {
- iph = (struct iphdr *)skb->data;
+ iph = (const struct iphdr *)skb->data;
net = dev_net(skb->dev);
while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
@@ -280,17 +289,18 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
skb->dev->ifindex)) != NULL) {
raw_err(raw_sk, skb, info);
raw_sk = sk_next(raw_sk);
- iph = (struct iphdr *)skb->data;
+ iph = (const struct iphdr *)skb->data;
}
}
read_unlock(&raw_v4_hashinfo.lock);
}
-static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
/* Charge it to the socket. */
- if (ip_queue_rcv_skb(sk, skb) < 0) {
+ ipv4_pktinfo_prepare(sk, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -313,9 +323,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
- struct rtable **rtp,
- unsigned int flags)
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+ void *from, size_t length,
+ struct rtable **rtp,
+ unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -324,21 +335,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
unsigned int iphlen;
int err;
struct rtable *rt = *rtp;
+ int hlen, tlen;
if (length > rt->dst.dev->mtu) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
rt->dst.dev->mtu);
return -EMSGSIZE;
}
if (flags&MSG_PROBE)
goto out;
+ hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ tlen = rt->dst.dev->needed_tailroom;
skb = sock_alloc_send_skb(sk,
- length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
+ length + hlen + tlen + 15,
flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto error;
- skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
+ skb_reserve(skb, hlen);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
@@ -371,11 +385,11 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
if (iphlen >= sizeof(*iph)) {
if (!iph->saddr)
- iph->saddr = rt->rt_src;
+ iph->saddr = fl4->saddr;
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(iph, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
@@ -401,7 +415,7 @@ error:
return err;
}
-static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
{
struct iovec *iov;
u8 __user *type = NULL;
@@ -417,7 +431,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
if (!iov)
continue;
- switch (fl->proto) {
+ switch (fl4->flowi4_proto) {
case IPPROTO_ICMP:
/* check if one-byte field is readable or not. */
if (iov->iov_base && iov->iov_len < 1)
@@ -432,8 +446,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
code = iov->iov_base;
if (type && code) {
- if (get_user(fl->fl_icmp_type, type) ||
- get_user(fl->fl_icmp_code, code))
+ if (get_user(fl4->fl4_icmp_type, type) ||
+ get_user(fl4->fl4_icmp_code, code))
return -EFAULT;
probed = 1;
}
@@ -454,11 +468,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct inet_sock *inet = inet_sk(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
+ struct flowi4 fl4;
int free = 0;
__be32 daddr;
__be32 saddr;
u8 tos;
int err;
+ struct ip_options_data opt_copy;
err = -EMSGSIZE;
if (len > 0xFFFF)
@@ -477,16 +493,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*/
if (msg->msg_namelen) {
- struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
err = -EINVAL;
if (msg->msg_namelen < sizeof(*usin))
goto out;
if (usin->sin_family != AF_INET) {
- static int complained;
- if (!complained++)
- printk(KERN_INFO "%s forgot to set AF_INET in "
- "raw sendmsg. Fix it!\n",
- current->comm);
+ pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+ __func__, current->comm);
err = -EAFNOSUPPORT;
if (usin->sin_family)
goto out;
@@ -506,10 +519,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
if (err)
goto out;
if (ipc.opt)
@@ -519,8 +534,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
saddr = ipc.addr;
ipc.addr = daddr;
- if (!ipc.opt)
- ipc.opt = inet->opt;
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
if (ipc.opt) {
err = -EINVAL;
@@ -529,13 +554,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
*/
if (inet->hdrincl)
goto done;
- if (ipc.opt->srr) {
+ if (ipc.opt->opt.srr) {
if (!daddr)
goto done;
- daddr = ipc.opt->faddr;
+ daddr = ipc.opt->opt.faddr;
}
}
- tos = RT_CONN_FLAGS(sk);
+ tos = get_rtconn_flags(&ipc, sk);
if (msg->msg_flags & MSG_DONTROUTE)
tos |= RTO_ONLINK;
@@ -544,28 +569,29 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk) |
+ (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ daddr, saddr, 0, 0);
+
+ if (!inet->hdrincl) {
+ err = raw_probe_proto_opt(&fl4, msg);
+ if (err)
+ goto done;
}
- {
- struct flowi fl = { .oif = ipc.oif,
- .mark = sk->sk_mark,
- .fl4_dst = daddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .proto = inet->hdrincl ? IPPROTO_RAW :
- sk->sk_protocol,
- };
- if (!inet->hdrincl) {
- err = raw_probe_proto_opt(&fl, msg);
- if (err)
- goto done;
- }
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
- }
- if (err)
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto done;
+ }
err = -EACCES;
if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -576,19 +602,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
back_from_confirm:
if (inet->hdrincl)
- err = raw_send_hdrinc(sk, msg->msg_iov, len,
- &rt, msg->msg_flags);
+ err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+ &rt, msg->msg_flags);
else {
if (!ipc.addr)
- ipc.addr = rt->rt_dst;
+ ipc.addr = fl4.daddr;
lock_sock(sk);
- err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
- &ipc, &rt, msg->msg_flags);
+ err = ip_append_data(sk, &fl4, ip_generic_getfrag,
+ msg->msg_iov, len, 0,
+ &ipc, &rt, msg->msg_flags);
if (err)
ip_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE)) {
- err = ip_push_pending_frames(sk);
+ err = ip_push_pending_frames(sk, &fl4);
if (err == -ENOBUFS && !inet->recverr)
err = 0;
}
@@ -615,7 +642,7 @@ do_confirm:
static void raw_close(struct sock *sk, long timeout)
{
/*
- * Raw sockets may have direct kernel refereneces. Kill them.
+ * Raw sockets may have direct kernel references. Kill them.
*/
ip_ra_control(sk, 0, NULL);
@@ -663,17 +690,14 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
int err = -EOPNOTSUPP;
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE) {
- err = ip_recv_error(sk, msg, len);
+ err = ip_recv_error(sk, msg, len, addr_len);
goto out;
}
@@ -699,6 +723,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
@@ -812,31 +837,48 @@ static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
switch (cmd) {
- case SIOCOUTQ: {
- int amount = sk_wmem_alloc_get(sk);
+ case SIOCOUTQ: {
+ int amount = sk_wmem_alloc_get(sk);
- return put_user(amount, (int __user *)arg);
- }
- case SIOCINQ: {
- struct sk_buff *skb;
- int amount = 0;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL)
- amount = skb->len;
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
- }
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL)
+ amount = skb->len;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
- default:
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
#ifdef CONFIG_IP_MROUTE
- return ipmr_ioctl(sk, cmd, (void __user *)arg);
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
- return -ENOIOCTLCMD;
+ return -ENOIOCTLCMD;
#endif
}
}
+#endif
struct proto raw_prot = {
.name = "RAW",
@@ -853,6 +895,7 @@ struct proto raw_prot = {
.recvmsg = raw_recvmsg,
.bind = raw_bind,
.backlog_rcv = raw_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
.hash = raw_hash_sk,
.unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw_sock),
@@ -860,6 +903,7 @@ struct proto raw_prot = {
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_raw_setsockopt,
.compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
#endif
};
@@ -871,9 +915,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
- struct hlist_node *node;
-
- sk_for_each(sk, node, &state->h->ht[state->bucket])
+ sk_for_each(sk, &state->h->ht[state->bucket])
if (sock_net(sk) == seq_file_net(seq))
goto found;
}
@@ -948,11 +990,13 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
srcp = inet->inet_num;
seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
i, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
- 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
}
@@ -1005,7 +1049,7 @@ static const struct file_operations raw_seq_fops = {
static __net_init int raw_init_net(struct net *net)
{
- if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
+ if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
return -ENOMEM;
return 0;
@@ -1013,7 +1057,7 @@ static __net_init int raw_init_net(struct net *net)
static __net_exit void raw_exit_net(struct net *net)
{
- proc_net_remove(net, "raw");
+ remove_proc_entry("raw", net->proc_net);
}
static __net_initdata struct pernet_operations raw_net_ops = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3843c2dfde8..190199851c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -62,14 +62,14 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/module.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -79,7 +79,6 @@
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
-#include <linux/workqueue.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
@@ -87,10 +86,10 @@
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/jhash.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -107,67 +106,71 @@
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
+#include <linux/kmemleak.h>
#endif
+#include <net/secure_seq.h>
-#define RT_FL_TOS(oldflp) \
- ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
-
-#define IP_MAX_MTU 0xFFF0
+#define RT_FL_TOS(oldflp4) \
+ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
#define RT_GC_TIMEOUT (300*HZ)
static int ip_rt_max_size;
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
-static int ip_rt_gc_interval __read_mostly = 60 * HZ;
-static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
-static int rt_chain_length_max __read_mostly = 20;
-
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
/*
* Interface to generic destination cache.
*/
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
-static void ipv4_dst_destroy(struct dst_entry *dst);
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int ipv4_mtu(const struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static int rt_garbage_collect(struct dst_ops *ops);
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu);
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static void ipv4_dst_destroy(struct dst_entry *dst);
-static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int how)
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
+ WARN_ON(1);
+ return NULL;
}
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
- .gc = rt_garbage_collect,
.check = ipv4_dst_check,
+ .default_advmss = ipv4_default_advmss,
+ .mtu = ipv4_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
- .ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
+ .redirect = ip_do_redirect,
.local_out = __ip_local_out,
+ .neigh_lookup = ipv4_neigh_lookup,
};
#define ECN_OR_COST(class) TC_PRIO_##class
const __u8 ip_tos2prio[16] = {
TC_PRIO_BESTEFFORT,
- ECN_OR_COST(FILLER),
+ ECN_OR_COST(BESTEFFORT),
TC_PRIO_BESTEFFORT,
ECN_OR_COST(BESTEFFORT),
TC_PRIO_BULK,
@@ -183,186 +186,27 @@ const __u8 ip_tos2prio[16] = {
TC_PRIO_INTERACTIVE_BULK,
ECN_OR_COST(INTERACTIVE_BULK)
};
-
-
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- * as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- * they do so with atomic increments and with the
- * lock held.
- */
-
-struct rt_hash_bucket {
- struct rtable __rcu *chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
- defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ 256
-#else
-# if NR_CPUS >= 32
-# define RT_HASH_LOCK_SZ 4096
-# elif NR_CPUS >= 16
-# define RT_HASH_LOCK_SZ 2048
-# elif NR_CPUS >= 8
-# define RT_HASH_LOCK_SZ 1024
-# elif NR_CPUS >= 4
-# define RT_HASH_LOCK_SZ 512
-# else
-# define RT_HASH_LOCK_SZ 256
-# endif
-#endif
-
-static spinlock_t *rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
- int i;
-
- rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
- GFP_KERNEL);
- if (!rt_hash_locks)
- panic("IP: failed to allocate rt_hash_locks\n");
-
- for (i = 0; i < RT_HASH_LOCK_SZ; i++)
- spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket *rt_hash_table __read_mostly;
-static unsigned rt_hash_mask __read_mostly;
-static unsigned int rt_hash_log __read_mostly;
+EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
-
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
- int genid)
-{
- return jhash_3words((__force u32)daddr, (__force u32)saddr,
- idx, genid)
- & rt_hash_mask;
-}
-
-static inline int rt_genid(struct net *net)
-{
- return atomic_read(&net->ipv4.rt_genid);
-}
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
#ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
- struct seq_net_private p;
- int bucket;
- int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
- struct rt_cache_iter_state *st = seq->private;
- struct rtable *r = NULL;
-
- for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
- if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
- continue;
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- while (r) {
- if (dev_net(r->dst.dev) == seq_file_net(seq) &&
- r->rt_genid == st->genid)
- return r;
- r = rcu_dereference_bh(r->dst.rt_next);
- }
- rcu_read_unlock_bh();
- }
- return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
-
- r = rcu_dereference_bh(r->dst.rt_next);
- while (!r) {
- rcu_read_unlock_bh();
- do {
- if (--st->bucket < 0)
- return NULL;
- } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
- rcu_read_lock_bh();
- r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
- struct rtable *r)
-{
- struct rt_cache_iter_state *st = seq->private;
- while ((r = __rt_cache_get_next(seq, r)) != NULL) {
- if (dev_net(r->dst.dev) != seq_file_net(seq))
- continue;
- if (r->rt_genid == st->genid)
- break;
- }
- return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct rtable *r = rt_cache_get_first(seq);
-
- if (r)
- while (pos && (r = rt_cache_get_next(seq, r)))
- --pos;
- return pos ? NULL : r;
-}
-
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct rt_cache_iter_state *st = seq->private;
if (*pos)
- return rt_cache_get_idx(seq, *pos - 1);
- st->genid = rt_genid(seq_file_net(seq));
+ return NULL;
return SEQ_START_TOKEN;
}
static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct rtable *r;
-
- if (v == SEQ_START_TOKEN)
- r = rt_cache_get_first(seq);
- else
- r = rt_cache_get_next(seq, v);
++*pos;
- return r;
+ return NULL;
}
static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
- if (v && v != SEQ_START_TOKEN)
- rcu_read_unlock_bh();
}
static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -372,30 +216,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
"HHUptod\tSpecDst");
- else {
- struct rtable *r = v;
- int len;
-
- seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- (dst_metric(&r->dst, RTAX_ADVMSS) ?
- (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
- dst_metric(&r->dst, RTAX_WINDOW),
- (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
- dst_metric(&r->dst, RTAX_RTTVAR)),
- r->fl.fl4_tos,
- r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
- r->dst.hh ? (r->dst.hh->hh_output ==
- dev_queue_xmit) : 0,
- r->rt_spec_dst, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
- }
return 0;
}
@@ -408,8 +228,7 @@ static const struct seq_operations rt_cache_seq_ops = {
static int rt_cache_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &rt_cache_seq_ops,
- sizeof(struct rt_cache_iter_state));
+ return seq_open(file, &rt_cache_seq_ops);
}
static const struct file_operations rt_cache_seq_fops = {
@@ -417,7 +236,7 @@ static const struct file_operations rt_cache_seq_fops = {
.open = rt_cache_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_net,
+ .release = seq_release,
};
@@ -468,7 +287,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
dst_entries_get_slow(&ipv4_dst_ops),
- st->in_hit,
+ 0, /* st->in_hit */
st->in_slow_tot,
st->in_slow_mc,
st->in_no_route,
@@ -476,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
st->in_martian_dst,
st->in_martian_src,
- st->out_hit,
+ 0, /* st->out_hit */
st->out_slow_tot,
st->out_slow_mc,
- st->gc_total,
- st->gc_ignored,
- st->gc_goal_miss,
- st->gc_dst_overflow,
- st->in_hlist_search,
- st->out_hlist_search
+ 0, /* st->gc_total */
+ 0, /* st->gc_ignored */
+ 0, /* st->gc_goal_miss */
+ 0, /* st->gc_dst_overflow */
+ 0, /* st->in_hlist_search */
+ 0 /* st->out_hlist_search */
);
return 0;
}
@@ -511,7 +330,7 @@ static const struct file_operations rt_cpu_seq_fops = {
.release = seq_release,
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
struct ip_rt_acct *dst, *src;
@@ -554,8 +373,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
- &rt_cache_seq_fops);
+ pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+ &rt_cache_seq_fops);
if (!pde)
goto err1;
@@ -564,14 +383,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
if (!pde)
goto err2;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
@@ -585,7 +404,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
remove_proc_entry("rt_cache", net->proc_net_stat);
remove_proc_entry("rt_cache", net->proc_net);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
remove_proc_entry("rt_acct", net->proc_net);
#endif
}
@@ -607,770 +426,306 @@ static inline int ip_rt_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-static inline void rt_free(struct rtable *rt)
+static inline bool rt_is_expired(const struct rtable *rth)
{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+ return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
}
-static inline void rt_drop(struct rtable *rt)
+void rt_cache_flush(struct net *net)
{
- ip_rt_put(rt);
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+ rt_genid_bump_ipv4(net);
}
-static inline int rt_fast_clean(struct rtable *rth)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
{
- /* Kill broadcast/multicast entries very aggresively, if they
- collide in hash table with more useful entries */
- return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
- return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
- unsigned long age;
- int ret = 0;
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+ struct neighbour *n;
- if (atomic_read(&rth->dst.__refcnt))
- goto out;
+ rt = (const struct rtable *) dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *) &rt->rt_gateway;
+ else if (skb)
+ pkey = &ip_hdr(skb)->daddr;
- ret = 1;
- if (rth->dst.expires &&
- time_after_eq(jiffies, rth->dst.expires))
- goto out;
-
- age = jiffies - rth->dst.lastuse;
- ret = 0;
- if ((age <= tmo1 && !rt_fast_clean(rth)) ||
- (age <= tmo2 && rt_valuable(rth)))
- goto out;
- ret = 1;
-out: return ret;
+ n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
+ if (n)
+ return n;
+ return neigh_create(&arp_tbl, pkey, dev);
}
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
- u32 score = jiffies - rt->dst.lastuse;
-
- score = ~score & ~(3<<30);
-
- if (rt_valuable(rt))
- score |= (1<<31);
-
- if (rt_is_output_route(rt) ||
- !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
- score |= (1<<30);
-
- return score;
-}
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+ atomic_t id;
+ u32 stamp32;
+};
-static inline bool rt_caching(const struct net *net)
-{
- return net->ipv4.current_rt_cache_rebuild_count <=
- net->ipv4.sysctl_rt_cache_rebuild_count;
-}
+static struct ip_ident_bucket *ip_idents __read_mostly;
-static inline bool compare_hash_inputs(const struct flowi *fl1,
- const struct flowi *fl2)
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+u32 ip_idents_reserve(u32 hash, int segs)
{
- return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
- ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
- (fl1->iif ^ fl2->iif)) == 0);
-}
+ struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 now = (u32)jiffies;
+ u32 delta = 0;
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
-{
- return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
- ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
- (fl1->mark ^ fl2->mark) |
- (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
- (fl1->oif ^ fl2->oif) |
- (fl1->iif ^ fl2->iif)) == 0;
-}
+ if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ delta = prandom_u32_max(now - old);
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
- return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
+ return atomic_add_return(segs + delta, &bucket->id) - segs;
}
+EXPORT_SYMBOL(ip_idents_reserve);
-static inline int rt_is_expired(struct rtable *rth)
+void __ip_select_ident(struct iphdr *iph, int segs)
{
- return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
-}
+ static u32 ip_idents_hashrnd __read_mostly;
+ u32 hash, id;
-/*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(int process_context)
-{
- unsigned int i;
- struct rtable *rth, *next;
- struct rtable * tail;
-
- for (i = 0; i <= rt_hash_mask; i++) {
- if (process_context && need_resched())
- cond_resched();
- rth = rcu_dereference_raw(rt_hash_table[i].chain);
- if (!rth)
- continue;
+ net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
- spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
- {
- struct rtable __rcu **prev;
- struct rtable *p;
-
- rth = rcu_dereference_protected(rt_hash_table[i].chain,
- lockdep_is_held(rt_hash_lock_addr(i)));
-
- /* defer releasing the head of the list after spin_unlock */
- for (tail = rth; tail;
- tail = rcu_dereference_protected(tail->dst.rt_next,
- lockdep_is_held(rt_hash_lock_addr(i))))
- if (!rt_is_expired(tail))
- break;
- if (rth != tail)
- rt_hash_table[i].chain = tail;
-
- /* call rt_free on entries after the tail requiring flush */
- prev = &rt_hash_table[i].chain;
- for (p = rcu_dereference_protected(*prev,
- lockdep_is_held(rt_hash_lock_addr(i)));
- p != NULL;
- p = next) {
- next = rcu_dereference_protected(p->dst.rt_next,
- lockdep_is_held(rt_hash_lock_addr(i)));
- if (!rt_is_expired(p)) {
- prev = &p->dst.rt_next;
- } else {
- *prev = next;
- rt_free(p);
- }
- }
- }
-#else
- rth = rcu_dereference_protected(rt_hash_table[i].chain,
- lockdep_is_held(rt_hash_lock_addr(i)));
- rcu_assign_pointer(rt_hash_table[i].chain, NULL);
- tail = NULL;
-#endif
- spin_unlock_bh(rt_hash_lock_addr(i));
-
- for (; rth != tail; rth = next) {
- next = rcu_dereference_protected(rth->dst.rt_next, 1);
- rt_free(rth);
- }
- }
+ hash = jhash_3words((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol,
+ ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
}
+EXPORT_SYMBOL(__ip_select_ident);
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- * rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct iphdr *iph,
+ int oif, u8 tos,
+ u8 prot, u32 mark, int flow_flags)
{
- const struct rtable *aux = head;
+ if (sk) {
+ const struct inet_sock *inet = inet_sk(sk);
- while (aux != rth) {
- if (compare_hash_inputs(&aux->fl, &rth->fl))
- return 0;
- aux = rcu_dereference_protected(aux->dst.rt_next, 1);
+ oif = sk->sk_bound_dev_if;
+ mark = sk->sk_mark;
+ tos = RT_CONN_FLAGS(sk);
+ prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
}
- return ONE;
+ flowi4_init_output(fl4, oif, mark, tos,
+ RT_SCOPE_UNIVERSE, prot,
+ flow_flags,
+ iph->daddr, iph->saddr, 0, 0);
}
-static void rt_check_expire(void)
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+ const struct sock *sk)
{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (rth->dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- /*
- * We only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- length += has_noalias(rt_hash_table[i].chain, rth);
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
- goto nofree;
+ const struct iphdr *iph = ip_hdr(skb);
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
- /* Cleanup aged off entries. */
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
+ __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
}
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
-/*
- * Pertubation of rt_genid by a small quantity [1..256]
- * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
- * many times (2^24) without giving recent rt_genid.
- * Jenkins hash is strong enough that litle changes of rt_genid are OK.
- */
-static void rt_cache_invalidate(struct net *net)
-{
- unsigned char shuffle;
-
- get_random_bytes(&shuffle, sizeof(shuffle));
- atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-}
-
-/*
- * delay < 0 : invalidate cache (fast : entries will be deleted later)
- * delay >= 0 : invalidate & flush cache (can be long)
- */
-void rt_cache_flush(struct net *net, int delay)
-{
- rt_cache_invalidate(net);
- if (delay >= 0)
- rt_do_flush(!in_softirq());
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ daddr, inet->inet_saddr, 0, 0);
+ rcu_read_unlock();
}
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct sk_buff *skb)
{
- rt_do_flush(!in_softirq());
+ if (skb)
+ build_skb_flow_key(fl4, skb, sk);
+ else
+ build_sk_flow_key(fl4, sk);
}
-static void rt_emergency_hash_rebuild(struct net *net)
+static inline void rt_free(struct rtable *rt)
{
- if (net_ratelimit())
- printk(KERN_WARNING "Route hash chain too long!\n");
- rt_cache_invalidate(net);
+ call_rcu(&rt->dst.rcu_head, dst_rcu_free);
}
-/*
- Short description of GC goals.
-
- We want to build algorithm, which will keep routing cache
- at some equilibrium point, when number of aged off entries
- is kept approximately equal to newly generated ones.
-
- Current expiration strength is variable "expire".
- We try to adjust it dynamically, so that if networking
- is idle expires is large enough to keep enough of warm entries,
- and when load increases it reduces to limit cache size.
- */
+static DEFINE_SPINLOCK(fnhe_lock);
-static int rt_garbage_collect(struct dst_ops *ops)
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
{
- static unsigned long expire = RT_GC_TIMEOUT;
- static unsigned long last_gc;
- static int rover;
- static int equilibrium;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long now = jiffies;
- int goal;
- int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
- /*
- * Garbage collection is pretty expensive,
- * do not make it too frequently.
- */
-
- RT_CACHE_STAT_INC(gc_total);
+ struct rtable *rt;
- if (now - last_gc < ip_rt_gc_min_interval &&
- entries < ip_rt_max_size) {
- RT_CACHE_STAT_INC(gc_ignored);
- goto out;
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ rt_free(rt);
}
-
- entries = dst_entries_get_slow(&ipv4_dst_ops);
- /* Calculate number of entries, which we want to expire now. */
- goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
- if (goal <= 0) {
- if (equilibrium < ipv4_dst_ops.gc_thresh)
- equilibrium = ipv4_dst_ops.gc_thresh;
- goal = entries - equilibrium;
- if (goal > 0) {
- equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = entries - equilibrium;
- }
- } else {
- /* We are in dangerous area. Try to reduce cache really
- * aggressively.
- */
- goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = entries - goal;
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ rt_free(rt);
}
+}
- if (now - last_gc >= ip_rt_gc_min_interval)
- last_gc = now;
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+{
+ struct fib_nh_exception *fnhe, *oldest;
- if (goal <= 0) {
- equilibrium += goal;
- goto work_done;
+ oldest = rcu_dereference(hash->chain);
+ for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ oldest = fnhe;
}
+ fnhe_flush_routes(oldest);
+ return oldest;
+}
- do {
- int i, k;
-
- for (i = rt_hash_mask, k = rover; i >= 0; i--) {
- unsigned long tmo = expire;
-
- k = (k + 1) & rt_hash_mask;
- rthp = &rt_hash_table[k].chain;
- spin_lock_bh(rt_hash_lock_addr(k));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
- if (!rt_is_expired(rth) &&
- !rt_may_expire(rth, tmo, expire)) {
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- continue;
- }
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- goal--;
- }
- spin_unlock_bh(rt_hash_lock_addr(k));
- if (goal <= 0)
- break;
- }
- rover = k;
-
- if (goal <= 0)
- goto work_done;
-
- /* Goal is not achieved. We stop process if:
-
- - if expire reduced to zero. Otherwise, expire is halfed.
- - if table is not full.
- - if we are called from interrupt.
- - jiffies check is just fallback/debug loop breaker.
- We will not spin here for long time in any case.
- */
-
- RT_CACHE_STAT_INC(gc_goal_miss);
-
- if (expire == 0)
- break;
-
- expire >>= 1;
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
- dst_entries_get_fast(&ipv4_dst_ops), goal, i);
-#endif
+static inline u32 fnhe_hashfun(__be32 daddr)
+{
+ u32 hval;
- if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- } while (!in_softirq() && time_before_eq(jiffies, now));
+ hval = (__force u32) daddr;
+ hval ^= (hval >> 11) ^ (hval >> 22);
- if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
- goto out;
- if (net_ratelimit())
- printk(KERN_WARNING "dst cache overflow\n");
- RT_CACHE_STAT_INC(gc_dst_overflow);
- return 1;
-
-work_done:
- expire += ip_rt_gc_min_interval;
- if (expire > ip_rt_gc_timeout ||
- dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
- dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
- expire = ip_rt_gc_timeout;
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
- dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
-#endif
-out: return 0;
+ return hval & (FNHE_HASH_SIZE - 1);
}
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
- int length = 0;
- const struct rtable *rth = head;
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ rt->dst.expires = fnhe->fnhe_expires;
- while (rth) {
- length += has_noalias(head, rth);
- rth = rcu_dereference_protected(rth->dst.rt_next, 1);
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = fnhe->fnhe_gw;
+ rt->rt_uses_gateway = 1;
}
- return length >> FRACT_BITS;
}
-static int rt_intern_hash(unsigned hash, struct rtable *rt,
- struct rtable **rp, struct sk_buff *skb, int ifindex)
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+ u32 pmtu, unsigned long expires)
{
- struct rtable *rth, *cand;
- struct rtable __rcu **rthp, **candp;
- unsigned long now;
- u32 min_score;
- int chain_length;
- int attempts = !in_softirq();
-
-restart:
- chain_length = 0;
- min_score = ~(u32)0;
- cand = NULL;
- candp = NULL;
- now = jiffies;
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe;
+ struct rtable *rt;
+ unsigned int i;
+ int depth;
+ u32 hval = fnhe_hashfun(daddr);
- if (!rt_caching(dev_net(rt->dst.dev))) {
- /*
- * If we're not caching, just tell the caller we
- * were successful and don't touch the route. The
- * caller hold the sole reference to the cache entry, and
- * it will be released when the caller is done with it.
- * If we drop it here, the callers have no way to resolve routes
- * when we're not caching. Instead, just point *rp at rt, so
- * the caller gets a single use out of the route
- * Note that we do rt_free on this new route entry, so that
- * once its refcount hits zero, we are still able to reap it
- * (Thanks Alexey)
- * Note: To avoid expensive rcu stuff for this uncached dst,
- * we set DST_NOCACHE so that dst_release() can free dst without
- * waiting a grace period.
- */
+ spin_lock_bh(&fnhe_lock);
- rt->dst.flags |= DST_NOCACHE;
- if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = arp_bind_neighbour(&rt->dst);
- if (err) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "Neighbour table failure & not caching routes.\n");
- ip_rt_put(rt);
- return err;
- }
- }
-
- goto skip_hashing;
+ hash = nh->nh_exceptions;
+ if (!hash) {
+ hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ goto out_unlock;
+ nh->nh_exceptions = hash;
}
- rthp = &rt_hash_table[hash].chain;
-
- spin_lock_bh(rt_hash_lock_addr(hash));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
- /* Put it first */
- *rthp = rth->dst.rt_next;
- /*
- * Since lookup is lockfree, the deletion
- * must be visible to another weakly ordered CPU before
- * the insertion at the start of the hash chain.
- */
- rcu_assign_pointer(rth->dst.rt_next,
- rt_hash_table[hash].chain);
- /*
- * Since lookup is lockfree, the update writes
- * must be ordered for consistency on SMP.
- */
- rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
- dst_use(&rth->dst, now);
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- rt_drop(rt);
- if (rp)
- *rp = rth;
- else
- skb_dst_set(skb, &rth->dst);
- return 0;
- }
+ hash += hval;
- if (!atomic_read(&rth->dst.__refcnt)) {
- u32 score = rt_score(rth);
-
- if (score <= min_score) {
- cand = rth;
- candp = rthp;
- min_score = score;
- }
- }
-
- chain_length++;
-
- rthp = &rth->dst.rt_next;
+ depth = 0;
+ for (fnhe = rcu_dereference(hash->chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ break;
+ depth++;
}
- if (cand) {
- /* ip_rt_gc_elasticity used to be average length of chain
- * length, when exceeded gc becomes really aggressive.
- *
- * The second limit is less certain. At the moment it allows
- * only 2 entries per bucket. We will see.
- */
- if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->dst.rt_next;
- rt_free(cand);
+ if (fnhe) {
+ if (gw)
+ fnhe->fnhe_gw = gw;
+ if (pmtu) {
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = max(1UL, expires);
}
+ /* Update all cached dsts too */
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
} else {
- if (chain_length > rt_chain_length_max &&
- slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
- struct net *net = dev_net(rt->dst.dev);
- int num = ++net->ipv4.current_rt_cache_rebuild_count;
- if (!rt_caching(net)) {
- printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
- rt->dst.dev->name, num);
- }
- rt_emergency_hash_rebuild(net);
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
- ifindex, rt_genid(net));
- goto restart;
+ if (depth > FNHE_RECLAIM_DEPTH)
+ fnhe = fnhe_oldest(hash);
+ else {
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+ rcu_assign_pointer(hash->chain, fnhe);
}
- }
-
- /* Try to bind route to arp only if it is output
- route or unicast forwarding path.
- */
- if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
- int err = arp_bind_neighbour(&rt->dst);
- if (err) {
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
- if (err != -ENOBUFS) {
- rt_drop(rt);
- return err;
- }
-
- /* Neighbour tables are full and nothing
- can be released. Try to shrink route cache,
- it is most likely it holds some neighbour records.
- */
- if (attempts-- > 0) {
- int saved_elasticity = ip_rt_gc_elasticity;
- int saved_int = ip_rt_gc_min_interval;
- ip_rt_gc_elasticity = 1;
- ip_rt_gc_min_interval = 0;
- rt_garbage_collect(&ipv4_dst_ops);
- ip_rt_gc_min_interval = saved_int;
- ip_rt_gc_elasticity = saved_elasticity;
- goto restart;
- }
-
- if (net_ratelimit())
- printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
- rt_drop(rt);
- return -ENOBUFS;
+ fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+ fnhe->fnhe_daddr = daddr;
+ fnhe->fnhe_gw = gw;
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = expires;
+
+ /* Exception created; mark the cached routes for the nexthop
+ * stale, so anyone caching it rechecks if this exception
+ * applies to them.
+ */
+ rt = rcu_dereference(nh->nh_rth_input);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+ for_each_possible_cpu(i) {
+ struct rtable __rcu **prt;
+ prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+ rt = rcu_dereference(*prt);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
}
}
- rt->dst.rt_next = rt_hash_table[hash].chain;
-
-#if RT_CACHE_DEBUG >= 2
- if (rt->dst.rt_next) {
- struct rtable *trt;
- printk(KERN_DEBUG "rt_cache @%02x: %pI4",
- hash, &rt->rt_dst);
- for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
- printk(" . %pI4", &trt->rt_dst);
- printk("\n");
- }
-#endif
- /*
- * Since lookup is lockfree, we must make sure
- * previous writes to rt are comitted to memory
- * before making rt visible to other CPUS.
- */
- rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
- spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
- if (rp)
- *rp = rt;
- else
- skb_dst_set(skb, &rt->dst);
- return 0;
-}
-
-void rt_bind_peer(struct rtable *rt, int create)
-{
- struct inet_peer *peer;
-
- peer = inet_getpeer_v4(rt->rt_dst, create);
+ fnhe->fnhe_stamp = jiffies;
- if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
- inet_putpeer(peer);
+out_unlock:
+ spin_unlock_bh(&fnhe_lock);
}
-/*
- * Peer allocation may fail only in serious out-of-memory conditions. However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
- */
-static void ip_select_fb_ident(struct iphdr *iph)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+ bool kill_route)
{
- static DEFINE_SPINLOCK(ip_fb_id_lock);
- static u32 ip_fallback_id;
- u32 salt;
-
- spin_lock_bh(&ip_fb_id_lock);
- salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
- iph->id = htons(salt & 0xFFFF);
- ip_fallback_id = salt;
- spin_unlock_bh(&ip_fb_id_lock);
-}
-
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
-{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt) {
- if (rt->peer == NULL)
- rt_bind_peer(rt, 1);
-
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (rt->peer) {
- iph->id = htons(inet_getid(rt->peer, more));
- return;
- }
- } else
- printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
- __builtin_return_address(0));
-
- ip_select_fb_ident(iph);
-}
-EXPORT_SYMBOL(__ip_select_ident);
+ __be32 new_gw = icmp_hdr(skb)->un.gateway;
+ __be32 old_gw = ip_hdr(skb)->saddr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct neighbour *n;
+ struct net *net;
-static void rt_del(unsigned hash, struct rtable *rt)
-{
- struct rtable __rcu **rthp;
- struct rtable *aux;
+ switch (icmp_hdr(skb)->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ break;
- rthp = &rt_hash_table[hash].chain;
- spin_lock_bh(rt_hash_lock_addr(hash));
- ip_rt_put(rt);
- while ((aux = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
- if (aux == rt || rt_is_expired(aux)) {
- *rthp = aux->dst.rt_next;
- rt_free(aux);
- continue;
- }
- rthp = &aux->dst.rt_next;
+ default:
+ return;
}
- spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-/* called in rcu_read_lock() section */
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
- __be32 saddr, struct net_device *dev)
-{
- int i, k;
- struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rth;
- struct rtable __rcu **rthp;
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct netevent_redirect netevent;
- struct net *net;
+ if (rt->rt_gateway != old_gw)
+ return;
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
return;
@@ -1380,9 +735,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
ipv4_is_zeronet(new_gw))
goto reject_redirect;
- if (!rt_caching(net))
- goto reject_redirect;
-
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
if (!inet_addr_onlink(in_dev, new_gw, old_gw))
goto reject_redirect;
@@ -1393,105 +745,57 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- for (i = 0; i < 2; i++) {
- for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rthp = &rt_hash_table[hash].chain;
-
- while ((rth = rcu_dereference(*rthp)) != NULL) {
- struct rtable *rt;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- rt_is_expired(rth) ||
- !net_eq(dev_net(rth->dst.dev), net)) {
- rthp = &rth->dst.rt_next;
- continue;
- }
-
- if (rth->rt_dst != daddr ||
- rth->rt_src != saddr ||
- rth->dst.error ||
- rth->rt_gateway != old_gw ||
- rth->dst.dev != dev)
- break;
-
- dst_hold(&rth->dst);
-
- rt = dst_alloc(&ipv4_dst_ops);
- if (rt == NULL) {
- ip_rt_put(rth);
- return;
- }
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (n) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ if (fib_lookup(net, fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
- /* Copy all the information. */
- *rt = *rth;
- rt->dst.__use = 1;
- atomic_set(&rt->dst.__refcnt, 1);
- rt->dst.child = NULL;
- if (rt->dst.dev)
- dev_hold(rt->dst.dev);
- rt->dst.obsolete = -1;
- rt->dst.lastuse = jiffies;
- rt->dst.path = &rt->dst;
- rt->dst.neighbour = NULL;
- rt->dst.hh = NULL;
-#ifdef CONFIG_XFRM
- rt->dst.xfrm = NULL;
-#endif
- rt->rt_genid = rt_genid(net);
- rt->rt_flags |= RTCF_REDIRECTED;
-
- /* Gateway is different ... */
- rt->rt_gateway = new_gw;
-
- /* Redirect received -> path was valid */
- dst_confirm(&rth->dst);
-
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
- if (arp_bind_neighbour(&rt->dst) ||
- !(rt->dst.neighbour->nud_state &
- NUD_VALID)) {
- if (rt->dst.neighbour)
- neigh_event_send(rt->dst.neighbour, NULL);
- ip_rt_put(rth);
- rt_drop(rt);
- goto do_next;
- }
-
- netevent.old = &rth->dst;
- netevent.new = &rt->dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
-
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
- ip_rt_put(rt);
- goto do_next;
+ update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ 0, 0);
}
- do_next:
- ;
+ if (kill_route)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
+ neigh_release(n);
}
return;
reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
- " Advised path = %pI4 -> %pI4\n",
- &old_gw, dev->name, &new_gw,
- &saddr, &daddr);
+ if (IN_DEV_LOG_MARTIANS(in_dev)) {
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ __be32 daddr = iph->daddr;
+ __be32 saddr = iph->saddr;
+
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
+ }
#endif
;
}
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+
+ rt = (struct rtable *) dst;
+
+ __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __ip_do_redirect(rt, skb, &fl4, true);
+}
+
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *)dst;
@@ -1502,16 +806,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
ip_rt_put(rt);
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- (rt->dst.expires &&
- time_after_eq(jiffies, rt->dst.expires))) {
- unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
- rt->fl.oif,
- rt_genid(dev_net(dst->dev)));
-#if RT_CACHE_DEBUG >= 1
- printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
- &rt->rt_dst, rt->fl.fl4_tos);
-#endif
- rt_del(hash, rt);
+ rt->dst.expires) {
+ ip_rt_put(rt);
ret = NULL;
}
}
@@ -1538,6 +834,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
+ struct inet_peer *peer;
+ struct net *net;
int log_martians;
rcu_read_lock();
@@ -1549,192 +847,291 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+ rt_nexthop(rt, ip_hdr(skb)->daddr));
+ return;
+ }
+
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
- rt->dst.rate_tokens = 0;
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
- rt->dst.rate_last = jiffies;
- return;
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
+ goto out_put_peer;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->dst.rate_tokens == 0 ||
+ if (peer->rate_tokens == 0 ||
time_after(jiffies,
- (rt->dst.rate_last +
- (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
- icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->dst.rate_last = jiffies;
- ++rt->dst.rate_tokens;
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
+ __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->dst.rate_tokens == ip_rt_redirect_number &&
- net_ratelimit())
- printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
- &rt->rt_src, rt->rt_iif,
- &rt->rt_dst, &rt->rt_gateway);
+ peer->rate_tokens == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, inet_iif(skb),
+ &ip_hdr(skb)->daddr, &gw);
#endif
}
+out_put_peer:
+ inet_putpeer(peer);
}
static int ip_error(struct sk_buff *skb)
{
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
unsigned long now;
+ struct net *net;
+ bool send;
int code;
- switch (rt->dst.error) {
- case EINVAL:
- default:
- goto out;
+ net = dev_net(rt->dst.dev);
+ if (!IN_DEV_FORWARD(in_dev)) {
+ switch (rt->dst.error) {
case EHOSTUNREACH:
- code = ICMP_HOST_UNREACH;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
break;
+
case ENETUNREACH:
- code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(dev_net(rt->dst.dev),
- IPSTATS_MIB_INNOROUTES);
- break;
- case EACCES:
- code = ICMP_PKT_FILTERED;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
break;
+ }
+ goto out;
}
- now = jiffies;
- rt->dst.rate_tokens += now - rt->dst.rate_last;
- if (rt->dst.rate_tokens > ip_rt_error_burst)
- rt->dst.rate_tokens = ip_rt_error_burst;
- rt->dst.rate_last = now;
- if (rt->dst.rate_tokens >= ip_rt_error_cost) {
- rt->dst.rate_tokens -= ip_rt_error_cost;
- icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ switch (rt->dst.error) {
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
+ inet_putpeer(peer);
}
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
out: kfree_skb(skb);
return 0;
}
-/*
- * The last two values are not from the RFC but
- * are needed for AMPRnet AX.25 paths.
- */
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+{
+ struct dst_entry *dst = &rt->dst;
+ struct fib_result res;
+
+ if (dst_metric_locked(dst, RTAX_MTU))
+ return;
+
+ if (dst->dev->mtu < mtu)
+ return;
-static const unsigned short mtu_plateau[] =
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
-static inline unsigned short guess_mtu(unsigned short old_mtu)
+ if (rt->rt_pmtu == mtu &&
+ time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
+ return;
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+ jiffies + ip_rt_mtu_expires);
+ }
+ rcu_read_unlock();
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
{
- int i;
+ struct rtable *rt = (struct rtable *) dst;
+ struct flowi4 fl4;
- for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
- if (old_mtu > mtu_plateau[i])
- return mtu_plateau[i];
- return 68;
+ ip_rt_build_flow_key(&fl4, sk, skb);
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
}
-unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
- unsigned short new_mtu,
- struct net_device *dev)
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+ int oif, u32 mark, u8 protocol, int flow_flags)
{
- int i, k;
- unsigned short old_mtu = ntohs(iph->tot_len);
- struct rtable *rth;
- int ikeys[2] = { dev->ifindex, 0 };
- __be32 skeys[2] = { iph->saddr, 0, };
- __be32 daddr = iph->daddr;
- unsigned short est_mtu = 0;
-
- for (k = 0; k < 2; k++) {
- for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- unsigned short mtu = new_mtu;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->rt_dst != daddr ||
- rth->rt_src != iph->saddr ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- dst_metric_locked(&rth->dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->dst.dev), net) ||
- rt_is_expired(rth))
- continue;
-
- if (new_mtu < 68 || new_mtu >= old_mtu) {
-
- /* BSD 4.2 compatibility hack :-( */
- if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->dst) &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
-
- mtu = guess_mtu(old_mtu);
- }
- if (mtu <= dst_mtu(&rth->dst)) {
- if (mtu < dst_mtu(&rth->dst)) {
- dst_confirm(&rth->dst);
- if (mtu < ip_rt_min_pmtu) {
- mtu = ip_rt_min_pmtu;
- rth->dst.metrics[RTAX_LOCK-1] |=
- (1 << RTAX_MTU);
- }
- rth->dst.metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(&rth->dst,
- ip_rt_mtu_expires);
- }
- est_mtu = mtu;
- }
- }
- rcu_read_unlock();
- }
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ if (!mark)
+ mark = IP4_REPLY_MARK(net, skb->mark);
+
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
}
- return est_mtu ? : new_mtu;
}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- if (dst_mtu(dst) > mtu && mtu >= 68 &&
- !(dst_metric_locked(dst, RTAX_MTU))) {
- if (mtu < ip_rt_min_pmtu) {
- mtu = ip_rt_min_pmtu;
- dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
- }
- dst->metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(dst, ip_rt_mtu_expires);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
}
}
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- if (rt_is_expired((struct rtable *)dst))
- return NULL;
- return dst;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct dst_entry *odst = NULL;
+ bool new = false;
+
+ bh_lock_sock(sk);
+
+ if (!ip_sk_accept_pmtu(sk))
+ goto out;
+
+ odst = sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !odst) {
+ __ipv4_sk_update_pmtu(skb, sk, mtu);
+ goto out;
+ }
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ rt = (struct rtable *)odst;
+ if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+
+ if (!dst_check(&rt->dst, 0)) {
+ if (new)
+ dst_release(&rt->dst);
+
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ if (new)
+ sk_dst_set(sk, &rt->dst);
+
+out:
+ bh_unlock_sock(sk);
+ dst_release(odst);
}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-static void ipv4_dst_destroy(struct dst_entry *dst)
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+ int oif, u32 mark, u8 protocol, int flow_flags)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer = rt->peer;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
- if (peer) {
- rt->peer = NULL;
- inet_putpeer(peer);
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
}
}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ /* All IPV4 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ *
+ * When a PMTU/redirect information update invalidates a route,
+ * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+ * DST_OBSOLETE_DEAD by dst_free().
+ */
+ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+ return NULL;
+ return dst;
+}
static void ipv4_link_failure(struct sk_buff *skb)
{
@@ -1747,12 +1144,13 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sk_buff *skb)
+static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
{
- printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
- skb->dev ? skb->dev->name : "?");
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
kfree_skb(skb);
+ WARN_ON(1);
return 0;
}
@@ -1765,26 +1163,40 @@ static int ip_rt_bug(struct sk_buff *skb)
in IP options!
*/
-void ip_rt_get_source(u8 *addr, struct rtable *rt)
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
{
__be32 src;
- struct fib_result res;
if (rt_is_output_route(rt))
- src = rt->rt_src;
+ src = ip_hdr(skb)->saddr;
else {
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = rt->dst.dev->ifindex;
+ fl4.flowi4_iif = skb->dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+
rcu_read_lock();
- if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
- src = FIB_RES_PREFSRC(res);
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
else
- src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
- RT_SCOPE_UNIVERSE);
+ src = inet_select_addr(rt->dst.dev,
+ rt_nexthop(rt, iph->daddr),
+ RT_SCOPE_UNIVERSE);
rcu_read_unlock();
}
memcpy(addr, &src, 4);
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
if (!(rt->dst.tclassid & 0xFFFF))
@@ -1794,55 +1206,229 @@ static void set_class_tag(struct rtable *rt, u32 tag)
}
#endif
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
- struct fib_info *fi = res->fi;
+ unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+
+ if (advmss == 0) {
+ advmss = max_t(unsigned int, dst->dev->mtu - 40,
+ ip_rt_min_advmss);
+ if (advmss > 65535 - 40)
+ advmss = 65535 - 40;
+ }
+ return advmss;
+}
+
+static unsigned int ipv4_mtu(const struct dst_entry *dst)
+{
+ const struct rtable *rt = (const struct rtable *) dst;
+ unsigned int mtu = rt->rt_pmtu;
+
+ if (!mtu || time_after_eq(jiffies, rt->dst.expires))
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ return mtu;
+
+ mtu = dst->dev->mtu;
+
+ if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+ if (rt->rt_uses_gateway && mtu > 576)
+ mtu = 576;
+ }
+
+ return min_t(unsigned int, mtu, IP_MAX_MTU);
+}
+
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ struct fib_nh_exception *fnhe;
+ u32 hval;
+
+ if (!hash)
+ return NULL;
+
+ hval = fnhe_hashfun(daddr);
+
+ for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ return fnhe;
+ }
+ return NULL;
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ __be32 daddr)
+{
+ bool ret = false;
+
+ spin_lock_bh(&fnhe_lock);
+
+ if (daddr == fnhe->fnhe_daddr) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
+ int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
+
+ if (fnhe->fnhe_genid != genid) {
+ fnhe->fnhe_genid = genid;
+ fnhe->fnhe_gw = 0;
+ fnhe->fnhe_pmtu = 0;
+ fnhe->fnhe_expires = 0;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
+ }
+ fill_route_from_fnhe(rt, fnhe);
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+
+ if (!(rt->dst.flags & DST_NOCACHE)) {
+ rcu_assign_pointer(*porig, rt);
+ if (orig)
+ rt_free(orig);
+ ret = true;
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+ }
+ spin_unlock_bh(&fnhe_lock);
+
+ return ret;
+}
+
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+ struct rtable *orig, *prev, **p;
+ bool ret = true;
+
+ if (rt_is_input_route(rt)) {
+ p = (struct rtable **)&nh->nh_rth_input;
+ } else {
+ p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ orig = *p;
+
+ prev = cmpxchg(p, orig, rt);
+ if (prev == orig) {
+ if (orig)
+ rt_free(orig);
+ } else
+ ret = false;
+
+ return ret;
+}
+
+static DEFINE_SPINLOCK(rt_uncached_lock);
+static LIST_HEAD(rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+ spin_lock_bh(&rt_uncached_lock);
+ list_add_tail(&rt->rt_uncached, &rt_uncached_list);
+ spin_unlock_bh(&rt_uncached_lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (!list_empty(&rt->rt_uncached)) {
+ spin_lock_bh(&rt_uncached_lock);
+ list_del(&rt->rt_uncached);
+ spin_unlock_bh(&rt_uncached_lock);
+ }
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+ if (!list_empty(&rt_uncached_list)) {
+ struct net *net = dev_net(dev);
+ struct rtable *rt;
+
+ spin_lock_bh(&rt_uncached_lock);
+ list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
+ if (rt->dst.dev != dev)
+ continue;
+ rt->dst.dev = net->loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(dev);
+ }
+ spin_unlock_bh(&rt_uncached_lock);
+ }
+}
+
+static bool rt_cache_valid(const struct rtable *rt)
+{
+ return rt &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ !rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res,
+ struct fib_nh_exception *fnhe,
+ struct fib_info *fi, u16 type, u32 itag)
+{
+ bool cached = false;
if (fi) {
- if (FIB_RES_GW(*res) &&
- FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- rt->rt_gateway = FIB_RES_GW(*res);
- memcpy(rt->dst.metrics, fi->fib_metrics,
- sizeof(rt->dst.metrics));
- if (fi->fib_mtu == 0) {
- rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
- if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
- rt->rt_gateway != rt->rt_dst &&
- rt->dst.dev->mtu > 576)
- rt->dst.metrics[RTAX_MTU-1] = 576;
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
+ rt->rt_gateway = nh->nh_gw;
+ rt->rt_uses_gateway = 1;
}
-#ifdef CONFIG_NET_CLS_ROUTE
- rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rt->dst.tclassid = nh->nh_tclassid;
#endif
+ if (unlikely(fnhe))
+ cached = rt_bind_exception(rt, fnhe, daddr);
+ else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_cache_route(nh, rt);
+ if (unlikely(!cached)) {
+ /* Routes we intend to cache in nexthop exception or
+ * FIB nexthop have the DST_NOCACHE bit clear.
+ * However, if we are unsuccessful at storing this
+ * route into the cache we really need to set it.
+ */
+ rt->dst.flags |= DST_NOCACHE;
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+ rt_add_uncached_list(rt);
+ }
} else
- rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
-
- if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
- rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
- if (dst_mtu(&rt->dst) > IP_MAX_MTU)
- rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
- if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
- rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
- ip_rt_min_advmss);
- if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
- rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
-
-#ifdef CONFIG_NET_CLS_ROUTE
+ rt_add_uncached_list(rt);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
- set_class_tag(rt, fib_rules_tclass(res));
+ set_class_tag(rt, res->tclassid);
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
+}
+
+static struct rtable *rt_dst_alloc(struct net_device *dev,
+ bool nopolicy, bool noxfrm, bool will_cache)
+{
+ return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
}
/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
{
- unsigned int hash;
struct rtable *rth;
- __be32 spec_dst;
struct in_device *in_dev = __in_dev_get_rcu(dev);
u32 itag = 0;
int err;
@@ -1853,49 +1439,41 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
- ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
+ skb->protocol != htons(ETH_P_IP))
goto e_inval;
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(saddr))
+ goto e_inval;
+
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
goto e_inval;
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else {
- err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
- &itag, 0);
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
if (err < 0)
goto e_err;
}
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
if (!rth)
goto e_nobufs;
- rth->dst.output = ip_rt_bug;
- rth->dst.obsolete = -1;
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
- rth->dst.dev = init_net.loopback_dev;
- dev_hold(rth->dst.dev);
- rth->fl.oif = 0;
- rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->rt_genid = rt_genid(dev_net(dev));
+ rth->dst.output = ip_rt_bug;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev));
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
+ rth->rt_is_input= 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
@@ -1907,8 +1485,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
RT_CACHE_STAT_INC(in_slow_mc);
- hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
- return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
+ skb_dst_set(skb, &rth->dst);
+ return 0;
e_nobufs:
return -ENOBUFS;
@@ -1932,18 +1510,13 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
+ pr_warn("martian source %pI4 from %pI4, on dev %s\n",
&daddr, &saddr, dev->name);
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
- int i;
- const unsigned char *p = skb_mac_header(skb);
- printk(KERN_WARNING "ll header: ");
- for (i = 0; i < dev->hard_header_len; i++, p++) {
- printk("%02x", *p);
- if (i < (dev->hard_header_len - 1))
- printk(":");
- }
- printk("\n");
+ print_hex_dump(KERN_WARNING, "ll header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb_mac_header(skb),
+ dev->hard_header_len, true);
}
}
#endif
@@ -1951,30 +1524,27 @@ static void ip_handle_martian_source(struct net_device *dev,
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
- struct fib_result *res,
+ const struct fib_result *res,
struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos,
- struct rtable **result)
+ __be32 daddr, __be32 saddr, u32 tos)
{
+ struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
struct in_device *out_dev;
unsigned int flags = 0;
- __be32 spec_dst;
- u32 itag;
+ bool do_cache;
+ u32 itag = 0;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
- if (net_ratelimit())
- printk(KERN_CRIT "Bug in ip_route_input" \
- "_slow(). Please, report\n");
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
-
- err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
- in_dev->dev, &spec_dst, &itag, skb->mark);
+ err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+ in_dev->dev, in_dev, &itag);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
@@ -1982,13 +1552,13 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- if (err)
- flags |= RTCF_DIRECTSRC;
-
- if (out_dev == in_dev && err &&
+ do_cache = res->fi && !itag;
+ if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
- inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+ inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
flags |= RTCF_DOREDIRECT;
+ do_cache = false;
+ }
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
@@ -2005,43 +1575,44 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
+ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ if (do_cache) {
+ if (fnhe != NULL)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ goto out;
+ }
+ }
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(out_dev->dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
- rth->rt_gateway = daddr;
- rth->rt_iif =
- rth->fl.iif = in_dev->dev->ifindex;
- rth->dst.dev = (out_dev)->dev;
- dev_hold(rth->dst.dev);
- rth->fl.oif = 0;
- rth->rt_spec_dst= spec_dst;
-
- rth->dst.obsolete = -1;
+ rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
+ rth->rt_flags = flags;
+ rth->rt_type = res->type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
+
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
- rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
- rt_set_nexthop(rth, res, itag);
-
- rth->rt_flags = flags;
-
- *result = rth;
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ skb_dst_set(skb, &rth->dst);
+out:
err = 0;
cleanup:
return err;
@@ -2049,28 +1620,17 @@ static int __mkroute_input(struct sk_buff *skb,
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
- const struct flowi *fl,
+ const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
- struct rtable* rth = NULL;
- int err;
- unsigned hash;
-
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
- fib_select_multipath(fl, res);
+ if (res->fi && res->fi->fib_nhs > 1)
+ fib_select_multipath(res);
#endif
/* create a routing cache entry */
- err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
- if (err)
- return err;
-
- /* put it into the cache */
- hash = rt_hash(daddr, saddr, fl->iif,
- rt_genid(dev_net(rth->dst.dev)));
- return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
+ return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
/*
@@ -2089,19 +1649,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct flowi fl = { .fl4_dst = daddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .fl4_scope = RT_SCOPE_UNIVERSE,
- .mark = skb->mark,
- .iif = dev->ifindex };
- unsigned flags = 0;
+ struct flowi4 fl4;
+ unsigned int flags = 0;
u32 itag = 0;
- struct rtable * rth;
- unsigned hash;
- __be32 spec_dst;
+ struct rtable *rth;
int err = -EINVAL;
- struct net * net = dev_net(dev);
+ struct net *net = dev_net(dev);
+ bool do_cache;
/* IP on this device is disabled. */
@@ -2112,10 +1666,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
by fib_lookup.
*/
- if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
- ipv4_is_loopback(saddr))
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
+ res.fi = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -2125,105 +1679,124 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
+ if (ipv4_is_zeronet(daddr))
goto martian_destination;
+ /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+ * and call it once if daddr or/and saddr are loopback addresses
+ */
+ if (ipv4_is_loopback(daddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_destination;
+ } else if (ipv4_is_loopback(saddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+ }
+
/*
* Now we are ready to route packet.
*/
- err = fib_lookup(net, &fl, &res);
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ err = fib_lookup(net, &fl4, &res);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
+ err = -EHOSTUNREACH;
goto no_route;
}
- RT_CACHE_STAT_INC(in_slow_tot);
-
if (res.type == RTN_BROADCAST)
goto brd_input;
if (res.type == RTN_LOCAL) {
- err = fib_validate_source(saddr, daddr, tos,
- net->loopback_dev->ifindex,
- dev, &spec_dst, &itag, skb->mark);
+ err = fib_validate_source(skb, saddr, daddr, tos,
+ 0, dev, in_dev, &itag);
if (err < 0)
goto martian_source_keep_err;
- if (err)
- flags |= RTCF_DIRECTSRC;
- spec_dst = daddr;
goto local_input;
}
- if (!IN_DEV_FORWARD(in_dev))
- goto e_hostunreach;
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
if (res.type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
if (skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (ipv4_is_zeronet(saddr))
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- else {
- err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
- &itag, skb->mark);
+ if (!ipv4_is_zeronet(saddr)) {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
if (err < 0)
goto martian_source_keep_err;
- if (err)
- flags |= RTCF_DIRECTSRC;
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
RT_CACHE_STAT_INC(in_brd);
local_input:
- rth = dst_alloc(&ipv4_dst_ops);
+ do_cache = false;
+ if (res.fi) {
+ if (!itag) {
+ rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ err = 0;
+ goto out;
+ }
+ do_cache = true;
+ }
+ }
+
+ rth = rt_dst_alloc(net->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
+ rth->dst.input= ip_local_deliver;
rth->dst.output= ip_rt_bug;
- rth->dst.obsolete = -1;
- rth->rt_genid = rt_genid(net);
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- rth->fl.fl4_dst = daddr;
- rth->rt_dst = daddr;
- rth->fl.fl4_tos = tos;
- rth->fl.mark = skb->mark;
- rth->fl.fl4_src = saddr;
- rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
- rth->rt_iif =
- rth->fl.iif = dev->ifindex;
- rth->dst.dev = net->loopback_dev;
- dev_hold(rth->dst.dev);
- rth->rt_gateway = daddr;
- rth->rt_spec_dst= spec_dst;
- rth->dst.input= ip_local_deliver;
+
+ rth->rt_genid = rt_genid_ipv4(net);
rth->rt_flags = flags|RTCF_LOCAL;
+ rth->rt_type = res.type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
- rth->rt_type = res.type;
- hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
- err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
+ if (do_cache) {
+ if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ rth->dst.flags |= DST_NOCACHE;
+ rt_add_uncached_list(rth);
+ }
+ }
+ skb_dst_set(skb, &rth->dst);
+ err = 0;
goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
- spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
res.type = RTN_UNREACHABLE;
if (err == -ESRCH)
err = -ENETUNREACH;
@@ -2235,15 +1808,11 @@ no_route:
martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
#endif
-e_hostunreach:
- err = -EHOSTUNREACH;
- goto out;
-
e_inval:
err = -EINVAL;
goto out;
@@ -2259,50 +1828,13 @@ martian_source_keep_err:
goto out;
}
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, bool noref)
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
{
- struct rtable * rth;
- unsigned hash;
- int iif = dev->ifindex;
- struct net *net;
int res;
- net = dev_net(dev);
-
rcu_read_lock();
- if (!rt_caching(net))
- goto skip_cache;
-
- tos &= IPTOS_RT_MASK;
- hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
- ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
- (rth->fl.iif ^ iif) |
- rth->fl.oif |
- (rth->fl.fl4_tos ^ tos)) == 0 &&
- rth->fl.mark == skb->mark &&
- net_eq(dev_net(rth->dst.dev), net) &&
- !rt_is_expired(rth)) {
- if (noref) {
- dst_use_noref(&rth->dst, jiffies);
- skb_dst_set_noref(skb, &rth->dst);
- } else {
- dst_use(&rth->dst, jiffies);
- skb_dst_set(skb, &rth->dst);
- }
- RT_CACHE_STAT_INC(in_hit);
- rcu_read_unlock();
- return 0;
- }
- RT_CACHE_STAT_INC(in_hlist_search);
- }
-
-skip_cache:
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
@@ -2318,8 +1850,8 @@ skip_cache:
struct in_device *in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
- int our = ip_check_mc(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
+ int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
||
@@ -2340,101 +1872,118 @@ skip_cache:
rcu_read_unlock();
return res;
}
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input_noref);
/* called with rcu_read_lock() */
-static int __mkroute_output(struct rtable **result,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi4 *fl4, int orig_oif,
+ struct net_device *dev_out,
+ unsigned int flags)
{
- struct rtable *rth;
+ struct fib_info *fi = res->fi;
+ struct fib_nh_exception *fnhe;
struct in_device *in_dev;
- u32 tos = RT_FL_TOS(oldflp);
+ u16 type = res->type;
+ struct rtable *rth;
+ bool do_cache;
- if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
- return -EINVAL;
+ in_dev = __in_dev_get_rcu(dev_out);
+ if (!in_dev)
+ return ERR_PTR(-EINVAL);
- if (ipv4_is_lbcast(fl->fl4_dst))
- res->type = RTN_BROADCAST;
- else if (ipv4_is_multicast(fl->fl4_dst))
- res->type = RTN_MULTICAST;
- else if (ipv4_is_zeronet(fl->fl4_dst))
- return -EINVAL;
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ return ERR_PTR(-EINVAL);
+
+ if (ipv4_is_lbcast(fl4->daddr))
+ type = RTN_BROADCAST;
+ else if (ipv4_is_multicast(fl4->daddr))
+ type = RTN_MULTICAST;
+ else if (ipv4_is_zeronet(fl4->daddr))
+ return ERR_PTR(-EINVAL);
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- in_dev = __in_dev_get_rcu(dev_out);
- if (!in_dev)
- return -EINVAL;
-
- if (res->type == RTN_BROADCAST) {
+ do_cache = true;
+ if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- res->fi = NULL;
- } else if (res->type == RTN_MULTICAST) {
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
- if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
- oldflp->proto))
+ if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+ fl4->flowi4_proto))
flags &= ~RTCF_LOCAL;
+ else
+ do_cache = false;
/* If multicast route do not exist use
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4)
- res->fi = NULL;
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
+ }
+
+ fnhe = NULL;
+ do_cache &= fi != NULL;
+ if (do_cache) {
+ struct rtable __rcu **prth;
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ fnhe = find_exception(nh, fl4->daddr);
+ if (fnhe)
+ prth = &fnhe->fnhe_rth_output;
+ else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ rth = rcu_dereference(*prth);
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ return rth;
+ }
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+add:
+ rth = rt_dst_alloc(dev_out,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM),
+ do_cache);
if (!rth)
- return -ENOBUFS;
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
-
- rth->fl.fl4_dst = oldflp->fl4_dst;
- rth->fl.fl4_tos = tos;
- rth->fl.fl4_src = oldflp->fl4_src;
- rth->fl.oif = oldflp->oif;
- rth->fl.mark = oldflp->mark;
- rth->rt_dst = fl->fl4_dst;
- rth->rt_src = fl->fl4_src;
- rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
- /* get references to the devices that are to be hold by the routing
- cache entry */
- rth->dst.dev = dev_out;
- dev_hold(dev_out);
- rth->rt_gateway = fl->fl4_dst;
- rth->rt_spec_dst= fl->fl4_src;
-
- rth->dst.output=ip_output;
- rth->dst.obsolete = -1;
- rth->rt_genid = rt_genid(dev_net(dev_out));
+ return ERR_PTR(-ENOBUFS);
+
+ rth->dst.output = ip_output;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
+ rth->rt_flags = flags;
+ rth->rt_type = type;
+ rth->rt_is_input = 0;
+ rth->rt_iif = orig_oif ? : 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
RT_CACHE_STAT_INC(out_slow_tot);
- if (flags & RTCF_LOCAL) {
+ if (flags & RTCF_LOCAL)
rth->dst.input = ip_local_deliver;
- rth->rt_spec_dst = fl->fl4_dst;
- }
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
- rth->rt_spec_dst = fl->fl4_src;
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
rth->dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
- if (res->type == RTN_MULTICAST) {
+ if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
- !ipv4_is_local_multicast(oldflp->fl4_dst)) {
+ !ipv4_is_local_multicast(fl4->daddr)) {
rth->dst.input = ip_mr_input;
rth->dst.output = ip_mc_output;
}
@@ -2442,66 +1991,41 @@ static int __mkroute_output(struct rtable **result,
#endif
}
- rt_set_nexthop(rth, res, 0);
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
- rth->rt_flags = flags;
- *result = rth;
- return 0;
-}
-
-/* called with rcu_read_lock() */
-static int ip_mkroute_output(struct rtable **rp,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
-{
- struct rtable *rth = NULL;
- int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
- if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
- rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
- }
-
- return err;
+ return rth;
}
/*
* Major route resolver routine.
- * called with rcu_read_lock();
*/
-static int ip_route_output_slow(struct net *net, struct rtable **rp,
- const struct flowi *oldflp)
-{
- u32 tos = RT_FL_TOS(oldflp);
- struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
- .fl4_src = oldflp->fl4_src,
- .fl4_tos = tos & IPTOS_RT_MASK,
- .fl4_scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
- .mark = oldflp->mark,
- .iif = net->loopback_dev->ifindex,
- .oif = oldflp->oif };
- struct fib_result res;
- unsigned int flags = 0;
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+{
struct net_device *dev_out = NULL;
- int err;
-
+ __u8 tos = RT_FL_TOS(fl4);
+ unsigned int flags = 0;
+ struct fib_result res;
+ struct rtable *rth;
+ int orig_oif;
+ res.tclassid = 0;
res.fi = NULL;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
+ res.table = NULL;
+
+ orig_oif = fl4->flowi4_oif;
+
+ fl4->flowi4_iif = LOOPBACK_IFINDEX;
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
- if (oldflp->fl4_src) {
- err = -EINVAL;
- if (ipv4_is_multicast(oldflp->fl4_src) ||
- ipv4_is_lbcast(oldflp->fl4_src) ||
- ipv4_is_zeronet(oldflp->fl4_src))
+ rcu_read_lock();
+ if (fl4->saddr) {
+ rth = ERR_PTR(-EINVAL);
+ if (ipv4_is_multicast(fl4->saddr) ||
+ ipv4_is_lbcast(fl4->saddr) ||
+ ipv4_is_zeronet(fl4->saddr))
goto out;
/* I removed check for oif == dev_out->oif here.
@@ -2512,11 +2036,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
of another iface. --ANK
*/
- if (oldflp->oif == 0 &&
- (ipv4_is_multicast(oldflp->fl4_dst) ||
- ipv4_is_lbcast(oldflp->fl4_dst))) {
+ if (fl4->flowi4_oif == 0 &&
+ (ipv4_is_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
+ dev_out = __ip_dev_find(net, fl4->saddr, false);
if (dev_out == NULL)
goto out;
@@ -2535,59 +2059,61 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
Luckily, this hack is good workaround.
*/
- fl.oif = dev_out->ifindex;
+ fl4->flowi4_oif = dev_out->ifindex;
goto make_route;
}
- if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+ if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- if (!__ip_dev_find(net, oldflp->fl4_src, false))
+ if (!__ip_dev_find(net, fl4->saddr, false))
goto out;
}
}
- if (oldflp->oif) {
- dev_out = dev_get_by_index_rcu(net, oldflp->oif);
- err = -ENODEV;
+ if (fl4->flowi4_oif) {
+ dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+ rth = ERR_PTR(-ENODEV);
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
- if (rcu_dereference(dev_out->ip_ptr) == NULL)
- goto out; /* Wrong error code */
-
- if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
- ipv4_is_lbcast(oldflp->fl4_dst)) {
- if (!fl.fl4_src)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+ if (ipv4_is_local_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr)) {
+ if (!fl4->saddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
- if (!fl.fl4_src) {
- if (ipv4_is_multicast(oldflp->fl4_dst))
- fl.fl4_src = inet_select_addr(dev_out, 0,
- fl.fl4_scope);
- else if (!oldflp->fl4_dst)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (!fl4->saddr) {
+ if (ipv4_is_multicast(fl4->daddr))
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ fl4->flowi4_scope);
+ else if (!fl4->daddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
}
- if (!fl.fl4_dst) {
- fl.fl4_dst = fl.fl4_src;
- if (!fl.fl4_dst)
- fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+ if (!fl4->daddr) {
+ fl4->daddr = fl4->saddr;
+ if (!fl4->daddr)
+ fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
dev_out = net->loopback_dev;
- fl.oif = net->loopback_dev->ifindex;
+ fl4->flowi4_oif = LOOPBACK_IFINDEX;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- if (fib_lookup(net, &fl, &res)) {
+ if (fib_lookup(net, fl4, &res)) {
res.fi = NULL;
- if (oldflp->oif) {
+ res.table = NULL;
+ if (fl4->flowi4_oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2606,190 +2132,161 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
likely IPv6, but we do not.
*/
- if (fl.fl4_src == 0)
- fl.fl4_src = inet_select_addr(dev_out, 0,
+ if (fl4->saddr == 0)
+ fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
res.type = RTN_UNICAST;
goto make_route;
}
- err = -ENETUNREACH;
+ rth = ERR_PTR(-ENETUNREACH);
goto out;
}
if (res.type == RTN_LOCAL) {
- if (!fl.fl4_src)
- fl.fl4_src = fl.fl4_dst;
+ if (!fl4->saddr) {
+ if (res.fi->fib_prefsrc)
+ fl4->saddr = res.fi->fib_prefsrc;
+ else
+ fl4->saddr = fl4->daddr;
+ }
dev_out = net->loopback_dev;
- fl.oif = dev_out->ifindex;
- res.fi = NULL;
+ fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl.oif == 0)
- fib_select_multipath(&fl, &res);
+ if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+ fib_select_multipath(&res);
else
#endif
- if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(net, &fl, &res);
+ if (!res.prefixlen &&
+ res.table->tb_num_default > 1 &&
+ res.type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(&res);
- if (!fl.fl4_src)
- fl.fl4_src = FIB_RES_PREFSRC(res);
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, res);
dev_out = FIB_RES_DEV(res);
- fl.oif = dev_out->ifindex;
+ fl4->flowi4_oif = dev_out->ifindex;
make_route:
- err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+ rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
-out: return err;
+out:
+ rcu_read_unlock();
+ return rth;
}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
-int __ip_route_output_key(struct net *net, struct rtable **rp,
- const struct flowi *flp)
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
- unsigned int hash;
- int res;
- struct rtable *rth;
+ return NULL;
+}
- if (!rt_caching(net))
- goto slow_output;
-
- hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
-
- rcu_read_lock_bh();
- for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference_bh(rth->dst.rt_next)) {
- if (rth->fl.fl4_dst == flp->fl4_dst &&
- rth->fl.fl4_src == flp->fl4_src &&
- rt_is_output_route(rth) &&
- rth->fl.oif == flp->oif &&
- rth->fl.mark == flp->mark &&
- !((rth->fl.fl4_tos ^ flp->fl4_tos) &
- (IPTOS_RT_MASK | RTO_ONLINK)) &&
- net_eq(dev_net(rth->dst.dev), net) &&
- !rt_is_expired(rth)) {
- dst_use(&rth->dst, jiffies);
- RT_CACHE_STAT_INC(out_hit);
- rcu_read_unlock_bh();
- *rp = rth;
- return 0;
- }
- RT_CACHE_STAT_INC(out_hlist_search);
- }
- rcu_read_unlock_bh();
+static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-slow_output:
- rcu_read_lock();
- res = ip_route_output_slow(net, rp, flp);
- rcu_read_unlock();
- return res;
+ return mtu ? : dst->dev->mtu;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
{
- return NULL;
}
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
}
+static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
+ unsigned long old)
+{
+ return NULL;
+}
+
static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
- .destroy = ipv4_dst_destroy,
.check = ipv4_blackhole_dst_check,
+ .mtu = ipv4_blackhole_mtu,
+ .default_advmss = ipv4_default_advmss,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
+ .redirect = ipv4_rt_blackhole_redirect,
+ .cow_metrics = ipv4_rt_blackhole_cow_metrics,
+ .neigh_lookup = ipv4_neigh_lookup,
};
-
-static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rtable *ort = *rp;
- struct rtable *rt = (struct rtable *)
- dst_alloc(&ipv4_dst_blackhole_ops);
+ struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *rt;
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard;
- memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
+ new->output = dst_discard_sk;
new->dev = ort->dst.dev;
if (new->dev)
dev_hold(new->dev);
- rt->fl = ort->fl;
+ rt->rt_is_input = ort->rt_is_input;
+ rt->rt_iif = ort->rt_iif;
+ rt->rt_pmtu = ort->rt_pmtu;
- rt->rt_genid = rt_genid(net);
+ rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
- rt->rt_dst = ort->rt_dst;
- rt->rt_src = ort->rt_src;
- rt->rt_iif = ort->rt_iif;
rt->rt_gateway = ort->rt_gateway;
- rt->rt_spec_dst = ort->rt_spec_dst;
- rt->peer = ort->peer;
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
+
+ INIT_LIST_HEAD(&rt->rt_uncached);
dst_free(new);
}
- dst_release(&(*rp)->dst);
- *rp = rt;
- return rt ? 0 : -ENOMEM;
+ dst_release(dst_orig);
+
+ return rt ? &rt->dst : ERR_PTR(-ENOMEM);
}
-int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
- struct sock *sk, int flags)
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+ struct sock *sk)
{
- int err;
-
- if ((err = __ip_route_output_key(net, rp, flp)) != 0)
- return err;
+ struct rtable *rt = __ip_route_output_key(net, flp4);
- if (flp->proto) {
- if (!flp->fl4_src)
- flp->fl4_src = (*rp)->rt_src;
- if (!flp->fl4_dst)
- flp->fl4_dst = (*rp)->rt_dst;
- err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
- flags ? XFRM_LOOKUP_WAIT : 0);
- if (err == -EREMOTE)
- err = ipv4_dst_blackhole(net, rp, flp);
+ if (IS_ERR(rt))
+ return rt;
- return err;
- }
+ if (flp4->flowi4_proto)
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0);
- return 0;
+ return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
-int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
-{
- return ip_route_output_flow(net, rp, flp, NULL, 0);
-}
-EXPORT_SYMBOL(ip_route_output_key);
-
-static int rt_fill_info(struct net *net,
- struct sk_buff *skb, u32 pid, u32 seq, int event,
- int nowait, unsigned int flags)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, int event, int nowait, unsigned int flags)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
struct nlmsghdr *nlh;
- long expires;
- u32 id = 0, ts = 0, tsage = 0, error;
+ unsigned long expires = 0;
+ u32 error;
+ u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2797,9 +2294,10 @@ static int rt_fill_info(struct net *net,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = rt->fl.fl4_tos;
+ r->rtm_tos = fl4->flowi4_tos;
r->rtm_table = RT_TABLE_MAIN;
- NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ goto nla_put_failure;
r->rtm_type = rt->rt_type;
r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
@@ -2807,50 +2305,59 @@ static int rt_fill_info(struct net *net,
if (rt->rt_flags & RTCF_NOTIFY)
r->rtm_flags |= RTM_F_NOTIFY;
- NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-
- if (rt->fl.fl4_src) {
+ if (nla_put_be32(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (src) {
r->rtm_src_len = 32;
- NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
+ if (nla_put_be32(skb, RTA_SRC, src))
+ goto nla_put_failure;
}
- if (rt->dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (rt->dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
#endif
- if (rt_is_input_route(rt))
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
- else if (rt->rt_src != rt->fl.fl4_src)
- NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
+ if (!rt_is_input_route(rt) &&
+ fl4->saddr != src) {
+ if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
+ goto nla_put_failure;
+ }
+ if (rt->rt_uses_gateway &&
+ nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
+ goto nla_put_failure;
- if (rt->rt_dst != rt->rt_gateway)
- NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
+ expires = rt->dst.expires;
+ if (expires) {
+ unsigned long now = jiffies;
- if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
+ if (time_before(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+ }
+
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt_pmtu && expires)
+ metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (rt->fl.mark)
- NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
error = rt->dst.error;
- expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
- if (rt->peer) {
- inet_peer_refcheck(rt->peer);
- id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
- if (rt->peer->tcp_ts_stamp) {
- ts = rt->peer->tcp_ts;
- tsage = get_seconds() - rt->peer->tcp_ts_stamp;
- }
- }
if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
- __be32 dst = rt->rt_dst;
-
if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb, r, nowait);
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, nowait);
if (err <= 0) {
if (!nowait) {
if (err == 0)
@@ -2864,11 +2371,11 @@ static int rt_fill_info(struct net *net,
}
} else
#endif
- NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
+ if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ goto nla_put_failure;
}
- if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
- expires, error) < 0)
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -2878,12 +2385,13 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
struct rtable *rt = NULL;
+ struct flowi4 fl4;
__be32 dst = 0;
__be32 src = 0;
u32 iif;
@@ -2918,6 +2426,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst;
+ fl4.saddr = src;
+ fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+ fl4.flowi4_mark = mark;
+
if (iif) {
struct net_device *dev;
@@ -2938,14 +2453,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
- struct flowi fl = {
- .fl4_dst = dst,
- .fl4_src = src,
- .fl4_tos = rtm->rtm_tos,
- .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
- .mark = mark,
- };
- err = ip_route_output_key(net, &rt, &fl);
+ rt = ip_route_output_key(net, &fl4);
+
+ err = 0;
+ if (IS_ERR(rt))
+ err = PTR_ERR(rt);
}
if (err)
@@ -2955,12 +2467,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
- err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+ err = rt_fill_info(net, dst, src, &fl4, skb,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
RTM_NEWROUTE, 0, 0);
if (err <= 0)
goto errout_free;
- err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
return err;
@@ -2969,76 +2482,33 @@ errout_free:
goto errout;
}
-int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct rtable *rt;
- int h, s_h;
- int idx, s_idx;
- struct net *net;
-
- net = sock_net(skb->sk);
-
- s_h = cb->args[0];
- if (s_h < 0)
- s_h = 0;
- s_idx = idx = cb->args[1];
- for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
- if (!rt_hash_table[h].chain)
- continue;
- rcu_read_lock_bh();
- for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
- if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
- continue;
- if (rt_is_expired(rt))
- continue;
- skb_dst_set_noref(skb, &rt->dst);
- if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- 1, NLM_F_MULTI) <= 0) {
- skb_dst_drop(skb);
- rcu_read_unlock_bh();
- goto done;
- }
- skb_dst_drop(skb);
- }
- rcu_read_unlock_bh();
- }
-
-done:
- cb->args[0] = h;
- cb->args[1] = idx;
- return skb->len;
-}
-
void ip_rt_multicast_event(struct in_device *in_dev)
{
- rt_cache_flush(dev_net(in_dev->dev), 0);
+ rt_cache_flush(dev_net(in_dev->dev));
}
#ifdef CONFIG_SYSCTL
-static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+
+static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
- if (write) {
- int flush_delay;
- ctl_table ctl;
- struct net *net;
-
- memcpy(&ctl, __ctl, sizeof(ctl));
- ctl.data = &flush_delay;
- proc_dointvec(&ctl, write, buffer, lenp, ppos);
+ struct net *net = (struct net *)__ctl->extra1;
- net = (struct net *)__ctl->extra1;
- rt_cache_flush(net, flush_delay);
+ if (write) {
+ rt_cache_flush(net);
+ fnhe_genid_bump(net);
return 0;
}
return -EINVAL;
}
-static ctl_table ipv4_route_table[] = {
+static struct ctl_table ipv4_route_table[] = {
{
.procname = "gc_thresh",
.data = &ipv4_dst_ops.gc_thresh,
@@ -3149,23 +2619,6 @@ static ctl_table ipv4_route_table[] = {
{ }
};
-static struct ctl_table empty[1];
-
-static struct ctl_table ipv4_skeleton[] =
-{
- { .procname = "route",
- .mode = 0555, .child = ipv4_route_table},
- { .procname = "neigh",
- .mode = 0555, .child = empty},
- { }
-};
-
-static __net_initdata struct ctl_path ipv4_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-
static struct ctl_table ipv4_route_flush_table[] = {
{
.procname = "flush",
@@ -3176,13 +2629,6 @@ static struct ctl_table ipv4_route_flush_table[] = {
{ },
};
-static __net_initdata struct ctl_path ipv4_route_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "route", },
- { },
-};
-
static __net_init int sysctl_route_net_init(struct net *net)
{
struct ctl_table *tbl;
@@ -3192,11 +2638,14 @@ static __net_init int sysctl_route_net_init(struct net *net)
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
}
tbl[0].extra1 = net;
- net->ipv4.route_hdr =
- register_net_sysctl_table(net, ipv4_route_path, tbl);
+ net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
if (net->ipv4.route_hdr == NULL)
goto err_reg;
return 0;
@@ -3226,8 +2675,10 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
static __net_init int rt_genid_init(struct net *net)
{
- get_random_bytes(&net->ipv4.rt_genid,
- sizeof(net->ipv4.rt_genid));
+ atomic_set(&net->ipv4.rt_genid, 0);
+ atomic_set(&net->fnhe_genid, 0);
+ get_random_bytes(&net->ipv4.dev_addr_genid,
+ sizeof(net->ipv4.dev_addr_genid));
return 0;
}
@@ -3235,26 +2686,46 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
.init = rt_genid_init,
};
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
-#ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-#endif /* CONFIG_NET_CLS_ROUTE */
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv4.peers = bp;
+ return 0;
+}
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
{
- if (!str)
- return 0;
- rhash_entries = simple_strtoul(str, &str, 0);
- return 1;
+ struct inet_peer_base *bp = net->ipv4.peers;
+
+ net->ipv4.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
}
-__setup("rhash_entries=", set_rhash_entries);
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+ .init = ipv4_inetpeer_init,
+ .exit = ipv4_inetpeer_exit,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
int __init ip_rt_init(void)
{
int rc = 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
@@ -3272,45 +2743,25 @@ int __init ip_rt_init(void)
if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
- rt_hash_table = (struct rt_hash_bucket *)
- alloc_large_system_hash("IP route cache",
- sizeof(struct rt_hash_bucket),
- rhash_entries,
- (totalram_pages >= 128 * 1024) ?
- 15 : 17,
- 0,
- &rt_hash_log,
- &rt_hash_mask,
- rhash_entries ? 0 : 512 * 1024);
- memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
- rt_hash_lock_init();
-
- ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
- ip_rt_max_size = (rt_hash_mask + 1) * 16;
+ ipv4_dst_ops.gc_thresh = ~0;
+ ip_rt_max_size = INT_MAX;
devinet_init();
ip_fib_init();
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
- expires_ljiffies = jiffies;
- schedule_delayed_work(&expires_work,
- net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
if (ip_rt_proc_init())
- printk(KERN_ERR "Unable to create route proc files\n");
+ pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
xfrm_init();
- xfrm4_init(ip_rt_max_size);
+ xfrm4_init();
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
return rc;
}
@@ -3321,6 +2772,6 @@ int __init ip_rt_init(void)
*/
void __init ip_static_sysctl_init(void)
{
- register_sysctl_paths(ipv4_path, ipv4_skeleton);
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 47519205a01..c86624b36a6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/cryptohash.h>
#include <linux/kernel.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <net/route.h>
@@ -24,15 +25,7 @@
extern int sysctl_tcp_syncookies;
-__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
-EXPORT_SYMBOL(syncookie_secret);
-
-static __init int init_syncookies(void)
-{
- get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
- return 0;
-}
-__initcall(init_syncookies);
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -43,8 +36,11 @@ static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
{
- __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
+ __u32 *tmp;
+
+ net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+ tmp = __get_cpu_var(ipv4_cookie_scratch);
memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
tmp[0] = (__force u32)saddr;
tmp[1] = (__force u32)daddr;
@@ -88,8 +84,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
- __be16 dport, __u32 sseq, __u32 count,
- __u32 data)
+ __be16 dport, __u32 sseq, __u32 data)
{
/*
* Compute the secure sequence number.
@@ -101,7 +96,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* As an extra hack, we add a small "data" value that encodes the
* MSS into the second hash value.
*/
-
+ u32 count = tcp_cookie_time();
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
sseq + (count << COOKIEBITS) +
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
@@ -113,22 +108,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
* If the syncookie is bad, the data returned will be out of
* range. This must be checked by the caller.
*
- * The count value used to generate the cookie must be within
- * "maxdiff" if the current (passed-in) "count". The return value
- * is (__u32)-1 if this test fails.
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
*/
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport, __u32 sseq,
- __u32 count, __u32 maxdiff)
+ __be16 sport, __be16 dport, __u32 sseq)
{
- __u32 diff;
+ u32 diff, count = tcp_cookie_time();
/* Strip away the layers from the cookie */
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
- diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
- if (diff >= maxdiff)
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
return (__u32)-1;
return (cookie -
@@ -137,72 +131,70 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
}
/*
- * MSS Values are taken from the 2009 paper
- * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
- * - values 1440 to 1460 accounted for 80% of observed mss values
- * - values outside the 536-1460 range are rare (<0.2%).
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ * .. lower than 536 are rare (< 0.2%)
+ * .. between 537 and 1299 account for less than < 1.5% of observed values
+ * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ * .. exceeding 1460 are very rare (< 0.04%)
*
- * Table must be sorted.
+ * 1460 is the single most frequently announced mss value (30 to 46% depending
+ * on monitor location). Table must be sorted.
*/
static __u16 const msstab[] = {
- 64,
- 512,
536,
- 1024,
- 1440,
+ 1300,
+ 1440, /* 1440, 1452: PPPoE */
1460,
- 4312,
- 8960,
};
/*
* Generate a syncookie. mssp points to the mss, which is returned
* rounded down to the value encoded in the cookie.
*/
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ u16 *mssp)
{
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
int mssind;
const __u16 mss = *mssp;
- tcp_synq_overflow(sk);
-
for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
if (mss >= msstab[mssind])
break;
*mssp = msstab[mssind];
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
th->source, th->dest, ntohl(th->seq),
- jiffies / (HZ * 60), mssind);
+ mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ tcp_synq_overflow(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+ return __cookie_v4_init_sequence(iph, th, mssp);
}
-/*
- * This (misnamed) value is the age of syncookie which is permitted.
- * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
- * sysctl_tcp_retries1. It's a rather complicated formula (exponential
- * backoff) to compute at runtime so it's currently hardcoded here.
- */
-#define COUNTER_TRIES 4
/*
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+ u32 cookie)
{
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
__u32 seq = ntohl(th->seq) - 1;
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
- th->source, th->dest, seq,
- jiffies / (HZ * 60),
- COUNTER_TRIES);
+ th->source, th->dest, seq);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
@@ -231,7 +223,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
*
* return false if we decode an option that should not be.
*/
-bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
+ struct net *net, bool *ecn_ok)
{
/* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK;
@@ -244,9 +237,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
if (!sysctl_tcp_timestamps)
return false;
- tcp_opt->sack_ok = (options >> 4) & 0x1;
+ tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
*ecn_ok = (options >> 5) & 1;
- if (*ecn_ok && !sysctl_tcp_ecn)
+ if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
return false;
if (tcp_opt->sack_ok && !sysctl_tcp_sack)
@@ -265,7 +258,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
struct tcp_options_received tcp_opt;
- u8 *hash_location;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -276,13 +268,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
int mss;
struct rtable *rt;
__u8 rcv_wscale;
- bool ecn_ok;
+ bool ecn_ok = false;
+ struct flowi4 fl4;
if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk) ||
- (mss = cookie_check(skb, cookie)) == 0) {
+ (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
@@ -291,9 +284,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
- if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+ if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
goto out;
ret = NULL;
@@ -306,25 +299,28 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = cookie;
req->mss = mss;
- ireq->loc_port = th->dest;
- ireq->rmt_port = th->source;
- ireq->loc_addr = ip_hdr(skb)->daddr;
- ireq->rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->ir_mark = inet_request_mark(sk, skb);
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+ treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->listener = NULL;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
if (opt && opt->optlen) {
- int opt_size = sizeof(struct ip_options) + opt->optlen;
+ int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
- if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
+ if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
kfree(ireq->opt);
ireq->opt = NULL;
}
@@ -336,7 +332,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
}
req->expires = 0UL;
- req->retrans = 0;
+ req->num_retrans = 0;
/*
* We need to lookup the route here to get at the correct
@@ -344,21 +340,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- {
- struct flowi fl = { .mark = sk->sk_mark,
- .fl4_dst = ((opt && opt->srr) ?
- opt->faddr : ireq->rmt_addr),
- .fl4_src = ireq->loc_addr,
- .fl4_tos = RT_CONN_FLAGS(sk),
- .proto = IPPROTO_TCP,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = th->dest,
- .fl_ip_dport = th->source };
- security_req_classify_flow(req, &fl);
- if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
- reqsk_free(req);
- goto out;
- }
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+ inet_sk_flowi_flags(sk),
+ (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, th->source, th->dest);
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt)) {
+ reqsk_free(req);
+ goto out;
}
/* Try to redo what tcp_v4_send_synack did. */
@@ -372,5 +363,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->rcv_wscale = rcv_wscale;
ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ /* ip_queue_xmit() depends on our flow being setup
+ * Normal sockets get it right from inet_csk_route_child_sock()
+ */
+ if (ret)
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
out: return ret;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e91911d7aae..79a007c5255 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,6 +13,8 @@
#include <linux/seqlock.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/swap.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -21,29 +23,44 @@
#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
+#include <net/ping.h>
+#include <net/tcp_memcontrol.h>
static int zero;
+static int one = 1;
+static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
-static void set_local_port_range(int range[2])
+static void set_local_port_range(struct net *net, int range[2])
{
- write_seqlock(&sysctl_local_ports.lock);
- sysctl_local_ports.range[0] = range[0];
- sysctl_local_ports.range[1] = range[1];
- write_sequnlock(&sysctl_local_ports.lock);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = range[0];
+ net->ipv4.ip_local_ports.range[1] = range[1];
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write,
+static int ipv4_local_port_range(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ip_local_ports.range);
int ret;
int range[2];
- ctl_table tmp = {
+ struct ctl_table tmp = {
.data = &range,
.maxlen = sizeof(range),
.mode = table->mode,
@@ -51,24 +68,88 @@ static int ipv4_local_port_range(ctl_table *table, int write,
.extra2 = &ip_local_port_range_max,
};
- inet_get_local_port_range(range, range + 1);
+ inet_get_local_port_range(net, &range[0], &range[1]);
+
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
if (range[1] < range[0])
ret = -EINVAL;
else
- set_local_port_range(range);
+ set_local_port_range(net, range);
+ }
+
+ return ret;
+}
+
+
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ unsigned int seq;
+ do {
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ data[0] = low;
+ data[1] = high;
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int ret;
+ gid_t urange[2];
+ kgid_t low, high;
+ struct ctl_table tmp = {
+ .data = &urange,
+ .maxlen = sizeof(urange),
+ .mode = table->mode,
+ .extra1 = &ip_ping_group_range_min,
+ .extra2 = &ip_ping_group_range_max,
+ };
+
+ inet_get_ping_group_range_table(table, &low, &high);
+ urange[0] = from_kgid_munged(user_ns, low);
+ urange[1] = from_kgid_munged(user_ns, high);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ low = make_kgid(user_ns, urange[0]);
+ high = make_kgid(user_ns, urange[1]);
+ if (!gid_valid(low) || !gid_valid(high) ||
+ (urange[1] < urange[0]) || gid_lt(high, low)) {
+ low = make_kgid(&init_user_ns, 1);
+ high = make_kgid(&init_user_ns, 0);
+ }
+ set_ping_group_range(table, low, high);
}
return ret;
}
-static int proc_tcp_congestion_control(ctl_table *ctl, int write,
+static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char val[TCP_CA_NAME_MAX];
- ctl_table tbl = {
+ struct ctl_table tbl = {
.data = val,
.maxlen = TCP_CA_NAME_MAX,
};
@@ -82,12 +163,12 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write,
return ret;
}
-static int proc_tcp_available_congestion_control(ctl_table *ctl,
+static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
@@ -99,12 +180,12 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
return ret;
}
-static int proc_allowed_congestion_control(ctl_table *ctl,
+static int proc_allowed_congestion_control(struct ctl_table *ctl,
int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
@@ -119,6 +200,53 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
return ret;
}
+static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+ struct tcp_fastopen_context *ctxt;
+ int ret;
+ u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(tcp_fastopen_ctx);
+ if (ctxt)
+ memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+ else
+ memset(user_key, 0, sizeof(user_key));
+ rcu_read_unlock();
+
+ snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+ user_key[0], user_key[1], user_key[2], user_key[3]);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ /* Generate a dummy secret but don't publish it. This
+ * is needed so we don't regenerate a new key on the
+ * first invocation of tcp_fastopen_cookie_gen
+ */
+ tcp_fastopen_init_key_once(false);
+ tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+ }
+
+bad_key:
+ pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3],
+ (char *)tbl.data, ret);
+ kfree(tbl.data);
+ return ret;
+}
+
static struct ctl_table ipv4_table[] = {
{
.procname = "tcp_timestamps",
@@ -153,15 +281,9 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_ip_default_ttl,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = ipv4_doint_and_flush,
- .extra2 = &init_net,
- },
- {
- .procname = "ip_no_pmtu_disc",
- .data = &ipv4_config.no_pmtu_disc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
},
{
.procname = "ip_nonlocal_bind",
@@ -175,7 +297,9 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_tcp_syn_retries,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
},
{
.procname = "tcp_synack_retries",
@@ -199,6 +323,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_early_demux",
+ .data = &sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr,
.maxlen = sizeof(int),
@@ -258,6 +389,19 @@ static struct ctl_table ipv4_table[] = {
},
#endif
{
+ .procname = "tcp_fastopen",
+ .data = &sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
.procname = "tcp_tw_recycle",
.data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
@@ -293,29 +437,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_local_port_range",
- .data = &sysctl_local_ports.range,
- .maxlen = sizeof(sysctl_local_ports.range),
- .mode = 0644,
- .proc_handler = ipv4_local_port_range,
- },
- {
- .procname = "ip_local_reserved_ports",
- .data = NULL, /* initialized in sysctl_ipv4_init */
- .maxlen = 65536,
- .mode = 0644,
- .proc_handler = proc_do_large_bitmap,
- },
-#ifdef CONFIG_IP_MULTICAST
- {
.procname = "igmp_max_memberships",
.data = &sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
-
-#endif
{
.procname = "igmp_max_msf",
.data = &sysctl_igmp_max_msf,
@@ -345,20 +472,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "inet_peer_gc_mintime",
- .data = &inet_peer_gc_mintime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "inet_peer_gc_maxtime",
- .data = &inet_peer_gc_maxtime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
.procname = "tcp_orphan_retries",
.data = &sysctl_tcp_orphan_retries,
.maxlen = sizeof(int),
@@ -380,13 +493,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_ecn",
- .data = &sysctl_tcp_ecn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
@@ -395,24 +501,33 @@ static struct ctl_table ipv4_table[] = {
},
{
.procname = "tcp_mem",
- .data = &sysctl_tcp_mem,
.maxlen = sizeof(sysctl_tcp_mem),
+ .data = &sysctl_tcp_mem,
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax
+ .proc_handler = proc_doulongvec_minmax,
},
{
.procname = "tcp_wmem",
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_notsent_lowat",
+ .data = &sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(sysctl_tcp_notsent_lowat),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
{
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "tcp_app_win",
@@ -426,7 +541,9 @@ static struct ctl_table ipv4_table[] = {
.data = &sysctl_tcp_adv_win_scale,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_adv_win_scale_min,
+ .extra2 = &tcp_adv_win_scale_max,
},
{
.procname = "tcp_tw_reuse",
@@ -443,13 +560,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "tcp_frto_response",
- .data = &sysctl_tcp_frto_response,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -484,13 +594,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
- .procname = "tcp_abc",
- .data = &sysctl_tcp_abc,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_mtu_probing",
.data = &sysctl_tcp_mtu_probing,
.maxlen = sizeof(int),
@@ -511,6 +614,20 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "tcp_limit_output_bytes",
+ .data = &sysctl_tcp_limit_output_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_challenge_ack_limit",
+ .data = &sysctl_tcp_challenge_ack_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
#ifdef CONFIG_NET_DMA
{
.procname = "tcp_dma_copybreak",
@@ -570,27 +687,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_allowed_congestion_control,
},
{
- .procname = "tcp_max_ssthresh",
- .data = &sysctl_tcp_max_ssthresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tcp_cookie_size",
- .data = &sysctl_tcp_cookie_size,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_thin_linear_timeouts",
.data = &sysctl_tcp_thin_linear_timeouts,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
- {
+ {
.procname = "tcp_thin_dupack",
.data = &sysctl_tcp_thin_dupack,
.maxlen = sizeof(int),
@@ -598,6 +701,33 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &four,
+ },
+ {
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
+ .procname = "tcp_autocorking",
+ .data = &sysctl_tcp_autocorking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -610,7 +740,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{
.procname = "udp_wmem_min",
@@ -618,7 +748,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
+ .extra1 = &one
},
{ }
};
@@ -667,57 +797,93 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "rt_cache_rebuild_count",
- .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
+ .procname = "ping_group_range",
+ .data = &init_net.ipv4.ping_group_range.range,
+ .maxlen = sizeof(gid_t)*2,
+ .mode = 0644,
+ .proc_handler = ipv4_ping_group_range,
+ },
+ {
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_local_port_range",
+ .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
+ .data = &init_net.ipv4.ip_local_ports.range,
+ .mode = 0644,
+ .proc_handler = ipv4_local_port_range,
+ },
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "ip_no_pmtu_disc",
+ .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "ip_forward_use_pmtu",
+ .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
-struct ctl_path net_ipv4_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { },
-};
-EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
-
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
struct ctl_table *table;
table = ipv4_net_table;
if (!net_eq(net, &init_net)) {
+ int i;
+
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
- table[0].data =
- &net->ipv4.sysctl_icmp_echo_ignore_all;
- table[1].data =
- &net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
- table[2].data =
- &net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
- table[3].data =
- &net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
- table[4].data =
- &net->ipv4.sysctl_icmp_ratelimit;
- table[5].data =
- &net->ipv4.sysctl_icmp_ratemask;
- table[6].data =
- &net->ipv4.sysctl_rt_cache_rebuild_count;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
}
- net->ipv4.sysctl_rt_cache_rebuild_count = 4;
-
- net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
- net_ipv4_ctl_path, table);
+ net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (net->ipv4.ipv4_hdr == NULL)
goto err_reg;
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
return 0;
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
err_reg:
if (!net_eq(net, &init_net))
kfree(table);
@@ -729,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
struct ctl_table *table;
+ kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
kfree(table);
@@ -742,23 +909,13 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
static __init int sysctl_ipv4_init(void)
{
struct ctl_table_header *hdr;
- struct ctl_table *i;
-
- for (i = ipv4_table; i->procname; i++) {
- if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
- i->data = sysctl_local_reserved_ports;
- break;
- }
- }
- if (!i->procname)
- return -EINVAL;
- hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
if (hdr == NULL)
return -ENOMEM;
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
- unregister_sysctl_table(hdr);
+ unregister_net_sysctl_table(hdr);
return -ENOMEM;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2bb46d55f40..9d2118e5fbc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -245,6 +245,8 @@
* TCP_CLOSE socket is finished
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -268,6 +270,7 @@
#include <linux/slab.h>
#include <net/icmp.h>
+#include <net/inet_common.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
@@ -276,9 +279,14 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <net/busy_poll.h>
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
+int sysctl_tcp_autocorking __read_mostly = 1;
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -363,6 +371,61 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
return period;
}
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ __skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+ INIT_LIST_HEAD(&tp->tsq_node);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = TCP_INIT_CWND;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+
+ tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+
+ tp->tsoffset = 0;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
/*
* Wait for a TCP event.
*
@@ -374,7 +437,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
unsigned int mask;
struct sock *sk = sock->sk;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ sock_rps_record_flow(sk);
sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == TCP_LISTEN)
@@ -419,8 +484,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP;
- /* Connected? */
- if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ /* Connected or passive Fast Open socket? */
+ if (sk->sk_state != TCP_SYN_SENT &&
+ (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -435,7 +501,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
@@ -446,7 +512,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* wspace test but before the flags are set,
* IO signal will be lost.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
} else
@@ -468,30 +534,29 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
+ bool slow;
switch (cmd) {
case SIOCINQ:
if (sk->sk_state == TCP_LISTEN)
return -EINVAL;
- lock_sock(sk);
+ slow = lock_sock_fast(sk);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else if (sock_flag(sk, SOCK_URGINLINE) ||
!tp->urg_data ||
before(tp->urg_seq, tp->copied_seq) ||
!before(tp->urg_seq, tp->rcv_nxt)) {
- struct sk_buff *skb;
answ = tp->rcv_nxt - tp->copied_seq;
- /* Subtract 1, if FIN is in queue. */
- skb = skb_peek_tail(&sk->sk_receive_queue);
- if (answ && skb)
- answ -= tcp_hdr(skb)->fin;
+ /* Subtract 1, if FIN was received */
+ if (answ && sock_flag(sk, SOCK_DONE))
+ answ--;
} else
answ = tp->urg_seq - tp->copied_seq;
- release_sock(sk);
+ unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
@@ -505,6 +570,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
else
answ = tp->write_seq - tp->snd_una;
break;
+ case SIOCOUTQNSD:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_nxt;
+ break;
default:
return -ENOIOCTLCMD;
}
@@ -515,11 +589,11 @@ EXPORT_SYMBOL(tcp_ioctl);
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
}
-static inline int forced_push(struct tcp_sock *tp)
+static inline bool forced_push(const struct tcp_sock *tp)
{
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
@@ -531,7 +605,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
- tcb->flags = TCPHDR_ACK;
+ tcb->tcp_flags = TCPHDR_ACK;
tcb->sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
@@ -547,19 +621,58 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
tp->snd_up = tp->write_seq;
}
-static inline void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle)
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+ int size_goal)
+{
+ return skb->len < size_goal &&
+ sysctl_tcp_autocorking &&
+ skb != tcp_write_queue_head(sk) &&
+ atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+}
+
+static void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
- if (tcp_send_head(sk)) {
- struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
- if (!(flags & MSG_MORE) || forced_push(tp))
- tcp_mark_push(tp, tcp_write_queue_tail(sk));
+ if (!tcp_send_head(sk))
+ return;
- tcp_mark_urg(tp, flags);
- __tcp_push_pending_frames(sk, mss_now,
- (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+ skb = tcp_write_queue_tail(sk);
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
+
+ tcp_mark_urg(tp, flags);
+
+ if (tcp_should_autocork(sk, skb, size_goal)) {
+
+ /* avoid atomic op if TSQ_THROTTLED bit is already set */
+ if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+ return;
}
+
+ if (flags & MSG_MORE)
+ nonagle = TCP_NAGLE_CORK;
+
+ __tcp_push_pending_frames(sk, mss_now, nonagle);
}
static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
@@ -692,11 +805,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) {
if (sk_wmem_schedule(sk, skb->truesize)) {
+ skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
*/
- skb_reserve(skb, skb_tailroom(skb) - size);
+ skb->reserved_tailroom = skb->end - skb->tail - size;
return skb;
}
__kfree_skb(skb);
@@ -716,10 +830,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = mss_now;
if (large_allowed && sk_can_gso(sk)) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
+ u32 gso_size, hlen;
+
+ /* Maybe we should/could use sk->sk_prot->max_header here ? */
+ hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+ inet_csk(sk)->icsk_ext_hdr_len +
+ tp->tcp_header_len;
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+ gso_size = max_t(u32, gso_size,
+ sysctl_tcp_min_tso_segs * mss_now);
+
+ xmit_size_goal = min_t(u32, gso_size,
+ sk->sk_gso_max_size - 1 - hlen);
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
@@ -730,7 +858,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
old_size_goal + mss_now > xmit_size_goal)) {
xmit_size_goal = old_size_goal;
} else {
- tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+ tp->xmit_size_goal_segs =
+ min_t(u16, xmit_size_goal / mss_now,
+ sk->sk_gso_max_segs);
xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
}
}
@@ -748,8 +878,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
- size_t psize, int flags)
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -757,10 +887,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
+ }
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -771,12 +906,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
- while (psize > 0) {
+ while (size > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
- struct page *page = pages[poffset / PAGE_SIZE];
- int copy, i, can_coalesce;
- int offset = poffset % PAGE_SIZE;
- int size = min_t(size_t, psize, PAGE_SIZE - offset);
+ int copy, i;
+ bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
new_segment:
@@ -804,11 +937,12 @@ new_segment:
goto wait_for_memory;
if (can_coalesce) {
- skb_shinfo(skb)->frags[i - 1].size += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
}
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
skb->len += copy;
skb->data_len += copy;
@@ -821,11 +955,11 @@ new_segment:
skb_shinfo(skb)->gso_segs = 0;
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
copied += copy;
- poffset += copy;
- if (!(psize -= copy))
+ offset += copy;
+ if (!(size -= copy))
goto out;
if (skb->len < size_goal || (flags & MSG_OOB))
@@ -841,8 +975,8 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
- if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -851,8 +985,8 @@ wait_for_memory:
}
out:
- if (copied)
- tcp_push(sk, flags, mss_now, tp->nonagle);
+ if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
return copied;
do_error:
@@ -873,26 +1007,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
flags);
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
- TCP_CHECK_TIMER(sk);
+ res = do_tcp_sendpages(sk, page, offset, size, flags);
release_sock(sk);
return res;
}
EXPORT_SYMBOL(tcp_sendpage);
-#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
-#define TCP_OFF(sk) (sk->sk_sndmsg_off)
-
-static inline int select_size(struct sock *sk, int sg)
+static inline int select_size(const struct sock *sk, bool sg)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
- if (sk_can_gso(sk))
- tmp = 0;
- else {
+ if (sk_can_gso(sk)) {
+ /* Small frames wont use a full page:
+ * Payload will immediately follow tcp header.
+ */
+ tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ } else {
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
if (tmp >= pgbreak &&
@@ -904,28 +1036,88 @@ static inline int select_size(struct sock *sk, int sg)
return tmp;
}
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req != NULL) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
+ }
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
+ int *copied, size_t size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err, flags;
+
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req != NULL)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(tp->fastopen_req == NULL))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+ tp->fastopen_req->size = size;
+
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, flags);
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ return err;
+}
+
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int iovlen, flags;
- int mss_now, size_goal;
- int sg, err, copied;
+ int iovlen, flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+ bool sg;
long timeo;
lock_sock(sk);
- TCP_CHECK_TIMER(sk);
flags = msg->msg_flags;
+ if (flags & MSG_FASTOPEN) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ offset = copied_syn;
+ }
+
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto do_error;
+ }
+
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out_nopush;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
+ /* 'common' sending to sendq */
+ }
+
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -940,13 +1132,22 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
- sg = sk->sk_route_caps & NETIF_F_SG;
+ sg = !!(sk->sk_route_caps & NETIF_F_SG);
while (--iovlen >= 0) {
size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
+ if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
+ if (offset >= seglen) {
+ offset -= seglen;
+ continue;
+ }
+ seglen -= offset;
+ from += offset;
+ offset = 0;
+ }
while (seglen > 0) {
int copy = 0;
@@ -974,6 +1175,13 @@ new_segment:
goto wait_for_memory;
/*
+ * All packets are restored as if they have
+ * already been sent.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ /*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
@@ -989,85 +1197,54 @@ new_segment:
copy = seglen;
/* Where to copy to? */
- if (skb_tailroom(skb) > 0) {
+ if (skb_availroom(skb) > 0) {
/* We have some space in skb head. Superb! */
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
- if ((err = skb_add_data(skb, from, copy)) != 0)
+ copy = min_t(int, copy, skb_availroom(skb));
+ err = skb_add_data_nocache(sk, skb, from, copy);
+ if (err)
goto do_fault;
} else {
- int merge = 0;
+ bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
- struct page *page = TCP_PAGE(sk);
- int off = TCP_OFF(sk);
-
- if (skb_can_coalesce(skb, i, page, off) &&
- off != PAGE_SIZE) {
- /* We can extend the last page
- * fragment. */
- merge = 1;
- } else if (i == MAX_SKB_FRAGS || !sg) {
- /* Need to add new fragment and cannot
- * do this because interface is non-SG,
- * or because all the page slots are
- * busy. */
- tcp_mark_push(tp, skb);
- goto new_segment;
- } else if (page) {
- if (off == PAGE_SIZE) {
- put_page(page);
- TCP_PAGE(sk) = page = NULL;
- off = 0;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS || !sg) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
}
- } else
- off = 0;
+ merge = false;
+ }
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
- if (!page) {
- /* Allocate new cache page. */
- if (!(page = sk_stream_alloc_page(sk)))
- goto wait_for_memory;
- }
-
- /* Time to copy data. We are close to
- * the end! */
- err = skb_copy_to_page(sk, from, skb, page,
- off, copy);
- if (err) {
- /* If this page was new, give it to the
- * socket so it does not get leaked.
- */
- if (!TCP_PAGE(sk)) {
- TCP_PAGE(sk) = page;
- TCP_OFF(sk) = 0;
- }
+ err = skb_copy_to_page_nocache(sk, from, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
goto do_error;
- }
/* Update the skb. */
if (merge) {
- skb_shinfo(skb)->frags[i - 1].size +=
- copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
- skb_fill_page_desc(skb, i, page, off, copy);
- if (TCP_PAGE(sk)) {
- get_page(page);
- } else if (off + copy < PAGE_SIZE) {
- get_page(page);
- TCP_PAGE(sk) = page;
- }
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
}
-
- TCP_OFF(sk) = off + copy;
+ pfrag->offset += copy;
}
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
@@ -1078,7 +1255,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len < max || (flags & MSG_OOB))
+ if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
@@ -1092,7 +1269,8 @@ wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
- tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
@@ -1103,10 +1281,10 @@ wait_for_memory:
out:
if (copied)
- tcp_push(sk, flags, mss_now, tp->nonagle);
- TCP_CHECK_TIMER(sk);
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+out_nopush:
release_sock(sk);
- return copied;
+ return copied + copied_syn;
do_fault:
if (!skb->len) {
@@ -1119,11 +1297,10 @@ do_fault:
}
do_error:
- if (copied)
+ if (copied + copied_syn)
goto out;
out_err:
err = sk_stream_error(sk, flags, err);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
}
@@ -1178,6 +1355,24 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
return -EAGAIN;
}
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ /* XXX -- need to support SO_PEEK_OFF */
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
/* Clean up the receive buffer for full frames taken by the user,
* then send an ACK if necessary. COPIED is the number of bytes
* tcp_recvmsg has given to the user so far, it speeds up the
@@ -1187,15 +1382,13 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
- int time_to_ack = 0;
+ bool time_to_ack = false;
-#if TCP_DEBUG
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-#endif
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1215,7 +1408,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
!icsk->icsk_ack.pingpong)) &&
!atomic_read(&sk->sk_rmem_alloc)))
- time_to_ack = 1;
+ time_to_ack = true;
}
/* We send an ACK if we can now advertise a non-zero window
@@ -1237,7 +1430,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
* "Lots" means "at least twice" here.
*/
if (new_window && new_window >= 2 * rcv_window_now)
- time_to_ack = 1;
+ time_to_ack = true;
}
}
if (time_to_ack)
@@ -1273,12 +1466,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
return;
last_issued = tp->ucopy.dma_cookie;
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
do {
- if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
last_issued, &done,
- &used) == DMA_SUCCESS) {
+ &used) == DMA_COMPLETE) {
/* Safe to free early-copied skbs now */
__skb_queue_purge(&sk->sk_async_wait_queue);
break;
@@ -1286,7 +1479,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
struct sk_buff *skb;
while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
(dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_SUCCESS)) {
+ used) == DMA_COMPLETE)) {
__skb_dequeue(&sk->sk_async_wait_queue);
kfree_skb(skb);
}
@@ -1295,12 +1488,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
}
#endif
-static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
u32 offset;
- skb_queue_walk(&sk->sk_receive_queue, skb) {
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
if (tcp_hdr(skb)->syn)
offset--;
@@ -1308,6 +1501,11 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
*off = offset;
return skb;
}
+ /* This looks weird, but this can happen if TCP collapsing
+ * splitted a fat GRO packet, while we released socket lock
+ * in skb_splice_bits()
+ */
+ sk_eat_skb(sk, skb, false);
}
return NULL;
}
@@ -1349,7 +1547,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
break;
}
used = recv_actor(desc, skb, offset, len);
- if (used < 0) {
+ if (used <= 0) {
if (!copied)
copied = used;
break;
@@ -1358,22 +1556,26 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
copied += used;
offset += used;
}
- /*
- * If recv_actor drops the lock (e.g. TCP splice
+ /* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
* while aggregating skbs from the socket queue.
*/
- skb = tcp_recv_skb(sk, seq-1, &offset);
- if (!skb || (offset+1 != skb->len))
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
}
if (tcp_hdr(skb)->fin) {
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
++seq;
break;
}
- sk_eat_skb(sk, skb, 0);
+ sk_eat_skb(sk, skb, false);
if (!desc->count)
break;
tp->copied_seq = seq;
@@ -1383,8 +1585,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
- if (copied > 0)
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
+ }
return copied;
}
EXPORT_SYMBOL(tcp_read_sock);
@@ -1409,13 +1613,15 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
- int copied_early = 0;
+ bool copied_early = false;
struct sk_buff *skb;
u32 urg_hole = 0;
- lock_sock(sk);
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ (sk->sk_state == TCP_ESTABLISHED))
+ sk_busy_loop(sk, nonblock);
- TCP_CHECK_TIMER(sk);
+ lock_sock(sk);
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
@@ -1427,6 +1633,21 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto recv_urg;
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
@@ -1447,12 +1668,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((available < target) &&
(len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
!sysctl_tcp_low_latency &&
- dma_find_channel(DMA_MEMCPY)) {
- preempt_enable_no_resched();
+ net_dma_find_channel()) {
+ preempt_enable();
tp->ucopy.pinned_list =
dma_pin_iovec_pages(msg->msg_iov, len);
} else {
- preempt_enable_no_resched();
+ preempt_enable();
}
}
#endif
@@ -1588,8 +1809,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan)
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ if (tp->ucopy.dma_chan) {
+ if (tp->rcv_wnd == 0 &&
+ !skb_queue_empty(&sk->sk_async_wait_queue)) {
+ tcp_service_net_dma(sk, true);
+ tcp_cleanup_rbuf(sk, copied);
+ } else
+ dma_async_issue_pending(tp->ucopy.dma_chan);
+ }
#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
@@ -1628,9 +1855,9 @@ do_prequeue:
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
- if (net_ratelimit())
- printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
- current->comm, task_pid_nr(current));
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
@@ -1662,7 +1889,7 @@ do_prequeue:
if (!(flags & MSG_TRUNC)) {
#ifdef CONFIG_NET_DMA
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan) {
tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1672,7 +1899,8 @@ do_prequeue:
if (tp->ucopy.dma_cookie < 0) {
- printk(KERN_ALERT "dma_cookie < 0\n");
+ pr_alert("%s: dma_cookie < 0\n",
+ __func__);
/* Exception. Bailout! */
if (!copied)
@@ -1680,10 +1908,10 @@ do_prequeue:
break;
}
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+ dma_async_issue_pending(tp->ucopy.dma_chan);
if ((offset + used) == skb->len)
- copied_early = 1;
+ copied_early = true;
} else
#endif
@@ -1717,7 +1945,7 @@ skip_copy:
goto found_fin_ok;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
continue;
@@ -1726,7 +1954,7 @@ skip_copy:
++*seq;
if (!(flags & MSG_PEEK)) {
sk_eat_skb(sk, skb, copied_early);
- copied_early = 0;
+ copied_early = false;
}
break;
} while (len > 0);
@@ -1767,18 +1995,20 @@ skip_copy:
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return copied;
out:
- TCP_CHECK_TIMER(sk);
release_sock(sk);
return err;
recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
}
EXPORT_SYMBOL(tcp_recvmsg);
@@ -1875,6 +2105,20 @@ void tcp_shutdown(struct sock *sk, int how)
}
EXPORT_SYMBOL(tcp_shutdown);
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+ bool too_many_orphans, out_of_socket_memory;
+
+ too_many_orphans = tcp_too_many_orphans(sk, shift);
+ out_of_socket_memory = tcp_out_of_memory(sk);
+
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
+ return too_many_orphans || out_of_socket_memory;
+}
+
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
@@ -1917,7 +2161,9 @@ void tcp_close(struct sock *sk, long timeout)
* advertise a zero window, then kill -9 the FTP client, wheee...
* Note: timeout is always zero in such a case.
*/
- if (data_was_unread) {
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
@@ -1951,6 +2197,10 @@ void tcp_close(struct sock *sk, long timeout)
* they look as CLOSING or LAST_ACK for Linux)
* Probably, I missed some more holelets.
* --ANK
+ * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+ * in a single packet! (May consider it later but will
+ * probably need API support or TCP_CORK SYN-ACK until
+ * data is written and socket is closed.)
*/
tcp_send_fin(sk);
}
@@ -1982,7 +2232,7 @@ adjudge_to_death:
/* This is a (useful) BSD violating of the RFC. There is a
* problem with TCP as specified in that the other end could
* keep a socket open forever with no application left this end.
- * We use a 3 minute timeout (about the same as BSD) then kill
+ * We use a 1 minute timeout (about the same as BSD) then kill
* our end. If they send after that then tough - BUT: long enough
* that we won't make the old 4*rto = almost no time - whoops
* reset mistake.
@@ -2014,10 +2264,7 @@ adjudge_to_death:
}
if (sk->sk_state != TCP_CLOSE) {
sk_mem_reclaim(sk);
- if (tcp_too_many_orphans(sk, 0)) {
- if (net_ratelimit())
- printk(KERN_INFO "TCP: too many of orphaned "
- "sockets\n");
+ if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(sock_net(sk),
@@ -2025,8 +2272,16 @@ adjudge_to_death:
}
}
- if (sk->sk_state == TCP_CLOSE)
+ if (sk->sk_state == TCP_CLOSE) {
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ /* We could get here with a non-NULL req if the socket is
+ * aborted (e.g., closed with unread data) before 3WHS
+ * finishes.
+ */
+ if (req != NULL)
+ reqsk_fastopen_remove(sk, req, false);
inet_csk_destroy_sock(sk);
+ }
/* Otherwise, socket is reprieved until protocol close. */
out:
@@ -2038,7 +2293,7 @@ EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
-static inline int tcp_need_reset(int state)
+static inline bool tcp_need_reset(int state)
{
return (1 << state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2059,6 +2314,8 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2085,7 +2342,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
- tp->srtt = 0;
+ tp->srtt_us = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
@@ -2094,7 +2351,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
@@ -2110,6 +2366,68 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
+void tcp_sock_destruct(struct sock *sk)
+{
+ inet_sock_destruct(sk);
+
+ kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
+
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+}
+
+static int tcp_repair_options_est(struct tcp_sock *tp,
+ struct tcp_repair_opt __user *optbuf, unsigned int len)
+{
+ struct tcp_repair_opt opt;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ optbuf++;
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ break;
+ case TCPOPT_WINDOW:
+ {
+ u16 snd_wscale = opt.opt_val & 0xFFFF;
+ u16 rcv_wscale = opt.opt_val >> 16;
+
+ if (snd_wscale > 14 || rcv_wscale > 14)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = snd_wscale;
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rx_opt.wscale_ok = 1;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
@@ -2140,92 +2458,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = NULL;
-
- if (sizeof(ctd) > optlen)
- return -EINVAL;
- if (copy_from_user(&ctd, optval, sizeof(ctd)))
- return -EFAULT;
-
- if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
- ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
- return -EINVAL;
-
- if (ctd.tcpct_cookie_desired == 0) {
- /* default to global value */
- } else if ((0x1 & ctd.tcpct_cookie_desired) ||
- ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
- ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
- return -EINVAL;
- }
-
- if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
- /* Supercedes all other values */
- lock_sock(sk);
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
- tp->rx_opt.cookie_in_always = 0; /* false */
- tp->rx_opt.cookie_out_never = 1; /* true */
- release_sock(sk);
- return err;
- }
-
- /* Allocate ancillary memory before locking.
- */
- if (ctd.tcpct_used > 0 ||
- (tp->cookie_values == NULL &&
- (sysctl_tcp_cookie_size > 0 ||
- ctd.tcpct_cookie_desired > 0 ||
- ctd.tcpct_s_data_desired > 0))) {
- cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
- GFP_KERNEL);
- if (cvp == NULL)
- return -ENOMEM;
-
- kref_init(&cvp->kref);
- }
- lock_sock(sk);
- tp->rx_opt.cookie_in_always =
- (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
- tp->rx_opt.cookie_out_never = 0; /* false */
-
- if (tp->cookie_values != NULL) {
- if (cvp != NULL) {
- /* Changed values are recorded by a changed
- * pointer, ensuring the cookie will differ,
- * without separately hashing each value later.
- */
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- } else {
- cvp = tp->cookie_values;
- }
- }
-
- if (cvp != NULL) {
- cvp->cookie_desired = ctd.tcpct_cookie_desired;
-
- if (ctd.tcpct_used > 0) {
- memcpy(cvp->s_data_payload, ctd.tcpct_value,
- ctd.tcpct_used);
- cvp->s_data_desired = ctd.tcpct_used;
- cvp->s_data_constant = 1; /* true */
- } else {
- /* No constant payload data. */
- cvp->s_data_desired = ctd.tcpct_s_data_desired;
- cvp->s_data_constant = 0; /* false */
- }
-
- tp->cookie_values = cvp;
- }
- release_sock(sk);
- return err;
- }
default:
/* fallthru */
break;
@@ -2244,7 +2476,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
/* Values greater than interface MTU won't take effect. However
* at the point when this call is done we typically don't yet
* know which interface is going to be used */
- if (val < 64 || val > MAX_TCP_WINDOW) {
+ if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
err = -EINVAL;
break;
}
@@ -2278,8 +2510,58 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
if (val < 0 || val > 1)
err = -EINVAL;
- else
+ else {
tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
+ }
+ break;
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == 1) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == 0) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else
+ err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if (val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE)
+ err = -EPERM;
+ else if (tp->repair_queue == TCP_SEND_QUEUE)
+ tp->write_seq = val;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ tp->rcv_nxt = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
+ err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ err = tcp_repair_options_est(tp,
+ (struct tcp_repair_opt __user *)optval,
+ optlen);
+ else
+ err = -EPERM;
break;
case TCP_CORK:
@@ -2394,7 +2676,28 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
/* Cap the max timeout in ms TCP will retry/retrans
* before giving up and aborting (ETIMEDOUT) a connection.
*/
- icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ if (val < 0)
+ err = -EINVAL;
+ else
+ icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ break;
+
+ case TCP_FASTOPEN:
+ if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+ TCPF_LISTEN)))
+ err = fastopen_init_queue(sk, val);
+ else
+ err = -EINVAL;
+ break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->tsoffset = val - tcp_time_stamp;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
break;
default:
err = -ENOPROTOOPT;
@@ -2408,7 +2711,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
unsigned int optlen)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
@@ -2430,9 +2733,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
/* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(struct sock *sk, struct tcp_info *info)
+void tcp_get_info(const struct sock *sk, struct tcp_info *info)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
@@ -2454,8 +2757,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
}
- if (tp->ecn_flags&TCP_ECN_OK)
+ if (tp->ecn_flags & TCP_ECN_OK)
info->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->syn_data_acked)
+ info->tcpi_options |= TCPI_OPT_SYN_DATA;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
@@ -2479,8 +2786,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
- info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
- info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+ info->tcpi_rtt = tp->srtt_us >> 3;
+ info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
@@ -2490,6 +2797,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
+
+ info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
+ sk->sk_pacing_rate : ~0ULL;
+ info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
+ sk->sk_max_pacing_rate : ~0ULL;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2513,6 +2825,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
break;
case TCP_NODELAY:
val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2573,41 +2887,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
- case TCP_COOKIE_TRANSACTIONS: {
- struct tcp_cookie_transactions ctd;
- struct tcp_cookie_values *cvp = tp->cookie_values;
-
- if (get_user(len, optlen))
- return -EFAULT;
- if (len < sizeof(ctd))
- return -EINVAL;
-
- memset(&ctd, 0, sizeof(ctd));
- ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
- TCP_COOKIE_IN_ALWAYS : 0)
- | (tp->rx_opt.cookie_out_never ?
- TCP_COOKIE_OUT_NEVER : 0);
-
- if (cvp != NULL) {
- ctd.tcpct_flags |= (cvp->s_data_in ?
- TCP_S_DATA_IN : 0)
- | (cvp->s_data_out ?
- TCP_S_DATA_OUT : 0);
-
- ctd.tcpct_cookie_desired = cvp->cookie_desired;
- ctd.tcpct_s_data_desired = cvp->s_data_desired;
-
- memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
- cvp->cookie_pair_size);
- ctd.tcpct_used = cvp->cookie_pair_size;
- }
-
- if (put_user(sizeof(ctd), optlen))
- return -EFAULT;
- if (copy_to_user(optval, &ctd, sizeof(ctd)))
- return -EFAULT;
- return 0;
- }
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -2615,9 +2894,43 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->thin_dupack;
break;
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
case TCP_USER_TIMEOUT:
val = jiffies_to_msecs(icsk->icsk_user_timeout);
break;
+
+ case TCP_FASTOPEN:
+ if (icsk->icsk_accept_queue.fastopenq != NULL)
+ val = icsk->icsk_accept_queue.fastopenq->max_qlen;
+ else
+ val = 0;
+ break;
+
+ case TCP_TIMESTAMP:
+ val = tcp_time_stamp + tp->tsoffset;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -2653,313 +2966,62 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct tcphdr *th;
- unsigned thlen;
- unsigned int seq;
- __be32 delta;
- unsigned int oldlen;
- unsigned int mss;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- goto out;
-
- th = tcp_hdr(skb);
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
-
- if (!pskb_may_pull(skb, thlen))
- goto out;
-
- oldlen = (u16)~skb->len;
- __skb_pull(skb, thlen);
-
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- 0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
- goto out;
-
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
- segs = NULL;
- goto out;
- }
-
- segs = skb_segment(skb, features);
- if (IS_ERR(segs))
- goto out;
-
- delta = htonl(oldlen + (thlen + mss));
-
- skb = segs;
- th = tcp_hdr(skb);
- seq = ntohl(th->seq);
-
- do {
- th->fin = th->psh = 0;
-
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check =
- csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
-
- seq += mss;
- skb = skb->next;
- th = tcp_hdr(skb);
-
- th->seq = htonl(seq);
- th->cwr = 0;
- } while (skb->next);
-
- delta = htonl(oldlen + (skb->tail - skb->transport_header) +
- skb->data_len);
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
- th->check = csum_fold(csum_partial(skb_transport_header(skb),
- thlen, skb->csum));
-
-out:
- return segs;
-}
-EXPORT_SYMBOL(tcp_tso_segment);
-
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
-{
- struct sk_buff **pp = NULL;
- struct sk_buff *p;
- struct tcphdr *th;
- struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
-
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*th);
- th = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
-
- hlen = off + thlen;
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- skb_gro_pull(skb, thlen);
-
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
-
- for (; (p = *head); head = &p->next) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- th2 = tcp_hdr(p);
-
- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
-
- goto found;
- }
-
- goto out_check_final;
-
-found:
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
- ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
- flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
- for (i = sizeof(*th); i < thlen; i += 4)
- flush |= *(u32 *)((u8 *)th + i) ^
- *(u32 *)((u8 *)th2 + i);
-
- mss = skb_shinfo(p)->gso_size;
-
- flush |= (len - 1) >= mss;
- flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-
- if (flush || skb_gro_receive(head, skb)) {
- mss = 1;
- goto out_check_final;
- }
-
- p = *head;
- th2 = tcp_hdr(p);
- tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
-
-out_check_final:
- flush = len < mss;
- flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
- TCP_FLAG_RST | TCP_FLAG_SYN |
- TCP_FLAG_FIN));
-
- if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = head;
-
-out:
- NAPI_GRO_CB(skb)->flush |= flush;
-
- return pp;
-}
-EXPORT_SYMBOL(tcp_gro_receive);
-
-int tcp_gro_complete(struct sk_buff *skb)
-{
- struct tcphdr *th = tcp_hdr(skb);
-
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct tcphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
-
- skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
-
- if (th->cwr)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
-
- return 0;
-}
-EXPORT_SYMBOL(tcp_gro_complete);
-
#ifdef CONFIG_TCP_MD5SIG
-static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
-static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_MUTEX(tcp_md5sig_mutex);
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
{
int cpu;
- for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
- if (p) {
- if (p->md5_desc.tfm)
- crypto_free_hash(p->md5_desc.tfm);
- kfree(p);
- }
- }
- free_percpu(pool);
-}
-void tcp_free_md5sig_pool(void)
-{
- struct tcp_md5sig_pool * __percpu *pool = NULL;
+ for_each_possible_cpu(cpu) {
+ struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (--tcp_md5sig_users == 0) {
- pool = tcp_md5sig_pool;
- tcp_md5sig_pool = NULL;
+ if (p->md5_desc.tfm)
+ crypto_free_hash(p->md5_desc.tfm);
}
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- if (pool)
- __tcp_free_md5sig_pool(pool);
+ free_percpu(pool);
}
-EXPORT_SYMBOL(tcp_free_md5sig_pool);
-static struct tcp_md5sig_pool * __percpu *
-__tcp_alloc_md5sig_pool(struct sock *sk)
+static void __tcp_alloc_md5sig_pool(void)
{
int cpu;
- struct tcp_md5sig_pool * __percpu *pool;
+ struct tcp_md5sig_pool __percpu *pool;
- pool = alloc_percpu(struct tcp_md5sig_pool *);
+ pool = alloc_percpu(struct tcp_md5sig_pool);
if (!pool)
- return NULL;
+ return;
for_each_possible_cpu(cpu) {
- struct tcp_md5sig_pool *p;
struct crypto_hash *hash;
- p = kzalloc(sizeof(*p), sk->sk_allocation);
- if (!p)
- goto out_free;
- *per_cpu_ptr(pool, cpu) = p;
-
hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (!hash || IS_ERR(hash))
+ if (IS_ERR_OR_NULL(hash))
goto out_free;
- p->md5_desc.tfm = hash;
+ per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
}
- return pool;
+ /* before setting tcp_md5sig_pool, we must commit all writes
+ * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+ */
+ smp_wmb();
+ tcp_md5sig_pool = pool;
+ return;
out_free:
__tcp_free_md5sig_pool(pool);
- return NULL;
}
-struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
+bool tcp_alloc_md5sig_pool(void)
{
- struct tcp_md5sig_pool * __percpu *pool;
- int alloc = 0;
-
-retry:
- spin_lock_bh(&tcp_md5sig_pool_lock);
- pool = tcp_md5sig_pool;
- if (tcp_md5sig_users++ == 0) {
- alloc = 1;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- } else if (!pool) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- cpu_relax();
- goto retry;
- } else
- spin_unlock_bh(&tcp_md5sig_pool_lock);
-
- if (alloc) {
- /* we cannot hold spinlock here because this may sleep. */
- struct tcp_md5sig_pool * __percpu *p;
-
- p = __tcp_alloc_md5sig_pool(sk);
- spin_lock_bh(&tcp_md5sig_pool_lock);
- if (!p) {
- tcp_md5sig_users--;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- return NULL;
- }
- pool = tcp_md5sig_pool;
- if (pool) {
- /* oops, it has already been assigned. */
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- __tcp_free_md5sig_pool(p);
- } else {
- tcp_md5sig_pool = pool = p;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- }
+ if (unlikely(!tcp_md5sig_pool)) {
+ mutex_lock(&tcp_md5sig_mutex);
+
+ if (!tcp_md5sig_pool)
+ __tcp_alloc_md5sig_pool();
+
+ mutex_unlock(&tcp_md5sig_mutex);
}
- return pool;
+ return tcp_md5sig_pool != NULL;
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -2973,56 +3035,45 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
*/
struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
- struct tcp_md5sig_pool * __percpu *p;
+ struct tcp_md5sig_pool __percpu *p;
local_bh_disable();
-
- spin_lock(&tcp_md5sig_pool_lock);
- p = tcp_md5sig_pool;
+ p = ACCESS_ONCE(tcp_md5sig_pool);
if (p)
- tcp_md5sig_users++;
- spin_unlock(&tcp_md5sig_pool_lock);
-
- if (p)
- return *this_cpu_ptr(p);
+ return __this_cpu_ptr(p);
local_bh_enable();
return NULL;
}
EXPORT_SYMBOL(tcp_get_md5sig_pool);
-void tcp_put_md5sig_pool(void)
-{
- local_bh_enable();
- tcp_free_md5sig_pool();
-}
-EXPORT_SYMBOL(tcp_put_md5sig_pool);
-
int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
- struct tcphdr *th)
+ const struct tcphdr *th)
{
struct scatterlist sg;
+ struct tcphdr hdr;
int err;
- __sum16 old_checksum = th->check;
- th->check = 0;
+ /* We are not allowed to change tcphdr, make a local copy */
+ memcpy(&hdr, th, sizeof(hdr));
+ hdr.check = 0;
+
/* options aren't included in the hash */
- sg_init_one(&sg, th, sizeof(struct tcphdr));
- err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
- th->check = old_checksum;
+ sg_init_one(&sg, &hdr, sizeof(hdr));
+ err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
return err;
}
EXPORT_SYMBOL(tcp_md5_hash_header);
int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
- struct sk_buff *skb, unsigned header_len)
+ const struct sk_buff *skb, unsigned int header_len)
{
struct scatterlist sg;
const struct tcphdr *tp = tcp_hdr(skb);
struct hash_desc *desc = &hp->md5_desc;
- unsigned i;
- const unsigned head_data_len = skb_headlen(skb) > header_len ?
- skb_headlen(skb) - header_len : 0;
+ unsigned int i;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
const struct skb_shared_info *shi = skb_shinfo(skb);
struct sk_buff *frag_iter;
@@ -3034,8 +3085,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
for (i = 0; i < shi->nr_frags; ++i) {
const struct skb_frag_struct *f = &shi->frags[i];
- sg_set_page(&sg, f->page, f->size, f->page_offset);
- if (crypto_hash_update(desc, &sg, f->size))
+ unsigned int offset = f->page_offset;
+ struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+ sg_set_page(&sg, page, skb_frag_size(f),
+ offset_in_page(offset));
+ if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
return 1;
}
@@ -3047,7 +3102,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
}
EXPORT_SYMBOL(tcp_md5_hash_skb_data);
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
{
struct scatterlist sg;
@@ -3058,142 +3113,17 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
-/**
- * Each Responder maintains up to two secret values concurrently for
- * efficient secret rollover. Each secret value has 4 states:
- *
- * Generating. (tcp_secret_generating != tcp_secret_primary)
- * Generates new Responder-Cookies, but not yet used for primary
- * verification. This is a short-term state, typically lasting only
- * one round trip time (RTT).
- *
- * Primary. (tcp_secret_generating == tcp_secret_primary)
- * Used both for generation and primary verification.
- *
- * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
- * Used for verification, until the first failure that can be
- * verified by the newer Generating secret. At that time, this
- * cookie's state is changed to Secondary, and the Generating
- * cookie's state is changed to Primary. This is a short-term state,
- * typically lasting only one round trip time (RTT).
- *
- * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
- * Used for secondary verification, after primary verification
- * failures. This state lasts no more than twice the Maximum Segment
- * Lifetime (2MSL). Then, the secret is discarded.
- */
-struct tcp_cookie_secret {
- /* The secret is divided into two parts. The digest part is the
- * equivalent of previously hashing a secret and saving the state,
- * and serves as an initialization vector (IV). The message part
- * serves as the trailing secret.
- */
- u32 secrets[COOKIE_WORKSPACE_WORDS];
- unsigned long expires;
-};
-
-#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
-#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
-#define TCP_SECRET_LIFE (HZ * 600)
-
-static struct tcp_cookie_secret tcp_secret_one;
-static struct tcp_cookie_secret tcp_secret_two;
-
-/* Essentially a circular list, without dynamic allocation. */
-static struct tcp_cookie_secret *tcp_secret_generating;
-static struct tcp_cookie_secret *tcp_secret_primary;
-static struct tcp_cookie_secret *tcp_secret_retiring;
-static struct tcp_cookie_secret *tcp_secret_secondary;
-
-static DEFINE_SPINLOCK(tcp_secret_locker);
-
-/* Select a pseudo-random word in the cookie workspace.
- */
-static inline u32 tcp_cookie_work(const u32 *ws, const int n)
-{
- return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
-}
-
-/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
- * Called in softirq context.
- * Returns: 0 for success.
- */
-int tcp_cookie_generator(u32 *bakery)
-{
- unsigned long jiffy = jiffies;
-
- if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
- spin_lock_bh(&tcp_secret_locker);
- if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
- /* refreshed by another */
- memcpy(bakery,
- &tcp_secret_generating->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- } else {
- /* still needs refreshing */
- get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
-
- /* The first time, paranoia assumes that the
- * randomization function isn't as strong. But,
- * this secret initialization is delayed until
- * the last possible moment (packet arrival).
- * Although that time is observable, it is
- * unpredictably variable. Mash in the most
- * volatile clock bits available, and expire the
- * secret extra quickly.
- */
- if (unlikely(tcp_secret_primary->expires ==
- tcp_secret_secondary->expires)) {
- struct timespec tv;
-
- getnstimeofday(&tv);
- bakery[COOKIE_DIGEST_WORDS+0] ^=
- (u32)tv.tv_nsec;
-
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_1MSL
- + (0x0f & tcp_cookie_work(bakery, 0));
- } else {
- tcp_secret_secondary->expires = jiffy
- + TCP_SECRET_LIFE
- + (0xff & tcp_cookie_work(bakery, 1));
- tcp_secret_primary->expires = jiffy
- + TCP_SECRET_2MSL
- + (0x1f & tcp_cookie_work(bakery, 2));
- }
- memcpy(&tcp_secret_secondary->secrets[0],
- bakery, COOKIE_WORKSPACE_WORDS);
-
- rcu_assign_pointer(tcp_secret_generating,
- tcp_secret_secondary);
- rcu_assign_pointer(tcp_secret_retiring,
- tcp_secret_primary);
- /*
- * Neither call_rcu() nor synchronize_rcu() needed.
- * Retiring data is not freed. It is replaced after
- * further (locked) pointer updates, and a quiet time
- * (minimum 1MSL, maximum LIFE - 2MSL).
- */
- }
- spin_unlock_bh(&tcp_secret_locker);
- } else {
- rcu_read_lock_bh();
- memcpy(bakery,
- &rcu_dereference(tcp_secret_generating)->secrets[0],
- COOKIE_WORKSPACE_WORDS);
- rcu_read_unlock_bh();
- }
- return 0;
-}
-EXPORT_SYMBOL(tcp_cookie_generator);
-
void tcp_done(struct sock *sk)
{
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
+ if (req != NULL)
+ reqsk_fastopen_remove(sk, req, false);
sk->sk_shutdown = SHUTDOWN_MASK;
@@ -3209,19 +3139,34 @@ extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- thash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
return 1;
}
__setup("thash_entries=", set_thash_entries);
+static void tcp_init_mem(void)
+{
+ unsigned long limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_tcp_mem[0] = limit / 4 * 3;
+ sysctl_tcp_mem[1] = limit;
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+}
+
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
- unsigned long nr_pages, limit;
- int i, max_share, cnt;
- unsigned long jiffy = jiffies;
+ unsigned long limit;
+ int max_rshare, max_wshare, cnt;
+ unsigned int i;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3241,29 +3186,28 @@ void __init tcp_init(void)
alloc_large_system_hash("TCP established",
sizeof(struct inet_ehash_bucket),
thash_entries,
- (totalram_pages >= 128 * 1024) ?
- 13 : 15,
+ 17, /* one slot per 128 KB of memory */
0,
NULL,
&tcp_hashinfo.ehash_mask,
+ 0,
thash_entries ? 0 : 512 * 1024);
- for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
- INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
- }
+
if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
- (totalram_pages >= 128 * 1024) ?
- 13 : 15,
+ 17, /* one slot per 128 KB of memory */
0,
&tcp_hashinfo.bhash_size,
NULL,
+ 0,
64 * 1024);
- tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3276,42 +3220,26 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans = cnt / 2;
sysctl_max_syn_backlog = max(128, cnt / 256);
- /* Set the pressure threshold to be a fraction of global memory that
- * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
- * memory, with a floor of 128 pages.
- */
- nr_pages = totalram_pages - totalhigh_pages;
- limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
- limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
- limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
-
+ tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
- limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
- max_share = min(4UL*1024*1024, limit);
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(6UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
- sysctl_tcp_wmem[2] = max(64*1024, max_share);
+ sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
- sysctl_tcp_rmem[2] = max(87380, max_share);
+ sysctl_tcp_rmem[2] = max(87380, max_rshare);
+
+ pr_info("Hash tables configured (established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
- printk(KERN_INFO "TCP: Hash tables configured "
- "(established %u bind %u)\n",
- tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_metrics_init();
tcp_register_congestion_control(&tcp_reno);
- memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
- memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
- tcp_secret_one.expires = jiffy; /* past due */
- tcp_secret_two.expires = jiffy; /* past due */
- tcp_secret_generating = &tcp_secret_one;
- tcp_secret_primary = &tcp_secret_one;
- tcp_secret_retiring = &tcp_secret_two;
- tcp_secret_secondary = &tcp_secret_two;
+ tcp_tasklet_init();
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23..d5de69bc04f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca)
{
ca->cnt = 0;
ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
ca->epoch_start = 0;
@@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca)
static void bictcp_init(struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
if (initial_ssthresh)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
@@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
}
/* if in slow start or link utilization is very low */
- if (ca->loss_cwnd == 0) {
+ if (ca->last_max_cwnd == 0) {
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
ca->cnt = 20;
}
@@ -137,16 +140,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
@@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct bictcp *ca = inet_csk_ca(sk);
- return max(tp->snd_cwnd, ca->last_max_cwnd);
+ return max(tp->snd_cwnd, ca->loss_cwnd);
}
static void bictcp_state(struct sock *sk, u8 new_state)
@@ -209,7 +212,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
}
-static struct tcp_congestion_ops bictcp = {
+static struct tcp_congestion_ops bictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 850c737e08e..7b09d8b49fa 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,11 +1,13 @@
/*
* Plugable TCP congestion control support and newReno
* congestion control.
- * Based on ideas from I/O scheduler suport and Web100.
+ * Based on ideas from I/O scheduler support and Web100.
*
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
@@ -13,8 +15,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_max_ssthresh = 0;
-
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -41,18 +41,17 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
/* all algorithms must implement ssthresh and cong_avoid ops */
if (!ca->ssthresh || !ca->cong_avoid) {
- printk(KERN_ERR "TCP %s does not implement required ops\n",
- ca->name);
+ pr_err("%s does not implement required ops\n", ca->name);
return -EINVAL;
}
spin_lock(&tcp_cong_list_lock);
if (tcp_ca_find(ca->name)) {
- printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+ pr_notice("%s already registered\n", ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
- printk(KERN_INFO "TCP %s registered\n", ca->name);
+ pr_info("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
@@ -258,7 +257,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
if (!ca)
err = -ENOENT;
- else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM;
else if (!try_module_get(ca->owner))
@@ -276,65 +276,24 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return err;
}
-/* RFC2861 Check whether we are limited by application or congestion window
- * This is the inverse of cwnd check in tcp_tso_should_defer
- */
-int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 left;
-
- if (in_flight >= tp->snd_cwnd)
- return 1;
-
- left = tp->snd_cwnd - in_flight;
- if (sk_can_gso(sk) &&
- left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
- left * tp->mss_cache < sk->sk_gso_max_size)
- return 1;
- return left <= tcp_max_burst(tp);
-}
-EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-
-/*
- * Slow start is used when congestion window is less than slow start
- * threshold. This version implements the basic RFC2581 version
- * and optionally supports:
- * RFC3742 Limited Slow Start - growth limited to max_ssthresh
- * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-void tcp_slow_start(struct tcp_sock *tp)
+int tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- int cnt; /* increase in packets */
-
- /* RFC3465: ABC Slow start
- * Increase only after a full MSS of bytes is acked
- *
- * TCP sender SHOULD increase cwnd by the number of
- * previously unacknowledged bytes ACKed by each incoming
- * acknowledgment, provided the increase is not more than L
- */
- if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
- return;
+ u32 cwnd = tp->snd_cwnd + acked;
- if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
- cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
- else
- cnt = tp->snd_cwnd; /* exponential increase */
-
- /* RFC3465: ABC
- * We MAY increase by 2 if discovered delayed ack
- */
- if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
- cnt <<= 1;
- tp->bytes_acked = 0;
-
- tp->snd_cwnd_cnt += cnt;
- while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd_cnt -= tp->snd_cwnd;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);
@@ -358,30 +317,19 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
-
+ tcp_slow_start(tp, acked);
/* In dangerous area, increase slowly. */
- else if (sysctl_tcp_abc) {
- /* RFC3465: Appropriate Byte Count
- * increase once for each full cwnd acked
- */
- if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
- tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- }
- } else {
+ else
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
- }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
@@ -393,21 +341,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window with halving. */
-u32 tcp_reno_min_cwnd(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- return tp->snd_ssthresh/2;
-}
-EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
-
struct tcp_congestion_ops tcp_reno = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "reno",
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
/* Initial congestion control used (until SYN)
@@ -419,6 +358,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops = {
.owner = THIS_MODULE,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
};
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa..a9bd8a4828a 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -39,7 +39,7 @@
/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES 8
-#define HYSTART_DELAY_MIN (2U<<3)
+#define HYSTART_DELAY_MIN (4U<<3)
#define HYSTART_DELAY_MAX (16U<<3)
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;
static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta __read_mostly = 2;
static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
/* BIC TCP Parameters */
struct bictcp {
@@ -85,17 +88,18 @@ struct bictcp {
u32 last_time; /* time when updated last_cwnd */
u32 bic_origin_point;/* origin point of bic function */
u32 bic_K; /* time to origin point from the beginning of the current epoch */
- u32 delay_min; /* min delay */
+ u32 delay_min; /* min delay (msec << 3) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
#define ACK_RATIO_SHIFT 4
+#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
u8 sample_cnt; /* number of samples to decide curr_rtt */
u8 found; /* the exit point is found? */
u32 round_start; /* beginning of each round */
u32 end_seq; /* end_seq of the round */
- u32 last_jiffies; /* last time when the ACK spacing is close */
+ u32 last_ack; /* last time when the ACK spacing is close */
u32 curr_rtt; /* the minimum rtt of current round */
};
@@ -103,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca)
{
ca->cnt = 0;
ca->last_max_cwnd = 0;
- ca->loss_cwnd = 0;
ca->last_cwnd = 0;
ca->last_time = 0;
ca->bic_origin_point = 0;
@@ -116,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->found = 0;
}
+static inline u32 bictcp_clock(void)
+{
+#if HZ < 1000
+ return ktime_to_ms(ktime_get_real());
+#else
+ return jiffies_to_msecs(jiffies);
+#endif
+}
+
static inline void bictcp_hystart_reset(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- ca->round_start = ca->last_jiffies = jiffies;
+ ca->round_start = ca->last_ack = bictcp_clock();
ca->end_seq = tp->snd_nxt;
ca->curr_rtt = 0;
ca->sample_cnt = 0;
@@ -129,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk)
static void bictcp_init(struct sock *sk)
{
- bictcp_reset(inet_csk_ca(sk));
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
if (hystart)
bictcp_hystart_reset(sk);
@@ -191,8 +206,8 @@ static u32 cubic_root(u64 a)
*/
static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
{
- u64 offs;
- u32 delta, t, bic_target, max_cnt;
+ u32 delta, bic_target, max_cnt;
+ u64 offs, t;
ca->ack_cnt++; /* count the number of ACKs */
@@ -235,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
* if the cwnd < 1 million packets !!!
*/
+ t = (s32)(tcp_time_stamp - ca->epoch_start);
+ t += msecs_to_jiffies(ca->delay_min >> 3);
/* change the unit from HZ to bictcp_HZ */
- t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start)
- << BICTCP_HZ) / HZ;
+ t <<= BICTCP_HZ;
+ do_div(t, HZ);
if (t < ca->bic_K) /* t - K */
offs = ca->bic_K - t;
@@ -258,6 +275,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 100 * cwnd; /* very small increment*/
}
+ /*
+ * The initial growth of cubic function may be too conservative
+ * when the available bandwidth is still unknown.
+ */
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+ ca->cnt = 20; /* increase cwnd 5% per RTT */
+
/* TCP Friendly */
if (tcp_friendliness) {
u32 scale = beta_scale;
@@ -280,18 +304,18 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
ca->cnt = 1;
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
bictcp_update(ca, tp->snd_cwnd);
tcp_cong_avoid_ai(tp, ca->cnt);
@@ -322,7 +346,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
- return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
static void bictcp_state(struct sock *sk, u8 new_state)
@@ -339,12 +363,12 @@ static void hystart_update(struct sock *sk, u32 delay)
struct bictcp *ca = inet_csk_ca(sk);
if (!(ca->found & hystart_detect)) {
- u32 curr_jiffies = jiffies;
+ u32 now = bictcp_clock();
/* first detection parameter - ack-train detection */
- if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) {
- ca->last_jiffies = curr_jiffies;
- if (curr_jiffies - ca->round_start >= ca->delay_min>>4)
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ ca->last_ack = now;
+ if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
ca->found |= HYSTART_ACK_TRAIN;
}
@@ -379,8 +403,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
u32 delay;
if (icsk->icsk_ca_state == TCP_CA_Open) {
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+ u32 ratio = ca->delayed_ack;
+
+ ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ratio += cnt;
+
+ ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
}
/* Some calls are for duplicates without timetamps */
@@ -388,10 +416,10 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
return;
/* Discard delay samples right after fast recovery */
- if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
return;
- delay = usecs_to_jiffies(rtt_us) << 3;
+ delay = (rtt_us << 3) / USEC_PER_MSEC;
if (delay == 0)
delay = 1;
@@ -405,7 +433,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
hystart_update(sk, delay);
}
-static struct tcp_congestion_ops cubictcp = {
+static struct tcp_congestion_ops cubictcp __read_mostly = {
.init = bictcp_init,
.ssthresh = bictcp_recalc_ssthresh,
.cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 939edb3b8e4..ed3f2ad42e0 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,11 +34,23 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
tcp_get_info(sk, info);
}
+static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+}
+
+static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+}
+
static const struct inet_diag_handler tcp_diag_handler = {
- .idiag_hashinfo = &tcp_hashinfo,
+ .dump = tcp_diag_dump,
+ .dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
- .idiag_type = TCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
+ .idiag_type = IPPROTO_TCP,
};
static int __init tcp_diag_init(void)
@@ -54,4 +66,4 @@ static void __exit tcp_diag_exit(void)
module_init(tcp_diag_init);
module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 00000000000..9771563ab56
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,295 @@
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
+
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+
+void tcp_fastopen_init_key_once(bool publish)
+{
+ static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ /* tcp_fastopen_reset_cipher publishes the new context
+ * atomically, so we allow this race happening here.
+ *
+ * All call sites of tcp_fastopen_cookie_gen also check
+ * for a valid cookie, so this is an acceptable risk.
+ */
+ if (net_get_random_once(key, sizeof(key)) && publish)
+ tcp_fastopen_reset_cipher(key, sizeof(key));
+}
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+ struct tcp_fastopen_context *ctx =
+ container_of(head, struct tcp_fastopen_context, rcu);
+ crypto_free_cipher(ctx->tfm);
+ kfree(ctx);
+}
+
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+ int err;
+ struct tcp_fastopen_context *ctx, *octx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+
+ if (IS_ERR(ctx->tfm)) {
+ err = PTR_ERR(ctx->tfm);
+error: kfree(ctx);
+ pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+ return err;
+ }
+ err = crypto_cipher_setkey(ctx->tfm, key, len);
+ if (err) {
+ pr_err("TCP: TFO cipher key error: %d\n", err);
+ crypto_free_cipher(ctx->tfm);
+ goto error;
+ }
+ memcpy(ctx->key, key, len);
+
+ spin_lock(&tcp_fastopen_ctx_lock);
+
+ octx = rcu_dereference_protected(tcp_fastopen_ctx,
+ lockdep_is_held(&tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+ spin_unlock(&tcp_fastopen_ctx_lock);
+
+ if (octx)
+ call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+ return err;
+}
+
+static bool __tcp_fastopen_cookie_gen(const void *path,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+ bool ok = false;
+
+ tcp_fastopen_init_key_once(true);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(tcp_fastopen_ctx);
+ if (ctx) {
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ ok = true;
+ }
+ rcu_read_unlock();
+ return ok;
+}
+
+/* Generate the fastopen cookie by doing aes128 encryption on both
+ * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
+ * addresses. For the longer IPv6 addresses use CBC-MAC.
+ *
+ * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+ */
+static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
+ return __tcp_fastopen_cookie_gen(path, foc);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+ struct tcp_fastopen_cookie tmp;
+
+ if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ int i = 4;
+
+ for (i = 0; i < 4; i++)
+ buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
+ return __tcp_fastopen_cookie_gen(buf, foc);
+ }
+ }
+#endif
+ return false;
+}
+
+static bool tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ return false;
+
+ spin_lock(&queue->fastopenq->lock);
+ queue->fastopenq->qlen++;
+ spin_unlock(&queue->fastopenq->lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ tp->fastopen_rsk = req;
+ /* Do a hold on the listner sk so that if the listener is being
+ * closed, the child that has been accepted can live on and still
+ * access listen_lock.
+ */
+ sock_hold(sk);
+ tcp_rsk(req)->listener = sk;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the SYN table of the parent
+ * because it's been added to the accept queue directly.
+ */
+ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+ /* Now finish processing the fastopen child socket. */
+ inet_csk(child)->icsk_af_ops->rebuild_header(child);
+ tcp_init_congestion_control(child);
+ tcp_mtup_init(child);
+ tcp_init_metrics(child);
+ tcp_init_buffer_space(child);
+
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ *
+ * XXX (TFO) - we honor a zero-payload TFO request for now,
+ * (any reason not to?) but no need to queue the skb since
+ * there is no data. How about SYN+FIN?
+ */
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) {
+ skb = skb_get(skb);
+ skb_dst_drop(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ skb_set_owner_r(skb, child);
+ __skb_queue_tail(&child->sk_receive_queue, skb);
+ tp->syn_data_acked = 1;
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(child);
+ sock_put(child);
+ WARN_ON(req->sk == NULL);
+ return true;
+}
+EXPORT_SYMBOL(tcp_fastopen_create_child);
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+ struct fastopen_queue *fastopenq;
+
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq == NULL || fastopenq->max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= fastopenq->max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+ spin_unlock(&fastopenq->lock);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_free(req1);
+ }
+ return true;
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
+{
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+
+ if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return false;
+ }
+
+ if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+ foc->len == valid_foc.len &&
+ !memcmp(foc->val, valid_foc.val, foc->len)) {
+ /* Cookie is valid. Create a (full) child socket to accept
+ * the data in SYN before returning a SYN-ACK to ack the
+ * data. If we fail to create the socket, fall back and
+ * ack the ISN only but includes the same cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to send
+ * data in SYN_RECV state.
+ */
+fastopen:
+ if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ foc->len = -1;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return true;
+ }
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), foc->len ?
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
+ LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ *foc = valid_foc;
+ return false;
+}
+EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb..1c4908280d9 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -109,16 +109,16 @@ static void hstcp_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* Update AIMD parameters.
*
@@ -158,11 +158,10 @@ static u32 hstcp_ssthresh(struct sock *sk)
}
-static struct tcp_congestion_ops tcp_highspeed = {
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
.init = hstcp_init,
.ssthresh = hstcp_ssthresh,
.cong_avoid = hstcp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "highspeed"
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a495541..031361311a8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -227,16 +227,16 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
}
-static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
}
}
-static struct tcp_congestion_ops htcp = {
+static struct tcp_congestion_ops htcp __read_mostly = {
.init = htcp_init,
.ssthresh = htcp_recalc_ssthresh,
.cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc934937..d8f8f05a495 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,17 +15,16 @@
/* Tcp Hybla structure. */
struct hybla {
- u8 hybla_en;
+ bool hybla_en;
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
u32 rho; /* Rho parameter, integer part */
u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */
- u32 minrtt; /* Minimum smoothed round trip time value seen */
+ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
};
-/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
- expressed in jiffies */
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -36,10 +35,12 @@ static inline void hybla_recalc_param (struct sock *sk)
{
struct hybla *ca = inet_csk_ca(sk);
- ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho_3ls = max_t(u32,
+ tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+ 8U);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
- ca->rho2 = ca->rho2_7ls >>7;
+ ca->rho2 = ca->rho2_7ls >> 7;
}
static void hybla_init(struct sock *sk)
@@ -52,7 +53,7 @@ static void hybla_init(struct sock *sk)
ca->rho_3ls = 0;
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
- ca->hybla_en = 1;
+ ca->hybla_en = true;
tp->snd_cwnd = 2;
tp->snd_cwnd_clamp = 65535;
@@ -60,13 +61,14 @@ static void hybla_init(struct sock *sk)
hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
- ca->minrtt = tp->srtt;
+ ca->minrtt_us = tp->srtt_us;
tp->snd_cwnd = ca->rho;
}
static void hybla_state(struct sock *sk, u8 ca_state)
{
struct hybla *ca = inet_csk_ca(sk);
+
ca->hybla_en = (ca_state == TCP_CA_Open);
}
@@ -85,7 +87,7 @@ static inline u32 hybla_fraction(u32 odds)
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
-static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
@@ -93,16 +95,16 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
- if (tp->srtt < ca->minrtt){
+ if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk);
- ca->minrtt = tp->srtt;
+ ca->minrtt_us = tp->srtt_us;
}
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (!ca->hybla_en) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
@@ -162,10 +164,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
}
-static struct tcp_congestion_ops tcp_hybla = {
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
- .min_cwnd = tcp_reno_min_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00ca688d896..5999b3972e6 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -23,7 +23,6 @@
#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
-#define U32_MAX ((u32)~0U)
#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
#define BETA_SHIFT 6
@@ -256,7 +255,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state)
/*
* Increase window in response to successful acknowledgment.
*/
-static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
@@ -265,12 +264,12 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
update_params(sk);
/* RFC2861 only increase cwnd if fully utilized */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* In slow start */
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else {
u32 delta;
@@ -313,20 +312,20 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
.tcpv_rttcnt = ca->cnt_rtt,
.tcpv_minrtt = ca->base_rtt,
};
- u64 t = ca->sum_rtt;
- do_div(t, ca->cnt_rtt);
- info.tcpv_rtt = t;
+ if (info.tcpv_rttcnt > 0) {
+ u64 t = ca->sum_rtt;
+ do_div(t, info.tcpv_rttcnt);
+ info.tcpv_rtt = t;
+ }
nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
}
}
-static struct tcp_congestion_ops tcp_illinois = {
- .flags = TCP_CONG_RTT_STAMP,
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
- .min_cwnd = tcp_reno_min_cwnd,
.cong_avoid = tcp_illinois_cong_avoid,
.set_state = tcp_illinois_state,
.get_info = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6d8ab1c4efc..40639c288dc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,6 +61,8 @@
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
@@ -79,24 +81,23 @@ int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
EXPORT_SYMBOL(sysctl_tcp_reordering);
-int sysctl_tcp_ecn __read_mostly = 2;
-EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
-int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_early_retrans __read_mostly = 3;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -105,19 +106,17 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
-#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
-#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
-#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
-#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -174,7 +173,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
static void tcp_incr_quickack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
@@ -194,9 +193,10 @@ static void tcp_enter_quickack_mode(struct sock *sk)
* and the session is not interactive.
*/
-static inline int tcp_in_quickack_mode(const struct sock *sk)
+static inline bool tcp_in_quickack_mode(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
@@ -206,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
@@ -217,36 +217,49 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (tp->ecn_flags & TCP_ECN_OK) {
- if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ if (!(tp->ecn_flags & TCP_ECN_OK))
+ return;
+
+ switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
- * it is surely retransmit. It is not in ECN RFC,
- * but Linux follows this rule. */
- else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+ * and we already seen ECT on a previous segment,
+ * it is probably a retransmit.
+ */
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ tcp_enter_quickack_mode((struct sock *)tp);
+ break;
+ case INET_ECN_CE:
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ /* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode((struct sock *)tp);
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ }
+ /* fallinto */
+ default:
+ tp->ecn_flags |= TCP_ECN_SEEN;
}
}
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
+static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
- return 1;
- return 0;
+ return true;
+ return false;
}
/* Buffer size and advertised window tuning.
@@ -254,16 +267,33 @@ static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
-static void tcp_fixup_sndbuf(struct sock *sk)
+static void tcp_sndbuf_expand(struct sock *sk)
{
- int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
- sizeof(struct sk_buff);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int sndmem, per_mss;
+ u32 nr_segs;
- if (sk->sk_sndbuf < 3 * sndmem) {
- sk->sk_sndbuf = 3 * sndmem;
- if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
- sk->sk_sndbuf = sysctl_tcp_wmem[2];
- }
+ /* Worst case is non GSO/TSO : each frame consumes one skb
+ * and skb->head is kmalloced using power of two area of memory
+ */
+ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+ MAX_TCP_HEADER +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+ * extra cushion (application might react slowly to POLLOUT)
+ */
+ sndmem = 2 * nr_segs * per_mss;
+
+ if (sk->sk_sndbuf < sndmem)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -309,14 +339,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
return 0;
}
-static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !tcp_memory_pressure) {
+ !sk_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -328,6 +358,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
incr = __tcp_grow_window(sk, skb);
if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
@@ -336,26 +367,28 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
}
/* 3. Tuning rcvbuf, when connection enters established state. */
-
static void tcp_fixup_rcvbuf(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+ u32 mss = tcp_sk(sk)->advmss;
+ int rcvmem;
+
+ rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
+ tcp_default_init_rwnd(mss);
- /* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and corresponding skbs will fit to our rcvbuf.
- * (was 3; 4 is minimum to allow fast retransmit to work.)
+ /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+ * Allow enough cushion so that sender is not limited by our window
*/
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
- if (sk->sk_rcvbuf < 4 * rcvmem)
- sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+ if (sysctl_tcp_moderate_rcvbuf)
+ rcvmem <<= 2;
+
+ if (sk->sk_rcvbuf < rcvmem)
+ sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
}
/* 4. Try to fixup all. It is made immediately after connection enters
* established state.
*/
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
@@ -363,9 +396,11 @@ static void tcp_init_buffer_space(struct sock *sk)
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
- tcp_fixup_sndbuf(sk);
+ tcp_sndbuf_expand(sk);
tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
@@ -398,8 +433,8 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ !sk_under_memory_pressure(sk) &&
+ sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
}
@@ -416,7 +451,7 @@ static void tcp_clamp_window(struct sock *sk)
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
@@ -460,8 +495,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
- } else if (m < new_sample)
- new_sample = m << 3;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
} else {
/* No previous measure. */
new_sample = m << 3;
@@ -477,7 +515,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
- tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -502,49 +540,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int time;
- int space;
-
- if (tp->rcvq_space.time == 0)
- goto new_measure;
+ int copied;
time = tcp_time_stamp - tp->rcvq_space.time;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
return;
- space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+ /* Number of bytes copied to user in last RTT */
+ copied = tp->copied_seq - tp->rcvq_space.seq;
+ if (copied <= tp->rcvq_space.space)
+ goto new_measure;
+
+ /* A bit of theory :
+ * copied = bytes received in previous RTT, our base window
+ * To cope with packet losses, we need a 2x factor
+ * To cope with slow start, and sender growing its cwin by 100 %
+ * every RTT, we need a 4x factor, because the ACK we are sending
+ * now is for the next RTT, not the current one :
+ * <prev RTT . ><current RTT .. ><next RTT .... >
+ */
+
+ if (sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvwin, rcvmem, rcvbuf;
- space = max(tp->rcvq_space.space, space);
+ /* minimal window to cope with packet losses, assuming
+ * steady state. Add some cushion because of small variations.
+ */
+ rcvwin = (copied << 1) + 16 * tp->advmss;
- if (tp->rcvq_space.space != space) {
- int rcvmem;
+ /* If rate increased by 25%,
+ * assume slow start, rcvwin = 3 * copied
+ * If rate increased by 50%,
+ * assume sender can use 2x growth, rcvwin = 4 * copied
+ */
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+ rcvwin <<= 1;
+ else
+ rcvwin += (rcvwin >> 1);
+ }
- tp->rcvq_space.space = space;
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
- if (sysctl_tcp_moderate_rcvbuf &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int new_clamp = space;
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = rcvbuf;
- /* Receive space grows, normalize in order to
- * take into account packet headers and sk_buff
- * structure overhead.
- */
- space /= tp->advmss;
- if (!space)
- space = 1;
- rcvmem = (tp->advmss + MAX_TCP_HEADER +
- 16 + sizeof(struct sk_buff));
- while (tcp_win_from_space(rcvmem) < tp->advmss)
- rcvmem += 128;
- space *= rcvmem;
- space = min(space, sysctl_tcp_rmem[2]);
- if (space > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = space;
-
- /* Make the window clamp follow along. */
- tp->window_clamp = new_clamp;
- }
+ /* Make the window clamp follow along. */
+ tp->window_clamp = rcvwin;
}
}
+ tp->rcvq_space.space = copied;
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
@@ -616,10 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
- long m = mrtt; /* RTT */
+ long m = mrtt_us; /* RTT */
+ u32 srtt = tp->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -637,14 +689,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
- if (m == 0)
- m = 1;
- if (tp->srtt != 0) {
- m -= (tp->srtt >> 3); /* m is now error in rtt est */
- tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -656,33 +706,62 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
if (m > 0)
m >>= 3;
} else {
- m -= (tp->mdev >> 2); /* similar update on mdev */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
}
- tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
- if (tp->mdev > tp->mdev_max) {
- tp->mdev_max = tp->mdev;
- if (tp->mdev_max > tp->rttvar)
- tp->rttvar = tp->mdev_max;
+ tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev_us > tp->mdev_max_us) {
+ tp->mdev_max_us = tp->mdev_us;
+ if (tp->mdev_max_us > tp->rttvar_us)
+ tp->rttvar_us = tp->mdev_max_us;
}
if (after(tp->snd_una, tp->rtt_seq)) {
- if (tp->mdev_max < tp->rttvar)
- tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
+ if (tp->mdev_max_us < tp->rttvar_us)
+ tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
- tp->mdev_max = tcp_rto_min(sk);
+ tp->mdev_max_us = tcp_rto_min_us(sk);
}
} else {
/* no previous measure. */
- tp->srtt = m << 3; /* take the measured time to be rtt */
- tp->mdev = m << 1; /* make sure rto = 3*rtt */
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+ srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+ tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
}
+ tp->srtt_us = max(1U, srtt);
+}
+
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ if (likely(tp->srtt_us))
+ do_div(rate, tp->srtt_us);
+
+ /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ * without any lock. We want to make sure compiler wont store
+ * intermediate values in this location.
+ */
+ ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+ sk->sk_max_pacing_rate);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct sock *sk)
+static void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -709,228 +788,31 @@ static inline void tcp_set_rto(struct sock *sk)
tcp_bound_rto(sk);
}
-/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes successfully
- i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (sysctl_tcp_nometrics_save)
- return;
-
- dst_confirm(dst);
-
- if (dst && (dst->flags & DST_HOST)) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int m;
- unsigned long rtt;
-
- if (icsk->icsk_backoff || !tp->srtt) {
- /* This session failed to estimate rtt. Why?
- * Probably, no packets returned in time.
- * Reset our results.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst->metrics[RTAX_RTT - 1] = 0;
- return;
- }
-
- rtt = dst_metric_rtt(dst, RTAX_RTT);
- m = rtt - tp->srtt;
-
- /* If newly calculated rtt larger than stored one,
- * store new one. Otherwise, use EWMA. Remember,
- * rtt overestimation is always better than underestimation.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT))) {
- if (m <= 0)
- set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
- else
- set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
- }
-
- if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
- unsigned long var;
- if (m < 0)
- m = -m;
-
- /* Scale deviation to rttvar fixed point */
- m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
-
- var = dst_metric_rtt(dst, RTAX_RTTVAR);
- if (m >= var)
- var = m;
- else
- var -= (var - m) >> 2;
-
- set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
- }
-
- if (tcp_in_initial_slowstart(tp)) {
- /* Slow start still did not finish. */
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
- if (!dst_metric_locked(dst, RTAX_CWND) &&
- tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
- icsk->icsk_ca_state == TCP_CA_Open) {
- /* Cong. avoidance phase, cwnd is reliable. */
- if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] =
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
- } else {
- /* Else slow start did not finish, cwnd is non-sense,
- ssthresh may be also invalid.
- */
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
- dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
- }
-
- if (!dst_metric_locked(dst, RTAX_REORDERING)) {
- if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
- dst->metrics[RTAX_REORDERING-1] = tp->reordering;
- }
- }
-}
-
-__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd)
- cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+ cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- tp->prior_ssthresh = 0;
- tp->bytes_acked = 0;
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- tp->undo_marker = 0;
- if (set_ssthresh)
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + 1U);
- tp->snd_cwnd_cnt = 0;
- tp->high_seq = tp->snd_nxt;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- TCP_ECN_queue_cwr(tp);
-
- tcp_set_ca_state(sk, TCP_CA_CWR);
- }
-}
-
/*
* Packet counting of FACK is based on in-order assumptions, therefore TCP
* disables it when reordering is detected
*/
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
{
/* RFC3517 uses different metric in lost marker => reset on change */
if (tcp_is_fack(tp))
tp->lost_skb_hint = NULL;
- tp->rx_opt.sack_ok &= ~2;
+ tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
}
/* Take a notice that peer is sending D-SACKs */
static void tcp_dsack_seen(struct tcp_sock *tp)
{
- tp->rx_opt.sack_ok |= 4;
-}
-
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst == NULL)
- goto reset;
-
- dst_confirm(dst);
-
- if (dst_metric_locked(dst, RTAX_CWND))
- tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
- if (dst_metric(dst, RTAX_SSTHRESH)) {
- tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- }
- if (dst_metric(dst, RTAX_REORDERING) &&
- tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tcp_disable_fack(tp);
- tp->reordering = dst_metric(dst, RTAX_REORDERING);
- }
-
- if (dst_metric(dst, RTAX_RTT) == 0)
- goto reset;
-
- if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
- goto reset;
-
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
- tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
- tp->rtt_seq = tp->snd_nxt;
- }
- if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
- tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
- }
- tcp_set_rto(sk);
- if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
- goto reset;
-
-cwnd:
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- return;
-
-reset:
- /* Play conservative. If timestamps are not
- * supported, TCP will fail to recalculate correct
- * rtt, if initial rto is too small. FORGET ALL AND RESET!
- */
- if (!tp->rx_opt.saw_tstamp && tp->srtt) {
- tp->srtt = 0;
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
- }
- goto cwnd;
+ tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
}
static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -954,15 +836,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
NET_INC_STATS_BH(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
- printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
- tp->reordering,
- tp->fackets_out,
- tp->sacked_out,
- tp->undo_marker ? tp->undo_retrans : 0);
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
#endif
tcp_disable_fack(tp);
}
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
}
/* This must be called before lost_out is incremented */
@@ -1020,13 +905,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
* These 6 states form finite state machine, controlled by the following events:
* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
- * 3. Loss detection event of one of three flavors:
+ * 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
- * A''. Its FACK modfication, head until snd.fack is lost.
- * B. SACK arrives sacking data transmitted after never retransmitted
- * hole was sent out.
- * C. SACK arrives sacking SND.NXT at the moment, when the
+ * A''. Its FACK modification, head until snd.fack is lost.
+ * B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
*
@@ -1095,36 +978,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
* the exact amount is rather hard to quantify. However, tp->max_window can
* be used as an exaggerated estimate.
*/
-static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
- u32 start_seq, u32 end_seq)
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
{
/* Too far in future, or reversed (interpretation is ambiguous) */
if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
- return 0;
+ return false;
/* Nasty start_seq wrap-around check (see comments above) */
if (!before(start_seq, tp->snd_nxt))
- return 0;
+ return false;
/* In outstanding window? ...This is valid exit for D-SACKs too.
* start_seq == snd_una is non-sensical (see comments above)
*/
if (after(start_seq, tp->snd_una))
- return 1;
+ return true;
if (!is_dsack || !tp->undo_marker)
- return 0;
+ return false;
/* ...Then it's D-SACK, and must reside below snd_una completely */
- if (!after(end_seq, tp->snd_una))
- return 0;
+ if (after(end_seq, tp->snd_una))
+ return false;
if (!before(start_seq, tp->undo_marker))
- return 1;
+ return true;
/* Too old */
if (!after(end_seq, tp->undo_marker))
- return 0;
+ return false;
/* Undo_marker boundary crossing (overestimates a lot). Known already:
* start_seq < undo_marker and end_seq >= undo_marker.
@@ -1133,7 +1016,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
}
/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "C". Later note: FACK people cheated me again 8), we have to account
+ * Event "B". Later note: FACK people cheated me again 8), we have to account
* for reordering! Ugly, but should help.
*
* Search retransmitted skbs from write_queue that were sent when snd_nxt was
@@ -1196,17 +1079,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
tp->lost_retrans_low = new_low_seq;
}
-static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
- struct tcp_sack_block_wire *sp, int num_sacks,
- u32 prior_snd_una)
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
- int dup_sack = 0;
+ bool dup_sack = false;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
@@ -1215,7 +1098,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
if (!after(end_seq_0, end_seq_1) &&
!before(start_seq_0, start_seq_1)) {
- dup_sack = 1;
+ dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
@@ -1223,7 +1106,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
}
/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack &&
+ if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans--;
@@ -1232,9 +1115,10 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
}
struct tcp_sacktag_state {
- int reord;
- int fack_count;
- int flag;
+ int reord;
+ int fack_count;
+ long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ int flag;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1246,9 +1130,10 @@ struct tcp_sacktag_state {
* FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
- u32 start_seq, u32 end_seq)
+ u32 start_seq, u32 end_seq)
{
- int in_sack, err;
+ int err;
+ bool in_sack;
unsigned int pkt_len;
unsigned int mss;
@@ -1277,12 +1162,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
unsigned int new_len = (pkt_len / mss) * mss;
if (!in_sack && new_len < pkt_len) {
new_len += mss;
- if (new_len > skb->len)
+ if (new_len >= skb->len)
return 0;
}
pkt_len = new_len;
}
- err = tcp_fragment(sk, skb, pkt_len, mss);
+ err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
}
@@ -1290,24 +1175,27 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
-static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
- struct tcp_sacktag_state *state,
- int dup_sack, int pcount)
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int pcount,
+ const struct skb_mstamp *xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
- u8 sacked = TCP_SKB_CB(skb)->sacked;
int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
- if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
+ after(end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
state->reord = min(fack_count, state->reord);
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ if (!after(end_seq, tp->snd_una))
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1326,14 +1214,20 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
- if (before(TCP_SKB_CB(skb)->seq,
+ if (before(start_seq,
tcp_highest_sack_seq(tp)))
state->reord = min(fack_count,
state->reord);
-
- /* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
- state->flag |= FLAG_ONLY_ORIG_SACKED;
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
+ /* Pick the earliest sequence sacked for RTT */
+ if (state->rtt_us < 0) {
+ struct skb_mstamp now;
+
+ skb_mstamp_get(&now);
+ state->rtt_us = skb_mstamp_us_delta(&now,
+ xmit_time);
+ }
}
if (sacked & TCPCB_LOST) {
@@ -1350,8 +1244,7 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
if (fack_count > tp->fackets_out)
@@ -1370,19 +1263,32 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
return sacked;
}
-static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
- struct tcp_sacktag_state *state,
- unsigned int pcount, int shifted, int mss,
- int dup_sack)
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
BUG_ON(!pcount);
- /* Tweak before seqno plays */
- if (!tcp_is_fack(tp) && tcp_is_sack(tp) && tp->lost_skb_hint &&
- !before(TCP_SKB_CB(tp->lost_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount,
+ &skb->skb_mstamp);
+
+ if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
TCP_SKB_CB(prev)->end_seq += shifted;
@@ -1408,30 +1314,28 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_type = 0;
}
- /* We discard results */
- tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
-
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
- return 0;
+ return false;
}
/* Whole SKB was eaten :-) */
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->scoreboard_skb_hint)
- tp->scoreboard_skb_hint = prev;
if (skb == tp->lost_skb_hint) {
tp->lost_skb_hint = prev;
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
- TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
@@ -1440,19 +1344,19 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
- return 1;
+ return true;
}
/* I wish gso_size would have a bit more sane initialization than
* something-or-zero which complicates things
*/
-static int tcp_skb_seglen(struct sk_buff *skb)
+static int tcp_skb_seglen(const struct sk_buff *skb)
{
return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
}
/* Shifting pages past head area doesn't work */
-static int skb_can_shift(struct sk_buff *skb)
+static int skb_can_shift(const struct sk_buff *skb)
{
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
@@ -1463,7 +1367,7 @@ static int skb_can_shift(struct sk_buff *skb)
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack)
+ bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
@@ -1558,6 +1462,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
}
}
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
if (!skb_shift(prev, skb, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
@@ -1598,14 +1506,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
- int dup_sack_in)
+ bool dup_sack_in)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
tcp_for_write_queue_from(skb, sk) {
int in_sack = 0;
- int dup_sack = dup_sack_in;
+ bool dup_sack = dup_sack_in;
if (skb == tcp_send_head(sk))
break;
@@ -1620,7 +1528,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
next_dup->start_seq,
next_dup->end_seq);
if (in_sack > 0)
- dup_sack = 1;
+ dup_sack = true;
}
/* skb reference here is a bit tricky to get right, since
@@ -1648,10 +1556,15 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
break;
if (in_sack) {
- TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
- state,
- dup_sack,
- tcp_skb_pcount(skb));
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb),
+ &skb->skb_mstamp);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@@ -1701,19 +1614,18 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
return skb;
}
-static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
{
return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}
static int
-tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
- u32 prior_snd_una)
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_snd_una, long *sack_rtt_us)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- unsigned char *ptr = (skb_transport_header(ack_skb) +
- TCP_SKB_CB(ack_skb)->sacked);
+ const unsigned char *ptr = (skb_transport_header(ack_skb) +
+ TCP_SKB_CB(ack_skb)->sacked);
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
@@ -1721,12 +1633,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
- int found_dup_sack = 0;
+ bool found_dup_sack = false;
int i, j;
int first_sack_index;
state.flag = 0;
state.reord = tp->packets_out;
+ state.rtt_us = -1L;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@@ -1752,7 +1665,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
used_sacks = 0;
first_sack_index = 0;
for (i = 0; i < num_sacks; i++) {
- int dup_sack = !i && found_dup_sack;
+ bool dup_sack = !i && found_dup_sack;
sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1819,16 +1732,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
while (i < used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
- int dup_sack = (found_dup_sack && (i == first_sack_index));
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
struct tcp_sack_block *next_dup = NULL;
if (found_dup_sack && ((i + 1) == first_sack_index))
next_dup = &sp[i + 1];
- /* Event "B" in the comment above. */
- if (after(end_seq, tp->high_seq))
- state.flag |= FLAG_DATA_LOST;
-
/* Skip too early cached blocks */
while (tcp_sack_cache_ok(tp, cache) &&
!before(start_seq, cache->end_seq))
@@ -1887,12 +1796,6 @@ walk:
start_seq, end_seq, dup_sack);
advance_sp:
- /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
- * due to in-order walk
- */
- if (after(end_seq, tp->frto_highmark))
- state.flag &= ~FLAG_ONLY_ORIG_SACKED;
-
i++;
}
@@ -1909,8 +1812,7 @@ advance_sp:
tcp_verify_left_out(tp);
if ((state.reord < tp->fackets_out) &&
- ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
- (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+ ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
out:
@@ -1921,13 +1823,14 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
+ *sack_rtt_us = state.rtt_us;
return state.flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
- * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
-static int tcp_limit_reno_sacked(struct tcp_sock *tp)
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
@@ -1936,9 +1839,9 @@ static int tcp_limit_reno_sacked(struct tcp_sock *tp)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/* If we receive more dupacks than we expected counting segments
@@ -1984,205 +1887,13 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-static int tcp_is_sackfrto(const struct tcp_sock *tp)
-{
- return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
-}
-
-/* F-RTO can only be used if TCP has never retransmitted anything other than
- * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
- */
-int tcp_use_frto(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
-
- if (!sysctl_tcp_frto)
- return 0;
-
- /* MTU probe and F-RTO won't really play nicely along currently */
- if (icsk->icsk_mtup.probe_size)
- return 0;
-
- if (tcp_is_sackfrto(tp))
- return 1;
-
- /* Avoid expensive walking of rexmit queue if possible */
- if (tp->retrans_out > 1)
- return 0;
-
- skb = tcp_write_queue_head(sk);
- if (tcp_skb_is_last(sk, skb))
- return 1;
- skb = tcp_write_queue_next(sk, skb); /* Skips head */
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- return 0;
- /* Short-circuit when first non-SACKed skb has been checked */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- break;
- }
- return 1;
-}
-
-/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
- * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
- * keep retrans_out counting accurate (with SACK F-RTO, other than head
- * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
- * bits are handled if the Loss state is really to be entered (in
- * tcp_enter_frto_loss).
- *
- * Do like tcp_enter_loss() would; when RTO expires the second time it
- * does:
- * "Reduce ssthresh if it has not yet been made inside this window."
- */
-void tcp_enter_frto(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
- tp->snd_una == tp->high_seq ||
- ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
- !icsk->icsk_retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- /* Our state is too optimistic in ssthresh() call because cwnd
- * is not reduced until tcp_enter_frto_loss() when previous F-RTO
- * recovery has not yet completed. Pattern would be this: RTO,
- * Cumulative ACK, RTO (2xRTO for the same segment does not end
- * up here twice).
- * RFC4138 should be more specific on what to do, even though
- * RTO is quite unlikely to occur after the first Cumulative ACK
- * due to back-off and complexity of triggering events ...
- */
- if (tp->frto_counter) {
- u32 stored_cwnd;
- stored_cwnd = tp->snd_cwnd;
- tp->snd_cwnd = 2;
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = stored_cwnd;
- } else {
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- }
- /* ... in theory, cong.control module could do "any tricks" in
- * ssthresh(), which means that ca_state, lost bits and lost_out
- * counter would have to be faked before the call occurs. We
- * consider that too expensive, unlikely and hacky, so modules
- * using these in ssthresh() must deal these incompatibility
- * issues if they receives CA_EVENT_FRTO and frto_counter != 0
- */
- tcp_ca_event(sk, CA_EVENT_FRTO);
- }
-
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = 0;
-
- skb = tcp_write_queue_head(sk);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_verify_left_out(tp);
-
- /* Too bad if TCP was application limited */
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-
- /* Earlier loss recovery underway (see RFC4138; Appendix B).
- * The last condition is necessary at least in tp->frto_counter case.
- */
- if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
- ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
- after(tp->high_seq, tp->snd_una)) {
- tp->frto_highmark = tp->high_seq;
- } else {
- tp->frto_highmark = tp->snd_nxt;
- }
- tcp_set_ca_state(sk, TCP_CA_Disorder);
- tp->high_seq = tp->snd_nxt;
- tp->frto_counter = 1;
-}
-
-/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
- * which indicates that we should follow the traditional RTO recovery,
- * i.e. mark everything lost and do go-back-N retransmission.
- */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- tp->lost_out = 0;
- tp->retrans_out = 0;
- if (tcp_is_reno(tp))
- tcp_reset_reno_sack(tp);
-
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
-
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- /*
- * Count the retransmission made on RTO correctly (only when
- * waiting for the first ACK and did not get it)...
- */
- if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
- /* For some reason this R-bit might get cleared? */
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out += tcp_skb_pcount(skb);
- /* ...enter this if branch just for the first segment */
- flag |= FLAG_DATA_ACKED;
- } else {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
- tp->undo_marker = 0;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- }
-
- /* Marking forward transmissions that were made after RTO lost
- * can cause unnecessary retransmissions in some scenarios,
- * SACK blocks will mitigate that in some but not in all cases.
- * We used to not mark them but it was causing break-ups with
- * receivers that do only in-order receival.
- *
- * TODO: we could detect presence of such receiver and select
- * different behavior per flow.
- */
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
- }
- }
- tcp_verify_left_out(tp);
-
- tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
- tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->frto_counter = 0;
- tp->bytes_acked = 0;
-
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
- tcp_set_ca_state(sk, TCP_CA_Loss);
- tp->high_seq = tp->snd_nxt;
- TCP_ECN_queue_cwr(tp);
-
- tcp_clear_all_retrans_hints(tp);
-}
-
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
- tp->undo_retrans = 0;
+ tp->undo_retrans = -1;
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -2202,10 +1913,13 @@ void tcp_enter_loss(struct sock *sk, int how)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ bool new_recovery = false;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ new_recovery = true;
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -2214,17 +1928,13 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->bytes_acked = 0;
tcp_clear_retrans_partial(tp);
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- if (!how) {
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- tp->undo_marker = tp->snd_una;
- } else {
+ tp->undo_marker = tp->snd_una;
+ if (how) {
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2234,8 +1944,9 @@ void tcp_enter_loss(struct sock *sk, int how)
if (skb == tcp_send_head(sk))
break;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->undo_marker = 0;
+
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
@@ -2246,13 +1957,24 @@ void tcp_enter_loss(struct sock *sk, int how)
}
tcp_verify_left_out(tp);
- tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ /* Timeout in disordered state after receiving substantial DUPACKs
+ * suggests that the degree of reordering is over-estimated.
+ */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+ tp->sacked_out >= sysctl_tcp_reordering)
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
- /* Abort F-RTO algorithm if one is in progress */
- tp->frto_counter = 0;
+
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ */
+ tp->frto = sysctl_tcp_frto &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
}
/* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2261,7 +1983,7 @@ void tcp_enter_loss(struct sock *sk, int how)
*
* Do processing similar to RTO timeout.
*/
-static int tcp_check_sack_reneging(struct sock *sk, int flag)
+static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2272,12 +1994,12 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag)
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, TCP_RTO_MAX);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static inline int tcp_fackets_out(struct tcp_sock *tp)
+static inline int tcp_fackets_out(const struct tcp_sock *tp)
{
return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
}
@@ -2297,22 +2019,33 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
-static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
+static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
-static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
-{
- return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
-}
-
-static inline int tcp_head_timedout(struct sock *sk)
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+ (flag & FLAG_ECE) || !tp->srtt_us)
+ return false;
+
+ delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+ msecs_to_jiffies(2));
- return tp->packets_out &&
- tcp_skb_timedout(sk, tcp_write_queue_head(sk));
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+ TCP_RTO_MAX);
+ return true;
}
/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -2408,28 +2141,18 @@ static inline int tcp_head_timedout(struct sock *sk)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static int tcp_time_to_recover(struct sock *sk)
+static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
- /* Do not perform any recovery during F-RTO algorithm */
- if (tp->frto_counter)
- return 0;
-
/* Trick#1: The loss is proven. */
if (tp->lost_out)
- return 1;
+ return true;
/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heuristics(tp) > tp->reordering)
- return 1;
-
- /* Trick#3 : when we use RFC2988 timer restart, fast
- * retransmit can be triggered by timeout of queue head.
- */
- if (tcp_is_fack(tp) && tcp_head_timedout(sk))
- return 1;
+ return true;
/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
@@ -2441,7 +2164,7 @@ static int tcp_time_to_recover(struct sock *sk)
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
*/
- return 1;
+ return true;
}
/* If a thin stream is detected, retransmit after first
@@ -2452,51 +2175,26 @@ static int tcp_time_to_recover(struct sock *sk)
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
tcp_is_sack(tp) && !tcp_send_head(sk))
- return 1;
-
- return 0;
-}
-
-/* New heuristics: it is possible only after we switched to restart timer
- * each time when something is ACKed. Hence, we can detect timed out packets
- * during fast retransmit without falling to slow start.
- *
- * Usefulness of this as is very questionable, since we should know which of
- * the segments is the next to timeout which is relatively expensive to find
- * in general case unless we add some data structure just for that. The
- * current approach certainly won't find the right one too often and when it
- * finally does find _something_ it usually marks large part of the window
- * right away (because a retransmission with a larger timestamp blocks the
- * loop from advancing). -ij
- */
-static void tcp_timeout_skbs(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
-
- if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
- return;
-
- skb = tp->scoreboard_skb_hint;
- if (tp->scoreboard_skb_hint == NULL)
- skb = tcp_write_queue_head(sk);
-
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (!tcp_skb_timedout(sk, skb))
- break;
-
- tcp_skb_mark_lost(tp, skb);
- }
+ return true;
- tp->scoreboard_skb_hint = skb;
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
- tcp_verify_left_out(tp);
+ return false;
}
-/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
- * is against sacked "cnt", otherwise it's against facked "cnt"
+/* Detect loss in event "A" above by marking head of queue up as lost.
+ * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * has at least tp->reordering SACKed seqments above it; "packets" refers to
+ * the maximum SACKed segments to pass before reaching this limit.
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
@@ -2505,6 +2203,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
int cnt, oldcnt;
int err;
unsigned int mss;
+ /* Use SACK to deduce losses of new sequences sent during recovery */
+ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
@@ -2526,7 +2226,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
- if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+ if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
oldcnt = cnt;
@@ -2536,11 +2236,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
- err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+ err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
+ mss, GFP_ATOMIC);
if (err < 0)
break;
cnt = packets;
@@ -2574,8 +2276,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
-
- tcp_timeout_skbs(sk);
}
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2588,39 +2288,10 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
- const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-
- return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk, int flag)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int decr = tp->snd_cwnd_cnt + 1;
-
- if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
- (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
- tp->snd_cwnd_cnt = decr & 1;
- decr >>= 1;
-
- if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
- tp->snd_cwnd -= decr;
-
- tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- }
-}
-
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
-static inline int tcp_packet_delayed(struct tcp_sock *tp)
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2636,22 +2307,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
- }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
- msg,
- &np->daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
- tp->snd_ssthresh, tp->prior_ssthresh,
- tp->packets_out);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
}
#endif
}
@@ -2659,10 +2330,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
#define DBGUNDO(x...) do { } while (0)
#endif
-static void tcp_undo_cwr(struct sock *sk, const int undo)
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (unmark_loss) {
+ struct sk_buff *skb;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ tp->lost_out = 0;
+ tcp_clear_all_retrans_hints(tp);
+ }
+
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2671,24 +2354,24 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
- if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
+ if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
TCP_ECN_withdraw_cwr(tp);
}
} else {
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
- tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->undo_marker = 0;
}
-static inline int tcp_may_undo(struct tcp_sock *tp)
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
-static int tcp_try_undo_recovery(struct sock *sk)
+static bool tcp_try_undo_recovery(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2699,37 +2382,37 @@ static int tcp_try_undo_recovery(struct sock *sk)
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(sk, 1);
+ tcp_undo_cwnd_reduction(sk, false);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPLOSSUNDO;
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS_BH(sock_net(sk), mib_idx);
- tp->undo_marker = 0;
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
tcp_moderate_cwnd(tp);
- return 1;
+ return true;
}
tcp_set_ca_state(sk, TCP_CA_Open);
- return 0;
+ return false;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
-static void tcp_try_undo_dsack(struct sock *sk)
+static bool tcp_try_undo_dsack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
- tcp_undo_cwr(sk, 1);
- tp->undo_marker = 0;
+ tcp_undo_cwnd_reduction(sk, false);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ return true;
}
+ return false;
}
/* We can clear retrans_stamp when there are no retransmissions in the
@@ -2746,85 +2429,115 @@ static void tcp_try_undo_dsack(struct sock *sk)
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
-static int tcp_any_retrans_done(struct sock *sk)
+static bool tcp_any_retrans_done(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
- return 1;
+ return true;
skb = tcp_write_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
- return 1;
+ return true;
- return 0;
+ return false;
}
-/* Undo during fast recovery after partial ACK. */
-
-static int tcp_try_undo_partial(struct sock *sk, int acked)
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
struct tcp_sock *tp = tcp_sk(sk);
- /* Partial ACK arrived. Force Hoe's retransmit. */
- int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
- if (tcp_may_undo(tp)) {
- /* Plain luck! Hole if filled with delayed
- * packet, rather than with a retransmit.
- */
- if (!tcp_any_retrans_done(sk))
- tp->retrans_stamp = 0;
+ if (frto_undo || tcp_may_undo(tp)) {
+ tcp_undo_cwnd_reduction(sk, true);
- tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+ DBGUNDO(sk, "partial loss");
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
+ inet_csk(sk)->icsk_retransmits = 0;
+ if (frto_undo || tcp_is_sack(tp))
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ return true;
+ }
+ return false;
+}
- DBGUNDO(sk, "Hoe");
- tcp_undo_cwr(sk, 0);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ * 1) If the packets in flight is larger than ssthresh, PRR spreads the
+ * cwnd reductions across a full RTT.
+ * 2) If packets in flight is lower than ssthresh (such as due to excess
+ * losses and/or application stalls), do not perform any further cwnd
+ * reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
- /* So... Do not make Hoe's retransmit yet.
- * If the first packet was delayed, the rest
- * ones are most probably delayed as well.
- */
- failed = 0;
- }
- return failed;
+ tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ if (set_ssthresh)
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ TCP_ECN_queue_cwr(tp);
}
-/* Undo during loss recovery after partial ACK. */
-static int tcp_try_undo_loss(struct sock *sk)
+static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
+ int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int sndcnt = 0;
+ int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+ int newly_acked_sacked = prior_unsacked -
+ (tp->packets_out - tp->sacked_out);
+
+ tp->prr_delivered += newly_acked_sacked;
+ if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+ tp->prior_cwnd - 1;
+ sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+ } else {
+ sndcnt = min_t(int, delta,
+ max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked) + 1);
+ }
- if (tcp_may_undo(tp)) {
- struct sk_buff *skb;
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
- }
+ sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
- tcp_clear_all_retrans_hints(tp);
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
- DBGUNDO(sk, "partial loss");
- tp->lost_out = 0;
- tcp_undo_cwr(sk, 1);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
- inet_csk(sk)->icsk_retransmits = 0;
- tp->undo_marker = 0;
- if (tcp_is_sack(tp))
- tcp_set_ca_state(sk, TCP_CA_Open);
- return 1;
+ /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+ (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
}
- return 0;
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
-static inline void tcp_complete_cwr(struct sock *sk)
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
{
struct tcp_sock *tp = tcp_sk(sk);
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tp->snd_cwnd_stamp = tcp_time_stamp;
- tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+
+ tp->prior_ssthresh = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tcp_init_cwnd_reduction(sk, set_ssthresh);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
}
static void tcp_try_keep_open(struct sock *sk)
@@ -2832,7 +2545,7 @@ static void tcp_try_keep_open(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int state = TCP_CA_Open;
- if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
+ if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2841,13 +2554,13 @@ static void tcp_try_keep_open(struct sock *sk)
}
}
-static void tcp_try_to_open(struct sock *sk, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_verify_left_out(tp);
- if (!tp->frto_counter && !tcp_any_retrans_done(sk))
+ if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
@@ -2855,9 +2568,8 @@ static void tcp_try_to_open(struct sock *sk, int flag)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
- tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_down(sk, flag);
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
}
}
@@ -2939,6 +2651,115 @@ void tcp_simple_retransmit(struct sock *sk)
}
EXPORT_SYMBOL(tcp_simple_retransmit);
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = tp->snd_una;
+ tp->undo_retrans = tp->retrans_out ? : -1;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tcp_init_cwnd_reduction(sk, true);
+ }
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
+
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
+ return;
+
+ if (after(tp->snd_nxt, tp->high_seq) &&
+ (flag & FLAG_DATA_SACKED || is_dupack)) {
+ tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+ tp->high_seq = tp->snd_nxt;
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return; /* Step 2.b */
+ tp->frto = 0;
+ }
+ }
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ icsk->icsk_retransmits = 0;
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (flag & FLAG_DATA_ACKED)
+ icsk->icsk_retransmits = 0;
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)tranmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+ tcp_add_reno_sack(sk);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ if (tcp_try_undo_loss(sk, false))
+ return;
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, const int acked,
+ const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && tcp_packet_delayed(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit.
+ */
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+
+ /* We are getting evidence that the reordering degree is higher
+ * than we realized. If there are no retransmits out then we
+ * can undo. Otherwise we clock out new packets but do not
+ * mark more packets lost or retransmit more.
+ */
+ if (tp->retrans_out) {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ return true;
+ }
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ DBGUNDO(sk, "partial recovery");
+ tcp_undo_cwnd_reduction(sk, true);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ tcp_try_keep_open(sk);
+ return true;
+ }
+ return false;
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2950,14 +2771,15 @@ EXPORT_SYMBOL(tcp_simple_retransmit);
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
+static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+ const int prior_unsacked,
+ bool is_dupack, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0, mib_idx;
+ int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -2973,47 +2795,21 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
if (tcp_check_sack_reneging(sk, flag))
return;
- /* C. Process data loss notification, provided it is valid. */
- if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
- before(tp->snd_una, tp->high_seq) &&
- icsk->icsk_ca_state != TCP_CA_Open &&
- tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
- }
-
- /* D. Check consistency of the current state. */
+ /* C. Check consistency of the current state. */
tcp_verify_left_out(tp);
- /* E. Check state exit conditions. State can be terminated
+ /* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
- case TCP_CA_Loss:
- icsk->icsk_retransmits = 0;
- if (tcp_try_undo_recovery(sk))
- return;
- break;
-
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
- tcp_complete_cwr(sk);
- tcp_set_ca_state(sk, TCP_CA_Open);
- }
- break;
-
- case TCP_CA_Disorder:
- tcp_try_undo_dsack(sk);
- if (!tp->undo_marker ||
- /* For SACK case do not Open to allow to undo
- * catching for all duplicate ACKs. */
- tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
- tp->undo_marker = 0;
+ tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -3023,33 +2819,34 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk))
return;
- tcp_complete_cwr(sk);
+ tcp_end_cwnd_reduction(sk);
break;
}
}
- /* F. Process state. */
+ /* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
- } else
- do_lost = tcp_try_undo_partial(sk, pkts_acked);
- break;
- case TCP_CA_Loss:
- if (flag & FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
- if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
- tcp_reset_reno_sack(tp);
- if (!tcp_try_undo_loss(sk)) {
- tcp_moderate_cwnd(tp);
- tcp_xmit_retransmit_queue(sk);
+ } else {
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ return;
+ /* Partial ACK arrived. Force fast retransmit. */
+ do_lost = tcp_is_reno(tp) ||
+ tcp_fackets_out(tp) > tp->reordering;
+ }
+ if (tcp_try_undo_dsack(sk)) {
+ tcp_try_keep_open(sk);
return;
}
+ break;
+ case TCP_CA_Loss:
+ tcp_process_loss(sk, flag, is_dupack);
if (icsk->icsk_ca_state != TCP_CA_Open)
return;
- /* Loss is undone; fall through to processing in Open state. */
+ /* Fall through to processing in Open state. */
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -3058,11 +2855,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
tcp_add_reno_sack(sk);
}
- if (icsk->icsk_ca_state == TCP_CA_Disorder)
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
- if (!tcp_time_to_recover(sk)) {
- tcp_try_to_open(sk, flag);
+ if (!tcp_time_to_recover(sk, flag)) {
+ tcp_try_to_open(sk, flag, prior_unsacked);
return;
}
@@ -3078,120 +2875,130 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
}
/* Otherwise enter Recovery state */
-
- if (tcp_is_reno(tp))
- mib_idx = LINUX_MIB_TCPRENORECOVERY;
- else
- mib_idx = LINUX_MIB_TCPSACKRECOVERY;
-
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
-
- tp->high_seq = tp->snd_nxt;
- tp->prior_ssthresh = 0;
- tp->undo_marker = tp->snd_una;
- tp->undo_retrans = tp->retrans_out;
-
- if (icsk->icsk_ca_state < TCP_CA_CWR) {
- if (!(flag & FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- TCP_ECN_queue_cwr(tp);
- }
-
- tp->bytes_acked = 0;
- tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(sk, TCP_CA_Recovery);
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
fast_rexmit = 1;
}
- if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
+ if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_down(sk, flag);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
tcp_xmit_retransmit_queue(sk);
}
-static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ long seq_rtt_us, long sack_rtt_us)
{
- tcp_rtt_estimator(sk, seq_rtt);
- tcp_set_rto(sk);
- inet_csk(sk)->icsk_backoff = 0;
-}
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+ * broken middle-boxes or peers may corrupt TS-ECR fields. But
+ * Karn's algorithm forbids taking RTT if some retransmitted data
+ * is acked (RFC6298).
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ seq_rtt_us = -1L;
+
+ if (seq_rtt_us < 0)
+ seq_rtt_us = sack_rtt_us;
-/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Supersedes RFC1323)
- */
-static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
-{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
- *
* See draft-ietf-tcplw-high-performance-00, section 3.3.
- * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
- *
- * Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overestimated rto. With timestamps
- * samples are accepted even from very old segments: f.e., when rtt=1
- * increases to 8, we retransmit 5 times and after 8 seconds delayed
- * answer arrives rto becomes 120 seconds! If at least one of segments
- * in window is lost... Voila. --ANK (010210)
*/
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-}
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
-{
- /* We don't have a timestamp. Can only use
- * packets that are not retransmitted to determine
- * rtt estimates. Also, we must not reset the
- * backoff for rto until we get a non-retransmitted
- * packet. This allows us to deal with a situation
- * where the network delay has increased suddenly.
- * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
- */
+ if (seq_rtt_us < 0)
+ return false;
- if (flag & FLAG_RETRANS_DATA_ACKED)
- return;
+ tcp_rtt_estimator(sk, seq_rtt_us);
+ tcp_set_rto(sk);
- tcp_valid_rtt_meas(sk, seq_rtt);
+ /* RFC6298: only reset backoff on valid RTT measurement. */
+ inet_csk(sk)->icsk_backoff = 0;
+ return true;
}
-static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt)
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
- if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, flag);
- else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, flag);
+ struct tcp_sock *tp = tcp_sk(sk);
+ long seq_rtt_us = -1L;
+
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt_us)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
}
-static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-static void tcp_rearm_rto(struct sock *sk)
+void tcp_rearm_rto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ /* If the retrans timer is currently being used by Fast Open
+ * for SYN-ACK retrans purpose, stay put.
+ */
+ if (tp->fastopen_rsk)
+ return;
+
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the retrans timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
}
}
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
@@ -3218,25 +3025,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una)
+ u32 prior_snd_una, long sack_rtt_us)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct skb_mstamp first_ackt, last_ackt, now;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ u32 reord = tp->packets_out;
+ bool fully_acked = true;
+ long ca_seq_rtt_us = -1L;
+ long seq_rtt_us = -1L;
struct sk_buff *skb;
- u32 now = tcp_time_stamp;
- int fully_acked = 1;
- int flag = 0;
u32 pkts_acked = 0;
- u32 reord = tp->packets_out;
- u32 prior_sacked = tp->sacked_out;
- s32 seq_rtt = -1;
- s32 ca_seq_rtt = -1;
- ktime_t last_ackt = net_invalid_timestamp();
+ bool rtt_update;
+ int flag = 0;
+
+ first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u32 acked_pcount;
u8 sacked = scb->sacked;
+ u32 acked_pcount;
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
@@ -3248,7 +3057,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!acked_pcount)
break;
- fully_acked = 0;
+ fully_acked = false;
} else {
acked_pcount = tcp_skb_pcount(skb);
}
@@ -3257,18 +3066,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
- ca_seq_rtt = -1;
- seq_rtt = -1;
- if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
- flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
- ca_seq_rtt = now - scb->when;
- last_ackt = skb->tstamp;
- if (seq_rtt < 0) {
- seq_rtt = ca_seq_rtt;
- }
+ last_ackt = skb->skb_mstamp;
+ WARN_ON_ONCE(last_ackt.v64 == 0);
+ if (!first_ackt.v64)
+ first_ackt = last_ackt;
+
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED)
@@ -3286,7 +3093,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPHDR_SYN)) {
+ if (!(scb->tcp_flags & TCPHDR_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
@@ -3298,7 +3105,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- tp->scoreboard_skb_hint = NULL;
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = NULL;
if (skb == tp->lost_skb_hint)
@@ -3311,18 +3117,24 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
+ skb_mstamp_get(&now);
+ if (first_ackt.v64) {
+ seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+ ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
+ tcp_rearm_rto(sk);
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
}
- tcp_ack_update_rtt(sk, flag, seq_rtt);
- tcp_rearm_rto(sk);
-
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
} else {
@@ -3339,23 +3151,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
- if (ca_ops->pkts_acked) {
- s32 rtt_us = -1;
-
- /* Is the ACK triggering packet unambiguous? */
- if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
- /* High resolution needed and available? */
- if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
- !ktime_equal(last_ackt,
- net_invalid_timestamp()))
- rtt_us = ktime_us_delta(ktime_get_real(),
- last_ackt);
- else if (ca_seq_rtt > 0)
- rtt_us = jiffies_to_usecs(ca_seq_rtt);
- }
+ if (ca_ops->pkts_acked)
+ ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
- ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
- }
+ } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+ sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
}
#if FASTRETRANS_DEBUG > 0
@@ -3365,18 +3170,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (!tp->packets_out && tcp_is_sack(tp)) {
icsk = inet_csk(sk);
if (tp->lost_out) {
- printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, icsk->icsk_ca_state);
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
- printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, icsk->icsk_ca_state);
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
- printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, icsk->icsk_ca_state);
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -3404,23 +3209,34 @@ static void tcp_ack_probe(struct sock *sk)
}
}
-static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
-static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+/* Decide wheather to run the increase function of congestion control. */
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+ if (tcp_in_cwnd_reduction(sk))
+ return false;
+
+ /* If reordering is high then always grow cwnd whenever data is
+ * delivered regardless of its ordering. Otherwise stay conservative
+ * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
+ * new SACK or ECE mark may first advance cwnd here and later reduce
+ * cwnd in tcp_fastretrans_alert() based on more states.
+ */
+ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ return flag & FLAG_FORWARD_PROGRESS;
+
+ return flag & FLAG_DATA_ACKED;
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(const struct tcp_sock *tp,
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
@@ -3434,7 +3250,7 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
-static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3469,164 +3285,103 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
return flag;
}
-/* A very conservative spurious RTO response algorithm: reduce cwnd and
- * continue in congestion avoidance.
- */
-static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk)
{
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
- tp->snd_cwnd_cnt = 0;
- tp->bytes_acked = 0;
- TCP_ECN_queue_cwr(tp);
- tcp_moderate_cwnd(tp);
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ u32 now = jiffies / HZ;
+
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
}
-/* A conservative spurious RTO response algorithm: reduce cwnd using
- * rate halving and continue in congestion avoidance.
- */
-static void tcp_ratehalving_spur_to_response(struct sock *sk)
+static void tcp_store_ts_recent(struct tcp_sock *tp)
{
- tcp_enter_cwr(sk, 0);
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
}
-static void tcp_undo_spur_to_response(struct sock *sk, int flag)
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
- if (flag & FLAG_ECE)
- tcp_ratehalving_spur_to_response(sk);
- else
- tcp_undo_cwr(sk, 1);
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
+ */
+
+ if (tcp_paws_check(&tp->rx_opt, 0))
+ tcp_store_ts_recent(tp);
+ }
}
-/* F-RTO spurious RTO detection algorithm (RFC4138)
- *
- * F-RTO affects during two new ACKs following RTO (well, almost, see inline
- * comments). State (ACK number) is kept in frto_counter. When ACK advances
- * window (but not to or beyond highest sequence sent before RTO):
- * On First ACK, send two new segments out.
- * On Second ACK, RTO was likely spurious. Do spurious response (response
- * algorithm is not part of the F-RTO detection algorithm
- * given in RFC4138 but can be selected separately).
- * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
- * of Nagle, this is done using frto_counter states 2 and 3, when a new data
- * segment of any size sent during F-RTO, state 2 is upgraded to 3.
- *
- * Rationale: if the RTO was spurious, new ACKs should arrive from the
- * original window even after we transmit two new data segments.
- *
- * SACK version:
- * on first step, wait until first cumulative ACK arrives, then move to
- * the second step. In second step, the next ACK decides.
- *
- * F-RTO is implemented (mainly) in four functions:
- * - tcp_use_frto() is used to determine if TCP is can use F-RTO
- * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
- * called when tcp_use_frto() showed green light
- * - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
- * - tcp_enter_frto_loss() is called if there is not enough evidence
- * to prove that the RTO is indeed spurious. It transfers the control
- * from F-RTO to the conventional RTO recovery
+/* This routine deals with acks during a TLP episode.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
*/
-static int tcp_process_frto(struct sock *sk, int flag)
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
+ !(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED));
- tcp_verify_left_out(tp);
-
- /* Duplicate the behavior from Loss state (fastretrans_alert) */
- if (flag & FLAG_DATA_ACKED)
- inet_csk(sk)->icsk_retransmits = 0;
-
- if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
- ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
- tp->undo_marker = 0;
-
- if (!before(tp->snd_una, tp->frto_highmark)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
- return 1;
- }
-
- if (!tcp_is_sackfrto(tp)) {
- /* RFC4138 shortcoming in step 2; should also have case c):
- * ACK isn't duplicate nor advances window, e.g., opposite dir
- * data, winupdate
- */
- if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
- return 1;
-
- if (!(flag & FLAG_DATA_ACKED)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
- flag);
- return 1;
- }
- } else {
- if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
- /* Prevent sending of new data. */
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp));
- return 1;
- }
-
- if ((tp->frto_counter >= 2) &&
- (!(flag & FLAG_FORWARD_PROGRESS) ||
- ((flag & FLAG_DATA_SACKED) &&
- !(flag & FLAG_ONLY_ORIG_SACKED)))) {
- /* RFC4138 shortcoming (see comment above) */
- if (!(flag & FLAG_FORWARD_PROGRESS) &&
- (flag & FLAG_NOT_DUP))
- return 1;
-
- tcp_enter_frto_loss(sk, 3, flag);
- return 1;
- }
+ /* Mark the end of TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
+ */
+ if (is_tlp_dupack) {
+ tp->tlp_high_seq = 0;
+ return;
}
- if (tp->frto_counter == 1) {
- /* tcp_may_send_now needs to see updated state */
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
- tp->frto_counter = 2;
-
- if (!tcp_may_send_now(sk))
- tcp_enter_frto_loss(sk, 2, flag);
-
- return 1;
- } else {
- switch (sysctl_tcp_frto_response) {
- case 2:
- tcp_undo_spur_to_response(sk, flag);
- break;
- case 1:
- tcp_conservative_spur_to_response(tp);
- break;
- default:
- tcp_ratehalving_spur_to_response(sk);
- break;
+ if (after(ack, tp->tlp_high_seq)) {
+ tp->tlp_high_seq = 0;
+ /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
+ if (!(flag & FLAG_DSACKING_ACK)) {
+ tcp_init_cwnd_reduction(sk, true);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_try_keep_open(sk);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
}
- tp->frto_counter = 0;
- tp->undo_marker = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
}
- return 0;
}
/* This routine deals with incoming acks, but not outgoing ones. */
-static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- u32 prior_in_flight;
+ bool is_dupack = false;
u32 prior_fackets;
- int prior_packets;
- int frto_cwnd = 0;
+ int prior_packets = tp->packets_out;
+ const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ int acked = 0; /* Number of packets newly acked */
+ long sack_rtt_us = -1L;
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
- if (before(ack, prior_snd_una))
+ if (before(ack, prior_snd_una)) {
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - tp->max_window)) {
+ tcp_send_challenge_ack(sk);
+ return -1;
+ }
goto old_ack;
+ }
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
@@ -3634,20 +3389,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (after(ack, tp->snd_nxt))
goto invalid_ack;
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ tcp_rearm_rto(sk);
+
if (after(ack, prior_snd_una))
flag |= FLAG_SND_UNA_ADVANCED;
- if (sysctl_tcp_abc) {
- if (icsk->icsk_ca_state < TCP_CA_CWR)
- tp->bytes_acked += ack - prior_snd_una;
- else if (icsk->icsk_ca_state == TCP_CA_Loss)
- /* we assume just one segment left network */
- tp->bytes_acked += min(ack - prior_snd_una,
- tp->mss_cache);
- }
-
prior_fackets = tp->fackets_out;
- prior_in_flight = tcp_packets_in_flight(tp);
+
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
@@ -3670,7 +3425,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
@@ -3684,43 +3440,52 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_time_stamp;
- prior_packets = tp->packets_out;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+ acked = tp->packets_out;
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ sack_rtt_us);
+ acked -= tp->packets_out;
- if (tp->frto_counter)
- frto_cwnd = tcp_process_frto(sk, flag);
- /* Guarantee sacktag reordering detection against wrap-arounds */
- if (before(tp->frto_highmark, tp->snd_una))
- tp->frto_highmark = 0;
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advance CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
- tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, prior_in_flight);
- tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
- flag);
- } else {
- if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
- tcp_cong_avoid(sk, ack, prior_in_flight);
+ is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
}
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
- if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
- dst_confirm(__sk_dst_get(sk));
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+ struct dst_entry *dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+ }
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+ tcp_schedule_loss_probe(sk);
+ tcp_update_pacing_rate(sk);
return 1;
no_queue:
+ /* If data was DSACKed, see if we can undo a cwnd reduction. */
+ if (flag & FLAG_DSACKING_ACK)
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
if (tcp_send_head(sk))
tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
return 1;
invalid_ack:
@@ -3728,10 +3493,14 @@ invalid_ack:
return -1;
old_ack:
+ /* If data was SACKed, tag it and see if we should send more data.
+ * If data was DSACKed, see if we can undo a cwnd reduction.
+ */
if (TCP_SKB_CB(skb)->sacked) {
- tcp_sacktag_write_queue(sk, skb, prior_snd_una);
- if (icsk->icsk_ca_state == TCP_CA_Open)
- tcp_try_keep_open(sk);
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3742,14 +3511,15 @@ old_ack:
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
- u8 **hvpp, int estab)
+void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
+ struct tcp_fastopen_cookie *foc)
{
- unsigned char *ptr;
- struct tcphdr *th = tcp_hdr(skb);
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
- ptr = (unsigned char *)(th + 1);
+ ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
while (length > 0) {
@@ -3786,10 +3556,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
- if (net_ratelimit())
- printk(KERN_INFO "tcp_parse_options: Illegal window "
- "scaling value %d >14 received.\n",
- snd_wscale);
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
snd_wscale = 14;
}
opt_rx->snd_wscale = snd_wscale;
@@ -3807,7 +3576,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
!estab && sysctl_tcp_sack) {
- opt_rx->sack_ok = 1;
+ opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
break;
@@ -3827,32 +3596,24 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
*/
break;
#endif
- case TCPOPT_COOKIE:
- /* This option is variable length.
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number. It's valid only in
+ * SYN or SYN-ACK with an even size.
*/
- switch (opsize) {
- case TCPOLEN_COOKIE_BASE:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_PAIR:
- /* not yet implemented */
- break;
- case TCPOLEN_COOKIE_MIN+0:
- case TCPOLEN_COOKIE_MIN+2:
- case TCPOLEN_COOKIE_MIN+4:
- case TCPOLEN_COOKIE_MIN+6:
- case TCPOLEN_COOKIE_MAX:
- /* 16-bit multiple */
- opt_rx->cookie_plus = opsize;
- *hvpp = ptr;
- break;
- default:
- /* ignore option */
+ if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+ get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+ foc == NULL || !th->syn || (opsize & 1))
break;
- }
+ foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+ if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+ foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, ptr + 2, foc->len);
+ else if (foc->len != 0)
+ foc->len = -1;
break;
- }
+ }
ptr += opsize-2;
length -= opsize;
}
@@ -3860,9 +3621,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
}
EXPORT_SYMBOL(tcp_parse_options);
-static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
- __be32 *ptr = (__be32 *)(th + 1);
+ const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
@@ -3870,41 +3631,48 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
- tp->rx_opt.rcv_tsecr = ntohl(*ptr);
- return 1;
+ if (*ptr)
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+ else
+ tp->rx_opt.rcv_tsecr = 0;
+ return true;
}
- return 0;
+ return false;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp, u8 **hvpp)
+static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th, struct tcp_sock *tp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
- return 0;
+ return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
- return 1;
+ return true;
}
- tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
- return 1;
+
+ tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ return true;
}
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
-u8 *tcp_parse_md5sig_option(struct tcphdr *th)
+const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
- int length = (th->doff << 2) - sizeof (*th);
- u8 *ptr = (u8*)(th + 1);
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
/* If the TCP option is too short, we can short cut */
if (length < TCPOLEN_MD5SIG)
@@ -3914,7 +3682,7 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
int opcode = *ptr++;
int opsize;
- switch(opcode) {
+ switch (opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
@@ -3935,27 +3703,6 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
-static inline void tcp_store_ts_recent(struct tcp_sock *tp)
-{
- tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = get_seconds();
-}
-
-static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
-{
- if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
- /* PAWS bug workaround wrt. ACK frames, the PAWS discard
- * extra check below makes sure this can only happen
- * for pure ACK frames. -DaveM
- *
- * Not only, also it occurs for expired timestamps.
- */
-
- if (tcp_paws_check(&tp->rx_opt, 0))
- tcp_store_ts_recent(tp);
- }
-}
-
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
@@ -3981,8 +3728,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3999,7 +3746,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(const struct sock *sk,
+static inline bool tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -4021,14 +3768,14 @@ static inline int tcp_paws_discard(const struct sock *sk,
* (borrowed from freebsd)
*/
-static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}
/* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
{
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
@@ -4066,9 +3813,10 @@ static void tcp_reset(struct sock *sk)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst;
inet_csk_schedule_ack(sk);
@@ -4080,7 +3828,9 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -4110,7 +3860,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
- printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+ pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
break;
}
@@ -4135,7 +3885,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
}
}
-static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4143,9 +3893,9 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4178,7 +3928,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4292,7 +4042,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
- for (i=this_sack+1; i < num_sacks; i++)
+ for (i = this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
@@ -4337,37 +4087,261 @@ static void tcp_ofo_queue(struct sock *sk)
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tcp_hdr(skb)->fin)
- tcp_fin(skb, sk, tcp_hdr(skb));
+ tcp_fin(sk);
}
}
-static int tcp_prune_ofo_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, size)) {
+ !sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk) < 0)
return -1;
- if (!sk_rmem_schedule(sk, size)) {
+ if (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
- if (!sk_rmem_schedule(sk, size))
+ if (!sk_rmem_schedule(sk, skb, size))
return -1;
}
}
return 0;
}
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ if (tcp_hdr(from)->fin)
+ return false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ return true;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb1;
+ u32 seq, end_seq;
+
+ TCP_ECN_check_ce(tp, skb);
+
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ inet_csk_schedule_ack(sk);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ skb1 = skb_peek_tail(&tp->out_of_order_queue);
+ if (!skb1) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tcp_is_sack(tp)) {
+ tp->rx_opt.num_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq =
+ TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ goto end;
+ }
+
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (seq == TCP_SKB_CB(skb1)->end_seq) {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ }
+
+ if (!tp->rx_opt.num_sacks ||
+ tp->selective_acks[0].end_seq != seq)
+ goto add_sack;
+
+ /* Common case: data arrive in order after hole. */
+ tp->selective_acks[0].end_seq = end_seq;
+ goto end;
+ }
+
+ /* Find place to insert this segment. */
+ while (1) {
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
+ break;
+ if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+ skb1 = NULL;
+ break;
+ }
+ skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+ }
+
+ /* Do skb overlap to previous one? */
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ if (skb_queue_is_first(&tp->out_of_order_queue,
+ skb1))
+ skb1 = NULL;
+ else
+ skb1 = skb_queue_prev(
+ &tp->out_of_order_queue,
+ skb1);
+ }
+ }
+ if (!skb1)
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ else
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+ /* And clean segments covered by new one as whole. */
+ while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+ skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ }
+
+add_sack:
+ if (tcp_is_sack(tp))
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+ if (skb) {
+ tcp_grow_window(sk, skb);
+ skb_set_owner_r(skb, sk);
+ }
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb = NULL;
+ struct tcphdr *th;
+ bool fragstolen;
+
+ if (size == 0)
+ return 0;
+
+ skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+ goto err_free;
+
+ th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+ skb_reset_transport_header(skb);
+ memset(th, 0, sizeof(*th));
+
+ if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
+
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
+ bool fragstolen = false;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
@@ -4400,7 +4374,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
- eaten = (chunk == skb->len && !th->fin);
+ eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
local_bh_disable();
@@ -4409,17 +4383,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb->truesize))
+ tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto drop;
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (skb->len)
tcp_event_data_recv(sk, skb);
if (th->fin)
- tcp_fin(skb, sk, th);
+ tcp_fin(sk);
if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
@@ -4437,9 +4410,9 @@ queue_and_out:
tcp_fast_path_check(sk);
if (eaten > 0)
- __kfree_skb(skb);
- else if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ kfree_skb_partial(skb, fragstolen);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
return;
}
@@ -4478,105 +4451,7 @@ drop:
goto queue_and_out;
}
- TCP_ECN_check_ce(tp, skb);
-
- if (tcp_try_rmem_schedule(sk, skb->truesize))
- goto drop;
-
- /* Disable header prediction. */
- tp->pred_flags = 0;
- inet_csk_schedule_ack(sk);
-
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
-
- skb_set_owner_r(skb, sk);
-
- if (!skb_peek(&tp->out_of_order_queue)) {
- /* Initial out of order segment, build 1 SACK. */
- if (tcp_is_sack(tp)) {
- tp->rx_opt.num_sacks = 1;
- tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
- tp->selective_acks[0].end_seq =
- TCP_SKB_CB(skb)->end_seq;
- }
- __skb_queue_head(&tp->out_of_order_queue, skb);
- } else {
- struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
- u32 seq = TCP_SKB_CB(skb)->seq;
- u32 end_seq = TCP_SKB_CB(skb)->end_seq;
-
- if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-
- if (!tp->rx_opt.num_sacks ||
- tp->selective_acks[0].end_seq != seq)
- goto add_sack;
-
- /* Common case: data arrive in order after hole. */
- tp->selective_acks[0].end_seq = end_seq;
- return;
- }
-
- /* Find place to insert this segment. */
- while (1) {
- if (!after(TCP_SKB_CB(skb1)->seq, seq))
- break;
- if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
- skb1 = NULL;
- break;
- }
- skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
- }
-
- /* Do skb overlap to previous one? */
- if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- /* All the bits are present. Drop. */
- __kfree_skb(skb);
- tcp_dsack_set(sk, seq, end_seq);
- goto add_sack;
- }
- if (after(seq, TCP_SKB_CB(skb1)->seq)) {
- /* Partial overlap. */
- tcp_dsack_set(sk, seq,
- TCP_SKB_CB(skb1)->end_seq);
- } else {
- if (skb_queue_is_first(&tp->out_of_order_queue,
- skb1))
- skb1 = NULL;
- else
- skb1 = skb_queue_prev(
- &tp->out_of_order_queue,
- skb1);
- }
- }
- if (!skb1)
- __skb_queue_head(&tp->out_of_order_queue, skb);
- else
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-
- /* And clean segments covered by new one as whole. */
- while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
- skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
-
- if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
- break;
- if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
- tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- end_seq);
- break;
- }
- __skb_unlink(skb1, &tp->out_of_order_queue);
- tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- TCP_SKB_CB(skb1)->end_seq);
- __kfree_skb(skb1);
- }
-
-add_sack:
- if (tcp_is_sack(tp))
- tcp_sack_new_ofo_skb(sk, seq, end_seq);
- }
+ tcp_data_queue_ofo(sk, skb);
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4755,10 +4630,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
* Purge the out-of-order queue.
* Return true if queue was pruned.
*/
-static int tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res = 0;
+ bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4772,7 +4647,7 @@ static int tcp_prune_ofo_queue(struct sock *sk)
if (tp->rx_opt.sack_ok)
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
- res = 1;
+ res = true;
}
return res;
}
@@ -4794,7 +4669,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (tcp_memory_pressure)
+ else if (sk_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4827,51 +4702,29 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
-/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
- * As additional protections, we do not touch cwnd in retransmission phases,
- * and if application hit its sndbuf limit recently.
- */
-void tcp_cwnd_application_limited(struct sock *sk)
+static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
- sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- /* Limited by application or receiver window. */
- u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
- u32 win_used = max(tp->snd_cwnd_used, init_win);
- if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
- }
- tp->snd_cwnd_used = 0;
- }
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
-static int tcp_should_expand_sndbuf(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* If the user specified a specific send buffer setting, do
* not modify it.
*/
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
- return 0;
+ return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (tcp_memory_pressure)
- return 0;
+ if (sk_under_memory_pressure(sk))
+ return false;
/* If we are under soft global TCP memory pressure, do not expand. */
- if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
- return 0;
+ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
/* If we filled the congestion window, do not expand. */
if (tp->packets_out >= tp->snd_cwnd)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* When incoming ACK allowed to free some skb from write_queue,
@@ -4885,13 +4738,7 @@ static void tcp_new_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
- MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
- int demanded = max_t(unsigned int, tp->snd_cwnd,
- tp->reordering + 1);
- sndmem *= 2 * demanded;
- if (sndmem > sk->sk_sndbuf)
- sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+ tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -4958,7 +4805,7 @@ static inline void tcp_ack_snd_check(struct sock *sk)
* either form (or just set the sysctl tcp_stdurg).
*/
-static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
@@ -5024,7 +4871,7 @@ static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
}
/* This is the 'fast' part of urgent handling. */
-static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5044,7 +4891,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
BUG();
tp->urg_data = TCP_URG_VALID | tmp;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
}
}
@@ -5087,7 +4934,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
return result;
}
-static inline int tcp_checksum_complete_user(struct sock *sk,
+static inline bool tcp_checksum_complete_user(struct sock *sk,
struct sk_buff *skb)
{
return !skb_csum_unnecessary(skb) &&
@@ -5095,19 +4942,19 @@ static inline int tcp_checksum_complete_user(struct sock *sk,
}
#ifdef CONFIG_NET_DMA
-static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
int hlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int chunk = skb->len - hlen;
int dma_cookie;
- int copied_early = 0;
+ bool copied_early = false;
if (tp->ucopy.wakeup)
- return 0;
+ return false;
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
@@ -5120,7 +4967,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
goto out;
tp->ucopy.dma_cookie = dma_cookie;
- copied_early = 1;
+ copied_early = true;
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
@@ -5130,11 +4977,11 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
(tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
(atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
} else if (chunk > 0) {
tp->ucopy.wakeup = 1;
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
out:
return copied_early;
@@ -5144,15 +4991,13 @@ out:
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
-static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, int syn_inerr)
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
{
- u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
- tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5170,38 +5015,48 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
- if (!th->rst)
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
tcp_send_dupack(sk, skb);
+ }
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- tcp_reset(sk);
+ /* RFC 5961 3.2 :
+ * If sequence number exactly matches RCV.NXT, then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ tcp_reset(sk);
+ else
+ tcp_send_challenge_ack(sk);
goto discard;
}
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
- */
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
/* step 3: check security and precedence [ignored] */
- /* step 4: Check for a SYN in window. */
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* step 4: Check for a SYN
+ * RFC 5691 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+syn_challenge:
if (syn_inerr)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
- tcp_reset(sk);
- return -1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk);
+ goto discard;
}
- return 1;
+ return true;
discard:
__kfree_skb(skb);
- return 0;
+ return false;
}
/*
@@ -5227,12 +5082,13 @@ discard:
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
-int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res;
+ if (unlikely(sk->sk_rx_dst == NULL))
+ inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
@@ -5304,7 +5160,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
- return 0;
+ return;
} else { /* Header too small */
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -5312,11 +5168,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else {
int eaten = 0;
int copied_early = 0;
+ bool fragstolen = false;
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
#ifdef CONFIG_NET_DMA
- if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+ if (tp->ucopy.task == current &&
+ sock_owned_by_user(sk) &&
+ tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
copied_early = 1;
eaten = 1;
}
@@ -5352,6 +5211,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
@@ -5363,16 +5225,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- __skb_pull(skb, tcp_header_len);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
}
tcp_event_data_recv(sk, skb);
@@ -5394,10 +5251,9 @@ no_ack:
else
#endif
if (eaten)
- __kfree_skb(skb);
- else
- sk->sk_data_ready(sk, 0);
- return 0;
+ kfree_skb_partial(skb, fragstolen);
+ sk->sk_data_ready(sk);
+ return;
}
}
@@ -5405,16 +5261,18 @@ slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if (!th->ack && !th->rst)
+ goto discard;
+
/*
* Standard slow path.
*/
- res = tcp_validate_incoming(sk, skb, th, 1);
- if (res <= 0)
- return -res;
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return;
step5:
- if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5427,27 +5285,113 @@ step5:
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
- return 0;
+ return;
csum_error:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
discard:
__kfree_skb(skb);
- return 0;
}
EXPORT_SYMBOL(tcp_rcv_established);
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb != NULL) {
+ icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ security_inet_conn_established(sk, skb);
+ }
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ u16 mss = tp->rx_opt.mss_clamp;
+ bool syn_drop;
+
+ if (mss == tp->rx_opt.user_mss) {
+ struct tcp_options_received opt;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(synack, &opt, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+
+ /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+ * the remote receives only the retransmitted (regular) SYNs: either
+ * the original SYN-data or the corresponding SYN-ACK is lost.
+ */
+ syn_drop = (cookie->len <= 0 && data && tp->total_retrans);
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ tcp_for_write_queue_from(data, sk) {
+ if (data == tcp_send_head(sk) ||
+ __tcp_retransmit_skb(sk, data))
+ break;
+ }
+ tcp_rearm_rto(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ return true;
+ }
+ tp->syn_data_acked = tp->syn_data;
+ if (tp->syn_data_acked)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ return false;
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ const struct tcphdr *th, unsigned int len)
{
- u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
if (th->ack) {
/* rfc793:
@@ -5457,11 +5401,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)"
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
*/
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
goto reset_and_undo;
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5503,7 +5445,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_ECN_rcv_synack(tp, th);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and
@@ -5516,7 +5458,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
- tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5545,61 +5486,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
- if (cvp != NULL &&
- cvp->cookie_pair_size > 0 &&
- tp->rx_opt.cookie_plus > 0) {
- int cookie_size = tp->rx_opt.cookie_plus
- - TCPOLEN_COOKIE_BASE;
- int cookie_pair_size = cookie_size
- + cvp->cookie_desired;
-
- /* A cookie extension option was sent and returned.
- * Note that each incoming SYNACK replaces the
- * Responder cookie. The initial exchange is most
- * fragile, as protection against spoofing relies
- * entirely upon the sequence and timestamp (above).
- * This replacement strategy allows the correct pair to
- * pass through, while any others will be filtered via
- * Responder verification later.
- */
- if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
- memcpy(&cvp->cookie_pair[cvp->cookie_desired],
- hash_location, cookie_size);
- cvp->cookie_pair_size = cookie_pair_size;
- }
- }
-
smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
- security_inet_conn_established(sk, skb);
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- tcp_init_metrics(sk);
-
- tcp_init_congestion_control(sk);
-
- /* Prevent spurious tcp_cwnd_restart() on first data
- * packet.
- */
- tp->lsndtime = tcp_time_stamp;
-
- tcp_init_buffer_space(sk);
+ tcp_finish_connect(sk, skb);
- if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
+ if ((tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5613,8 +5506,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
inet_csk_schedule_ack(sk);
icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(sk);
tcp_enter_quickack_mode(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5680,7 +5571,9 @@ discard:
tcp_send_synack(sk);
#if 0
/* Note, we could accept data and URG from this segment.
- * There are no obstacles to make this.
+ * There are no obstacles to make this (except that we must
+ * either change tcp_recvmsg() to prevent it from returning data
+ * before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
@@ -5716,12 +5609,14 @@ reset_and_undo:
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
int queued = 0;
- int res;
+ bool acceptable;
+ u32 synack_stamp;
tp->rx_opt.saw_tstamp = 0;
@@ -5737,6 +5632,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
if (th->syn) {
+ if (th->fin)
+ goto discard;
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
@@ -5774,124 +5671,167 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 0;
}
- res = tcp_validate_incoming(sk, skb, th, 0);
- if (res <= 0)
- return -res;
+ req = tp->fastopen_rsk;
+ if (req != NULL) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+
+ if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+ goto discard;
+ }
+
+ if (!th->ack && !th->rst)
+ goto discard;
+
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
/* step 5: check the ACK field */
- if (th->ack) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
-
- switch (sk->sk_state) {
- case TCP_SYN_RECV:
- if (acceptable) {
- tp->copied_seq = tp->rcv_nxt;
- smp_mb();
- tcp_set_state(sk, TCP_ESTABLISHED);
- sk->sk_state_change(sk);
-
- /* Note, that this wakeup is only for marginal
- * crossed SYN case. Passively open sockets
- * are not waked up, because sk->sk_sleep ==
- * NULL and sk->sk_socket == NULL.
- */
- if (sk->sk_socket)
- sk_wake_async(sk,
- SOCK_WAKE_IO, POLL_OUT);
-
- tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
- tp->snd_wnd = ntohs(th->window) <<
- tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-
- /* tcp_ack considers this ACK as duplicate
- * and does not calculate rtt.
- * Force it here.
- */
- tcp_ack_update_rtt(sk, 0, 0);
+ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ if (!acceptable)
+ return 1;
- /* Make sure socket is routed, for
- * correct metrics.
- */
- icsk->icsk_af_ops->rebuild_header(sk);
+ /* Once we leave TCP_SYN_RECV, we no longer need req
+ * so release it.
+ */
+ if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
+ tp->total_retrans = req->num_retrans;
+ reqsk_fastopen_remove(sk, req, false);
+ } else {
+ synack_stamp = tp->lsndtime;
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_congestion_control(sk);
+
+ tcp_mtup_init(sk);
+ tp->copied_seq = tp->rcv_nxt;
+ tcp_init_buffer_space(sk);
+ }
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
- tcp_init_metrics(sk);
+ /* Note, that this wakeup is only for marginal crossed SYN case.
+ * Passively open sockets are not waked up, because
+ * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket)
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- tcp_init_congestion_control(sk);
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_synack_rtt_meas(sk, synack_stamp);
- /* Prevent spurious tcp_cwnd_restart() on
- * first data packet.
- */
- tp->lsndtime = tcp_time_stamp;
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
- tcp_mtup_init(sk);
- tcp_initialize_rcv_mss(sk);
- tcp_init_buffer_space(sk);
- tcp_fast_path_on(tp);
- } else {
+ if (req) {
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
+ } else
+ tcp_init_metrics(sk);
+
+ tcp_update_pacing_rate(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
+ break;
+
+ case TCP_FIN_WAIT1: {
+ struct dst_entry *dst;
+ int tmo;
+
+ /* If we enter the TCP_FIN_WAIT1 state and we are a
+ * Fast Open socket and this is the first acceptable
+ * ACK we have received, this would have acknowledged
+ * our SYNACK so stop the SYNACK timer.
+ */
+ if (req != NULL) {
+ /* Return RST if ack_seq is invalid.
+ * Note that RFC793 only says to generate a
+ * DUPACK for it but for TCP Fast Open it seems
+ * better to treat this case like TCP_SYN_RECV
+ * above.
+ */
+ if (!acceptable)
return 1;
- }
+ /* We no longer need the request sock. */
+ reqsk_fastopen_remove(sk, req, false);
+ tcp_rearm_rto(sk);
+ }
+ if (tp->snd_una != tp->write_seq)
break;
- case TCP_FIN_WAIT1:
- if (tp->snd_una == tp->write_seq) {
- tcp_set_state(sk, TCP_FIN_WAIT2);
- sk->sk_shutdown |= SEND_SHUTDOWN;
- dst_confirm(__sk_dst_get(sk));
-
- if (!sock_flag(sk, SOCK_DEAD))
- /* Wake up lingering close() */
- sk->sk_state_change(sk);
- else {
- int tmo;
-
- if (tp->linger2 < 0 ||
- (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
- tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- return 1;
- }
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ sk->sk_shutdown |= SEND_SHUTDOWN;
- tmo = tcp_fin_time(sk);
- if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
- } else if (th->fin || sock_owned_by_user(sk)) {
- /* Bad case. We could lose such FIN otherwise.
- * It is not a big problem, but it looks confusing
- * and not so rare event. We still can lose it now,
- * if it spins in bh_lock_sock(), but it is really
- * marginal case.
- */
- inet_csk_reset_keepalive_timer(sk, tmo);
- } else {
- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
- }
- }
- }
- break;
+ dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
- case TCP_CLOSING:
- if (tp->snd_una == tp->write_seq) {
- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
- }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
break;
+ }
- case TCP_LAST_ACK:
- if (tp->snd_una == tp->write_seq) {
- tcp_update_metrics(sk);
- tcp_done(sk);
- goto discard;
- }
- break;
+ if (tp->linger2 < 0 ||
+ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
}
- } else
- goto discard;
+
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ inet_csk_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ break;
+ }
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto discard;
+ }
+ break;
+ }
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dd555051ec8..77cccda1ad0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -50,6 +50,7 @@
* a single port at the same time.
*/
+#define pr_fmt(fmt) "TCP: " fmt
#include <linux/bottom_half.h>
#include <linux/types.h>
@@ -72,6 +73,9 @@
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/netdma.h>
+#include <net/secure_seq.h>
+#include <net/tcp_memcontrol.h>
+#include <net/busy_poll.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -88,22 +92,14 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);
#ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
- __be32 addr);
-static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, struct tcphdr *th);
-#else
-static inline
-struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
-{
- return NULL;
-}
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
+static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
@@ -146,13 +142,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique);
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
+ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct rtable *rt;
+ __be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
- int tmp;
+ struct flowi4 *fl4;
+ struct rtable *rt;
int err;
+ struct ip_options_rcu *inet_opt;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -161,20 +159,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt && inet->opt->srr) {
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
- nexthop = inet->opt->faddr;
- }
-
- tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->inet_sport, usin->sin_port, sk, 1);
- if (tmp < 0) {
- if (tmp == -ENETUNREACH)
- IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
- return tmp;
+ nexthop = inet_opt->opt.faddr;
+ }
+
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ return err;
}
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -182,44 +186,31 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -ENETUNREACH;
}
- if (!inet->opt || !inet->opt->srr)
- daddr = rt->rt_dst;
+ if (!inet_opt || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
if (!inet->inet_saddr)
- inet->inet_saddr = rt->rt_src;
+ inet->inet_saddr = fl4->saddr;
inet->inet_rcv_saddr = inet->inet_saddr;
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ if (likely(!tp->repair))
+ tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
- !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
- struct inet_peer *peer = rt_get_peer(rt);
- /*
- * VJ's idea. We save last timestamp seen from
- * the destination in peer table, when entering state
- * TIME-WAIT * and initialize rx_opt.ts_recent from it,
- * when trying new connection.
- */
- if (peer) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
- }
- }
- }
+ !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+ tcp_fetch_timewait_stamp(sk, &rt->dst);
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -233,16 +224,18 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err)
goto failure;
- err = ip_route_newports(&rt, IPPROTO_TCP,
- inet->inet_sport, inet->inet_dport, sk);
- if (err)
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
- if (!tp->write_seq)
+ if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
@@ -251,6 +244,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
+
rt = NULL;
if (err)
goto failure;
@@ -271,31 +265,20 @@ failure:
EXPORT_SYMBOL(tcp_v4_connect);
/*
- * This routine does path mtu discovery as defined in RFC1191.
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
*/
-static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
+static void tcp_v4_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
+ u32 mtu = tcp_sk(sk)->mtu_info;
- /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
- * send out by Linux are always <576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == TCP_LISTEN)
- return;
-
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
@@ -305,6 +288,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
tcp_sync_mss(sk, mtu);
@@ -317,6 +301,14 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
} /* else let the usual retransmit timer handle it */
}
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -335,7 +327,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
- struct iphdr *iph = (struct iphdr *)icmp_skb->data;
+ const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
struct inet_connection_sock *icsk;
struct tcp_sock *tp;
@@ -344,7 +336,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
struct sk_buff *skb;
- __u32 seq;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
__u32 remaining;
int err;
struct net *net = dev_net(icmp_skb->dev);
@@ -368,10 +361,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
+ * We do take care of PMTU discovery (RFC1191) special case :
+ * we can receive locally generated ICMP messages while socket is held.
*/
- if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
+ if (sock_owned_by_user(sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
if (sk->sk_state == TCP_CLOSE)
goto out;
@@ -383,13 +379,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
seq = ntohl(th->seq);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
- !between(seq, tp->snd_una, tp->snd_nxt)) {
+ !between(seq, snd_una, tp->snd_nxt)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
+ case ICMP_REDIRECT:
+ do_redirect(icmp_skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -401,8 +403,20 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- if (!sock_owned_by_user(sk))
- do_pmtu_discovery(sk, iph, info);
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ tp->mtu_info = info;
+ if (!sock_owned_by_user(sk)) {
+ tcp_v4_mtu_reduced(sk);
+ } else {
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ sock_hold(sk);
+ }
goto out;
}
@@ -412,15 +426,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
break;
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
- !icsk->icsk_backoff)
+ !icsk->icsk_backoff || fastopen)
break;
if (sock_owned_by_user(sk))
break;
icsk->icsk_backoff--;
- inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
- icsk->icsk_backoff;
+ inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
+ TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
tcp_bound_rto(sk);
skb = tcp_write_queue_head(sk);
@@ -474,12 +488,17 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(sk, req, prev);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
goto out;
case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen.
- It can f.e. if SYNs crossed.
- */
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && fastopen->sk == NULL)
+ break;
+
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
@@ -521,8 +540,7 @@ out:
sock_put(sk);
}
-static void __tcp_v4_send_check(struct sk_buff *skb,
- __be32 saddr, __be32 daddr)
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
struct tcphdr *th = tcp_hdr(skb);
@@ -541,29 +559,12 @@ static void __tcp_v4_send_check(struct sk_buff *skb,
/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
EXPORT_SYMBOL(tcp_v4_send_check);
-int tcp_v4_gso_send_check(struct sk_buff *skb)
-{
- const struct iphdr *iph;
- struct tcphdr *th;
-
- if (!pskb_may_pull(skb, sizeof(*th)))
- return -EINVAL;
-
- iph = ip_hdr(skb);
- th = tcp_hdr(skb);
-
- th->check = 0;
- skb->ip_summed = CHECKSUM_PARTIAL;
- __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
- return 0;
-}
-
/*
* This routine will send an RST to the other tcp.
*
@@ -579,7 +580,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
#ifdef CONFIG_TCP_MD5SIG
@@ -589,6 +590,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
+ const __u8 *hash_location = NULL;
+ unsigned char newhash[16];
+ int genhash;
+ struct sock *sk1 = NULL;
#endif
struct net *net;
@@ -619,7 +624,37 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len = sizeof(rep.th);
#ifdef CONFIG_TCP_MD5SIG
- key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
+ hash_location = tcp_parse_md5sig_option(th);
+ if (!sk && hash_location) {
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
+ ntohs(th->source), inet_iif(skb));
+ /* don't send rst if it can't find key */
+ if (!sk1)
+ return;
+ rcu_read_lock();
+ key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ if (!key)
+ goto release_sk1;
+
+ genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
+ if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ goto release_sk1;
+ } else {
+ key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr,
+ AF_INET) : NULL;
+ }
+
if (key) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -639,13 +674,28 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ /* When socket is gone, all binding information is lost.
+ * routing might fail in this case. No choice here, if we choose to force
+ * input interface, we will misroute in case of asymmetric route.
+ */
+ if (sk)
+ arg.bound_dev_if = sk->sk_bound_dev_if;
net = dev_net(skb_dst(skb)->dev);
- ip_send_reply(net->ipv4.tcp_sock, skb,
- &arg, arg.iov[0].iov_len);
+ arg.tos = ip_hdr(skb)->tos;
+ ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+
+#ifdef CONFIG_TCP_MD5SIG
+release_sk1:
+ if (sk1) {
+ rcu_read_unlock();
+ sock_put(sk1);
+ }
+#endif
}
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
@@ -653,11 +703,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
*/
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
- u32 win, u32 ts, int oif,
+ u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
- int reply_flags)
+ int reply_flags, u8 tos)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
@@ -674,12 +724,12 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- if (ts) {
+ if (tsecr) {
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
- rep.opt[1] = htonl(tcp_time_stamp);
- rep.opt[2] = htonl(ts);
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
}
@@ -694,7 +744,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
#ifdef CONFIG_TCP_MD5SIG
if (key) {
- int offset = (ts) ? 3 : 0;
+ int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -715,9 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
if (oif)
arg.bound_dev_if = oif;
-
- ip_send_reply(net->ipv4.tcp_sock, skb,
- &arg, arg.iov[0].iov_len);
+ arg.tos = tos;
+ ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
}
@@ -729,10 +779,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
- tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ tw->tw_tos
);
inet_twsk_put(tw);
@@ -741,12 +793,19 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
- tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
- tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp,
req->ts_recent,
0,
- tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
- inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ AF_INET),
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ ip_hdr(skb)->tos);
}
/*
@@ -756,36 +815,44 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp)
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
{
const struct inet_request_sock *ireq = inet_rsk(req);
+ struct flowi4 fl4;
int err = -1;
- struct sk_buff * skb;
+ struct sk_buff *skb;
/* First, grab a route. */
- if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, rvp);
+ skb = tcp_make_synack(sk, dst, req, foc);
if (skb) {
- __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+ __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
+ skb_set_queue_mapping(skb, queue_mapping);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
+ if (!tcp_rsk(req)->snt_synack && !err)
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
}
- dst_release(dst);
return err;
}
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
- struct request_values *rvp)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
{
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- return tcp_v4_send_synack(sk, NULL, req, rvp);
+ int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
+
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ }
+ return res;
}
/*
@@ -796,35 +863,50 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-static void syn_flood_warning(const struct sk_buff *skb)
+/*
+ * Return true if a syncookie should be sent
+ */
+bool tcp_syn_flood_action(struct sock *sk,
+ const struct sk_buff *skb,
+ const char *proto)
{
- const char *msg;
+ const char *msg = "Dropping request";
+ bool want_cookie = false;
+ struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies)
+ if (sysctl_tcp_syncookies) {
msg = "Sending cookies";
- else
+ want_cookie = true;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
#endif
- msg = "Dropping request";
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
- ntohs(tcp_hdr(skb)->dest), msg);
+ lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
+ lopt->synflood_warned = 1;
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
+ }
+ return want_cookie;
}
+EXPORT_SYMBOL(tcp_syn_flood_action);
/*
* Save and compile IPv4 options into the request_sock if needed.
*/
-static struct ip_options *tcp_v4_save_options(struct sock *sk,
- struct sk_buff *skb)
+static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
{
- struct ip_options *opt = &(IPCB(skb)->opt);
- struct ip_options *dopt = NULL;
+ const struct ip_options *opt = &(IPCB(skb)->opt);
+ struct ip_options_rcu *dopt = NULL;
if (opt && opt->optlen) {
- int opt_size = optlength(opt);
+ int opt_size = sizeof(*dopt) + opt->optlen;
+
dopt = kmalloc(opt_size, GFP_ATOMIC);
if (dopt) {
- if (ip_options_echo(dopt, skb)) {
+ if (ip_options_echo(&dopt->opt, skb)) {
kfree(dopt);
dopt = NULL;
}
@@ -841,150 +923,129 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
*/
/* Find the Key structure for an address. */
-static struct tcp_md5sig_key *
- tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
{
struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- if (!tp->md5sig_info || !tp->md5sig_info->entries4)
+ struct tcp_md5sig_key *key;
+ unsigned int size = sizeof(struct in_addr);
+ struct tcp_md5sig_info *md5sig;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
+ if (!md5sig)
return NULL;
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr)
- return &tp->md5sig_info->keys4[i].base;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ size = sizeof(struct in6_addr);
+#endif
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+ if (!memcmp(&key->addr, addr, size))
+ return key;
}
return NULL;
}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
struct sock *addr_sk)
{
- return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
+ union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
struct request_sock *req)
{
- return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
+ union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
}
/* This can be called on a newly created socket, from other files */
-int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
- u8 *newkey, u8 newkeylen)
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp4_md5sig_key *keys;
+ struct tcp_md5sig_info *md5sig;
- key = tcp_v4_md5_do_lookup(sk, addr);
+ key = tcp_md5_do_lookup(sk, addr, family);
if (key) {
/* Pre-existing entry - just update that one. */
- kfree(key->key);
- key->key = newkey;
+ memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
- } else {
- struct tcp_md5sig_info *md5sig;
-
- if (!tp->md5sig_info) {
- tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
- GFP_ATOMIC);
- if (!tp->md5sig_info) {
- kfree(newkey);
- return -ENOMEM;
- }
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
- }
- if (tcp_alloc_md5sig_pool(sk) == NULL) {
- kfree(newkey);
+ return 0;
+ }
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ sock_owned_by_user(sk));
+ if (!md5sig) {
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
return -ENOMEM;
- }
- md5sig = tp->md5sig_info;
-
- if (md5sig->alloced4 == md5sig->entries4) {
- keys = kmalloc((sizeof(*keys) *
- (md5sig->entries4 + 1)), GFP_ATOMIC);
- if (!keys) {
- kfree(newkey);
- tcp_free_md5sig_pool();
- return -ENOMEM;
- }
- if (md5sig->entries4)
- memcpy(keys, md5sig->keys4,
- sizeof(*keys) * md5sig->entries4);
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ }
- /* Free old key list, and reference new one */
- kfree(md5sig->keys4);
- md5sig->keys4 = keys;
- md5sig->alloced4++;
- }
- md5sig->entries4++;
- md5sig->keys4[md5sig->entries4 - 1].addr = addr;
- md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
- md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
+ key = sock_kmalloc(sk, sizeof(*key), gfp);
+ if (!key)
+ return -ENOMEM;
+ if (!tcp_alloc_md5sig_pool()) {
+ sock_kfree_s(sk, key, sizeof(*key));
+ return -ENOMEM;
}
+
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ key->family = family;
+ memcpy(&key->addr, addr,
+ (family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
+ hlist_add_head_rcu(&key->node, &md5sig->head);
return 0;
}
-EXPORT_SYMBOL(tcp_v4_md5_do_add);
+EXPORT_SYMBOL(tcp_md5_do_add);
-static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
- u8 *newkey, u8 newkeylen)
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
{
- return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
- newkey, newkeylen);
-}
+ struct tcp_md5sig_key *key;
-int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int i;
-
- for (i = 0; i < tp->md5sig_info->entries4; i++) {
- if (tp->md5sig_info->keys4[i].addr == addr) {
- /* Free the key */
- kfree(tp->md5sig_info->keys4[i].base.key);
- tp->md5sig_info->entries4--;
-
- if (tp->md5sig_info->entries4 == 0) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
- } else if (tp->md5sig_info->entries4 != i) {
- /* Need to do some manipulation */
- memmove(&tp->md5sig_info->keys4[i],
- &tp->md5sig_info->keys4[i+1],
- (tp->md5sig_info->entries4 - i) *
- sizeof(struct tcp4_md5sig_key));
- }
- tcp_free_md5sig_pool();
- return 0;
- }
- }
- return -ENOENT;
+ key = tcp_md5_do_lookup(sk, addr, family);
+ if (!key)
+ return -ENOENT;
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ return 0;
}
-EXPORT_SYMBOL(tcp_v4_md5_do_del);
+EXPORT_SYMBOL(tcp_md5_do_del);
-static void tcp_v4_clear_md5_list(struct sock *sk)
+static void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ struct hlist_node *n;
+ struct tcp_md5sig_info *md5sig;
- /* Free each key, then the set of key keys,
- * the crypto element, and then decrement our
- * hold on the last resort crypto.
- */
- if (tp->md5sig_info->entries4) {
- int i;
- for (i = 0; i < tp->md5sig_info->entries4; i++)
- kfree(tp->md5sig_info->keys4[i].base.key);
- tp->md5sig_info->entries4 = 0;
- tcp_free_md5sig_pool();
- }
- if (tp->md5sig_info->keys4) {
- kfree(tp->md5sig_info->keys4);
- tp->md5sig_info->keys4 = NULL;
- tp->md5sig_info->alloced4 = 0;
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
}
}
@@ -993,7 +1054,6 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
- u8 *newkey;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1004,32 +1064,16 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
- if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
- if (!tcp_sk(sk)->md5sig_info)
- return -ENOENT;
- return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
- }
+ if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- if (!tcp_sk(sk)->md5sig_info) {
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_md5sig_info *p;
-
- p = kzalloc(sizeof(*p), sk->sk_allocation);
- if (!p)
- return -EINVAL;
-
- tp->md5sig_info = p;
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
- }
-
- newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
- if (!newkey)
- return -ENOMEM;
- return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
- newkey, cmd.tcpm_keylen);
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ GFP_KERNEL);
}
static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
@@ -1055,8 +1099,8 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
}
-static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, struct tcphdr *th)
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
@@ -1088,20 +1132,20 @@ clear_hash_noput:
}
int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk, struct request_sock *req,
- struct sk_buff *skb)
+ const struct sock *sk, const struct request_sock *req,
+ const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
if (sk) {
saddr = inet_sk(sk)->inet_saddr;
daddr = inet_sk(sk)->inet_daddr;
} else if (req) {
- saddr = inet_rsk(req)->loc_addr;
- daddr = inet_rsk(req)->rmt_addr;
+ saddr = inet_rsk(req)->ir_loc_addr;
+ daddr = inet_rsk(req)->ir_rmt_addr;
} else {
const struct iphdr *iph = ip_hdr(skb);
saddr = iph->saddr;
@@ -1138,7 +1182,7 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
+static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
@@ -1148,28 +1192,29 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
* o MD5 hash and we're not expecting one.
* o MD5 hash and its wrong.
*/
- __u8 *hash_location = NULL;
+ const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct iphdr *iph = ip_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
int genhash;
unsigned char newhash[16];
- hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
+ hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
+ AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
if (!hash_expected && !hash_location)
- return 0;
+ return false;
if (hash_expected && !hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return 1;
+ return true;
}
if (!hash_expected && hash_location) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return 1;
+ return true;
}
/* Okay, so this is hash_expected and hash_location -
@@ -1180,15 +1225,14 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
NULL, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- if (net_ratelimit()) {
- printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
- &iph->saddr, ntohs(th->source),
- &iph->daddr, ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed" : "");
- }
- return 1;
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "");
+ return true;
}
- return 0;
+ return false;
}
#endif
@@ -1212,9 +1256,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
- u8 *hash_location;
struct request_sock *req;
struct inet_request_sock *ireq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -1222,11 +1264,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
-#ifdef CONFIG_SYN_COOKIES
- int want_cookie = 0;
-#else
-#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
-#endif
+ bool want_cookie = false, fastopen;
+ struct flowi4 fl4;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int err;
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1236,15 +1277,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
- if (net_ratelimit())
- syn_flood_warning(skb);
-#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
- want_cookie = 1;
- } else
-#endif
- goto drop;
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+ want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
+ if (!want_cookie)
+ goto drop;
}
/* Accept backlog is full. If we have already queued enough
@@ -1252,8 +1289,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
+ }
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
@@ -1266,43 +1305,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
-
- if (tmp_opt.cookie_plus > 0 &&
- tmp_opt.saw_tstamp &&
- !tp->rx_opt.cookie_out_never &&
- (sysctl_tcp_cookie_size > 0 ||
- (tp->cookie_values != NULL &&
- tp->cookie_values->cookie_desired > 0))) {
- u8 *c;
- u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
- int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
- if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
- goto drop_and_release;
-
- /* Secret recipe starts with IP addresses */
- *mess++ ^= (__force u32)daddr;
- *mess++ ^= (__force u32)saddr;
-
- /* plus variable length Initiator Cookie */
- c = (u8 *)mess;
- while (l-- > 0)
- *c++ ^= *hash_location++;
-
-#ifdef CONFIG_SYN_COOKIES
- want_cookie = 0; /* not our kind of cookie */
-#endif
- tmp_ext.cookie_out_never = 0; /* false */
- tmp_ext.cookie_plus = tmp_opt.cookie_plus;
- } else if (!tp->rx_opt.cookie_in_always) {
- /* redundant indications, but ensure initialization. */
- tmp_ext.cookie_out_never = 1; /* true */
- tmp_ext.cookie_plus = 0;
- } else {
- goto drop_and_release;
- }
- tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@@ -1311,23 +1314,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
- ireq->loc_addr = daddr;
- ireq->rmt_addr = saddr;
+ ireq->ir_loc_addr = daddr;
+ ireq->ir_rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
- ireq->opt = tcp_v4_save_options(sk, skb);
+ ireq->opt = tcp_v4_save_options(skb);
+ ireq->ir_mark = inet_request_mark(sk, skb);
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, tcp_hdr(skb));
+ TCP_ECN_create_request(req, skb, sock_net(sk));
if (want_cookie) {
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
- struct inet_peer *peer = NULL;
-
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
@@ -1339,13 +1341,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, req)) != NULL &&
- (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
- peer->daddr.a4 == saddr) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
+ (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
+ fl4.daddr == saddr) {
+ if (!tcp_peer_is_proven(req, dst, true)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
@@ -1354,8 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst, false)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
@@ -1363,21 +1360,32 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* to destinations, already remembered
* to the moment of synflood.
*/
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
&saddr, ntohs(tcp_hdr(skb)->source));
goto drop_and_release;
}
isn = tcp_v4_init_sequence(skb);
}
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ goto drop_and_free;
+
tcp_rsk(req)->snt_isn = isn;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = tcp_v4_send_synack(sk, dst, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
+ if (err || want_cookie)
+ goto drop_and_free;
- if (tcp_v4_send_synack(sk, dst, req,
- (struct request_values *)&tmp_ext) ||
- want_cookie)
- goto drop_and_free;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->listener = NULL;
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
return 0;
drop_and_release:
@@ -1385,6 +1393,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1405,38 +1414,46 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif
+ struct ip_options_rcu *inet_opt;
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit_nonewsk;
newsk->sk_gso_type = SKB_GSO_TCPV4;
- sk_setup_caps(newsk, dst);
+ inet_sk_rx_dst_set(newsk, skb);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->rmt_addr;
- newinet->inet_rcv_saddr = ireq->loc_addr;
- newinet->inet_saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
+ newinet->inet_daddr = ireq->ir_rmt_addr;
+ newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ inet_opt = ireq->opt;
+ rcu_assign_pointer(newinet->inet_opt, inet_opt);
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newinet->opt)
- inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
+ if (inet_opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
- tcp_mtup_init(newsk);
+ if (!dst) {
+ dst = inet_csk_route_child_sock(sk, newsk, req);
+ if (!dst)
+ goto put_and_exit;
+ } else {
+ /* syncookie case : see end of cookie_v4_check() */
+ }
+ sk_setup_caps(newsk, dst);
+
tcp_sync_mss(newsk, dst_mtu(dst));
- newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1445,7 +1462,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
- key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET);
if (key != NULL) {
/*
* We're using one, so create a matching key
@@ -1453,18 +1471,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
* memory, then we end up not copying the key
* across. Shucks.
*/
- char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
- if (newkey != NULL)
- tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
- newkey, key->keylen);
+ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET, key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
- if (__inet_inherit_port(sk, newsk) < 0) {
- sock_put(newsk);
- goto exit;
- }
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
__inet_hash_nolisten(newsk, NULL);
return newsk;
@@ -1476,6 +1490,10 @@ exit_nonewsk:
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -1489,7 +1507,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr);
if (req)
- return tcp_check_req(sk, skb, req, prev);
+ return tcp_check_req(sk, skb, req, prev, false);
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1510,28 +1528,6 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
return sk;
}
-static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!tcp_v4_check(skb->len, iph->saddr,
- iph->daddr, skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- return 0;
- }
- }
-
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, IPPROTO_TCP, 0);
-
- if (skb->len <= 76) {
- return __skb_checksum_complete(skb);
- }
- return 0;
-}
-
-
/* The socket must have it's spinlock held when we get
* here.
*
@@ -1555,13 +1551,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- sock_rps_save_rxhash(sk, skb->rxhash);
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
- rsk = sk;
- goto reset;
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+ sock_rps_save_rxhash(sk, skb);
+ if (dst) {
+ if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ dst->ops->check(dst, 0) == NULL) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
}
- TCP_CHECK_TIMER(sk);
+ tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
return 0;
}
@@ -1574,6 +1574,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto discard;
if (nsk != sk) {
+ sock_rps_save_rxhash(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1581,15 +1582,12 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
} else
- sock_rps_save_rxhash(sk, skb->rxhash);
+ sock_rps_save_rxhash(sk, skb);
-
- TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
- TCP_CHECK_TIMER(sk);
return 0;
reset:
@@ -1604,11 +1602,94 @@ discard:
return 0;
csum_err:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ skb->skb_iif);
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst &&
+ inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+}
+
+/* Packet is added to VJ-style prequeue for processing in process
+ * context, if a reader task is waiting. Apparently, this exciting
+ * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
+ * failed somewhere. Latency? Burstiness? Well, at least now we will
+ * see, why it failed. 8)8) --ANK
+ *
+ */
+bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sysctl_tcp_low_latency || !tp->ucopy.task)
+ return false;
+
+ if (skb->len <= tcp_hdrlen(skb) &&
+ skb_queue_len(&tp->ucopy.prequeue) == 0)
+ return false;
+
+ skb_dst_force(skb);
+ __skb_queue_tail(&tp->ucopy.prequeue, skb);
+ tp->ucopy.memory += skb->truesize;
+ if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ struct sk_buff *skb1;
+
+ BUG_ON(sock_owned_by_user(sk));
+
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ sk_backlog_rcv(sk, skb1);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPPREQUEUEDROPPED);
+ }
+
+ tp->ucopy.memory = 0;
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+ if (!inet_csk_ack_scheduled(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+ }
+ return true;
+}
+EXPORT_SYMBOL(tcp_prequeue);
+
/*
* From tcp_input.c
*/
@@ -1616,7 +1697,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_rcv(struct sk_buff *skb)
{
const struct iphdr *iph;
- struct tcphdr *th;
+ const struct tcphdr *th;
struct sock *sk;
int ret;
struct net *net = dev_net(skb->dev);
@@ -1641,8 +1722,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
- if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
- goto bad_packet;
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
+ goto csum_error;
th = tcp_hdr(skb);
iph = ip_hdr(skb);
@@ -1651,7 +1733,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->when = 0;
- TCP_SKB_CB(skb)->flags = iph->tos;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
@@ -1674,6 +1756,7 @@ process:
if (sk_filter(sk, skb))
goto discard_and_relse;
+ sk_mark_napi_id(sk, skb);
skb->dev = NULL;
bh_lock_sock_nested(sk);
@@ -1682,7 +1765,7 @@ process:
#ifdef CONFIG_NET_DMA
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+ tp->ucopy.dma_chan = net_dma_find_channel();
if (tp->ucopy.dma_chan)
ret = tcp_v4_do_rcv(sk, skb);
else
@@ -1691,7 +1774,8 @@ process:
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
- } else if (unlikely(sk_add_backlog(sk, skb))) {
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
@@ -1707,6 +1791,8 @@ no_tcp_socket:
goto discard_it;
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+csum_error:
+ TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
bad_packet:
TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
} else {
@@ -1728,15 +1814,19 @@ do_time_wait:
goto discard_it;
}
- if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
- TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ if (skb->len < (th->doff << 2)) {
inet_twsk_put(inet_twsk(sk));
- goto discard_it;
+ goto bad_packet;
+ }
+ if (tcp_checksum_complete(skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
&tcp_hashinfo,
+ iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
if (sk2) {
@@ -1757,48 +1847,29 @@ do_time_wait:
goto discard_it;
}
-struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
-{
- struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_peer *peer;
-
- if (!rt || rt->rt_dst != inet->inet_daddr) {
- peer = inet_getpeer_v4(inet->inet_daddr, 1);
- *release_it = true;
- } else {
- if (!rt->peer)
- rt_bind_peer(rt, 1);
- peer = rt->peer;
- *release_it = false;
- }
-
- return peer;
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
- struct inet_timewait_sock *tw = inet_twsk(sk);
-
- return inet_getpeer_v4(tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
-
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
.twsk_unique = tcp_twsk_unique,
.twsk_destructor= tcp_twsk_destructor,
- .twsk_getpeer = tcp_v4_tw_get_peer,
};
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ dst_hold(dst);
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(inet_sk_rx_dst_set);
+
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
- .get_peer = tcp_v4_get_peer,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
@@ -1816,7 +1887,6 @@ EXPORT_SYMBOL(ipv4_specific);
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
- .md5_add = tcp_v4_md5_add_func,
.md5_parse = tcp_v4_parse_md5_keys,
};
#endif
@@ -1827,63 +1897,15 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- skb_queue_head_init(&tp->out_of_order_queue);
- tcp_init_xmit_timers(sk);
- tcp_prequeue_init(tp);
-
- icsk->icsk_rto = TCP_TIMEOUT_INIT;
- tp->mdev = TCP_TIMEOUT_INIT;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- tp->snd_cwnd = 2;
-
- /* See draft-stevens-tcpca-spec-01 for discussion of the
- * initialization of these values.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = TCP_MSS_DEFAULT;
-
- tp->reordering = sysctl_tcp_reordering;
- icsk->icsk_ca_ops = &tcp_init_congestion_ops;
- sk->sk_state = TCP_CLOSE;
-
- sk->sk_write_space = sk_stream_write_space;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific;
- icsk->icsk_sync_mss = tcp_sync_mss;
+
#ifdef CONFIG_TCP_MD5SIG
- tp->af_specific = &tcp_sock_ipv4_specific;
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
- /* TCP Cookie Transactions */
- if (sysctl_tcp_cookie_size > 0) {
- /* Default, cookies without s_data_payload. */
- tp->cookie_values =
- kzalloc(sizeof(*tp->cookie_values),
- sk->sk_allocation);
- if (tp->cookie_values != NULL)
- kref_init(&tp->cookie_values->kref);
- }
- /* Presumed zeroed, in order of appearance:
- * cookie_in_always, cookie_out_never,
- * s_data_constant, s_data_in, s_data_out
- */
- sk->sk_sndbuf = sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sysctl_tcp_rmem[1];
-
- local_bh_disable();
- percpu_counter_inc(&tcp_sockets_allocated);
- local_bh_enable();
-
return 0;
}
@@ -1904,8 +1926,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
#ifdef CONFIG_TCP_MD5SIG
/* Clean up the MD5 key list, if any */
if (tp->md5sig_info) {
- tcp_v4_clear_md5_list(sk);
- kfree(tp->md5sig_info);
+ tcp_clear_md5_list(sk);
+ kfree_rcu(tp->md5sig_info, rcu);
tp->md5sig_info = NULL;
}
#endif
@@ -1922,40 +1944,19 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- /*
- * If sendmsg cached page exists, toss it.
- */
- if (sk->sk_sndmsg_page) {
- __free_page(sk->sk_sndmsg_page);
- sk->sk_sndmsg_page = NULL;
- }
+ BUG_ON(tp->fastopen_rsk != NULL);
- /* TCP Cookie Transactions */
- if (tp->cookie_values != NULL) {
- kref_put(&tp->cookie_values->kref,
- tcp_cookie_values_release);
- tp->cookie_values = NULL;
- }
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
- percpu_counter_dec(&tcp_sockets_allocated);
+ sk_sockets_allocated_dec(sk);
+ sock_release_memcg(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
-{
- return hlist_nulls_empty(head) ? NULL :
- list_entry(head->first, struct inet_timewait_sock, tw_node);
-}
-
-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
-{
- return !is_a_nulls(tw->tw_node.next) ?
- hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
-}
-
/*
* Get next listener socket follow cur. If cur is NULL, get first socket
* starting from bucket given in st->bucket; when st->bucket is zero the
@@ -1994,13 +1995,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
- st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
}
- sk = sk_next(st->syn_wait_sk);
+ sk = sk_nulls_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
@@ -2009,11 +2009,13 @@ get_req:
if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_next(sk);
+ sk = sk_nulls_next(sk);
}
get_sk:
sk_nulls_for_each_from(sk, node) {
- if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_family == st->family) {
cur = sk;
goto out;
}
@@ -2058,10 +2060,9 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline int empty_bucket(struct tcp_iter_state *st)
+static inline bool empty_bucket(const struct tcp_iter_state *st)
{
- return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
- hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+ return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
}
/*
@@ -2078,7 +2079,6 @@ static void *established_get_first(struct seq_file *seq)
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
- struct inet_timewait_sock *tw;
spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
/* Lockless fast path for the common case of empty buckets */
@@ -2094,18 +2094,7 @@ static void *established_get_first(struct seq_file *seq)
rc = sk;
goto out;
}
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- inet_twsk_for_each(tw, node,
- &tcp_hashinfo.ehash[st->bucket].twchain) {
- if (tw->tw_family != st->family ||
- !net_eq(twsk_net(tw), net)) {
- continue;
- }
- rc = tw;
- goto out;
- }
spin_unlock_bh(lock);
- st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
return rc;
@@ -2114,7 +2103,6 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
- struct inet_timewait_sock *tw;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
@@ -2122,45 +2110,16 @@ static void *established_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
- tw = cur;
- tw = tw_next(tw);
-get_tw:
- while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
- tw = tw_next(tw);
- }
- if (tw) {
- cur = tw;
- goto out;
- }
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
- st->state = TCP_SEQ_STATE_ESTABLISHED;
-
- /* Look for next non empty bucket */
- st->offset = 0;
- while (++st->bucket <= tcp_hashinfo.ehash_mask &&
- empty_bucket(st))
- ;
- if (st->bucket > tcp_hashinfo.ehash_mask)
- return NULL;
-
- spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
- sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
- } else
- sk = sk_nulls_next(sk);
+ sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
- goto found;
+ return sk;
}
- st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
- goto get_tw;
-found:
- cur = sk;
-out:
- return cur;
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ ++st->bucket;
+ return established_get_first(seq);
}
static void *established_get_idx(struct seq_file *seq, loff_t pos)
@@ -2213,10 +2172,9 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
if (rc)
break;
st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Fallthrough */
case TCP_SEQ_STATE_ESTABLISHED:
- case TCP_SEQ_STATE_TIME_WAIT:
- st->state = TCP_SEQ_STATE_ESTABLISHED;
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
rc = established_get_first(seq);
@@ -2273,7 +2231,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
break;
case TCP_SEQ_STATE_ESTABLISHED:
- case TCP_SEQ_STATE_TIME_WAIT:
rc = established_get_next(seq, v);
break;
}
@@ -2297,7 +2254,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
- case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2305,9 +2261,9 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
}
}
-static int tcp_seq_open(struct inode *inode, struct file *file)
+int tcp_seq_open(struct inode *inode, struct file *file)
{
- struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
struct tcp_iter_state *s;
int err;
@@ -2321,23 +2277,19 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
s->last_pos = 0;
return 0;
}
+EXPORT_SYMBOL(tcp_seq_open);
int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
{
int rc = 0;
struct proc_dir_entry *p;
- afinfo->seq_fops.open = tcp_seq_open;
- afinfo->seq_fops.read = seq_read;
- afinfo->seq_fops.llseek = seq_lseek;
- afinfo->seq_fops.release = seq_release_net;
-
afinfo->seq_ops.start = tcp_seq_start;
afinfo->seq_ops.next = tcp_seq_next;
afinfo->seq_ops.stop = tcp_seq_stop;
p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
- &afinfo->seq_fops, afinfo);
+ afinfo->seq_fops, afinfo);
if (!p)
rc = -ENOMEM;
return rc;
@@ -2346,50 +2298,52 @@ EXPORT_SYMBOL(tcp_proc_register);
void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(tcp_proc_unregister);
-static void get_openreq4(struct sock *sk, struct request_sock *req,
- struct seq_file *f, int i, int uid, int *len)
+static void get_openreq4(const struct sock *sk, const struct request_sock *req,
+ struct seq_file *f, int i, kuid_t uid)
{
const struct inet_request_sock *ireq = inet_rsk(req);
- int ttd = req->expires - jiffies;
+ long delta = req->expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
- ireq->loc_addr,
+ ireq->ir_loc_addr,
ntohs(inet_sk(sk)->inet_sport),
- ireq->rmt_addr,
- ntohs(ireq->rmt_port),
+ ireq->ir_rmt_addr,
+ ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
- jiffies_to_clock_t(ttd),
- req->retrans,
- uid,
+ jiffies_delta_to_clock_t(delta),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(f), uid),
0, /* non standard timer */
0, /* open_requests have no inode */
atomic_read(&sk->sk_refcnt),
- req,
- len);
+ req);
}
-static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
int timer_active;
unsigned long timer_expires;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
@@ -2412,14 +2366,14 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
- "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, sk->sk_state,
tp->write_seq - tp->snd_una,
rx_queue,
timer_active,
- jiffies_to_clock_t(timer_expires - jiffies),
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
- sock_i_uid(sk),
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
icsk->icsk_probes_out,
sock_i_ino(sk),
atomic_read(&sk->sk_refcnt), sk,
@@ -2427,19 +2381,17 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
- len);
+ sk->sk_state == TCP_LISTEN ?
+ (fastopenq ? fastopenq->max_qlen : 0) :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
-static void get_timewait4_sock(struct inet_timewait_sock *tw,
- struct seq_file *f, int i, int *len)
+static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+ struct seq_file *f, int i)
{
__be32 dest, src;
__u16 destp, srcp;
- int ttd = tw->tw_ttd - jiffies;
-
- if (ttd < 0)
- ttd = 0;
+ s32 delta = tw->tw_ttd - inet_tw_time_stamp();
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
@@ -2447,10 +2399,10 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,
srcp = ntohs(tw->tw_sport);
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
- 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw, len);
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ atomic_read(&tw->tw_refcnt), tw);
}
#define TMPSZ 150
@@ -2458,11 +2410,11 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,
static int tcp4_seq_show(struct seq_file *seq, void *v)
{
struct tcp_iter_state *st;
- int len;
+ struct sock *sk = v;
+ seq_setwidth(seq, TMPSZ - 1);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-*s\n", TMPSZ - 1,
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode");
goto out;
@@ -2472,26 +2424,32 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
case TCP_SEQ_STATE_ESTABLISHED:
- get_tcp4_sock(v, seq, st->num, &len);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
- break;
- case TCP_SEQ_STATE_TIME_WAIT:
- get_timewait4_sock(v, seq, st->num, &len);
+ get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
break;
}
- seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
out:
+ seq_pad(seq, '\n');
return 0;
}
+static const struct file_operations tcp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = tcp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
static struct tcp_seq_afinfo tcp4_seq_afinfo = {
.name = "tcp",
.family = AF_INET,
- .seq_fops = {
- .owner = THIS_MODULE,
- },
+ .seq_fops = &tcp_afinfo_seq_fops,
.seq_ops = {
.show = tcp4_seq_show,
},
@@ -2523,39 +2481,6 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
-{
- struct iphdr *iph = skb_gro_network_header(skb);
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
-
- /* fall through */
- case CHECKSUM_NONE:
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
-
- return tcp_gro_receive(head, skb);
-}
-
-int tcp4_gro_complete(struct sk_buff *skb)
-{
- struct iphdr *iph = ip_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
-
- th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
- iph->saddr, iph->daddr, 0);
- skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
-
- return tcp_gro_complete(skb);
-}
-
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
@@ -2573,10 +2498,13 @@ struct proto tcp_prot = {
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
+ .release_cb = tcp_release_cb,
+ .mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
@@ -2595,19 +2523,22 @@ struct proto tcp_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ .init_cgroup = tcp_init_cgroup,
+ .destroy_cgroup = tcp_destroy_cgroup,
+ .proto_cgroup = tcp_proto_cgroup,
+#endif
};
EXPORT_SYMBOL(tcp_prot);
-
static int __net_init tcp_sk_init(struct net *net)
{
- return inet_ctl_sock_create(&net->ipv4.tcp_sock,
- PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+ net->ipv4.sysctl_tcp_ecn = 2;
+ return 0;
}
static void __net_exit tcp_sk_exit(struct net *net)
{
- inet_ctl_sock_destroy(net->ipv4.tcp_sock);
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbb..1e70fa8fa79 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -12,7 +12,7 @@
* within cong_avoid.
* o Error correcting in remote HZ, therefore remote HZ will be keeped
* on checking and updating.
- * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
+ * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
* OWD have a similar meaning as RTT. Also correct the buggy formular.
* o Handle reaction for Early Congestion Indication (ECI) within
* pkts_acked, as mentioned within pseudo code.
@@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk)
* Will only call newReno CA when away from inference.
* From TCP-LP's paper, this will be handled in additive increasement.
*/
-static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct lp *lp = inet_csk_ca(sk);
if (!(lp->flag & LP_WITHIN_INF))
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
}
/**
@@ -313,12 +313,10 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
lp->last_drop = tcp_time_stamp;
}
-static struct tcp_congestion_ops tcp_lp = {
- .flags = TCP_CONG_RTT_STAMP,
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
.init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_lp_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.pkts_acked = tcp_lp_pkts_acked,
.owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
new file mode 100644
index 00000000000..f7a2ec3ac58
--- /dev/null
+++ b/net/ipv4/tcp_memcontrol.c
@@ -0,0 +1,228 @@
+#include <net/tcp.h>
+#include <net/tcp_memcontrol.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
+#include <linux/memcontrol.h>
+#include <linux/module.h>
+
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ /*
+ * The root cgroup does not use res_counters, but rather,
+ * rely on the data already collected by the network
+ * subsystem
+ */
+ struct res_counter *res_parent = NULL;
+ struct cg_proto *cg_proto, *parent_cg;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return 0;
+
+ cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
+ cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
+ cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
+ cg_proto->memory_pressure = 0;
+ cg_proto->memcg = memcg;
+
+ parent_cg = tcp_prot.proto_cgroup(parent);
+ if (parent_cg)
+ res_parent = &parent_cg->memory_allocated;
+
+ res_counter_init(&cg_proto->memory_allocated, res_parent);
+ percpu_counter_init(&cg_proto->sockets_allocated, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
+{
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return;
+
+ percpu_counter_destroy(&cg_proto->sockets_allocated);
+}
+EXPORT_SYMBOL(tcp_destroy_cgroup);
+
+static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
+{
+ struct cg_proto *cg_proto;
+ int i;
+ int ret;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return -EINVAL;
+
+ if (val > RES_COUNTER_MAX)
+ val = RES_COUNTER_MAX;
+
+ ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < 3; i++)
+ cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
+ sysctl_tcp_mem[i]);
+
+ if (val == RES_COUNTER_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else if (val != RES_COUNTER_MAX) {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
+
+ return 0;
+}
+
+static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long long val;
+ int ret = 0;
+
+ buf = strstrip(buf);
+
+ switch (of_cft(of)->private) {
+ case RES_LIMIT:
+ /* see memcontrol.c */
+ ret = res_counter_memparse_write_strategy(buf, &val);
+ if (ret)
+ break;
+ ret = tcp_update_limit(memcg, val);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
+{
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return default_val;
+
+ return res_counter_read_u64(&cg_proto->memory_allocated, type);
+}
+
+static u64 tcp_read_usage(struct mem_cgroup *memcg)
+{
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
+
+ return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
+}
+
+static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ u64 val;
+
+ switch (cft->private) {
+ case RES_LIMIT:
+ val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
+ break;
+ case RES_USAGE:
+ val = tcp_read_usage(memcg);
+ break;
+ case RES_FAILCNT:
+ case RES_MAX_USAGE:
+ val = tcp_read_stat(memcg, cft->private, 0);
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
+
+ memcg = mem_cgroup_from_css(of_css(of));
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return nbytes;
+
+ switch (of_cft(of)->private) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&cg_proto->memory_allocated);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&cg_proto->memory_allocated);
+ break;
+ }
+
+ return nbytes;
+}
+
+static struct cftype tcp_files[] = {
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .write = tcp_cgroup_write,
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_LIMIT,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_USAGE,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = RES_FAILCNT,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ { } /* terminate */
+};
+
+static int __init tcp_memcontrol_init(void)
+{
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
+ return 0;
+}
+__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 00000000000..4fe04180598
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,1188 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/genetlink.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash);
+
+struct tcp_fastopen_metrics {
+ u16 mss;
+ u16 syn_loss:10; /* Recurring Fast Open SYN losses */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
+ struct tcp_fastopen_cookie cookie;
+};
+
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ struct inetpeer_addr tcpm_saddr;
+ struct inetpeer_addr tcpm_daddr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_ts;
+ u32 tcpm_ts_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
+ struct tcp_fastopen_metrics tcpm_fastopen;
+
+ struct rcu_head rcu_head;
+};
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_vals[idx];
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = val;
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ const struct in6_addr *a6, *b6;
+
+ if (a->family != b->family)
+ return false;
+ if (a->family == AF_INET)
+ return a->addr.a4 == b->addr.a4;
+
+ a6 = (const struct in6_addr *) &a->addr.a6[0];
+ b6 = (const struct in6_addr *) &b->addr.a6[0];
+
+ return ipv6_addr_equal(a6, b6);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst,
+ bool fastopen_clear)
+{
+ u32 msval;
+ u32 val;
+
+ tm->tcpm_stamp = jiffies;
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ tm->tcpm_lock = val;
+
+ msval = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+
+ msval = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
+ tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+ tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+ tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tm->tcpm_ts = 0;
+ tm->tcpm_ts_stamp = 0;
+ if (fastopen_clear) {
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.cookie.len = 0;
+ }
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *saddr,
+ struct inetpeer_addr *daddr,
+ unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ struct net *net;
+ bool reclaim = false;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dev_net(dst->dev);
+
+ /* While waiting for the spin-lock the cache might have been populated
+ * with this entry and so we have to check again.
+ */
+ tm = __tcp_get_metrics(saddr, daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (tm) {
+ tcpm_check_stamp(tm, dst);
+ goto out_unlock;
+ }
+
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+ for (tm = rcu_dereference(oldest->tcpm_next); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ tm->tcpm_saddr = *saddr;
+ tm->tcpm_daddr = *daddr;
+
+ tcpm_suck_dst(tm, dst, true);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, saddr) &&
+ addr_same(&tm->tcpm_daddr, daddr))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ saddr.family = req->rsk_ops->family;
+ daddr.family = req->rsk_ops->family;
+ switch (daddr.family) {
+ case AF_INET:
+ saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
+ daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr;
+ *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
+ hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ net = dev_net(dst->dev);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (tw->tw_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw->tw_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = twsk_net(tw);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr))
+ break;
+ }
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (sk->sk_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr;
+ hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = dev_net(dst->dev);
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+
+ tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR)
+ tm = NULL;
+ if (!tm && create)
+ tm = tcpm_new(dst, &saddr, &daddr, hash);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ if (sysctl_tcp_nometrics_save || !dst)
+ return;
+
+ if (dst->flags & DST_HOST)
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt_us) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt_us;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt_us;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev_us)
+ m = tp->mdev_us;
+
+ var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tp->snd_cwnd >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_cwnd >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tp->snd_cwnd > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tp->snd_cwnd);
+ }
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ tm->tcpm_stamp = jiffies;
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ u32 val, crtt = 0; /* cached RTT scaled by 8 */
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = val;
+ }
+
+ crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ rcu_read_unlock();
+reset:
+ /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+ * to seed the RTO for later data packets because SYN packets are
+ * small. Use the per-dst cached values to seed the RTO but keep
+ * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+ * Later the RTO will be updated immediately upon obtaining the first
+ * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+ * influences the first RTO but not later RTT estimation.
+ *
+ * But if RTT is not available from the SYN (due to retransmits or
+ * syn cookies) or the cache, force a conservative 3secs timeout.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (crtt > tp->srtt_us) {
+ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+ crtt /= 8 * USEC_PER_MSEC;
+ inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+ } else if (tp->srtt_us == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+ tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (paws_check) {
+ if (tm &&
+ (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+ (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+ ret = false;
+ else
+ ret = true;
+ } else {
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+ ret = true;
+ else
+ ret = false;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+ tp->rx_opt.ts_recent = tm->tcpm_ts;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+ tm->tcpm_ts = tp->rx_opt.ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ bool ret = false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_tw(tw);
+ if (tm) {
+ const struct tcp_timewait_sock *tcptw;
+ struct sock *sk = (struct sock *) tw;
+
+ tcptw = tcp_twsk(sk);
+ if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+ tm->tcpm_ts = tcptw->tw_ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ *syn_loss = tfom->syn_loss;
+ *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+ }
+ rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_metrics_block *tm;
+
+ if (!dst)
+ return;
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+ write_seqlock_bh(&fastopen_seqlock);
+ if (mss)
+ tfom->mss = mss;
+ if (cookie && cookie->len > 0)
+ tfom->cookie = *cookie;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
+ write_sequnlock_bh(&fastopen_seqlock);
+ }
+ rcu_read_unlock();
+}
+
+static struct genl_family tcp_metrics_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+ [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr), },
+ /* Following attributes are not received for GET/DEL,
+ * we keep them for reference
+ */
+#if 0
+ [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
+ [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
+ [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
+ .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+ struct tcp_metrics_block *tm)
+{
+ struct nlattr *nest;
+ int i;
+
+ switch (tm->tcpm_daddr.family) {
+ case AF_INET:
+ if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+ tm->tcpm_daddr.addr.a4) < 0)
+ goto nla_put_failure;
+ if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ tm->tcpm_saddr.addr.a4) < 0)
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
+ tm->tcpm_daddr.addr.a6) < 0)
+ goto nla_put_failure;
+ if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16,
+ tm->tcpm_saddr.addr.a6) < 0)
+ goto nla_put_failure;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+ jiffies - tm->tcpm_stamp) < 0)
+ goto nla_put_failure;
+ if (tm->tcpm_ts_stamp) {
+ if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
+ (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
+ goto nla_put_failure;
+ if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
+ tm->tcpm_ts) < 0)
+ goto nla_put_failure;
+ }
+
+ {
+ int n = 0;
+
+ nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+ u32 val = tm->tcpm_vals[i];
+
+ if (!val)
+ continue;
+ if (i == TCP_METRIC_RTT) {
+ if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (i == TCP_METRIC_RTTVAR) {
+ if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (nla_put_u32(msg, i + 1, val) < 0)
+ goto nla_put_failure;
+ n++;
+ }
+ if (n)
+ nla_nest_end(msg, nest);
+ else
+ nla_nest_cancel(msg, nest);
+ }
+
+ {
+ struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ tfom_copy[0] = tm->tcpm_fastopen;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+
+ tfom = tfom_copy;
+ if (tfom->mss &&
+ nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+ tfom->mss) < 0)
+ goto nla_put_failure;
+ if (tfom->syn_loss &&
+ (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+ tfom->syn_loss) < 0 ||
+ nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+ jiffies - tfom->last_syn_loss) < 0))
+ goto nla_put_failure;
+ if (tfom->cookie.len > 0 &&
+ nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+ tfom->cookie.len, tfom->cookie.val) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct tcp_metrics_block *tm)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tcp_metrics_nl_family, NLM_F_MULTI,
+ TCP_METRICS_CMD_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (tcp_metrics_fill_info(skb, tm) < 0)
+ goto nla_put_failure;
+
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+ unsigned int row, s_row = cb->args[0];
+ int s_col = cb->args[1], col = s_col;
+
+ for (row = s_row; row < max_rows; row++, s_col = 0) {
+ struct tcp_metrics_block *tm;
+ struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
+
+ rcu_read_lock();
+ for (col = 0, tm = rcu_dereference(hb->chain); tm;
+ tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (col < s_col)
+ continue;
+ if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = row;
+ cb->args[1] = col;
+ return skb->len;
+}
+
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional, int v4, int v6)
+{
+ struct nlattr *a;
+
+ a = info->attrs[v4];
+ if (a) {
+ addr->family = AF_INET;
+ addr->addr.a4 = nla_get_be32(a);
+ if (hash)
+ *hash = (__force unsigned int) addr->addr.a4;
+ return 0;
+ }
+ a = info->attrs[v6];
+ if (a) {
+ if (nla_len(a) != sizeof(struct in6_addr))
+ return -EINVAL;
+ addr->family = AF_INET6;
+ memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
+ if (hash)
+ *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
+ return 0;
+ }
+ return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional)
+{
+ return __parse_nl_addr(info, addr, hash, optional,
+ TCP_METRICS_ATTR_ADDR_IPV4,
+ TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+ return __parse_nl_addr(info, addr, NULL, 0,
+ TCP_METRICS_ATTR_SADDR_IPV4,
+ TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct sk_buff *msg;
+ struct net *net = genl_info_net(info);
+ void *reply;
+ int ret;
+ bool src = true;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply)
+ goto nla_put_failure;
+
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ ret = -ESRCH;
+ rcu_read_lock();
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ ret = tcp_metrics_fill_info(msg, tm);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out_free;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ ret = -EMSGSIZE;
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+#define deref_locked_genl(p) \
+ rcu_dereference_protected(p, lockdep_genl_is_held() && \
+ lockdep_is_held(&tcp_metrics_lock))
+
+#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
+
+static int tcp_metrics_flush_all(struct net *net)
+{
+ unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
+ struct tcp_metrics_block *tm;
+ unsigned int row;
+
+ for (row = 0; row < max_rows; row++, hb++) {
+ spin_lock_bh(&tcp_metrics_lock);
+ tm = deref_locked_genl(hb->chain);
+ if (tm)
+ hb->chain = NULL;
+ spin_unlock_bh(&tcp_metrics_lock);
+ while (tm) {
+ struct tcp_metrics_block *next;
+
+ next = deref_genl(tm->tcpm_next);
+ kfree_rcu(tm, rcu_head);
+ tm = next;
+ }
+ }
+ return 0;
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcpm_hash_bucket *hb;
+ struct tcp_metrics_block *tm;
+ struct tcp_metrics_block __rcu **pp;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net = genl_info_net(info);
+ int ret;
+ bool src = true, found = false;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 1);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return tcp_metrics_flush_all(net);
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hb = net->ipv4.tcp_metrics_hash + hash;
+ pp = &hb->chain;
+ spin_lock_bh(&tcp_metrics_lock);
+ for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ found = true;
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ if (!found)
+ return -ESRCH;
+ return 0;
+}
+
+static const struct genl_ops tcp_metrics_nl_ops[] = {
+ {
+ .cmd = TCP_METRICS_CMD_GET,
+ .doit = tcp_metrics_nl_cmd_get,
+ .dumpit = tcp_metrics_nl_dump,
+ .policy = tcp_metrics_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = TCP_METRICS_CMD_DEL,
+ .doit = tcp_metrics_nl_cmd_del,
+ .policy = tcp_metrics_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static unsigned int tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+ size_t size;
+ unsigned int slots;
+
+ slots = tcpmhash_entries;
+ if (!slots) {
+ if (totalram_pages >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
+
+ net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!net->ipv4.tcp_metrics_hash)
+ net->ipv4.tcp_metrics_hash = vzalloc(size);
+
+ if (!net->ipv4.tcp_metrics_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+ unsigned int i;
+
+ for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
+ struct tcp_metrics_block *tm, *next;
+
+ tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
+ while (tm) {
+ next = rcu_dereference_protected(tm->tcpm_next, 1);
+ kfree(tm);
+ tm = next;
+ }
+ }
+ kvfree(net->ipv4.tcp_metrics_hash);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .init = tcp_net_metrics_init,
+ .exit = tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&tcp_net_metrics_ops);
+ if (ret < 0)
+ goto cleanup;
+ ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
+ tcp_metrics_nl_ops);
+ if (ret < 0)
+ goto cleanup_subsys;
+ return;
+
+cleanup_subsys:
+ unregister_pernet_subsys(&tcp_net_metrics_ops);
+
+cleanup:
+ return;
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3527b51d615..e68e0d4af6c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,62 +49,12 @@ struct inet_timewait_death_row tcp_death_row = {
};
EXPORT_SYMBOL_GPL(tcp_death_row);
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-static int tcp_remember_stamp(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_peer *peer;
- bool release_it;
-
- peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
- if (peer) {
- if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
- peer->tcp_ts = tp->rx_opt.ts_recent;
- }
- if (release_it)
- inet_putpeer(peer);
- return 1;
- }
-
- return 0;
-}
-
-static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
- struct sock *sk = (struct sock *) tw;
- struct inet_peer *peer;
-
- peer = twsk_getpeer(sk);
- if (peer) {
- const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-
- if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
- peer->tcp_ts = tcptw->tw_ts_recent;
- }
- inet_putpeer(peer);
- return 1;
- }
- return 0;
-}
-
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
- return 1;
+ return true;
if (after(end_seq, s_win) && before(seq, e_win))
- return 1;
+ return true;
return seq == e_win && seq == end_seq;
}
@@ -135,21 +85,23 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
* spinlock it. I do not want! Well, probability of misbehaviour
* is ridiculously low and, seems, we could use some mb() tricks
* to avoid misread sequence numbers, states etc. --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
const struct tcphdr *th)
{
struct tcp_options_received tmp_opt;
- u8 *hash_location;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
- int paws_reject = 0;
+ bool paws_reject = false;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
tmp_opt.ts_recent = tcptw->tw_ts_recent;
tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
@@ -316,7 +268,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_timewait_sock *tw = NULL;
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
- int recycle_ok = 0;
+ bool recycle_ok = false;
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
@@ -327,23 +279,25 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+ struct inet_sock *inet = inet_sk(sk);
+ tw->tw_transparent = inet->transparent;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_tclass = np->tclass;
+ tw->tw_flowlabel = np->flow_label >> 12;
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -357,13 +311,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
*/
do {
struct tcp_md5sig_key *key;
- memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
- tcptw->tw_md5_keylen = 0;
+ tcptw->tw_md5_key = NULL;
key = tp->af_specific->md5_lookup(sk, sk);
if (key != NULL) {
- memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
- tcptw->tw_md5_keylen = key->keylen;
- if (tcp_alloc_md5sig_pool(sk) == NULL)
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
BUG();
}
} while (0);
@@ -392,7 +344,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
@@ -403,12 +355,44 @@ void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_keylen)
- tcp_free_md5sig_pool();
+
+ if (twsk->tw_md5_key)
+ kfree_rcu(twsk->tw_md5_key, rcu);
#endif
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+ int mss = dst_metric_advmss(dst);
+
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+ ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
struct request_sock *req)
{
@@ -423,39 +407,13 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
*/
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
- struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
-
- /* TCP Cookie Transactions require space for the cookie pair,
- * as it differs for each connection. There is no need to
- * copy any s_data_payload stored at the original socket.
- * Failure will prevent resuming the connection.
- *
- * Presumed copied, in order of appearance:
- * cookie_in_always, cookie_out_never
- */
- if (oldcvp != NULL) {
- struct tcp_cookie_values *newcvp =
- kzalloc(sizeof(*newtp->cookie_values),
- GFP_ATOMIC);
-
- if (newcvp != NULL) {
- kref_init(&newcvp->kref);
- newcvp->cookie_desired =
- oldcvp->cookie_desired;
- newtp->cookie_values = newcvp;
- } else {
- /* Not Yet Implemented */
- newtp->cookie_values = NULL;
- }
- }
/* Now setup tcp_sock */
newtp->pred_flags = 0;
@@ -464,15 +422,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
+ INIT_LIST_HEAD(&newtp->tsq_node);
tcp_init_wl(newtp, treq->rcv_isn);
- newtp->srtt = 0;
- newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -480,26 +438,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->sacked_out = 0;
newtp->fackets_out = 0;
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
+ newtp->tlp_high_seq = 0;
+ newtp->lsndtime = treq->snt_synack;
+ newtp->total_retrans = req->num_retrans;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- newtp->snd_cwnd = 2;
+ newtp->snd_cwnd = TCP_INIT_CWND;
newtp->snd_cwnd_cnt = 0;
- newtp->bytes_acked = 0;
- newtp->frto_counter = 0;
- newtp->frto_highmark = 0;
-
- newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
+ !try_module_get(newicsk->icsk_ca_ops->owner))
+ newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
- skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->write_seq = newtp->pushed_seq =
- treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+ __skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
@@ -540,6 +499,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
+ newtp->tsoffset = 0;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -549,6 +509,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
+ newtp->fastopen_rsk = NULL;
+ newtp->syn_data_acked = 0;
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
@@ -557,24 +519,32 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
EXPORT_SYMBOL(tcp_create_openreq_child);
/*
- * Process an incoming packet for SYN_RECV sockets represented
- * as a request_sock.
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
*/
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct request_sock **prev)
+ struct request_sock **prev,
+ bool fastopen)
{
struct tcp_options_received tmp_opt;
- u8 *hash_location;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
- int paws_reject = 0;
+ bool paws_reject = false;
+
+ BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@@ -582,7 +552,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -607,8 +577,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
+ *
+ * Note that even if there is new data in the SYN packet
+ * they will be thrown away too.
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
*/
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ if (!inet_rtx_syn_ack(sk, req))
+ req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX) + jiffies;
return NULL;
}
@@ -664,11 +642,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
- * Invalid ACK: reset will be sent by listening socket
+ * Invalid ACK: reset will be sent by listening socket.
+ * Note that the ACK validity check for a Fast Open socket is done
+ * elsewhere and is checked directly against the child socket rather
+ * than req because user data may have been sent out.
*/
- if ((flg & TCP_FLAG_ACK) &&
+ if ((flg & TCP_FLAG_ACK) && !fastopen &&
(TCP_SKB_CB(skb)->ack_seq !=
- tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
+ tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -679,7 +660,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -690,7 +671,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* In sequence, PAWS is OK. */
- if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
req->ts_recent = tmp_opt.rcv_tsval;
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -709,12 +690,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* ACK sequence verified above, just make sure ACK is
* set. If ACK not set, just silently drop the packet.
+ *
+ * XXX (TFO) - if we ever allow "data after SYN", the
+ * following check needs to be removed.
*/
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ /* For Fast Open no more processing is needed (sk is the
+ * child socket).
+ */
+ if (fastopen)
+ return sk;
+
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
- if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
@@ -744,11 +734,21 @@ listen_overflow:
}
embryonic_reset:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
- if (!(flg & TCP_FLAG_RST))
+ if (!(flg & TCP_FLAG_RST)) {
+ /* Received a bad SYN pkt - for TFO We try not to reset
+ * the local connection unless it's really necessary to
+ * avoid becoming vulnerable to outside attack aiming at
+ * resetting legit local connections.
+ */
req->rsk_ops->send_reset(sk, skb);
-
- inet_csk_reqsk_queue_drop(sk, req, prev);
+ } else if (fastopen) { /* received a valid RST pkt */
+ reqsk_fastopen_remove(sk, req, true);
+ tcp_reset(sk);
+ }
+ if (!fastopen) {
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ }
return NULL;
}
EXPORT_SYMBOL(tcp_check_req);
@@ -757,6 +757,12 @@ EXPORT_SYMBOL(tcp_check_req);
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
*/
int tcp_child_process(struct sock *parent, struct sock *child,
@@ -770,7 +776,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
skb->len);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 00000000000..55046ecd083
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,329 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TCPv4 GSO/GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
+ struct tcphdr *th;
+ unsigned int thlen;
+ unsigned int seq;
+ __be32 delta;
+ unsigned int oldlen;
+ unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ goto out;
+
+ th = tcp_hdr(skb);
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = (u16)~skb->len;
+ __skb_pull(skb, thlen);
+
+ mss = tcp_skb_mss(skb);
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_MPLS |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ 0) ||
+ !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
+ delta = htonl(oldlen + (thlen + mss));
+
+ skb = segs;
+ th = tcp_hdr(skb);
+ seq = ntohl(th->seq);
+
+ newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+
+ do {
+ th->fin = th->psh = 0;
+ th->check = newcheck;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+
+ seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ sum_truesize += skb->truesize;
+ }
+ skb = skb->next;
+ th = tcp_hdr(skb);
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
+ }
+
+ delta = htonl(oldlen + (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+out:
+ return segs;
+}
+
+struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th;
+ struct tcphdr *th2;
+ unsigned int len;
+ unsigned int thlen;
+ __be32 flags;
+ unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
+ int flush = 1;
+ int i;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ for (; (p = *head); head = &p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ goto found;
+ }
+
+ goto out_check_final;
+
+found:
+ /* Include the IP ID check below from the inner most IP hdr */
+ flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+ for (i = sizeof(*th); i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
+
+ mss = tcp_skb_mss(p);
+
+ flush |= (len - 1) >= mss;
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+
+ if (flush || skb_gro_receive(head, skb)) {
+ mss = 1;
+ goto out_check_final;
+ }
+
+ p = *head;
+ th2 = tcp_hdr(p);
+ tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+ flush = len < mss;
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
+
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = head;
+
+out:
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+ return pp;
+}
+
+int tcp_gro_complete(struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)th - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (th->cwr)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+static int tcp_v4_gso_send_check(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct tcphdr *th;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ return 0;
+}
+
+static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ /* Use the IP hdr immediately proceeding for this transport */
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ __wsum wsum;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip_csum;
+
+ wsum = NAPI_GRO_CB(skb)->csum;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
+ 0);
+
+ /* fall through */
+
+ case CHECKSUM_COMPLETE:
+ if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
+ wsum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+skip_csum:
+ return tcp_gro_receive(head, skb);
+}
+
+static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+
+ return tcp_gro_complete(skb);
+}
+
+static const struct net_offload tcpv4_offload = {
+ .callbacks = {
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_gso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+};
+
+int __init tcpv4_offload_init(void)
+{
+ return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 749b6498588..179b51e6bda 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,6 +34,8 @@
*
*/
+#define pr_fmt(fmt) "TCP: " fmt
+
#include <net/tcp.h>
#include <linux/compiler.h>
@@ -48,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
@@ -60,27 +65,30 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
-EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
tcp_advance_send_head(sk, skb);
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- /* Don't override Nagle indefinately with F-RTO */
- if (tp->frto_counter == 2)
- tp->frto_counter = 3;
-
tp->packets_out += tcp_skb_pcount(skb);
- if (!prior_packets)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ tcp_rearm_rto(sk);
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+ tcp_skb_pcount(skb));
}
/* SND.NXT, if window was not shrunk.
@@ -89,9 +97,9 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
* invalid. OK, let's make this for now:
*/
-static inline __u32 tcp_acceptable_seq(struct sock *sk)
+static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
if (!before(tcp_wnd_end(tp), tp->snd_nxt))
return tp->snd_nxt;
@@ -116,12 +124,16 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk)
static __u16 tcp_advertise_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
int mss = tp->advmss;
- if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
- mss = dst_metric(dst, RTAX_ADVMSS);
- tp->advmss = mss;
+ if (dst) {
+ unsigned int metric = dst_metric_advmss(dst);
+
+ if (metric < mss) {
+ mss = metric;
+ tp->advmss = mss;
+ }
}
return (__u16)mss;
@@ -129,7 +141,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
{
struct tcp_sock *tp = tcp_sk(sk);
s32 delta = tcp_time_stamp - tp->lsndtime;
@@ -150,10 +162,11 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
- struct sk_buff *skb, struct sock *sk)
+ struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_time_stamp;
+ const struct dst_entry *dst = __sk_dst_get(sk);
if (sysctl_tcp_slow_start_after_idle &&
(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
@@ -164,8 +177,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
+ (!dst || !dst_metric(dst, RTAX_QUICKACK)))
+ icsk->icsk_ack.pingpong = 1;
}
/* Account for an ACK we sent. */
@@ -175,6 +189,21 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
+
+u32 tcp_default_init_rwnd(u32 mss)
+{
+ /* Initial receive window should be twice of TCP_INIT_CWND to
+ * enable proper sending of new unsent data during fast recovery
+ * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
+ * limit when mss is larger than 1460.
+ */
+ u32 init_rwnd = TCP_INIT_CWND * 2;
+
+ if (mss > 1460)
+ init_rwnd = max((1460 * init_rwnd) / mss, 2U);
+ return init_rwnd;
+}
+
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
@@ -224,18 +253,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
}
- /* Set initial window to value enough for senders, following RFC5681. */
if (mss > (1 << *rcv_wscale)) {
- int init_cwnd = rfc3390_bytes_to_packets(mss);
-
- /* when initializing use the value from init_rcv_wnd
- * rather than the default from above
- */
- if (init_rcv_wnd &&
- (*rcv_wnd > init_rcv_wnd * mss))
- *rcv_wnd = init_rcv_wnd * mss;
- else if (*rcv_wnd > init_cwnd * mss)
- *rcv_wnd = init_cwnd * mss;
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
}
/* Set the clamp no higher than max representable value */
@@ -251,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 old_win = tp->rcv_wnd;
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
@@ -263,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
+ if (new_win == 0)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPWANTZEROWINDOWADV);
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
tp->rcv_wnd = new_win;
@@ -280,18 +305,24 @@ static u16 tcp_select_window(struct sock *sk)
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
- if (new_win == 0)
+ if (new_win == 0) {
tp->pred_flags = 0;
+ if (old_win)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTOZEROWINDOWADV);
+ } else if (old_win == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ }
return new_win;
}
/* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
- TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
}
/* Packet ECN state for a SYN. */
@@ -300,14 +331,14 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sysctl_tcp_ecn == 1) {
- TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
}
static __inline__ void
-TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
{
if (inet_rsk(req)->ecn_ok)
th->ece = 1;
@@ -345,15 +376,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
- TCP_SKB_CB(skb)->flags = flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ shinfo->gso_segs = 1;
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -361,7 +394,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->end_seq = seq;
}
-static inline int tcp_urg_mode(const struct tcp_sock *tp)
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}
@@ -370,51 +403,25 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
-#define OPTION_COOKIE_EXTENSION (1 << 4)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
struct tcp_out_options {
- u8 options; /* bit field of OPTION_* */
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
- u16 mss; /* 0 to disable */
- __u32 tsval, tsecr; /* need to include OPTION_TS */
__u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
};
-/* The sysctl int routines are generic, so check consistency here.
- */
-static u8 tcp_cookie_size_check(u8 desired)
-{
- if (desired > 0) {
- /* previously specified */
- return desired;
- }
- if (sysctl_tcp_cookie_size <= 0) {
- /* no default specified */
- return 0;
- }
- if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
- /* value too small, specify minimum */
- return TCP_COOKIE_MIN;
- }
- if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
- /* value too large, specify maximum */
- return TCP_COOKIE_MAX;
- }
- if (0x1 & sysctl_tcp_cookie_size) {
- /* 8-bit multiple, illegal, fix it */
- return (u8)(sysctl_tcp_cookie_size + 0x1);
- }
- return (u8)sysctl_tcp_cookie_size;
-}
-
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
* TCP options, we learned this through the hard way, so be careful here.
* Luckily we can at least blame others for their non-compliance but from
- * inter-operatibility perspective it seems that we're somewhat stuck with
+ * inter-operability perspective it seems that we're somewhat stuck with
* the ordering which we have been using if we want to keep working with
* those broken things (not that it currently hurts anybody as there isn't
* particular reason why the ordering would need to be changed).
@@ -425,29 +432,11 @@ static u8 tcp_cookie_size_check(u8 desired)
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
- u8 options = opts->options; /* mungable copy */
+ u16 options = opts->options; /* mungable copy */
- /* Having both authentication and cookies for security is redundant,
- * and there's certainly not enough room. Instead, the cookie-less
- * extension variant is proposed.
- *
- * Consider the pessimal case with authentication. The options
- * could look like:
- * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
- */
if (unlikely(OPTION_MD5 & options)) {
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- *ptr++ = htonl((TCPOPT_COOKIE << 24) |
- (TCPOLEN_COOKIE_BASE << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- } else {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- }
- options &= ~OPTION_COOKIE_EXTENSION;
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
@@ -476,44 +465,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
- /* Specification requires after timestamp, so do it now.
- *
- * Consider the pessimal case without authentication. The options
- * could look like:
- * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
- */
- if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
- __u8 *cookie_copy = opts->hash_location;
- u8 cookie_size = opts->hash_size;
-
- /* 8-bit multiple handled in tcp_cookie_size_check() above,
- * and elsewhere.
- */
- if (0x2 & cookie_size) {
- __u8 *p = (__u8 *)ptr;
-
- /* 16-bit multiple */
- *p++ = TCPOPT_COOKIE;
- *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
- *p++ = *cookie_copy++;
- *p++ = *cookie_copy++;
- ptr++;
- cookie_size -= 2;
- } else {
- /* 32-bit multiple */
- *ptr++ = htonl(((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_COOKIE << 8) |
- TCPOLEN_COOKIE_BASE) +
- cookie_size);
- }
-
- if (cookie_size > 0) {
- memcpy(ptr, cookie_copy, cookie_size);
- ptr += (cookie_size / 4);
- }
- }
-
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -547,20 +498,33 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
tp->rx_opt.dsack = 0;
}
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+ *ptr++ = htonl((TCPOPT_EXP << 24) |
+ ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+
+ memcpy(ptr, foc->val, foc->len);
+ if ((foc->len & 3) == 2) {
+ u8 *align = ((u8 *)ptr) + foc->len;
+ align[0] = align[1] = TCPOPT_NOP;
+ }
+ ptr += (foc->len + 3) >> 2;
+ }
}
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
-static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5) {
+ struct tcp_md5sig_key **md5)
+{
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_cookie_values *cvp = tp->cookie_values;
- unsigned remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
- tcp_cookie_size_check(cvp->cookie_desired) :
- 0;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -586,7 +550,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
- opts->tsval = TCP_SKB_CB(skb)->when;
+ opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -601,68 +565,30 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- /* Note that timestamps are required by the specification.
- *
- * Odd numbers of bytes are prohibited by the specification, ensuring
- * that the cookie is 16-bit aligned, and the resulting cookie pair is
- * 32-bit aligned.
- */
- if (*md5 == NULL &&
- (OPTION_TS & opts->options) &&
- cookie_size > 0) {
- int need = TCPOLEN_COOKIE_BASE + cookie_size;
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
-
- if (need > remaining) {
- /* try shrinking cookie to fit */
- cookie_size -= 2;
- need -= 4;
- }
- }
- while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
- cookie_size -= 4;
- need -= 4;
- }
- if (TCP_COOKIE_MIN <= cookie_size) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
- opts->hash_size = cookie_size;
-
- /* Remember for future incarnations. */
- cvp->cookie_desired = cookie_size;
-
- if (cvp->cookie_desired != cvp->cookie_pair_size) {
- /* Currently use random bytes as a nonce,
- * assuming these are completely unpredictable
- * by hostile users of the same system.
- */
- get_random_bytes(&cvp->cookie_pair[0],
- cookie_size);
- cvp->cookie_pair_size = cookie_size;
- }
-
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
remaining -= need;
+ tp->syn_fastopen = 1;
}
}
+
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned tcp_synack_options(struct sock *sk,
+static unsigned int tcp_synack_options(struct sock *sk,
struct request_sock *req,
- unsigned mss, struct sk_buff *skb,
+ unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5,
- struct tcp_extend_values *xvp)
+ struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
- unsigned remaining = MAX_TCP_OPTION_SPACE;
- u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
- xvp->cookie_plus :
- 0;
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -701,43 +627,33 @@ static unsigned tcp_synack_options(struct sock *sk,
if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
-
- /* Similar rationale to tcp_syn_options() applies here, too.
- * If the <SYN> options fit, the same options should fit now!
- */
- if (*md5 == NULL &&
- ireq->tstamp_ok &&
- cookie_plus > TCPOLEN_COOKIE_BASE) {
- int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
-
- if (0x2 & need) {
- /* 32-bit multiple */
- need += 2; /* NOPs */
- }
- if (need <= remaining) {
- opts->options |= OPTION_COOKIE_EXTENSION;
- opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
+ if (foc != NULL && foc->len >= 0) {
+ u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = foc;
remaining -= need;
- } else {
- /* There's no error return, so flag it. */
- xvp->cookie_out_never = 1; /* true */
- opts->hash_size = 0;
}
}
+
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Compute TCP options for ESTABLISHED sockets. This is not the
* final wire format yet.
*/
-static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5) {
+ struct tcp_md5sig_key **md5)
+{
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ unsigned int size = 0;
unsigned int eff_sacks;
+ opts->options = 0;
+
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (unlikely(*md5)) {
@@ -750,16 +666,16 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcb ? tcb->when : 0;
+ opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
- const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, eff_sacks,
+ min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -769,6 +685,172 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
return size;
}
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+ struct tasklet_struct tasklet;
+ struct list_head head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+ tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ 0, GFP_ATOMIC);
+}
+/*
+ * One tasklet per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transferring tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+ struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+ struct sock *sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+ local_irq_restore(flags);
+
+ list_for_each_safe(q, n, &list) {
+ tp = list_entry(q, struct tcp_sock, tsq_node);
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+ bh_lock_sock(sk);
+
+ if (!sock_owned_by_user(sk)) {
+ tcp_tsq_handler(sk);
+ } else {
+ /* defer the work to tcp_release_cb() */
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ }
+ bh_unlock_sock(sk);
+
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+ sk_free(sk);
+ }
+}
+
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \
+ (1UL << TCP_DELACK_TIMER_DEFERRED) | \
+ (1UL << TCP_MTU_REDUCED_DEFERRED))
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nflags;
+
+ /* perform an atomic operation only if at least one flag is set */
+ do {
+ flags = tp->tsq_flags;
+ if (!(flags & TCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~TCP_DEFERRED_ALL;
+ } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+ if (flags & (1UL << TCP_TSQ_DEFERRED))
+ tcp_tsq_handler(sk);
+
+ /* Here begins the tricky part :
+ * We are called from release_sock() with :
+ * 1) BH disabled
+ * 2) sk_lock.slock spinlock held
+ * 3) socket owned by us (sk->sk_lock.owned == 1)
+ *
+ * But following code is meant to be called from BH handlers,
+ * so we should keep BH disabled, but early release socket ownership
+ */
+ sock_release_ownership(sk);
+
+ if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ tcp_write_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ tcp_delack_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ sk->sk_prot->mtu_reduced(sk);
+ __sock_put(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+ INIT_LIST_HEAD(&tsq->head);
+ tasklet_init(&tsq->tasklet,
+ tcp_tasklet_func,
+ (unsigned long)tsq);
+ }
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We can't xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+ !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+ unsigned long flags;
+ struct tsq_tasklet *tsq;
+
+ /* Keep a ref on socket.
+ * This last ref will be released in tcp_tasklet_func()
+ */
+ atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+ /* queue this socket to tasklet queue */
+ local_irq_save(flags);
+ tsq = &__get_cpu_var(tsq_tasklet);
+ list_add(&tp->tsq_node, &tsq->head);
+ tasklet_schedule(&tsq->tasklet);
+ local_irq_restore(flags);
+ } else {
+ sock_wfree(skb);
+ }
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -788,26 +870,24 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
- unsigned tcp_options_size, tcp_header_size;
+ unsigned int tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
- /* If congestion control is doing timestamping, we must
- * take such a timestamp before we potentially clone/copy.
- */
- if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
- __net_timestamp(skb);
+ if (clone_it) {
+ skb_mstamp_get(&skb->skb_mstamp);
- if (likely(clone_it)) {
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
+ /* Our usage of tstamp should remain private */
+ skb->tstamp.tv64 = 0;
}
inet = inet_sk(sk);
@@ -815,22 +895,28 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->flags & TCPHDR_SYN))
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
else
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- if (tcp_packets_in_flight(tp) == 0) {
+ if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
- skb->ooo_okay = 1;
- } else
- skb->ooo_okay = 0;
+
+ /* if no packet is in qdisc/device queue, then allow XPS to select
+ * another queue.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
- skb_set_owner_w(skb, sk);
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = tcp_wfree;
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
@@ -839,9 +925,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
- tcb->flags);
+ tcb->tcp_flags);
- if (unlikely(tcb->flags & TCPHDR_SYN)) {
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
@@ -864,7 +950,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
- if (likely((tcb->flags & TCPHDR_SYN) == 0))
+ if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
@@ -878,17 +964,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
icsk->icsk_af_ops->send_check(sk, skb);
- if (likely(tcb->flags & TCPHDR_ACK))
+ if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
- tcp_event_data_sent(tp, skb, sk);
+ tcp_event_data_sent(tp, sk);
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
- err = icsk->icsk_af_ops->queue_xmit(skb);
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (likely(err <= 0))
return err;
@@ -915,28 +1001,32 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
+static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
- if (skb->len <= mss_now || !sk_can_gso(sk) ||
- skb->ip_summed == CHECKSUM_NONE) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
+ if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- skb_shinfo(skb)->gso_segs = 1;
- skb_shinfo(skb)->gso_size = 0;
- skb_shinfo(skb)->gso_type = 0;
+ shinfo->gso_segs = 1;
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
} else {
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
- skb_shinfo(skb)->gso_size = mss_now;
- skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+ shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ shinfo->gso_size = mss_now;
+ shinfo->gso_type = sk->sk_gso_type;
}
}
/* When a modification to fackets out becomes necessary, we need to check
* skb is counted to fackets_out or not.
*/
-static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
+static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
int decr)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -951,7 +1041,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
-static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
+static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -984,7 +1074,7 @@ static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
* Remember, these are still headerless SKBs at this point.
*/
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
- unsigned int mss_now)
+ unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
@@ -992,19 +1082,18 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
int nlen;
u8 flags;
- BUG_ON(len > skb->len);
+ if (WARN_ON(len > skb->len))
+ return -EINVAL;
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+ buff = sk_stream_alloc_skb(sk, nsize, gfp);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
@@ -1020,9 +1109,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
- flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
- TCP_SKB_CB(buff)->flags = flags;
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -1077,25 +1166,36 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
*/
static void __pskb_trim_head(struct sk_buff *skb, int len)
{
+ struct skb_shared_info *shinfo;
int i, k, eat;
+ eat = min_t(int, len, skb_headlen(skb));
+ if (eat) {
+ __skb_pull(skb, eat);
+ len -= eat;
+ if (!len)
+ return;
+ }
eat = len;
k = 0;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ shinfo = skb_shinfo(skb);
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ int size = skb_frag_size(&shinfo->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ shinfo->frags[k].page_offset += eat;
+ skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
k++;
}
}
- skb_shinfo(skb)->nr_frags = k;
+ shinfo->nr_frags = k;
skb_reset_tail_pointer(skb);
skb->data_len -= len;
@@ -1105,14 +1205,10 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
return -ENOMEM;
- /* If len == headlen, we avoid __skb_pull to preserve alignment. */
- if (unlikely(len < skb_headlen(skb)))
- __skb_pull(skb, len);
- else
- __pskb_trim_head(skb, len - skb_headlen(skb));
+ __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1122,20 +1218,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
sk_mem_uncharge(sk, len);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- /* Any change of skb->len requires recalculation of tso
- * factor and mss.
- */
+ /* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
return 0;
}
-/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
/* Calculate base mss without TCP options:
@@ -1143,6 +1237,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
@@ -1153,18 +1255,22 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
-
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
return mss_now;
}
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int mtu;
mtu = mss +
@@ -1172,6 +1278,13 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
return mtu;
}
@@ -1238,10 +1351,10 @@ EXPORT_SYMBOL(tcp_sync_mss);
*/
unsigned int tcp_current_mss(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- unsigned header_len;
+ unsigned int header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
@@ -1267,12 +1380,43 @@ unsigned int tcp_current_mss(struct sock *sk)
return mss_now;
}
-/* Congestion window validation. (RFC2861) */
-static void tcp_cwnd_validate(struct sock *sk)
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 win_used = max(tp->snd_cwnd_used, init_win);
+ if (win_used < tp->snd_cwnd) {
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->packets_out >= tp->snd_cwnd) {
+ /* Track the maximum number of outstanding packets in each
+ * window, and remember whether we were cwnd-limited then.
+ */
+ if (!before(tp->snd_una, tp->max_packets_seq) ||
+ tp->packets_out > tp->max_packets_out) {
+ tp->max_packets_out = tp->packets_out;
+ tp->max_packets_seq = tp->snd_nxt;
+ tp->is_cwnd_limited = is_cwnd_limited;
+ }
+
+ if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1287,48 +1431,85 @@ static void tcp_cwnd_validate(struct sock *sk)
}
}
-/* Returns the portion of skb which can be sent right away without
- * introducing MSS oddities to segment boundaries. In rare cases where
- * mss_now != mss_cache, we will request caller to create a small skb
- * per input skb which could be mostly avoided here (if desired).
- *
- * We explicitly want to create a request for splitting write queue tail
- * to a small skb for Nagle purposes while avoiding unnecessary modulos,
- * thus all the complexity (cwnd_len is always MSS multiple which we
- * return whenever allowed by the other factors). Basically we need the
- * modulo only when the receiver window alone is the limiting factor or
- * when we would be allowed to send the split-due-to-Nagle skb fully.
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml, tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
*/
-static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
- unsigned int mss_now, unsigned int cwnd)
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+ const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 needed, window, cwnd_len;
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
+ */
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ int nonagle)
+{
+ return partial &&
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned int mss_now,
+ unsigned int max_segs,
+ int nonagle)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 partial, needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
- cwnd_len = mss_now * cwnd;
+ max_len = mss_now * max_segs;
- if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
- return cwnd_len;
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
needed = min(skb->len, window);
- if (cwnd_len <= needed)
- return cwnd_len;
+ if (max_len <= needed)
+ return max_len;
+
+ partial = needed % mss_now;
+ /* If last segment is not a full MSS, check if Nagle rules allow us
+ * to include this last segment in this skb.
+ * Otherwise, we'll split the skb at last MSS boundary
+ */
+ if (tcp_nagle_check(partial != 0, tp, nonagle))
+ return needed - partial;
- return needed - needed % mss_now;
+ return needed;
}
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
- struct sk_buff *skb)
+static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
{
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+ tcp_skb_pcount(skb) == 1)
return 1;
in_flight = tcp_packets_in_flight(tp);
@@ -1339,11 +1520,11 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
return 0;
}
-/* Intialize TSO state of a skb.
+/* Initialize TSO state of a skb.
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
-static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
+static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
@@ -1355,34 +1536,12 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
return tso_segs;
}
-/* Minshall's variant of the Nagle send check. */
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
-{
- return after(tp->snd_sml, tp->snd_una) &&
- !after(tp->snd_sml, tp->snd_nxt);
-}
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_NODELAY was set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- * With Minshall's modification: all sent small packets are ACKed.
- */
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
- const struct sk_buff *skb,
- unsigned mss_now, int nonagle)
-{
- return skb->len < mss_now &&
- ((nonagle & TCP_NAGLE_CORK) ||
- (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
-}
-
-/* Return non-zero if the Nagle test allows this packet to be
+/* Return true if the Nagle test allows this packet to be
* sent now.
*/
-static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
@@ -1391,24 +1550,22 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
- return 1;
+ return true;
- /* Don't use the nagle rule for urgent data (or for the final FIN).
- * Nagle can be ignored during F-RTO too (see RFC4138).
- */
- if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
- (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
- return 1;
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ return true;
- if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
- return 1;
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+ return true;
- return 0;
+ return false;
}
/* Does at least the first segment of SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
- unsigned int cur_mss)
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -1422,10 +1579,10 @@ static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
* should be put on the wire right now. If so, it returns the number of
* packets allowed by the congestion window.
*/
-static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
unsigned int cwnd_quota;
tcp_init_tso_segs(sk, skb, cur_mss);
@@ -1441,9 +1598,9 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
}
/* Test if sending is allowed right now. */
-int tcp_may_send_now(struct sock *sk)
+bool tcp_may_send_now(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
return skb &&
@@ -1468,7 +1625,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, skb, len, mss_now);
+ return tcp_fragment(sk, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp);
if (unlikely(buff == NULL))
@@ -1485,9 +1642,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
- flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
- TCP_SKB_CB(buff)->flags = flags;
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
@@ -1511,13 +1668,15 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
*
* This algorithm is from John Heffner.
*/
-static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
+ int win_divisor;
- if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1540,20 +1699,22 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= sk->sk_gso_max_size)
+ if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+ tp->xmit_size_goal_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
- if (sysctl_tcp_tso_win_divisor) {
+ win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+ if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
*/
- chunk /= sysctl_tcp_tso_win_divisor;
+ chunk /= win_divisor;
if (limit >= chunk)
goto send_now;
} else {
@@ -1562,18 +1723,24 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
* frame, so if we have space for more than 3 frames
* then send now.
*/
- if (limit > tcp_max_burst(tp) * tp->mss_cache)
+ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
goto send_now;
}
- /* Ok, it looks like it is advisable to defer. */
- tp->tso_deferred = 1 | (jiffies << 1);
+ /* Ok, it looks like it is advisable to defer.
+ * Do not rearm the timer if already set to not break TCP ACK clocking.
+ */
+ if (!tp->tso_deferred)
+ tp->tso_deferred = 1 | (jiffies << 1);
+
+ if (cong_win < send_win && cong_win < skb->len)
+ *is_cwnd_limited = true;
- return 1;
+ return true;
send_now:
tp->tso_deferred = 0;
- return 0;
+ return false;
}
/* Create a new MTU probe if we are ready.
@@ -1643,7 +1810,7 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
- TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
+ TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
TCP_SKB_CB(nskb)->sacked = 0;
nskb->csum = 0;
nskb->ip_summed = skb->ip_summed;
@@ -1663,11 +1830,11 @@ static int tcp_mtu_probe(struct sock *sk)
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
* Throw it away. */
- TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
- TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH);
if (!skb_shinfo(skb)->nr_frags) {
skb_pull(skb, copy);
@@ -1715,17 +1882,21 @@ static int tcp_mtu_probe(struct sock *sk)
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
- * Returns 1, if no segments are in flight and we have queued segments, but
- * cannot send anything now because of SWS or another problem.
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
*/
-static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
- int push_one, gfp_t gfp)
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
+ bool is_cwnd_limited = false;
sent_pkts = 0;
@@ -1733,7 +1904,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
- return 0;
+ return false;
} else if (result > 0) {
sent_pkts = 1;
}
@@ -1745,9 +1916,18 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+ goto repair; /* Skip network transmission */
+
cwnd_quota = tcp_cwnd_test(tp, skb);
- if (!cwnd_quota)
- break;
+ if (!cwnd_quota) {
+ is_cwnd_limited = true;
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
@@ -1758,14 +1938,42 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
- if (!push_one && tcp_tso_should_defer(sk, skb))
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+ break;
+ }
+
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+ limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+ sk->sk_pacing_rate >> 10);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
break;
}
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- cwnd_quota);
+ min_t(unsigned int,
+ cwnd_quota,
+ sk->sk_gso_max_segs),
+ nonagle);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -1776,23 +1984,165 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
+repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
- sent_pkts++;
+ sent_pkts += tcp_skb_pcount(skb);
if (push_one)
break;
}
if (likely(sent_pkts)) {
- tcp_cwnd_validate(sk);
- return 0;
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk);
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+ return false;
}
- return !tp->packets_out && tcp_send_head(sk);
+ return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, tlp_time_stamp, rto_time_stamp;
+ u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
+
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+ return false;
+ /* No consecutive loss probes. */
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+ tcp_rearm_rto(sk);
+ return false;
+ }
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (sk->sk_state == TCP_SYN_RECV)
+ return false;
+
+ /* TLP is only scheduled when next timer event is RTO. */
+ if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+ return false;
+
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * in Open state, that are either limited by cwnd or application.
+ */
+ if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ return false;
+
+ if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+ tcp_send_head(sk))
+ return false;
+
+ /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ * for delayed ack when there's one outstanding packet.
+ */
+ timeout = rtt << 1;
+ if (tp->packets_out == 1)
+ timeout = max_t(u32, timeout,
+ (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+ timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+ /* If RTO is shorter, just schedule TLP in its place. */
+ tlp_time_stamp = tcp_time_stamp + timeout;
+ rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+ if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+ s32 delta = rto_time_stamp - tcp_time_stamp;
+ if (delta > 0)
+ timeout = delta;
+ }
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
+ * Note: This is called from BH context only.
+ */
+static bool skb_still_in_host_queue(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct sk_buff *fclone = skb + 1;
+
+ if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
+ fclone->fclone == SKB_FCLONE_CLONE)) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
+ return false;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+ int err = -1;
+
+ if (tcp_send_head(sk) != NULL) {
+ err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+ /* At most one outstanding TLP retransmission. */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ /* Retransmit last segment. */
+ skb = tcp_write_queue_tail(sk);
+ if (WARN_ON(!skb))
+ goto rearm_timer;
+
+ if (skb_still_in_host_queue(sk, skb))
+ goto rearm_timer;
+
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
+
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ GFP_ATOMIC)))
+ goto rearm_timer;
+ skb = tcp_write_queue_tail(sk);
+ }
+
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
+
+ err = __tcp_retransmit_skb(sk, skb);
+
+ /* Record snd_nxt for loss detection. */
+ if (likely(!err))
+ tp->tlp_high_seq = tp->snd_nxt;
+
+rearm_timer:
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+
+ if (likely(!err))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBES);
}
/* Push out any pending frames which were held back due to
@@ -1809,7 +2159,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_atomic(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
@@ -1889,7 +2240,8 @@ u32 __tcp_select_window(struct sock *sk)
*/
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
- int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+ int allowed_space = tcp_full_space(sk);
+ int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
if (mss > full_space)
@@ -1898,11 +2250,23 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (tcp_memory_pressure)
+ if (sk_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
- if (free_space < mss)
+ /* free_space might become our new window, make sure we don't
+ * increase it due to wscale.
+ */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ /* if free space is less than mss estimate, or is below 1/16th
+ * of the maximum allowed, try to move to zero-window, else
+ * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+ * new incoming data is dropped due to memory limits.
+ * With large window, mss test triggers way too late in order
+ * to announce zero window in time before rmem limit kicks in.
+ */
+ if (free_space < (allowed_space >> 4) || free_space < mss)
return 0;
}
@@ -1971,7 +2335,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
/* Merge over control information. This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
+ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
@@ -1989,22 +2353,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
}
/* Check if coalescing SKBs is legal. */
-static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
- return 0;
+ return false;
/* TODO: SACK collapsing could be used to remove this condition */
if (skb_shinfo(skb)->nr_frags != 0)
- return 0;
+ return false;
if (skb_cloned(skb))
- return 0;
+ return false;
if (skb == tcp_send_head(sk))
- return 0;
+ return false;
/* Some heurestics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- return 0;
+ return false;
- return 1;
+ return true;
}
/* Collapse packets in the retransmit queue to make to create
@@ -2015,11 +2379,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = to, *tmp;
- int first = 1;
+ bool first = true;
if (!sysctl_tcp_retrans_collapse)
return;
- if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2029,7 +2393,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
space -= skb->len;
if (first) {
- first = 0;
+ first = false;
continue;
}
@@ -2038,7 +2402,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
/* Punt if not enough space exists in the first SKB for
* the data in the second
*/
- if (skb->len > skb_tailroom(to))
+ if (skb->len > skb_availroom(to))
break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2052,7 +2416,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2071,6 +2435,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
return -EAGAIN;
+ if (skb_still_in_host_queue(sk, skb))
+ return -EBUSY;
+
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
@@ -2093,12 +2460,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -EAGAIN;
if (skb->len > cur_mss) {
- if (tcp_fragment(sk, skb, cur_mss, cur_mss))
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
int oldpcount = tcp_skb_pcount(skb);
if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
tcp_init_tso_segs(sk, skb, cur_mss);
tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
}
@@ -2106,38 +2475,45 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tcp_retrans_try_collapse(sk, skb, cur_mss);
- /* Some Solaris stacks overoptimize and ignore the FIN on a
- * retransmit when old data is attached. So strip it off
- * since it is cheap to do so and saves bytes on the network.
- */
- if (skb->len > 0 &&
- (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
- tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
- if (!pskb_trim(skb, 0)) {
- /* Reuse, even though it does some unnecessary work */
- tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
- TCP_SKB_CB(skb)->flags);
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
/* Make a copy, if the first transmission SKB clone we made
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ /* make sure skb->data is aligned on arches that require it
+ * and check if ack-trimming & collapsing extended the headroom
+ * beyond what csum_start can cover.
+ */
+ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+ skb_headroom(skb) >= 0xFFFF)) {
+ struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } else {
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
- if (err == 0) {
+ if (likely(!err)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
/* Update global TCP statistics. */
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
-
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans++;
+ }
+ return err;
+}
+
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = __tcp_retransmit_skb(sk, skb);
+ if (err == 0) {
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- if (net_ratelimit())
- printk(KERN_DEBUG "retrans_out leaked.\n");
+ net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
if (!tp->retrans_out)
@@ -2149,31 +2525,35 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = TCP_SKB_CB(skb)->when;
- tp->undo_retrans++;
-
/* snd_nxt is stored to detect loss of retransmitted segment,
* see tcp_input.c tcp_sacktag_write_queue().
*/
TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
return err;
}
/* Check if we forward retransmits are possible in the current
* window/congestion state.
*/
-static int tcp_can_forward_retransmit(struct sock *sk)
+static bool tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Forward retransmissions are possible only during Recovery. */
if (icsk->icsk_ca_state != TCP_CA_Recovery)
- return 0;
+ return false;
/* No forward retransmissions in Reno are possible. */
if (tcp_is_reno(tp))
- return 0;
+ return false;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
@@ -2184,9 +2564,9 @@ static int tcp_can_forward_retransmit(struct sock *sk)
*/
if (tcp_may_send_now(sk))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* This gets called after a retransmit timeout, and the initially
@@ -2278,8 +2658,12 @@ begin_fwd:
if (tcp_retransmit_skb(sk, skb))
return;
+
NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += tcp_skb_pcount(skb);
+
if (skb == tcp_write_queue_head(sk))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
@@ -2303,7 +2687,7 @@ void tcp_send_fin(struct sock *sk)
mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
} else {
@@ -2365,11 +2749,11 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
- printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+ if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
- if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
@@ -2383,66 +2767,50 @@ int tcp_send_synack(struct sock *sk)
skb = nskb;
}
- TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
-/* Prepare a SYN-ACK. */
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct request_values *rvp)
+ struct tcp_fastopen_cookie *foc)
{
struct tcp_out_options opts;
- struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
- const struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcphdr *th;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
int tcp_header_size;
int mss;
- int s_data_desired = 0;
- if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
- s_data_desired = cvp->s_data_desired;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
- if (skb == NULL)
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ dst_release(dst);
return NULL;
-
+ }
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb_dst_set(skb, dst_clone(dst));
+ skb_dst_set(skb, dst);
+ security_skb_owned_by(skb, sk);
- mss = dst_metric(dst, RTAX_ADVMSS);
+ mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
mss = tp->rx_opt.user_mss;
- if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
- __u8 rcv_wscale;
- /* Set this up on the first call only */
- req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
-
- /* limit the window selection if the user enforce a smaller rx buffer */
- if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
- req->window_clamp = tcp_full_space(sk);
-
- /* tcp_full_space because it is guaranteed to be the first packet */
- tcp_select_initial_window(tcp_full_space(sk),
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
- ireq->wscale_ok,
- &rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
- ireq->rcv_wscale = rcv_wscale;
- }
-
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
@@ -2450,9 +2818,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
else
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5, xvp)
- + sizeof(*th);
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+ foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2462,56 +2829,23 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->syn = 1;
th->ack = 1;
TCP_ECN_make_synack(req, th);
- th->source = ireq->loc_port;
- th->dest = ireq->rmt_port;
+ th->source = htons(ireq->ir_num);
+ th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
* not even correctly set)
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPHDR_SYN | TCPHDR_ACK);
- if (OPTION_COOKIE_EXTENSION & opts.options) {
- if (s_data_desired) {
- u8 *buf = skb_put(skb, s_data_desired);
-
- /* copy data directly from the listening socket. */
- memcpy(buf, cvp->s_data_payload, s_data_desired);
- TCP_SKB_CB(skb)->end_seq += s_data_desired;
- }
-
- if (opts.hash_size > 0) {
- __u32 workspace[SHA_WORKSPACE_WORDS];
- u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
- u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
-
- /* Secret recipe depends on the Timestamp, (future)
- * Sequence and Acknowledgment Numbers, Initiator
- * Cookie, and others handled by IP variant caller.
- */
- *tail-- ^= opts.tsval;
- *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
- *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
-
- /* recommended */
- *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
- *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
-
- sha_transform((__u32 *)&xvp->cookie_bakery[0],
- (char *)mess,
- &workspace[0]);
- opts.hash_location =
- (__u8 *)&xvp->cookie_bakery[0];
- }
- }
-
th->seq = htonl(TCP_SKB_CB(skb)->seq);
- th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+ /* XXX data is queued and acked as is. No buffer/window check */
+ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
- TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -2528,7 +2862,7 @@ EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
- struct dst_entry *dst = __sk_dst_get(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
@@ -2552,7 +2886,7 @@ static void tcp_connect_init(struct sock *sk)
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
- tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ tp->advmss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
tp->advmss = tp->rx_opt.user_mss;
@@ -2581,15 +2915,134 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->rcv_nxt = 0;
- tp->rcv_wup = 0;
- tp->copied_seq = 0;
+ tp->snd_nxt = tp->write_seq;
+
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ else
+ tp->rcv_tstamp = tcp_time_stamp;
+ tp->rcv_wup = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ skb_header_release(skb);
+ __tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ tp->write_seq = tcb->end_seq;
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
+ struct sk_buff *syn_data = NULL, *data;
+ unsigned long last_syn_loss = 0;
+
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+ &syn_loss, &last_syn_loss);
+ /* Recurring FO SYN losses: revert to regular handshake temporarily */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ fo->cookie.len = -1;
+ goto fallback;
+ }
+
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+ fo->cookie.len = -1;
+ else if (fo->cookie.len <= 0)
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ space = min_t(size_t, space, fo->size);
+
+ /* limit to order-0 allocations */
+ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
+
+ syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
+ sk->sk_allocation);
+ if (syn_data == NULL)
+ goto fallback;
+
+ for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+ struct iovec *iov = &fo->data->msg_iov[i];
+ unsigned char __user *from = iov->iov_base;
+ int len = iov->iov_len;
+
+ if (syn_data->len + len > space)
+ len = space - syn_data->len;
+ else if (i + 1 == iovlen)
+ /* No more data pending in inet_wait_for_connect() */
+ fo->data = NULL;
+
+ if (skb_add_data(syn_data, from, len))
+ goto fallback;
+ }
+
+ /* Queue a data-only packet after the regular SYN for retransmission */
+ data = pskb_copy(syn_data, sk->sk_allocation);
+ if (data == NULL)
+ goto fallback;
+ TCP_SKB_CB(data)->seq++;
+ TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+ tcp_connect_queue_skb(sk, data);
+ fo->copied = data->len;
+
+ /* syn_data is about to be sent, we need to take current time stamps
+ * for the packets that are in write queue : SYN packet and DATA
+ */
+ skb_mstamp_get(&syn->skb_mstamp);
+ data->skb_mstamp = syn->skb_mstamp;
+
+ if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ tp->syn_data = (fo->copied > 0);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
+ goto done;
+ }
+ syn_data = NULL;
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+ kfree_skb(syn_data);
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
@@ -2599,6 +3052,11 @@ int tcp_connect(struct sock *sk)
tcp_connect_init(sk);
+ if (unlikely(tp->repair)) {
+ tcp_finish_connect(sk, NULL);
+ return 0;
+ }
+
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
@@ -2606,19 +3064,14 @@ int tcp_connect(struct sock *sk)
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
- tp->snd_nxt = tp->write_seq;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+ tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ tcp_connect_queue_skb(sk, buff);
TCP_ECN_send_syn(sk, buff);
- /* Send it off. */
- TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tp->retrans_stamp = TCP_SKB_CB(buff)->when;
- skb_header_release(buff);
- __tcp_add_write_queue_tail(sk, buff);
- sk->sk_wmem_queued += buff->truesize;
- sk_mem_charge(sk, buff->truesize);
- tp->packets_out += tcp_skb_pcount(buff);
- err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
if (err == -ECONNREFUSED)
return err;
@@ -2660,8 +3113,9 @@ void tcp_send_delayed_ack(struct sock *sk)
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
- if (tp->srtt) {
- int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+ if (tp->srtt_us) {
+ int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+ TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
@@ -2705,7 +3159,7 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -2720,7 +3174,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
/* This routine sends a packet with an out of date sequence
@@ -2740,7 +3194,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (skb == NULL)
return -1;
@@ -2755,6 +3209,14 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
+void tcp_send_window_probe(struct sock *sk)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk)
{
@@ -2780,13 +3242,13 @@ int tcp_write_wakeup(struct sock *sk)
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
skb->len > mss) {
seg_size = min(seg_size, mss);
- TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
- if (tcp_fragment(sk, skb, seg_size, mss))
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(sk, skb, mss);
- TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 85ee7eb7e38..3b66610d415 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/socket.h>
@@ -36,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper");
MODULE_LICENSE("GPL");
MODULE_VERSION("1.1");
-static int port __read_mostly = 0;
+static int port __read_mostly;
MODULE_PARM_DESC(port, "Port to match (0=all)");
module_param(port, int, 0);
@@ -44,6 +46,10 @@ static unsigned int bufsize __read_mostly = 4096;
MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
module_param(bufsize, uint, 0);
+static unsigned int fwmark __read_mostly;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
static int full __read_mostly;
MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
module_param(full, int, 0);
@@ -52,12 +58,16 @@ static const char procname[] = "tcpprobe";
struct tcp_log {
ktime_t tstamp;
- __be32 saddr, daddr;
- __be16 sport, dport;
+ union {
+ struct sockaddr raw;
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } src, dst;
u16 length;
u32 snd_nxt;
u32 snd_una;
u32 snd_wnd;
+ u32 rcv_wnd;
u32 snd_cwnd;
u32 ssthresh;
u32 srtt;
@@ -84,19 +94,29 @@ static inline int tcp_probe_avail(void)
return bufsize - tcp_probe_used() - 1;
}
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
+ do { \
+ si4.sin_family = AF_INET; \
+ si4.sin_port = inet->inet_##mem##port; \
+ si4.sin_addr.s_addr = inet->inet_##mem##addr; \
+ } while (0) \
+
+
/*
* Hook inserted to be called before each receive packet.
* Note: arguments must match tcp_rcv_established()!
*/
-static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
- /* Only update if port matches */
- if ((port == 0 || ntohs(inet->inet_dport) == port ||
- ntohs(inet->inet_sport) == port) &&
+ /* Only update if port or skb mark matches */
+ if (((port == 0 && fwmark == 0) ||
+ ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port ||
+ (fwmark > 0 && skb->mark == fwmark)) &&
(full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
spin_lock(&tcp_probe.lock);
@@ -105,17 +125,36 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcp_log *p = tcp_probe.log + tcp_probe.head;
p->tstamp = ktime_get();
- p->saddr = inet->inet_saddr;
- p->sport = inet->inet_sport;
- p->daddr = inet->inet_daddr;
- p->dport = inet->inet_dport;
+ switch (sk->sk_family) {
+ case AF_INET:
+ tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
+ tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+ break;
+ case AF_INET6:
+ memset(&p->src.v6, 0, sizeof(p->src.v6));
+ memset(&p->dst.v6, 0, sizeof(p->dst.v6));
+#if IS_ENABLED(CONFIG_IPV6)
+ p->src.v6.sin6_family = AF_INET6;
+ p->src.v6.sin6_port = inet->inet_sport;
+ p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
+
+ p->dst.v6.sin6_family = AF_INET6;
+ p->dst.v6.sin6_port = inet->inet_dport;
+ p->dst.v6.sin6_addr = sk->sk_v6_daddr;
+#endif
+ break;
+ default:
+ BUG();
+ }
+
p->length = skb->len;
p->snd_nxt = tp->snd_nxt;
p->snd_una = tp->snd_una;
p->snd_cwnd = tp->snd_cwnd;
p->snd_wnd = tp->snd_wnd;
+ p->rcv_wnd = tp->rcv_wnd;
p->ssthresh = tcp_current_ssthresh(sk);
- p->srtt = tp->srtt >> 3;
+ p->srtt = tp->srtt_us >> 3;
tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
}
@@ -126,7 +165,6 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
jprobe_return();
- return 0;
}
static struct jprobe tcp_jprobe = {
@@ -136,7 +174,7 @@ static struct jprobe tcp_jprobe = {
.entry = jtcp_rcv_established,
};
-static int tcpprobe_open(struct inode * inode, struct file * file)
+static int tcpprobe_open(struct inode *inode, struct file *file)
{
/* Reset (empty) log */
spin_lock_bh(&tcp_probe.lock);
@@ -155,13 +193,11 @@ static int tcpprobe_sprint(char *tbuf, int n)
= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
return scnprintf(tbuf, n,
- "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
+ "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
(unsigned long) tv.tv_sec,
(unsigned long) tv.tv_nsec,
- &p->saddr, ntohs(p->sport),
- &p->daddr, ntohs(p->dport),
- p->length, p->snd_nxt, p->snd_una,
- p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
+ &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
+ p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
}
static ssize_t tcpprobe_read(struct file *file, char __user *buf,
@@ -174,7 +210,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
return -EINVAL;
while (cnt < len) {
- char tbuf[164];
+ char tbuf[256];
int width;
/* Wait for data in buffer */
@@ -221,6 +257,13 @@ static __init int tcpprobe_init(void)
{
int ret = -ENOMEM;
+ /* Warning: if the function signature of tcp_rcv_established,
+ * has been changed, you also have to change the signature of
+ * jtcp_rcv_established, otherwise you end up right here!
+ */
+ BUILD_BUG_ON(__same_type(tcp_rcv_established,
+ jtcp_rcv_established) == 0);
+
init_waitqueue_head(&tcp_probe.wait);
spin_lock_init(&tcp_probe.lock);
@@ -232,17 +275,18 @@ static __init int tcpprobe_init(void)
if (!tcp_probe.log)
goto err0;
- if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops))
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
goto err0;
ret = register_jprobe(&tcp_jprobe);
if (ret)
goto err1;
- pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
+ pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+ port, fwmark, bufsize);
return 0;
err1:
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
err0:
kfree(tcp_probe.log);
return ret;
@@ -251,7 +295,7 @@ module_init(tcpprobe_init);
static __exit void tcpprobe_exit(void)
{
- proc_net_remove(&init_net, procname);
+ remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&tcp_jprobe);
kfree(tcp_probe.log);
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2..8250949b885 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -15,15 +15,15 @@
#define TCP_SCALABLE_AI_CNT 50U
#define TCP_SCALABLE_MD_SCALE 3
-static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else
tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
}
@@ -35,10 +35,9 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
}
-static struct tcp_congestion_ops tcp_scalable = {
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
.ssthresh = tcp_scalable_ssthresh,
.cong_avoid = tcp_scalable_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.owner = THIS_MODULE,
.name = "scalable",
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa00365..286227abed1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
int sysctl_tcp_thin_linear_timeouts __read_mostly;
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
-
-void tcp_init_xmit_timers(struct sock *sk)
-{
- inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
- &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-
static void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -77,10 +66,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
if (sk->sk_err_soft)
shift++;
- if (tcp_too_many_orphans(sk, shift)) {
- if (net_ratelimit())
- printk(KERN_INFO "Out of socket memory\n");
-
+ if (tcp_check_oom(sk, shift)) {
/* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */
if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -170,14 +156,21 @@ static bool retransmits_timed_out(struct sock *sk,
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int retry_until;
- bool do_reset, syn_set = 0;
+ bool do_reset, syn_set = false;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (icsk->icsk_retransmits)
+ if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
+ if (tp->syn_fastopen || tp->syn_data)
+ tcp_fastopen_cache_set(sk, 0, NULL, true);
+ if (tp->syn_data)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
- syn_set = 1;
+ syn_set = true;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
/* Black hole detection */
@@ -208,21 +201,11 @@ static int tcp_write_timeout(struct sock *sk)
return 0;
}
-static void tcp_delack_timer(unsigned long data)
+void tcp_delack_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock *)data;
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
- goto out_unlock;
- }
-
sk_mem_reclaim_partial(sk);
if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -259,12 +242,26 @@ static void tcp_delack_timer(unsigned long data)
tcp_send_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
- TCP_CHECK_TIMER(sk);
out:
- if (tcp_memory_pressure)
+ if (sk_under_memory_pressure(sk))
sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ inet_csk(sk)->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
bh_unlock_sock(sk);
sock_put(sk);
}
@@ -315,6 +312,35 @@ static void tcp_probe_timer(struct sock *sk)
}
/*
+ * Timer for Fast Open socket to retransmit SYNACK. Note that the
+ * sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int max_retries = icsk->icsk_syn_retries ? :
+ sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ struct request_sock *req;
+
+ req = tcp_sk(sk)->fastopen_rsk;
+ req->rsk_ops->syn_ack_timeout(sk, req);
+
+ if (req->num_timeout >= max_retries) {
+ tcp_write_err(sk);
+ return;
+ }
+ /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+ * returned from rtx_syn_ack() to make it more persistent like
+ * regular retransmit because if the child socket has been accepted
+ * it's not good to give up too easily.
+ */
+ inet_rtx_syn_ack(sk, req);
+ req->num_timeout++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+}
+
+/*
* The TCP retransmit timer.
*/
@@ -323,11 +349,22 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ if (tp->fastopen_rsk) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+ tcp_fastopen_synack_timer(sk);
+ /* Before we receive ACK to our SYN-ACK don't retransmit
+ * anything else (e.g., data or FIN segments).
+ */
+ return;
+ }
if (!tp->packets_out)
goto out;
WARN_ON(tcp_write_queue_empty(sk));
+ tp->tlp_high_seq = 0;
+
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits
@@ -335,22 +372,21 @@ void tcp_retransmit_timer(struct sock *sk)
* connection. If the socket is an orphan, time it out,
* we cannot allow such beasts to hang infinitely.
*/
-#ifdef TCP_DEBUG
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &inet->inet_daddr, ntohs(inet->inet_dport),
- inet->inet_num, tp->snd_una, tp->snd_nxt);
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+ &inet->inet_daddr,
+ ntohs(inet->inet_dport), inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &np->daddr, ntohs(inet->inet_dport),
- inet->inet_num, tp->snd_una, tp->snd_nxt);
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+ &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport), inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
#endif
-#endif
if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
tcp_write_err(sk);
goto out;
@@ -386,11 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
- if (tcp_use_frto(sk)) {
- tcp_enter_frto(sk);
- } else {
- tcp_enter_loss(sk, 0);
- }
+ tcp_enter_loss(sk, 0);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
/* Retransmission failed because of local congestion,
@@ -449,19 +481,11 @@ out_reset_timer:
out:;
}
-static void tcp_write_timer(unsigned long data)
+void tcp_write_timer_handler(struct sock *sk)
{
- struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
- goto out_unlock;
- }
-
if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
@@ -471,21 +495,40 @@ static void tcp_write_timer(unsigned long data)
}
event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
switch (event) {
+ case ICSK_TIME_EARLY_RETRANS:
+ tcp_resume_early_retransmit(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
tcp_probe_timer(sk);
break;
}
- TCP_CHECK_TIMER(sk);
out:
sk_mem_reclaim(sk);
-out_unlock:
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
bh_unlock_sock(sk);
sock_put(sk);
}
@@ -589,7 +632,6 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp) - elapsed;
}
- TCP_CHECK_TIMER(sk);
sk_mem_reclaim(sk);
resched:
@@ -603,3 +645,10 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7..9a5e05f27f4 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -163,13 +163,13 @@ static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
return min(tp->snd_ssthresh, tp->snd_cwnd-1);
}
-static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct vegas *vegas = inet_csk_ca(sk);
if (!vegas->doing_vegas_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
@@ -194,7 +194,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u32 rtt, diff;
u64 target_cwnd;
@@ -243,7 +243,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
@@ -283,7 +283,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
}
/* Use normal slow start */
else if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
}
@@ -304,12 +304,10 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
-static struct tcp_congestion_ops tcp_vegas = {
- .flags = TCP_CONG_RTT_STAMP,
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_vegas_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.pkts_acked = tcp_vegas_pkts_acked,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 6c0eea2f824..0531b99d863 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -15,10 +15,10 @@ struct vegas {
u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
};
-extern void tcp_vegas_init(struct sock *sk);
-extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
-extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
-extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 38bc0b52d74..27b9825753d 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -114,18 +114,18 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
tcp_veno_init(sk);
}
-static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct veno *veno = inet_csk_ca(sk);
if (!veno->doing_veno_now) {
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
return;
}
/* limited by applications */
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
/* We do the Veno calculations only if we got enough rtt samples */
@@ -133,7 +133,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
/* We don't have enough rtt samples to do the Veno
* calculation, so we'll behave like Reno.
*/
- tcp_reno_cong_avoid(sk, ack, in_flight);
+ tcp_reno_cong_avoid(sk, ack, acked);
} else {
u64 target_cwnd;
u32 rtt;
@@ -152,7 +152,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
} else {
/* Congestion avoidance. */
if (veno->diff < beta) {
@@ -201,8 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
return max(tp->snd_cwnd >> 1U, 2U);
}
-static struct tcp_congestion_ops tcp_veno = {
- .flags = TCP_CONG_RTT_STAMP,
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
.init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh,
.cong_avoid = tcp_veno_cong_avoid,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index a534dda5456..b94a04ae2ed 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
break;
- case CA_EVENT_FRTO:
+ case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
@@ -272,11 +272,10 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
}
-static struct tcp_congestion_ops tcp_westwood = {
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
.init = tcp_westwood_init,
.ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_reno_cong_avoid,
- .min_cwnd = tcp_westwood_bw_rttmin,
.cwnd_event = tcp_westwood_event,
.get_info = tcp_westwood_info,
.pkts_acked = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f24035889..599b79b8eac 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -3,7 +3,7 @@
* YeAH TCP
*
* For further details look at:
- * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
*
*/
#include <linux/mm.h>
@@ -15,13 +15,13 @@
#include "tcp_vegas.h"
-#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck
-#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt
-#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
-#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
-#define TCP_YEAH_PHY 8 //lin maximum delta from base
-#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss
-#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
+#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY 8 /* maximum delta from base */
+#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */
#define TCP_SCALABLE_AI_CNT 100U
@@ -69,16 +69,16 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
}
-static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct yeah *yeah = inet_csk_ca(sk);
- if (!tcp_is_cwnd_limited(sk, in_flight))
+ if (!tcp_is_cwnd_limited(sk))
return;
if (tp->snd_cwnd <= tp->snd_ssthresh)
- tcp_slow_start(tp);
+ tcp_slow_start(tp, acked);
else if (!yeah->doing_reno_now) {
/* Scalable */
@@ -213,9 +213,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
if (yeah->doing_reno_now < TCP_YEAH_RHO) {
reduction = yeah->lastQ;
- reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
+ reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
- reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
} else
reduction = max(tp->snd_cwnd>>1, 2U);
@@ -225,12 +225,10 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
return tp->snd_cwnd - reduction;
}
-static struct tcp_congestion_ops tcp_yeah = {
- .flags = TCP_CONG_RTT_STAMP,
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh,
.cong_avoid = tcp_yeah_cong_avoid,
- .min_cwnd = tcp_reno_min_cwnd,
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index ac3b3ee4b07..0d017183062 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -105,7 +105,7 @@ drop:
return 0;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
static int tunnel64_rcv(struct sk_buff *skb)
{
struct xfrm_tunnel *handler;
@@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
break;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
static void tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
@@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = {
.netns_ok = 1,
};
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
static const struct net_protocol tunnel64_protocol = {
.handler = tunnel64_rcv,
.err_handler = tunnel64_err,
@@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = {
static int __init tunnel4_init(void)
{
if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
- printk(KERN_ERR "tunnel4 init: can't add protocol\n");
+ pr_err("%s: can't add protocol\n", __func__);
return -EAGAIN;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
- printk(KERN_ERR "tunnel64 init: can't add protocol\n");
+ pr_err("tunnel64 init: can't add protocol\n");
inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
return -EAGAIN;
}
@@ -179,12 +179,12 @@ static int __init tunnel4_init(void)
static void __exit tunnel4_fini(void)
{
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
- printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
+ pr_err("tunnel64 close: can't remove protocol\n");
#endif
if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
- printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
+ pr_err("tunnel4 close: can't remove protocol\n");
}
module_init(tunnel4_init);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b37181da487..7d5a8661df7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -77,7 +77,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <asm/system.h>
+#define pr_fmt(fmt) "UDP: " fmt
+
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include <linux/bootmem.h>
@@ -102,9 +103,14 @@
#include <linux/seq_file.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
+#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
#include "udp_impl.h"
struct udp_table udp_table __read_mostly;
@@ -135,6 +141,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
sk_nulls_for_each(sk2, node, &hslot->head)
if (net_eq(sock_net(sk2), net) &&
@@ -143,6 +150,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
if (bitmap)
__set_bit(udp_sk(sk2)->udp_port_hash >> log,
@@ -165,6 +174,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
{
struct sock *sk2;
struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
@@ -175,6 +185,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
(*saddr_comp)(sk, sk2)) {
res = 1;
break;
@@ -189,7 +201,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
* @sk: socket struct in question
* @snum: port number to look up
* @saddr_comp: AF-dependent comparison of bound local IP addresses
- * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
+ * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
* with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
@@ -204,14 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
if (!snum) {
int low, high, remaining;
- unsigned rand;
+ unsigned int rand;
unsigned short first, last;
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
- inet_get_local_port_range(&low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rand = net_random();
+ rand = prandom_u32();
first = (((u64)rand * remaining) >> 32) + low;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
@@ -234,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
do {
if (low <= snum && snum <= high &&
!test_bit(snum >> udptable->log, bitmap) &&
- !inet_is_reserved_local_port(snum))
+ !inet_is_local_reserved_port(net, snum))
goto found;
snum += rand;
} while (snum != first);
@@ -333,26 +345,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_rcv_saddr) {
if (inet->inet_rcv_saddr != daddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
@@ -361,7 +373,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
/*
* In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
*/
-#define SCORE2_MAX (1 + 2 + 2 + 2)
static inline int compute_score2(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif)
@@ -376,26 +387,38 @@ static inline int compute_score2(struct sock *sk, struct net *net,
if (inet->inet_num != hnum)
return -1;
- score = (sk->sk_family == PF_INET ? 1 : 0);
+ score = (sk->sk_family == PF_INET ? 2 : 1);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
- score += 2;
+ score += 4;
}
if (inet->inet_dport) {
if (inet->inet_dport != sport)
return -1;
- score += 2;
+ score += 4;
}
if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif)
return -1;
- score += 2;
+ score += 4;
}
}
return score;
}
+static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 udp_ehash_secret __read_mostly;
+
+ net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ udp_ehash_secret + net_hash_mix(net));
+}
+
/* called with read_rcu_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
@@ -405,19 +428,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
begin:
result = NULL;
- badness = -1;
+ badness = 0;
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
score = compute_score2(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
result = sk;
badness = score;
- if (score == SCORE2_MAX)
- goto exact_match;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -427,9 +460,7 @@ begin:
*/
if (get_nulls_value(node) != slot2)
goto begin;
-
if (result) {
-exact_match:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -444,7 +475,7 @@ exact_match:
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
-static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
@@ -453,7 +484,8 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
- int score, badness;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
rcu_read_lock();
if (hslot->count > 10) {
@@ -482,13 +514,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
}
begin:
result = NULL;
- badness = -1;
+ badness = 0;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
score = compute_score(sk, net, saddr, hnum, sport,
daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (((u64)hash * matches) >> 32 == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
}
}
/*
@@ -511,20 +554,17 @@ begin:
rcu_read_unlock();
return result;
}
+EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport,
struct udp_table *udptable)
{
- struct sock *sk;
const struct iphdr *iph = ip_hdr(skb);
- if (unlikely(sk = skb_steal_sock(skb)))
- return sk;
- else
- return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
- iph->daddr, dport, inet_iif(skb),
- udptable);
+ return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ udptable);
}
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
@@ -534,6 +574,26 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, unsigned short hnum)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(sk) ||
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ return false;
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ return false;
+ return true;
+}
+
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
@@ -544,20 +604,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
unsigned short hnum = ntohs(loc_port);
sk_nulls_for_each_from(s, node) {
- struct inet_sock *inet = inet_sk(s);
-
- if (!net_eq(sock_net(s), net) ||
- udp_sk(s)->udp_port_hash != hnum ||
- (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
- (inet->inet_dport != rmt_port && inet->inet_dport) ||
- (inet->inet_rcv_saddr &&
- inet->inet_rcv_saddr != loc_addr) ||
- ipv6_only_sock(s) ||
- (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
- continue;
- if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
- continue;
- goto found;
+ if (__udp_is_mcast_sock(net, s,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))
+ goto found;
}
s = NULL;
found:
@@ -578,7 +629,7 @@ found:
void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
- struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
@@ -611,6 +662,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
break;
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
if (inet->pmtudisc != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
@@ -624,6 +676,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
err = icmp_err_convert[code].errno;
}
break;
+ case ICMP_REDIRECT:
+ ipv4_sk_redirect(skb, sk);
+ goto out;
}
/*
@@ -663,97 +718,132 @@ void udp_flush_pending_frames(struct sock *sk)
EXPORT_SYMBOL(udp_flush_pending_frames);
/**
- * udp4_hwcsum_outgoing - handle outgoing HW checksumming
- * @sk: socket we are sending on
+ * udp4_hwcsum - handle outgoing HW checksumming
* @skb: sk_buff containing the filled-in UDP header
* (checksum field must be zeroed out)
+ * @src: source IP address
+ * @dst: destination IP address
*/
-static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, int len)
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{
- unsigned int offset;
struct udphdr *uh = udp_hdr(skb);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int hlen = len;
__wsum csum = 0;
- if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ if (!skb_has_frag_list(skb)) {
/*
* Only one fragment on the socket.
*/
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
- uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
+ uh->check = ~csum_tcpudp_magic(src, dst, len,
+ IPPROTO_UDP, 0);
} else {
+ struct sk_buff *frags;
+
/*
* HW-checksum won't work as there are two or more
* fragments on the socket so that all csums of sk_buffs
* should be together
*/
- offset = skb_transport_offset(skb);
- skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ skb_walk_frags(skb, frags) {
+ csum = csum_add(csum, frags->csum);
+ hlen -= frags->len;
+ }
+ csum = skb_checksum(skb, offset, hlen, csum);
skb->ip_summed = CHECKSUM_NONE;
- skb_queue_walk(&sk->sk_write_queue, skb) {
- csum = csum_add(csum, skb->csum);
- }
-
uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
}
}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
-/*
- * Push out all pending data as one UDP datagram. Socket is locked.
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
*/
-static int udp_push_pending_frames(struct sock *sk)
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, int len)
{
- struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v4_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+EXPORT_SYMBOL(udp_set_csum);
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+{
+ struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- struct flowi *fl = &inet->cork.fl;
- struct sk_buff *skb;
struct udphdr *uh;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
__wsum csum = 0;
- /* Grab the skbuff where UDP header space exists. */
- if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
- goto out;
-
/*
* Create a UDP header
*/
uh = udp_hdr(skb);
- uh->source = fl->fl_ip_sport;
- uh->dest = fl->fl_ip_dport;
- uh->len = htons(up->len);
+ uh->source = inet->inet_sport;
+ uh->dest = fl4->fl4_dport;
+ uh->len = htons(len);
uh->check = 0;
if (is_udplite) /* UDP-Lite */
- csum = udplite_csum_outgoing(sk, skb);
+ csum = udplite_csum(skb);
- else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
+ else if (sk->sk_no_check_tx) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
- udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
+ udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
- } else /* `normal' UDP */
- csum = udp_csum_outgoing(sk, skb);
+ } else
+ csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
- uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
+ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
sk->sk_protocol, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
send:
- err = ip_push_pending_frames(sk);
+ err = ip_send_skb(sock_net(sk), skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
UDP_INC_STATS_USER(sock_net(sk),
@@ -763,17 +853,40 @@ send:
} else
UDP_INC_STATS_USER(sock_net(sk),
UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+int udp_push_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ goto out;
+
+ err = udp_send_skb(skb, fl4);
+
out:
up->len = 0;
up->pending = 0;
return err;
}
+EXPORT_SYMBOL(udp_push_pending_frames);
int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
+ struct flowi4 fl4_stack;
+ struct flowi4 *fl4;
int ulen = len;
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
@@ -785,6 +898,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int err, is_udplite = IS_UDPLITE(sk);
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
+ struct ip_options_data opt_copy;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -798,7 +913,12 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.opt = NULL;
ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
+ fl4 = &inet->cork.fl.u.ip4;
if (up->pending) {
/*
* There are pending frames.
@@ -820,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* Get and verify the address.
*/
if (msg->msg_name) {
- struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
if (usin->sin_family != AF_INET) {
@@ -845,33 +965,44 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- err = sock_tx_timestamp(sk, &ipc.tx_flags);
- if (err)
- return err;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc,
+ sk->sk_family == AF_INET6);
if (err)
return err;
if (ipc.opt)
free = 1;
connected = 0;
}
- if (!ipc.opt)
- ipc.opt = inet->opt;
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
saddr = ipc.addr;
ipc.addr = faddr = daddr;
- if (ipc.opt && ipc.opt->srr) {
+ if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr)
return -EINVAL;
- faddr = ipc.opt->faddr;
+ faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = RT_TOS(inet->tos);
+ tos = get_rttos(&ipc, inet);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
- (ipc.opt && ipc.opt->is_strictroute)) {
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
tos |= RTO_ONLINK;
connected = 0;
}
@@ -882,28 +1013,28 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!saddr)
saddr = inet->mc_addr;
connected = 0;
- }
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
if (connected)
rt = (struct rtable *)sk_dst_check(sk, 0);
if (rt == NULL) {
- struct flowi fl = { .oif = ipc.oif,
- .mark = sk->sk_mark,
- .fl4_dst = faddr,
- .fl4_src = saddr,
- .fl4_tos = tos,
- .proto = sk->sk_protocol,
- .flags = inet_sk_flowi_flags(sk),
- .fl_ip_sport = inet->inet_sport,
- .fl_ip_dport = dport };
struct net *net = sock_net(sk);
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(net, &rt, &fl, sk, 1);
- if (err) {
+ fl4 = &fl4_stack;
+ flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ faddr, saddr, dport, inet->inet_sport);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
@@ -919,9 +1050,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto do_confirm;
back_from_confirm:
- saddr = rt->rt_src;
+ saddr = fl4->saddr;
if (!ipc.addr)
- daddr = ipc.addr = rt->rt_dst;
+ daddr = ipc.addr = fl4->daddr;
+
+ /* Lockless fast path for the non-corking case. */
+ if (!corkreq) {
+ skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ msg->msg_flags);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_send_skb(skb, fl4);
+ goto out;
+ }
lock_sock(sk);
if (unlikely(up->pending)) {
@@ -929,25 +1071,25 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));
err = -EINVAL;
goto out;
}
/*
* Now cork the socket to pend data.
*/
- inet->cork.fl.fl4_dst = daddr;
- inet->cork.fl.fl_ip_dport = dport;
- inet->cork.fl.fl4_src = saddr;
- inet->cork.fl.fl_ip_sport = inet->inet_sport;
+ fl4 = &inet->cork.fl.u.ip4;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->fl4_dport = dport;
+ fl4->fl4_sport = inet->inet_sport;
up->pending = AF_INET;
do_append_data:
up->len += ulen;
- getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
- err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
- sizeof(struct udphdr), &ipc, &rt,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_flush_pending_frames(sk);
else if (!corkreq)
@@ -987,9 +1129,13 @@ EXPORT_SYMBOL(udp_sendmsg);
int udp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
+ struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
int ret;
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
if (!up->pending) {
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
@@ -1007,11 +1153,12 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n"));
return -EINVAL;
}
- ret = ip_append_page(sk, page, offset, size, flags);
+ ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+ page, offset, size, flags);
if (ret == -EOPNOTSUPP) {
release_sock(sk);
return sock_no_sendpage(sk->sk_socket, page, offset,
@@ -1051,6 +1198,8 @@ static unsigned int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
@@ -1116,33 +1265,28 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
- unsigned int ulen;
- int peeked;
+ unsigned int ulen, copied;
+ int peeked, off = 0;
int err;
int is_udplite = IS_UDPLITE(sk);
bool slow;
- /*
- * Check any passed addresses
- */
- if (addr_len)
- *addr_len = sizeof(*sin);
-
if (flags & MSG_ERRQUEUE)
- return ip_recv_error(sk, msg, len);
+ return ip_recv_error(sk, msg, len, addr_len);
try_again:
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &err);
+ &peeked, &off, &err);
if (!skb)
goto out;
ulen = skb->len - sizeof(struct udphdr);
- if (len > ulen)
- len = ulen;
- else if (len < ulen)
+ copied = len;
+ if (copied > ulen)
+ copied = ulen;
+ else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
/*
@@ -1151,14 +1295,14 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
if (udp_lib_checksum_complete(skb))
goto csum_copy_err;
}
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
- msg->msg_iov, len);
+ msg->msg_iov, copied);
else {
err = skb_copy_and_csum_datagram_iovec(skb,
sizeof(struct udphdr),
@@ -1168,8 +1312,15 @@ try_again:
goto csum_copy_err;
}
- if (err)
+ if (unlikely(err)) {
+ trace_kfree_skb(skb, udp_recvmsg);
+ if (!peeked) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ }
goto out_free;
+ }
if (!peeked)
UDP_INC_STATS_USER(sock_net(sk),
@@ -1183,11 +1334,12 @@ try_again:
sin->sin_port = udp_hdr(skb)->source;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
- err = len;
+ err = copied;
if (flags & MSG_TRUNC)
err = ulen;
@@ -1198,12 +1350,17 @@ out:
csum_copy_err:
slow = lock_sock_fast(sk);
- if (!skb_kill_datagram(sk, skb, flags))
+ if (!skb_kill_datagram(sk, skb, flags)) {
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ }
unlock_sock_fast(sk, slow);
if (noblock)
return -EAGAIN;
+
+ /* starting over for a new packet */
+ msg->msg_flags &= ~MSG_TRUNC;
goto try_again;
}
@@ -1218,7 +1375,7 @@ int udp_disconnect(struct sock *sk, int flags)
sk->sk_state = TCP_CLOSE;
inet->inet_daddr = 0;
inet->inet_dport = 0;
- sock_rps_save_rxhash(sk, 0);
+ sock_rps_reset_rxhash(sk);
sk->sk_bound_dev_if = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -1305,10 +1462,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
- if (inet_sk(sk)->inet_daddr)
- sock_rps_save_rxhash(sk, skb->rxhash);
+ if (inet_sk(sk)->inet_daddr) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ }
- rc = ip_queue_rcv_skb(sk, skb);
+ rc = sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1318,6 +1477,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
is_udplite);
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
+ trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
@@ -1325,6 +1485,14 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
+static struct static_key udp_encap_needed __read_mostly;
+void udp_encap_enable(void)
+{
+ if (!static_key_enabled(&udp_encap_needed))
+ static_key_slow_inc(&udp_encap_needed);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
/* returns:
* -1: error
* 0: success
@@ -1346,7 +1514,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
nf_reset(skb);
- if (up->encap_type) {
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
/*
* This is an encapsulation socket so pass the skb to
* the socket's udp_encap_rcv() hook. Otherwise, just
@@ -1359,11 +1529,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
/* if we're overly short, let UDP handle it */
- if (skb->len > sizeof(struct udphdr) &&
- up->encap_rcv != NULL) {
+ encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
int ret;
- ret = (*up->encap_rcv)(sk, skb);
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
if (ret <= 0) {
UDP_INC_STATS_BH(sock_net(sk),
UDP_MIB_INDATAGRAMS,
@@ -1392,9 +1566,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
- "%d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
+ LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
@@ -1404,28 +1577,30 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING
- "UDPLITE: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
- if (rcu_dereference_raw(sk->sk_filter)) {
- if (udp_lib_checksum_complete(skb))
- goto drop;
- }
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
- if (sk_rcvqueues_full(sk, skb))
+ if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
goto drop;
+ }
rc = 0;
+ ipv4_pktinfo_prepare(sk, skb);
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
- else if (sk_add_backlog(sk, skb)) {
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
bh_unlock_sock(sk);
goto drop;
}
@@ -1433,6 +1608,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return rc;
+csum_error:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
@@ -1468,6 +1645,18 @@ static void flush_stack(struct sock **stack, unsigned int count,
kfree_skb(skb1);
}
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ struct dst_entry *old;
+
+ dst_hold(dst);
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+}
+
/*
* Multicasts and broadcasts go to each listener.
*
@@ -1528,7 +1717,6 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
int proto)
{
- const struct iphdr *iph;
int err;
UDP_SKB_CB(skb)->partial_cov = 0;
@@ -1540,22 +1728,8 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- iph = ip_hdr(skb);
- if (uh->check == 0) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
- if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- proto, skb->csum))
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- if (!skb_csum_unnecessary(skb))
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len, proto, 0);
- /* Probably, we should checksum udp header (it should be in cache
- * in any case) and data in tiny packets (< rx copybreak).
- */
-
- return 0;
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
@@ -1596,14 +1770,34 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable);
+ sk = skb_steal_sock(skb);
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (unlikely(sk->sk_rx_dst != dst))
+ udp_sk_rx_dst_set(sk, dst);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ } else {
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ }
if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb);
+ int ret;
+
+ ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
@@ -1634,13 +1828,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
short_packet:
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
- proto == IPPROTO_UDPLITE ? "-Lite" : "",
- &saddr,
- ntohs(uh->source),
- ulen,
- skb->len,
- &daddr,
- ntohs(uh->dest));
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
goto drop;
csum_error:
@@ -1649,18 +1840,152 @@ csum_error:
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
- proto == IPPROTO_UDPLITE ? "-Lite" : "",
- &saddr,
- ntohs(uh->source),
- &daddr,
- ntohs(uh->dest),
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
+ UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ struct udp_hslot *hslot = &udp_table.hash[slot];
+
+ /* Do not bother scanning a too big list */
+ if (hslot->count > 10)
+ return NULL;
+
+ rcu_read_lock();
+begin:
+ count = 0;
+ result = NULL;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum)) {
+ result = sk;
+ ++count;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (count != 1 ||
+ unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!__udp_is_mcast_sock(net, result,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups. The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int slot2 = hash2 & udp_table.mask;
+ struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+ const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ rcu_read_lock();
+ result = NULL;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr, ports, dif))
+ result = sk;
+ /* Only check first socket in chain */
+ break;
+ }
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr,
+ ports, dif))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+void udp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct udphdr *uh;
+ struct sock *sk;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+
+ /* validate the packet */
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST)
+ sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ else if (skb->pkt_type == PACKET_HOST)
+ sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ else
+ return;
+
+ if (!sk)
+ return;
+
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ dst = sk->sk_rx_dst;
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst)
+ skb_dst_set_noref(skb, dst);
+}
+
int udp_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
@@ -1668,9 +1993,16 @@ int udp_rcv(struct sk_buff *skb)
void udp_destroy_sock(struct sock *sk)
{
+ struct udp_sock *up = udp_sk(sk);
bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
}
/*
@@ -1681,7 +2013,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int (*push_pending_frames)(struct sock *))
{
struct udp_sock *up = udp_sk(sk);
- int val;
+ int val, valbool;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
@@ -1691,6 +2023,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
if (get_user(val, (int __user *)optval))
return -EFAULT;
+ valbool = val ? 1 : 0;
+
switch (optname) {
case UDP_CORK:
if (val != 0) {
@@ -1712,6 +2046,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
+ udp_encap_enable();
break;
default:
err = -ENOPROTOOPT;
@@ -1719,6 +2054,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
}
break;
+ case UDP_NO_CHECK6_TX:
+ up->no_check6_tx = valbool;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ up->no_check6_rx = valbool;
+ break;
+
/*
* UDP-Lite's partial checksum coverage (RFC 3828).
*/
@@ -1801,6 +2144,14 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
val = up->encap_type;
break;
+ case UDP_NO_CHECK6_TX:
+ val = up->no_check6_tx;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = up->no_check6_rx;
+ break;
+
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
@@ -1858,6 +2209,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* Check for false positives due to checksum errors */
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
!(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
@@ -1882,6 +2235,7 @@ struct proto udp_prot = {
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = __udp_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
@@ -1897,6 +2251,7 @@ struct proto udp_prot = {
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
};
EXPORT_SYMBOL(udp_prot);
@@ -1987,9 +2342,9 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
}
-static int udp_seq_open(struct inode *inode, struct file *file)
+int udp_seq_open(struct inode *inode, struct file *file)
{
- struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
struct udp_iter_state *s;
int err;
@@ -2003,6 +2358,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
s->udp_table = afinfo->udp_table;
return err;
}
+EXPORT_SYMBOL(udp_seq_open);
/* ------------------------------------------------------------------------ */
int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
@@ -2010,17 +2366,12 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
struct proc_dir_entry *p;
int rc = 0;
- afinfo->seq_fops.open = udp_seq_open;
- afinfo->seq_fops.read = seq_read;
- afinfo->seq_fops.llseek = seq_lseek;
- afinfo->seq_fops.release = seq_release_net;
-
afinfo->seq_ops.start = udp_seq_start;
afinfo->seq_ops.next = udp_seq_next;
afinfo->seq_ops.stop = udp_seq_stop;
p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
- &afinfo->seq_fops, afinfo);
+ afinfo->seq_fops, afinfo);
if (!p)
rc = -ENOMEM;
return rc;
@@ -2029,13 +2380,13 @@ EXPORT_SYMBOL(udp_proc_register);
void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
{
- proc_net_remove(net, afinfo->name);
+ remove_proc_entry(afinfo->name, net->proc_net);
}
EXPORT_SYMBOL(udp_proc_unregister);
/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
- int bucket, int *len)
+ int bucket)
{
struct inet_sock *inet = inet_sk(sp);
__be32 dest = inet->inet_daddr;
@@ -2044,40 +2395,47 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
- 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops), len);
+ atomic_read(&sp->sk_drops));
}
int udp4_seq_show(struct seq_file *seq, void *v)
{
+ seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_printf(seq, "%-127s\n",
- " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
struct udp_iter_state *state = seq->private;
- int len;
- udp4_format_sock(v, seq, state->bucket, &len);
- seq_printf(seq, "%*s\n", 127 - len, "");
+ udp4_format_sock(v, seq, state->bucket);
}
+ seq_pad(seq, '\n');
return 0;
}
+static const struct file_operations udp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
/* ------------------------------------------------------------------------ */
static struct udp_seq_afinfo udp4_seq_afinfo = {
.name = "udp",
.family = AF_INET,
.udp_table = &udp_table,
- .seq_fops = {
- .owner = THIS_MODULE,
- },
+ .seq_fops = &udp_afinfo_seq_fops,
.seq_ops = {
.show = udp4_seq_show,
},
@@ -2112,9 +2470,15 @@ void udp4_proc_exit(void)
static __initdata unsigned long uhash_entries;
static int __init set_uhash_entries(char *str)
{
+ ssize_t ret;
+
if (!str)
return 0;
- uhash_entries = simple_strtoul(str, &str, 0);
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
uhash_entries = UDP_HTABLE_SIZE_MIN;
return 1;
@@ -2125,26 +2489,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
{
unsigned int i;
- if (!CONFIG_BASE_SMALL)
- table->hash = alloc_large_system_hash(name,
- 2 * sizeof(struct udp_hslot),
- uhash_entries,
- 21, /* one slot per 2 MB */
- 0,
- &table->log,
- &table->mask,
- 64 * 1024);
- /*
- * Make sure hash table has the minimum size
- */
- if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
- table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
- 2 * sizeof(struct udp_hslot), GFP_KERNEL);
- if (!table->hash)
- panic(name);
- table->log = ilog2(UDP_HTABLE_SIZE_MIN);
- table->mask = UDP_HTABLE_SIZE_MIN - 1;
- }
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ 64 * 1024);
+
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
@@ -2160,16 +2514,10 @@ void __init udp_table_init(struct udp_table *table, const char *name)
void __init udp_init(void)
{
- unsigned long nr_pages, limit;
+ unsigned long limit;
udp_table_init(&udp_table, "UDP");
- /* Set the pressure threshold up by the same strategy of TCP. It is a
- * fraction of global memory that is up to 1/2 at 256 MB, decreasing
- * toward zero with the amount of memory, with a floor of 128 pages.
- */
- nr_pages = totalram_pages - totalhigh_pages;
- limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
- limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = nr_free_buffer_pages() / 8;
limit = max(limit, 128UL);
sysctl_udp_mem[0] = limit / 4 * 3;
sysctl_udp_mem[1] = limit;
@@ -2179,64 +2527,78 @@ void __init udp_init(void)
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
}
-int udp4_ufo_send_check(struct sk_buff *skb)
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
- const struct iphdr *iph;
- struct udphdr *uh;
-
- if (!pskb_may_pull(skb, sizeof(*uh)))
- return -EINVAL;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum;
+
+ oldlen = (u16)~skb->len;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
- iph = ip_hdr(skb);
- uh = udp_hdr(skb);
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = htons(ETH_P_TEB);
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+ if (need_csum)
+ skb->encap_hdr_csum = 1;
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
- uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
- IPPROTO_UDP, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- skb->ip_summed = CHECKSUM_PARTIAL;
- return 0;
-}
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int len;
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- unsigned int mss;
- int offset;
- __wsum csum;
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
+ skb->mac_len = mac_len;
- if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
- /* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
+ uh = udp_hdr(skb);
+ uh->len = htons(len);
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
+ if (need_csum) {
+ __be32 delta = htonl(oldlen + len);
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ uh->check = gso_make_checksum(skb, ~uh->check);
- segs = NULL;
- goto out;
- }
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
- /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
- * do checksum of UDP packets sent as multiple IP fragments.
- */
- offset = skb->csum_start - skb_headroom(skb);
- csum = skb_checksum(skb, offset, skb->len - offset, 0);
- offset += skb->csum_offset;
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
- skb->ip_summed = CHECKSUM_NONE;
-
- /* Fragment the skb. IP headers of the fragments are updated in
- * inet_gso_segment()
- */
- segs = skb_segment(skb, features);
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
out:
return segs;
}
-
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 00000000000..7927db0a927
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,216 @@
+/*
+ * udp_diag.c Module for monitoring UDP transport protocols sockets.
+ *
+ * Authors: Pavel Emelyanov, <xemul@parallels.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/sock_diag.h>
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb, struct inet_diag_req_v2 *req,
+ struct nlattr *bc)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, req,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+{
+ int err = -EINVAL;
+ struct sock *sk;
+ struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#endif
+ else
+ goto out_nosk;
+
+ err = -ENOENT;
+ if (sk == NULL)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, NULL, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ if (sk)
+ sock_put(sk);
+out_nosk:
+ return err;
+}
+
+static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ int num, s_num, slot, s_slot;
+ struct net *net = sock_net(skb->sk);
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ struct udp_hslot *hslot = &table->hash[slot];
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_nulls_for_each(sk, node, &hslot->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&hslot->lock);
+ goto done;
+ }
+next:
+ num++;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+done:
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ udp_dump(&udp_table, skb, cb, r, bc);
+}
+
+static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udp_table, in_skb, nlh, req);
+}
+
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+static const struct inet_diag_handler udp_diag_handler = {
+ .dump = udp_diag_dump,
+ .dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDP,
+};
+
+static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ udp_dump(&udplite_table, skb, cb, r, bc);
+}
+
+static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udplite_table, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler udplite_diag_handler = {
+ .dump = udplite_diag_dump,
+ .dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDPLITE,
+};
+
+static int __init udp_diag_init(void)
+{
+ int err;
+
+ err = inet_diag_register(&udp_diag_handler);
+ if (err)
+ goto out;
+ err = inet_diag_register(&udplite_diag_handler);
+ if (err)
+ goto out_lite;
+out:
+ return err;
+out_lite:
+ inet_diag_unregister(&udp_diag_handler);
+ goto out;
+}
+
+static void __exit udp_diag_exit(void)
+{
+ inet_diag_unregister(&udplite_diag_handler);
+ inet_diag_unregister(&udp_diag_handler);
+}
+
+module_init(udp_diag_init);
+module_exit(udp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650d47d..f3c27899f62 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,30 +5,30 @@
#include <net/protocol.h>
#include <net/inet_common.h>
-extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
-extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
-extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
-extern int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-extern int udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
-extern int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
#endif
-extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
-extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags);
-extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
-extern void udp_destroy_sock(struct sock *sk);
+int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len);
+int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+ int flags);
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
-extern int udp4_seq_show(struct seq_file *seq, void *v);
+int udp4_seq_show(struct seq_file *seq, void *v);
#endif
#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 00000000000..546d2d439dd
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,250 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+
+static DEFINE_SPINLOCK(udp_offload_lock);
+static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
+
+#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
+
+struct udp_offload_priv {
+ struct udp_offload *offload;
+ struct rcu_head rcu;
+ struct udp_offload_priv __rcu *next;
+};
+
+static int udp4_ufo_send_check(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ return -EINVAL;
+
+ if (likely(!skb->encapsulation)) {
+ const struct iphdr *iph;
+ struct udphdr *uh;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+ IPPROTO_UDP, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ }
+
+ return 0;
+}
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ int offset;
+ __wsum csum;
+
+ if (skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ segs = skb_udp_tunnel_segment(skb, features);
+ goto out;
+ }
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
+ SKB_GSO_MPLS) ||
+ !(type & (SKB_GSO_UDP))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+ offset = skb_checksum_start_offset(skb);
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ offset += skb->csum_offset;
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Fragment the skb. IP headers of the fragments are updated in
+ * inet_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+out:
+ return segs;
+}
+
+int udp_add_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
+
+ if (!new_offload)
+ return -ENOMEM;
+
+ new_offload->offload = uo;
+
+ spin_lock(&udp_offload_lock);
+ new_offload->next = udp_offload_base;
+ rcu_assign_pointer(udp_offload_base, new_offload);
+ spin_unlock(&udp_offload_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_add_offload);
+
+static void udp_offload_free_routine(struct rcu_head *head)
+{
+ struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
+ kfree(ou_priv);
+}
+
+void udp_del_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv __rcu **head = &udp_offload_base;
+ struct udp_offload_priv *uo_priv;
+
+ spin_lock(&udp_offload_lock);
+
+ uo_priv = udp_deref_protected(*head);
+ for (; uo_priv != NULL;
+ uo_priv = udp_deref_protected(*head)) {
+ if (uo_priv->offload == uo) {
+ rcu_assign_pointer(*head,
+ udp_deref_protected(uo_priv->next));
+ goto unlock;
+ }
+ head = &uo_priv->next;
+ }
+ pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
+unlock:
+ spin_unlock(&udp_offload_lock);
+ if (uo_priv != NULL)
+ call_rcu(&uo_priv->rcu, udp_offload_free_routine);
+}
+EXPORT_SYMBOL(udp_del_offload);
+
+static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct udp_offload_priv *uo_priv;
+ struct sk_buff *p, **pp = NULL;
+ struct udphdr *uh, *uh2;
+ unsigned int hlen, off;
+ int flush = 1;
+
+ if (NAPI_GRO_CB(skb)->udp_mark ||
+ (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
+ goto out;
+
+ /* mark that this skb passed once through the udp gro layer */
+ NAPI_GRO_CB(skb)->udp_mark = 1;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*uh);
+ uh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ uh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!uh))
+ goto out;
+ }
+
+ rcu_read_lock();
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_receive)
+ goto unflush;
+ }
+ goto out_unlock;
+
+unflush:
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = (struct udphdr *)(p->data + off);
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ return pp;
+}
+
+static int udp_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct udp_offload_priv *uo_priv;
+ __be16 newlen = htons(skb->len - nhoff);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ int err = -ENOSYS;
+
+ uh->len = newlen;
+
+ rcu_read_lock();
+
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_complete)
+ break;
+ }
+
+ if (uo_priv != NULL)
+ err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+
+ rcu_read_unlock();
+ return err;
+}
+
+static const struct net_offload udpv4_offload = {
+ .callbacks = {
+ .gso_send_check = udp4_ufo_send_check,
+ .gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp_gro_receive,
+ .gro_complete = udp_gro_complete,
+ },
+};
+
+int __init udpv4_offload_init(void)
+{
+ return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa928fa..3b3efbda48e 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -10,6 +10,10 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+#define pr_fmt(fmt) "UDPLite: " fmt
+
+#include <linux/export.h>
#include "udp_impl.h"
struct udp_table udplite_table __read_mostly;
@@ -57,6 +61,7 @@ struct proto udplite_prot = {
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
};
EXPORT_SYMBOL(udplite_prot);
@@ -65,18 +70,24 @@ static struct inet_protosw udplite4_protosw = {
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
#ifdef CONFIG_PROC_FS
+
+static const struct file_operations udplite_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
static struct udp_seq_afinfo udplite4_seq_afinfo = {
.name = "udplite",
.family = AF_INET,
.udp_table = &udplite_table,
- .seq_fops = {
- .owner = THIS_MODULE,
- },
+ .seq_fops = &udplite_afinfo_seq_fops,
.seq_ops = {
.show = udp4_seq_show,
},
@@ -120,11 +131,11 @@ void __init udplite4_register(void)
inet_register_protosw(&udplite4_protosw);
if (udplite4_proc_init())
- printk(KERN_ERR "%s: Cannot register /proc!\n", __func__);
+ pr_err("%s: Cannot register /proc!\n", __func__);
return;
out_unregister_proto:
proto_unregister(&udplite_prot);
out_register_err:
- printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
+ pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216d..aac6197b7a7 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -37,15 +37,6 @@ drop:
return NET_RX_DROP;
}
-int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type)
-{
- XFRM_SPI_SKB_CB(skb)->family = AF_INET;
- XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
- return xfrm_input(skb, nexthdr, spi, encap_type);
-}
-EXPORT_SYMBOL(xfrm4_rcv_encap);
-
int xfrm4_transport_finish(struct sk_buff *skb, int async)
{
struct iphdr *iph = ip_hdr(skb);
@@ -132,7 +123,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
* header and optional ESP marker bytes) and then modify the
* protocol to ESP, and then call into the transform receiver.
*/
- if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto drop;
/* Now we can update and verify the packet length... */
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 63418185f52..71acd0014f2 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
skb_set_network_header(skb, -x->props.header_len -
- hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
+ hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
if (x->sel.family != AF_INET6)
skb->network_header += IPV4_BEET_PHMAXLEN;
skb->mac_header = skb->network_header +
@@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
-
- memmove(skb->data - skb->mac_len, skb_mac_header(skb),
- skb->mac_len);
- skb_set_mac_header(skb, -skb->mac_len);
+ skb_mac_header_rebuild(skb);
xfrm4_beet_make_header(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 6f368413eb0..91771a7c802 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -44,8 +44,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
- /* DS disclosed */
- top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
+ /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
+ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
+ top_iph->tos = 0;
+ else
+ top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+ top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
XFRM_MODE_SKB_CB(skb)->tos);
flags = x->props.flags;
@@ -54,19 +58,18 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
- ip_select_ident(top_iph, dst->child, NULL);
- top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+ top_iph->ttl = ip4_dst_hoplimit(dst->child);
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(skb, NULL);
return 0;
}
static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
{
- const unsigned char *old_mac;
int err = -EINVAL;
if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -75,8 +78,8 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto out;
- if (skb_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
goto out;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
@@ -84,10 +87,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
if (!(x->props.flags & XFRM_STATE_NOECN))
ipip_ecn_decapsulate(skb);
- old_mac = skb_mac_header(skb);
- skb_set_mac_header(skb, -skb->mac_len);
- memmove(skb_mac_header(skb), old_mac, skb->mac_len);
skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
err = 0;
out:
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 571aa96a175..d5f6bd9a210 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -21,18 +21,20 @@
static int xfrm4_tunnel_check_size(struct sk_buff *skb)
{
int mtu, ret = 0;
- struct dst_entry *dst;
if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
goto out;
- if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
goto out;
- dst = skb_dst(skb);
- mtu = dst_mtu(dst);
+ mtu = dst_mtu(skb_dst(skb));
if (skb->len > mtu) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ if (skb->sk)
+ xfrm_local_error(skb, mtu);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
ret = -EMSGSIZE;
}
out:
@@ -60,33 +62,50 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
if (err)
return err;
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED;
-
- skb->protocol = htons(ETH_P_IP);
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
return x->outer_mode->output2(x, skb);
}
EXPORT_SYMBOL(xfrm4_prepare_output);
-static int xfrm4_output_finish(struct sk_buff *skb)
+int xfrm4_output_finish(struct sk_buff *skb)
{
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ skb->protocol = htons(ETH_P_IP);
+
#ifdef CONFIG_NETFILTER
- if (!skb_dst(skb)->xfrm) {
+ IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+
+ return xfrm_output(skb);
+}
+
+static int __xfrm4_output(struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+#ifdef CONFIG_NETFILTER
+ if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
-
- IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
#endif
- skb->protocol = htons(ETH_P_IP);
- return xfrm_output(skb);
+ return x->outer_mode->afinfo->output_finish(skb);
}
-int xfrm4_output(struct sk_buff *skb)
+int xfrm4_output(struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
- NULL, skb_dst(skb)->dev, xfrm4_output_finish,
+ NULL, skb_dst(skb)->dev, __xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
+
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
+{
+ struct iphdr *hdr;
+
+ hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+ ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+ inet_sk(skb->sk)->inet_dport, mtu);
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40adde..6156f68a1e9 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -18,47 +18,53 @@
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
- xfrm_address_t *saddr,
- xfrm_address_t *daddr)
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
+ int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
{
- struct flowi fl = {
- .fl4_dst = daddr->a4,
- .fl4_tos = tos,
- };
- struct dst_entry *dst;
struct rtable *rt;
- int err;
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = daddr->a4;
+ fl4->flowi4_tos = tos;
if (saddr)
- fl.fl4_src = saddr->a4;
+ fl4->saddr = saddr->a4;
+
+ rt = __ip_route_output_key(net, fl4);
+ if (!IS_ERR(rt))
+ return &rt->dst;
+
+ return ERR_CAST(rt);
+}
- err = __ip_route_output_key(net, &rt, &fl);
- dst = &rt->dst;
- if (err)
- dst = ERR_PTR(err);
- return dst;
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct flowi4 fl4;
+
+ return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
}
static int xfrm4_get_saddr(struct net *net,
xfrm_address_t *saddr, xfrm_address_t *daddr)
{
struct dst_entry *dst;
- struct rtable *rt;
+ struct flowi4 fl4;
- dst = xfrm4_dst_lookup(net, 0, NULL, daddr);
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
if (IS_ERR(dst))
return -EHOSTUNREACH;
- rt = (struct rtable *)dst;
- saddr->a4 = rt->rt_src;
+ saddr->a4 = fl4.saddr;
dst_release(dst);
return 0;
}
-static int xfrm4_get_tos(struct flowi *fl)
+static int xfrm4_get_tos(const struct flowi *fl)
{
- return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
}
static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -68,28 +74,26 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
}
static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
- struct flowi *fl)
+ const struct flowi *fl)
{
struct rtable *rt = (struct rtable *)xdst->route;
+ const struct flowi4 *fl4 = &fl->u.ip4;
- xdst->u.rt.fl = *fl;
+ xdst->u.rt.rt_iif = fl4->flowi4_iif;
xdst->u.dst.dev = dev;
dev_hold(dev);
- xdst->u.rt.peer = rt->peer;
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
+ xdst->u.rt.rt_is_input = rt->rt_is_input;
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
RTCF_LOCAL);
xdst->u.rt.rt_type = rt->rt_type;
- xdst->u.rt.rt_src = rt->rt_src;
- xdst->u.rt.rt_dst = rt->rt_dst;
xdst->u.rt.rt_gateway = rt->rt_gateway;
- xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
}
@@ -97,13 +101,19 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
static void
_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
{
- struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
- memset(fl, 0, sizeof(struct flowi));
- fl->mark = skb->mark;
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
- if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+ memset(fl4, 0, sizeof(struct flowi4));
+ fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
+
+ if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
@@ -114,8 +124,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be16 *ports = (__be16 *)xprth;
- fl->fl_ip_sport = ports[!!reverse];
- fl->fl_ip_dport = ports[!reverse];
+ fl4->fl4_sport = ports[!!reverse];
+ fl4->fl4_dport = ports[!reverse];
}
break;
@@ -123,8 +133,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
u8 *icmp = xprth;
- fl->fl_icmp_type = icmp[0];
- fl->fl_icmp_code = icmp[1];
+ fl4->fl4_icmp_type = icmp[0];
+ fl4->fl4_icmp_code = icmp[1];
}
break;
@@ -132,15 +142,15 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be32 *ehdr = (__be32 *)xprth;
- fl->fl_ipsec_spi = ehdr[0];
+ fl4->fl4_ipsec_spi = ehdr[0];
}
break;
case IPPROTO_AH:
if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr = (__be32*)xprth;
+ __be32 *ah_hdr = (__be32 *)xprth;
- fl->fl_ipsec_spi = ah_hdr[1];
+ fl4->fl4_ipsec_spi = ah_hdr[1];
}
break;
@@ -148,7 +158,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
__be16 *ipcomp_hdr = (__be16 *)xprth;
- fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
}
break;
@@ -160,20 +170,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
if (greflags[0] & GRE_KEY) {
if (greflags[0] & GRE_CSUM)
gre_hdr++;
- fl->fl_gre_key = gre_hdr[1];
+ fl4->fl4_gre_key = gre_hdr[1];
}
}
break;
default:
- fl->fl_ipsec_spi = 0;
+ fl4->fl4_ipsec_spi = 0;
break;
}
}
- fl->proto = iph->protocol;
- fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
- fl->fl4_src = reverse ? iph->daddr : iph->saddr;
- fl->fl4_tos = iph->tos;
+ fl4->flowi4_proto = iph->protocol;
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
+ fl4->flowi4_tos = iph->tos;
}
static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -184,20 +194,30 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu);
+}
+
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, mtu);
+ path->ops->redirect(path, sk, skb);
}
static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- if (likely(xdst->u.rt.peer))
- inet_putpeer(xdst->u.rt.peer);
+ dst_destroy_metrics_generic(dst);
+
xfrm_dst_destroy(xdst);
}
@@ -215,10 +235,12 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .redirect = xfrm4_redirect,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
- .gc_thresh = 1024,
+ .gc_thresh = 32768,
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -230,6 +252,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.get_tos = xfrm4_get_tos,
.init_path = xfrm4_init_path,
.fill_dst = xfrm4_fill_dst,
+ .blackhole_route = ipv4_blackhole_route,
};
#ifdef CONFIG_SYSCTL
@@ -244,43 +267,67 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static struct ctl_table_header *sysctl_hdr;
-#endif
-
-static void __init xfrm4_policy_init(void)
+static int __net_init xfrm4_net_init(struct net *net)
{
- xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
}
-static void __exit xfrm4_policy_fini(void)
+static void __net_exit xfrm4_net_exit(struct net *net)
{
-#ifdef CONFIG_SYSCTL
- if (sysctl_hdr)
- unregister_net_sysctl_table(sysctl_hdr);
+ struct ctl_table *table;
+
+ if (net->ipv4.xfrm4_hdr == NULL)
+ return;
+
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
#endif
- xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
}
-void __init xfrm4_init(int rt_max_size)
+void __init xfrm4_init(void)
{
- /*
- * Select a default value for the gc_thresh based on the main route
- * table hash size. It seems to me the worst case scenario is when
- * we have ipsec operating in transport mode, in which we create a
- * dst_entry per socket. The xfrm gc algorithm starts trying to remove
- * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
- * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
- * That will let us store an ipsec connection per route table entry,
- * and start cleaning when were 1/2 full
- */
- xfrm4_dst_ops.gc_thresh = rt_max_size/2;
dst_entries_init(&xfrm4_dst_ops);
xfrm4_state_init();
xfrm4_policy_init();
+ xfrm4_protocol_init();
#ifdef CONFIG_SYSCTL
- sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
- xfrm4_policy_table);
+ register_pernet_subsys(&xfrm4_net_ops);
#endif
}
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 00000000000..a2ce0101eaa
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,301 @@
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_handlers;
+ case IPPROTO_AH:
+ return &ah4_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp4_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_cb);
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ if (!head)
+ goto out;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_esp_rcv,
+ .err_handler = xfrm4_esp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_ah_rcv,
+ .err_handler = xfrm4_ah_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_ipcomp_rcv,
+ .err_handler = xfrm4_ipcomp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+ .family = AF_INET,
+ .owner = THIS_MODULE,
+ .callback = xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_protocol;
+ case IPPROTO_AH:
+ return &ah4_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp4_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex))) {
+ if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+ xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624ecc..542074c00c7 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -12,33 +12,36 @@
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/export.h>
static int xfrm4_init_flags(struct xfrm_state *x)
{
- if (ipv4_config.no_pmtu_disc)
+ if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
x->props.flags |= XFRM_STATE_NOPMTUDISC;
return 0;
}
static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
{
- sel->daddr.a4 = fl->fl4_dst;
- sel->saddr.a4 = fl->fl4_src;
- sel->dport = xfrm_flowi_dport(fl);
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ sel->daddr.a4 = fl4->daddr;
+ sel->saddr.a4 = fl4->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl);
+ sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
sel->sport_mask = htons(0xffff);
sel->family = AF_INET;
sel->prefixlen_d = 32;
sel->prefixlen_s = 32;
- sel->proto = fl->proto;
- sel->ifindex = fl->oif;
+ sel->proto = fl4->flowi4_proto;
+ sel->ifindex = fl4->flowi4_oif;
}
static void
-xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
x->id = tmpl->id;
if (x->id.daddr.a4 == 0)
@@ -53,7 +56,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
int xfrm4_extract_header(struct sk_buff *skb)
{
- struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
XFRM_MODE_SKB_CB(skb)->id = iph->id;
@@ -76,9 +79,11 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.init_tempsel = __xfrm4_init_tempsel,
.init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
+ .output_finish = xfrm4_output_finish,
.extract_input = xfrm4_extract_input,
.extract_output = xfrm4_extract_output,
.transport_finish = xfrm4_transport_finish,
+ .local_error = xfrm4_local_error,
};
void __init xfrm4_state_init(void)
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 82806455e85..06347dbd32c 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -3,6 +3,8 @@
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
+#define pr_fmt(fmt) "IPsec: " fmt
+
#include <linux/skbuff.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -61,10 +63,10 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
@@ -75,18 +77,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
static int __init ipip_init(void)
{
if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
- printk(KERN_INFO "ipip init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
- printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n");
+ pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
- printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
+ pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
xfrm_unregister_type(&ipip_type, AF_INET);
return -EAGAIN;
@@ -97,14 +99,16 @@ static int __init ipip_init(void)
static void __exit ipip_fini(void)
{
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
- printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
+ pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+ __func__);
#endif
if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
- printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n");
+ pr_info("%s: can't remove xfrm handler for AF_INET\n",
+ __func__);
if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
- printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ipip_init);