aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/include/asm/tlb.h
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/include/asm/tlb.h')
-rw-r--r--arch/x86/include/asm/tlb.h9
1 files changed, 8 insertions, 1 deletions
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9e..c7797307fc2 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,14 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+#define tlb_flush(tlb) \
+{ \
+ if (!tlb->fullmm && !tlb->need_flush_all) \
+ flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
+ else \
+ flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
+}
#include <asm-generic/tlb.h>
s='right'>85
-rw-r--r--net/ipv4/devinet.c1595
-rw-r--r--net/ipv4/esp4.c724
-rw-r--r--net/ipv4/fib_frontend.c740
-rw-r--r--net/ipv4/fib_hash.c1065
-rw-r--r--net/ipv4/fib_lookup.h52
-rw-r--r--net/ipv4/fib_rules.c231
-rw-r--r--net/ipv4/fib_semantics.c916
-rw-r--r--net/ipv4/fib_trie.c1650
-rw-r--r--net/ipv4/gre_demux.c364
-rw-r--r--net/ipv4/gre_offload.c298
-rw-r--r--net/ipv4/icmp.c826
-rw-r--r--net/ipv4/igmp.c967
-rw-r--r--net/ipv4/inet_connection_sock.c547
-rw-r--r--net/ipv4/inet_diag.c928
-rw-r--r--net/ipv4/inet_fragment.c279
-rw-r--r--net/ipv4/inet_hashtables.c538
-rw-r--r--net/ipv4/inet_lro.c284
-rw-r--r--net/ipv4/inet_timewait_sock.c257
-rw-r--r--net/ipv4/inetpeer.c696
-rw-r--r--net/ipv4/ip_forward.c62
-rw-r--r--net/ipv4/ip_fragment.c604
-rw-r--r--net/ipv4/ip_gre.c1530
-rw-r--r--net/ipv4/ip_input.c206
-rw-r--r--net/ipv4/ip_options.c281
-rw-r--r--net/ipv4/ip_output.c1015
-rw-r--r--net/ipv4/ip_sockglue.c532
-rw-r--r--net/ipv4/ip_tunnel.c1062
-rw-r--r--net/ipv4/ip_tunnel_core.c204
-rw-r--r--net/ipv4/ip_vti.c603
-rw-r--r--net/ipv4/ipcomp.c398
-rw-r--r--net/ipv4/ipconfig.c550
-rw-r--r--net/ipv4/ipip.c939
-rw-r--r--net/ipv4/ipmr.c2170
-rw-r--r--net/ipv4/ipvs/Kconfig224
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c629
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c944
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1144
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2382
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c260
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c202
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c395
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c600
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c865
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c123
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c179
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c177
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c615
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c429
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c118
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c163
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c257
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c942
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c151
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c563
-rw-r--r--net/ipv4/netfilter.c205
-rw-r--r--net/ipv4/netfilter/Kconfig339
-rw-r--r--net/ipv4/netfilter/Makefile47
-rw-r--r--net/ipv4/netfilter/arp_tables.c1681
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c35
-rw-r--r--net/ipv4/netfilter/arptable_filter.c110
-rw-r--r--net/ipv4/netfilter/ip_queue.c718
-rw-r--r--net/ipv4/netfilter/ip_tables.c1800
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c416
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c77
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c493
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c131
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c104
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c120
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c224
-rw-r--r--net/ipv4/netfilter/ipt_SAME.c179
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c482
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c87
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c104
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c289
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c66
-rw-r--r--net/ipv4/netfilter/ipt_ah.c73
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c134
-rw-r--r--net/ipv4/netfilter/ipt_iprange.c79
-rw-r--r--net/ipv4/netfilter/ipt_owner.c92
-rw-r--r--net/ipv4/netfilter/ipt_recent.c499
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c144
-rw-r--r--net/ipv4/netfilter/ipt_tos.c55
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c66
-rw-r--r--net/ipv4/netfilter/iptable_filter.c155
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c189
-rw-r--r--net/ipv4/netfilter/iptable_nat.c328
-rw-r--r--net/ipv4/netfilter/iptable_raw.c125
-rw-r--r--net/ipv4/netfilter/iptable_security.c111
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c424
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c255
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c296
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c131
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c78
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c701
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c165
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c324
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c444
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c92
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c281
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c81
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c92
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c50
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c151
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c141
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c275
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c304
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c343
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c374
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c52
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c104
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c129
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c199
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c90
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c75
-rw-r--r--net/ipv4/ping.c1218
-rw-r--r--net/ipv4/proc.c286
-rw-r--r--net/ipv4/protocol.c88
-rw-r--r--net/ipv4/raw.c553
-rw-r--r--net/ipv4/route.c3373
-rw-r--r--net/ipv4/syncookies.c299
-rw-r--r--net/ipv4/sysctl_net_ipv4.c904
-rw-r--r--net/ipv4/tcp.c1843
-rw-r--r--net/ipv4/tcp_bic.c36
-rw-r--r--net/ipv4/tcp_cong.c128
-rw-r--r--net/ipv4/tcp_cubic.c221
-rw-r--r--net/ipv4/tcp_diag.c25
-rw-r--r--net/ipv4/tcp_fastopen.c295
-rw-r--r--net/ipv4/tcp_highspeed.c10
-rw-r--r--net/ipv4/tcp_htcp.c36
-rw-r--r--net/ipv4/tcp_hybla.c46
-rw-r--r--net/ipv4/tcp_illinois.c24
-rw-r--r--net/ipv4/tcp_input.c4978
-rw-r--r--net/ipv4/tcp_ipv4.c2062
-rw-r--r--net/ipv4/tcp_lp.c14
-rw-r--r--net/ipv4/tcp_memcontrol.c228
-rw-r--r--net/ipv4/tcp_metrics.c1188
-rw-r--r--net/ipv4/tcp_minisocks.c378
-rw-r--r--net/ipv4/tcp_offload.c329
-rw-r--r--net/ipv4/tcp_output.c2708
-rw-r--r--net/ipv4/tcp_probe.c133
-rw-r--r--net/ipv4/tcp_scalable.c22
-rw-r--r--net/ipv4/tcp_timer.c352
-rw-r--r--net/ipv4/tcp_vegas.c141
-rw-r--r--net/ipv4/tcp_vegas.h10
-rw-r--r--net/ipv4/tcp_veno.c40
-rw-r--r--net/ipv4/tcp_westwood.c7
-rw-r--r--net/ipv4/tcp_yeah.c48
-rw-r--r--net/ipv4/tunnel4.c87
-rw-r--r--net/ipv4/udp.c1957
-rw-r--r--net/ipv4/udp_diag.c216
-rw-r--r--net/ipv4/udp_impl.h40
-rw-r--r--net/ipv4/udp_offload.c250
-rw-r--r--net/ipv4/udplite.c91
-rw-r--r--net/ipv4/xfrm4_input.c155
-rw-r--r--net/ipv4/xfrm4_mode_beet.c77
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c115
-rw-r--r--net/ipv4/xfrm4_output.c128
-rw-r--r--net/ipv4/xfrm4_policy.c422
-rw-r--r--net/ipv4/xfrm4_protocol.c301
-rw-r--r--net/ipv4/xfrm4_state.c64
-rw-r--r--net/ipv4/xfrm4_tunnel.c41
174 files changed, 39095 insertions, 41316 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d894f616c3d..05c57f0fcab 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -9,10 +9,7 @@ config IP_MULTICAST
intend to participate in the MBONE, a high bandwidth network on top
of the Internet which carries audio and video broadcasts. More
information about the MBONE is on the WWW at
- <http://www.savetz.com/mbone/>. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. For most people, it's
- safe to say N.
+ <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
@@ -35,7 +32,7 @@ config IP_ADVANCED_ROUTER
at boot time after the /proc file system has been mounted.
- If you turn on IP forwarding, you will also get the rp_filter, which
+ If you turn on IP forwarding, you should consider the rp_filter, which
automatically rejects incoming packets if the routing table entry
for their source address doesn't match the network interface they're
arriving on. This has security advantages because it prevents the
@@ -46,44 +43,21 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- or
+ or
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
config IP_MULTIPLE_TABLES
bool "IP: policy routing"
@@ -127,6 +101,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -153,7 +130,7 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/nfsroot.txt> for details.
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
@@ -168,7 +145,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/nfsroot.txt> for details.
+ Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -180,14 +157,13 @@ config IP_PNP_RARP
discovered automatically at boot time using the RARP protocol (an
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
- operating on your network. Read <file:Documentation/nfsroot.txt> for
- details.
+ operating on your network. Read
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
-# not yet ready..
-# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
+ select NET_IP_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -202,8 +178,20 @@ config NET_IPIP
be inserted in and removed from the running kernel whenever you
want). Most people won't need this and can say N.
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
+config NET_IP_TUNNEL
+ tristate
+ default n
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -232,10 +220,22 @@ config IP_MROUTE
packets that have several destination addresses. It is needed on the
MBONE, a high bandwidth network on top of the Internet which carries
audio and video broadcasts. In order to do that, you would most
- likely run the program mrouted. Information about the multicast
- capabilities of the various network cards is contained in
- <file:Documentation/networking/multicast.txt>. If you haven't heard
- about it, you don't need it.
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
config IP_PIMSM_V1
bool "IP: PIM-SM version 1 support"
@@ -259,33 +259,8 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.
-config ARPD
- bool "IP: ARP daemon support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
- Normally, the kernel maintains an internal cache which maps IP
- addresses to hardware addresses on the local network, so that
- Ethernet/Token Ring/ etc. frames are sent to the proper address on
- the physical networking layer. For small networks having a few
- hundred directly connected hosts or less, keeping this address
- resolution (ARP) cache inside the kernel works well. However,
- maintaining an internal ARP cache does not work well for very large
- switched networks, and will use a lot of kernel memory if TCP/IP
- connections are made to many machines on the network.
-
- If you say Y here, the kernel's internal ARP cache will never grow
- to more than 256 entries (the oldest entries are expired in a LIFO
- manner) and communication will be attempted with the user space ARP
- daemon arpd. Arpd then answers the address resolution request either
- from its own cache or by asking the net.
-
- This code is experimental and also obsolete. If you want to use it,
- you need to find a version of the daemon arpd on the net somewhere,
- and you should also say Y to "Kernel/User network link driver",
- below. If unsure, say N.
-
config SYN_COOKIES
- bool "IP: TCP syncookie support (disabled per default)"
+ bool "IP: TCP syncookie support"
---help---
Normal TCP/IP networking is open to an attack known as "SYN
flooding". This denial-of-service attack prevents legitimate remote
@@ -310,19 +285,31 @@ config SYN_COOKIES
server is really overloaded. If this happens frequently better turn
them off.
- If you say Y here, note that SYN cookies aren't enabled by default;
- you can enable them by saying Y to "/proc file system support" and
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
"Sysctl support" below and executing the command
- echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
- at boot time after the /proc file system has been mounted.
+ after the /proc file system has been mounted.
If unsure, say N.
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ depends on INET_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
config INET_AH
tristate "IP: AH transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_MD5
@@ -334,8 +321,9 @@ config INET_AH
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM
+ select XFRM_ALGO
select CRYPTO
+ select CRYPTO_AUTHENC
select CRYPTO_HMAC
select CRYPTO_MD5
select CRYPTO_CBC
@@ -348,14 +336,12 @@ config INET_ESP
config INET_IPCOMP
tristate "IP: IPComp transformation"
- select XFRM
select INET_XFRM_TUNNEL
- select CRYPTO
- select CRYPTO_DEFLATE
+ select XFRM_IPCOMP
---help---
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
-
+
If unsure, say Y.
config INET_XFRM_TUNNEL
@@ -396,7 +382,7 @@ config INET_XFRM_MODE_BEET
config INET_LRO
tristate "Large Receive Offload (ipv4/tcp)"
-
+ default y
---help---
Support for Large Receive Offload (ipv4/tcp).
@@ -408,14 +394,24 @@ config INET_DIAG
---help---
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
- downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
+ downloadable at:
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
If unsure, say Y.
config INET_TCP_DIAG
depends on INET_DIAG
def_tristate INET_DIAG
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for UDP socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
---help---
@@ -478,7 +474,6 @@ config TCP_CONG_HTCP
config TCP_CONG_HSTCP
tristate "High Speed TCP"
- depends on EXPERIMENTAL
default n
---help---
Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -489,7 +484,6 @@ config TCP_CONG_HSTCP
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
- depends on EXPERIMENTAL
default n
---help---
TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -499,7 +493,6 @@ config TCP_CONG_HYBLA
config TCP_CONG_VEGAS
tristate "TCP Vegas"
- depends on EXPERIMENTAL
default n
---help---
TCP Vegas is a sender-side only change to TCP that anticipates
@@ -510,7 +503,6 @@ config TCP_CONG_VEGAS
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
- depends on EXPERIMENTAL
default n
---help---
Scalable TCP is a sender-side only change to TCP which uses a
@@ -520,7 +512,6 @@ config TCP_CONG_SCALABLE
config TCP_CONG_LP
tristate "TCP Low Priority"
- depends on EXPERIMENTAL
default n
---help---
TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
@@ -530,7 +521,6 @@ config TCP_CONG_LP
config TCP_CONG_VENO
tristate "TCP Veno"
- depends on EXPERIMENTAL
default n
---help---
TCP Veno is a sender-side only enhancement of TCP to obtain better
@@ -538,11 +528,10 @@ config TCP_CONG_VENO
distinguishing to circumvent the difficult judgment of the packet loss
type. TCP Veno cuts down less congestion window in response to random
loss packets.
- See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
- depends on EXPERIMENTAL
select TCP_CONG_VEGAS
default n
---help---
@@ -557,10 +546,9 @@ config TCP_CONG_YEAH
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
- depends on EXPERIMENTAL
default n
---help---
- TCP-Illinois is a sender-side modificatio of TCP Reno for
+ TCP-Illinois is a sender-side modification of TCP Reno for
high speed long delay links. It uses round-trip-time to
adjust the alpha and beta parameters to achieve a higher average
throughput and maintain fairness.
@@ -584,9 +572,15 @@ choice
config DEFAULT_HTCP
bool "Htcp" if TCP_CONG_HTCP=y
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
config DEFAULT_VEGAS
bool "Vegas" if TCP_CONG_VEGAS=y
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
config DEFAULT_WESTWOOD
bool "Westwood" if TCP_CONG_WESTWOOD=y
@@ -607,14 +601,15 @@ config DEFAULT_TCP_CONG
default "bic" if DEFAULT_BIC
default "cubic" if DEFAULT_CUBIC
default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
default "vegas" if DEFAULT_VEGAS
default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
default "cubic"
config TCP_MD5SIG
- bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "TCP: MD5 Signature Option support (RFC2385)"
select CRYPTO
select CRYPTO_MD5
---help---
@@ -623,6 +618,3 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
-
-source "net/ipv4/ipvs/Kconfig"
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 93fe3966805..f032688d20d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,19 +7,22 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
- datagram.o raw.o udp.o udplite.o \
- arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o \
- inet_fragment.o
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
@@ -32,9 +35,9 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
-obj-$(CONFIG_IP_VS) += ipvs/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -48,7 +51,8 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
- xfrm4_output.o
+ xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 621b128897d..d156b3c5f36 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,8 +5,6 @@
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -67,6 +65,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "IPv4: " fmt
+
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -88,14 +88,15 @@
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/inet.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
@@ -105,20 +106,19 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
-#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
-
-extern void ip_mc_drop_socket(struct sock *sk);
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -135,25 +135,29 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
+ sk_mem_reclaim(sk);
+
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
- printk("Attempt to release TCP socket in state %d %p\n",
+ pr_err("Attempt to release TCP socket in state %d %p\n",
sk->sk_state, sk);
return;
}
if (!sock_flag(sk, SOCK_DEAD)) {
- printk("Attempt to release alive inet socket %p\n", sk);
+ pr_err("Attempt to release alive inet socket %p\n", sk);
return;
}
- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
- BUG_TRAP(!sk->sk_wmem_queued);
- BUG_TRAP(!sk->sk_forward_alloc);
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
- kfree(inet->opt);
- dst_release(sk->sk_dst_cache);
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
sk_refcnt_debug_dec(sk);
}
+EXPORT_SYMBOL(inet_sock_destruct);
/*
* The routines beyond this point handle the behaviour of an AF_INET
@@ -171,12 +175,12 @@ static int inet_autobind(struct sock *sk)
/* We may need to bind the socket. */
lock_sock(sk);
inet = inet_sk(sk);
- if (!inet->num) {
+ if (!inet->inet_num) {
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
}
release_sock(sk);
return 0;
@@ -205,6 +209,26 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
+ /* Check special setups for testing purpose to enable TFO w/o
+ * requiring TCP_FASTOPEN sockopt.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopenq may already been allocated because this
+ * socket was in TCP_LISTEN state previously but was
+ * shutdown() (rather than close()).
+ */
+ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+ err = fastopen_init_queue(sk, backlog);
+ else if ((sysctl_tcp_fastopen &
+ TFO_SERVER_WO_SOCKOPT2) != 0)
+ err = fastopen_init_queue(sk,
+ ((uint)sysctl_tcp_fastopen) >> 16);
+ else
+ err = 0;
+ if (err)
+ goto out;
+ }
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
@@ -216,61 +240,32 @@ out:
release_sock(sk);
return err;
}
-
-u32 inet_ehash_secret __read_mostly;
-EXPORT_SYMBOL(inet_ehash_secret);
-
-/*
- * inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
- */
-void build_ehash_secret(void)
-{
- u32 rnd;
- do {
- get_random_bytes(&rnd, sizeof(rnd));
- } while (rnd == 0);
- spin_lock_bh(&inetsw_lock);
- if (!inet_ehash_secret)
- inet_ehash_secret = rnd;
- spin_unlock_bh(&inetsw_lock);
-}
-EXPORT_SYMBOL(build_ehash_secret);
+EXPORT_SYMBOL(inet_listen);
/*
* Create an inet socket.
*/
-static int inet_create(struct net *net, struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
- struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
- char answer_no_check;
int try_loading_module = 0;
int err;
- if (net != &init_net)
- return -EAFNOSUPPORT;
-
- if (sock->type != SOCK_RAW &&
- sock->type != SOCK_DGRAM &&
- !inet_ehash_secret)
- build_ehash_secret();
-
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
- answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
- list_for_each_rcu(p, &inetsw[sock->type]) {
- answer = list_entry(p, struct inet_protosw, list);
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+ err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
@@ -285,10 +280,9 @@ lookup_protocol:
break;
}
err = -EPROTONOSUPPORT;
- answer = NULL;
}
- if (unlikely(answer == NULL)) {
+ if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
@@ -311,65 +305,67 @@ lookup_protocol:
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
- answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
- BUG_TRAP(answer_prot->slab != NULL);
+ WARN_ON(answer_prot->slab == NULL);
err = -ENOBUFS;
- sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, 1);
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
err = 0;
- sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
- sk->sk_reuse = 1;
+ sk->sk_reuse = SK_CAN_REUSE;
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet->nodefrag = 0;
+
if (SOCK_RAW == sock->type) {
- inet->num = protocol;
+ inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
- if (ipv4_config.no_pmtu_disc)
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->id = 0;
+ inet->inet_id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
- sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
+ inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
+ inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
- if (inet->num) {
+ if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
@@ -399,6 +395,8 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ sock_rps_reset_flow(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -418,15 +416,18 @@ int inet_release(struct socket *sock)
}
return 0;
}
+EXPORT_SYMBOL(inet_release);
/* It is off by default, see below. */
int sysctl_ip_nonlocal_bind __read_mostly;
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
int err;
@@ -440,7 +441,17 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
@@ -451,8 +462,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
- addr->sin_addr.s_addr != INADDR_ANY &&
+ !(inet->freebind || inet->transparent) &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
@@ -460,7 +471,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
@@ -474,27 +486,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* Check these errors (active socket, double bind). */
err = -EINVAL;
- if (sk->sk_state != TCP_CLOSE || inet->num)
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
- if (inet->rcv_saddr)
+ if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->sport = htons(inet->num);
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
@@ -502,25 +514,30 @@ out_release_sock:
out:
return err;
}
+EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC)
return sk->sk_prot->disconnect(sk, flags);
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
}
+EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -533,9 +550,10 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
return timeo;
}
@@ -543,14 +561,15 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo;
- lock_sock(sk);
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
@@ -591,8 +610,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
/* Error code is set above */
- if (!timeo || !inet_wait_for_connect(sk, timeo))
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
goto out;
err = sock_intr_errno(timeo);
@@ -614,7 +637,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTED;
err = 0;
out:
- release_sock(sk);
return err;
sock_error:
@@ -624,6 +646,19 @@ sock_error:
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_stream_connect);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
@@ -640,8 +675,10 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
lock_sock(sk2);
- BUG_TRAP((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+ sock_rps_record_flow(sk2);
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
sock_graft(sk2, newsock);
@@ -651,6 +688,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
do_err:
return err;
}
+EXPORT_SYMBOL(inet_accept);
/*
@@ -661,54 +699,79 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
sin->sin_family = AF_INET;
if (peer) {
- if (!inet->dport ||
+ if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
peer == 1))
return -ENOTCONN;
- sin->sin_port = inet->dport;
- sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
} else {
- __be32 addr = inet->rcv_saddr;
+ __be32 addr = inet->inet_rcv_saddr;
if (!addr)
- addr = inet->saddr;
- sin->sin_port = inet->sport;
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*uaddr_len = sizeof(*sin);
return 0;
}
+EXPORT_SYMBOL(inet_getname);
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
+EXPORT_SYMBOL(inet_sendmsg);
-
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
if (sk->sk_prot->sendpage)
return sk->sk_prot->sendpage(sk, page, offset, size, flags);
return sock_no_sendpage(sock, page, offset, size, flags);
}
+EXPORT_SYMBOL(inet_sendpage);
+
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
@@ -763,6 +826,7 @@ int inet_shutdown(struct socket *sock, int how)
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(inet_shutdown);
/*
* ioctl() calls you can issue on an INET socket. Most of these are
@@ -778,46 +842,61 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
int err = 0;
+ struct net *net = sock_net(sk);
switch (cmd) {
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
- break;
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
- break;
- case SIOCADDRT:
- case SIOCDELRT:
- case SIOCRTMSG:
- err = ip_rt_ioctl(cmd, (void __user *)arg);
- break;
- case SIOCDARP:
- case SIOCGARP:
- case SIOCSARP:
- err = arp_ioctl(cmd, (void __user *)arg);
- break;
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- case SIOCGIFBRDADDR:
- case SIOCSIFBRDADDR:
- case SIOCGIFNETMASK:
- case SIOCSIFNETMASK:
- case SIOCGIFDSTADDR:
- case SIOCSIFDSTADDR:
- case SIOCSIFPFLAGS:
- case SIOCGIFPFLAGS:
- case SIOCSIFFLAGS:
- err = devinet_ioctl(cmd, (void __user *)arg);
- break;
- default:
- if (sk->sk_prot->ioctl)
- err = sk->sk_prot->ioctl(sk, cmd, arg);
- else
- err = -ENOIOCTLCMD;
- break;
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(net, cmd, (void __user *)arg);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
}
return err;
}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
@@ -834,15 +913,18 @@ const struct proto_ops inet_stream_ops = {
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
- .sendmsg = tcp_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = tcp_sendpage,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
+EXPORT_SYMBOL(inet_stream_ops);
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
@@ -860,14 +942,16 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
+EXPORT_SYMBOL(inet_dgram_ops);
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
@@ -889,16 +973,17 @@ static const struct proto_ops inet_sockraw_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
#endif
};
-static struct net_proto_family inet_family_ops = {
+static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
@@ -914,8 +999,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .capability = -1,
- .no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
@@ -925,19 +1008,22 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .capability = CAP_NET_RAW,
- .no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
@@ -984,27 +1070,23 @@ void inet_register_protosw(struct inet_protosw *p)
out:
spin_unlock_bh(&inetsw_lock);
- synchronize_net();
-
return;
out_permanent:
- printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
- protocol);
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
- printk(KERN_ERR
- "Ignoring attempt to register invalid socket type %d.\n",
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
+EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
if (INET_PROTOSW_PERMANENT & p->flags) {
- printk(KERN_ERR
- "Attempt to unregister permanent protocol %d.\n",
+ pr_err("Attempt to unregister permanent protocol %d\n",
p->protocol);
} else {
spin_lock_bh(&inetsw_lock);
@@ -1014,6 +1096,7 @@ void inet_unregister_protosw(struct inet_protosw *p)
synchronize_net();
}
}
+EXPORT_SYMBOL(inet_unregister_protosw);
/*
* Shall we try to damage output packets if routing dev changes?
@@ -1024,40 +1107,39 @@ int sysctl_ip_dynaddr __read_mostly;
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
- int err;
+ __be32 old_saddr = inet->inet_saddr;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
struct rtable *rt;
- __be32 old_saddr = inet->saddr;
__be32 new_saddr;
- __be32 daddr = inet->daddr;
+ struct ip_options_rcu *inet_opt;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
/* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- sk->sk_protocol,
- inet->sport, inet->dport, sk, 0);
- if (err)
- return err;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
- new_saddr = rt->rt_src;
+ new_saddr = fl4->saddr;
if (new_saddr == old_saddr)
return 0;
if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- __FUNCTION__,
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
}
- inet->saddr = inet->rcv_saddr = new_saddr;
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
/*
* XXX The only one ugly spot where we need to
@@ -1076,6 +1158,8 @@ int inet_sk_rebuild_header(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
__be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
int err;
/* Route is OK, nothing to do. */
@@ -1083,34 +1167,23 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- daddr = inet->daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-{
- struct flowi fl = {
- .oif = sk->sk_bound_dev_if,
- .nl_u = {
- .ip4_u = {
- .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk),
- },
- },
- .proto = sk->sk_protocol,
- .uli_u = {
- .ports = {
- .sport = inet->sport,
- .dport = inet->dport,
- },
- },
- };
-
- security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 0);
-}
- if (!err)
- sk_setup_caps(sk, &rt->u.dst);
- else {
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ err = PTR_ERR(rt);
+
/* Routing failed... */
sk->sk_route_caps = 0;
/*
@@ -1126,13 +1199,12 @@ int inet_sk_rebuild_header(struct sock *sk)
return err;
}
-
EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
- struct iphdr *iph;
- struct net_protocol *ops;
+ const struct net_offload *ops;
+ const struct iphdr *iph;
int proto;
int ihl;
int err = -EINVAL;
@@ -1145,45 +1217,55 @@ static int inet_gso_send_check(struct sk_buff *skb)
if (ihl < sizeof(*iph))
goto out;
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
err = -EPROTONOSUPPORT;
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_send_check))
- err = ops->gso_send_check(skb);
- rcu_read_unlock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_send_check))
+ err = ops->callbacks.gso_send_check(skb);
out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
struct iphdr *iph;
- struct net_protocol *ops;
int proto;
+ int nhoff;
int ihl;
int id;
- if (!(features & NETIF_F_V4_CSUM))
- features &= ~NETIF_F_SG;
-
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_MPLS |
0)))
goto out;
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
goto out;
@@ -1192,174 +1274,434 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
if (ihl < sizeof(*iph))
goto out;
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
-
__skb_pull(skb, ihl);
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ SKB_GSO_CB(skb)->encap_level += ihl;
+
skb_reset_transport_header(skb);
- iph = ip_hdr(skb);
- id = ntohs(iph->id);
- proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
segs = ERR_PTR(-EPROTONOSUPPORT);
- rcu_read_lock();
- ops = rcu_dereference(inet_protos[proto]);
- if (likely(ops && ops->gso_segment))
- segs = ops->gso_segment(skb, features);
- rcu_read_unlock();
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
- if (!segs || unlikely(IS_ERR(segs)))
+ if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
- iph = ip_hdr(skb);
- iph->id = htons(id++);
- iph->tot_len = htons(skb->len - skb->mac_len);
- iph->check = 0;
- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
+ iph->id = htons(id);
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next != NULL)
+ iph->frag_off |= htons(IP_MF);
+ offset += skb->len - nhoff - ihl;
+ } else {
+ iph->id = htons(id++);
+ }
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
} while ((skb = skb->next));
out:
return segs;
}
-unsigned long snmp_fold_field(void *mib[], int offt)
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
+ int flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ proto = iph->protocol;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ if (*(u8 *)iph != 0x45)
+ goto out_unlock;
+
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+ goto out_unlock;
+
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
+ id >>= 16;
+
+ for (p = *head; p; p = p->next) {
+ struct iphdr *iph2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
+ if ((iph->protocol ^ iph2->protocol) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* All fields must match except length and checksum. */
+ NAPI_GRO_CB(p)->flush |=
+ (iph->ttl ^ iph2->ttl) |
+ (iph->tos ^ iph2->tos) |
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ NAPI_GRO_CB(p)->flush |= flush;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ int proto = iph->protocol;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
+
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+
+ sk_change_net(*sk, net);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
int i;
- for_each_possible_cpu(i) {
- res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
- res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
- }
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
-{
- BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize);
- if (!ptr[0])
- goto err0;
- ptr[1] = __alloc_percpu(mibsize);
- if (!ptr[1])
- goto err1;
- return 0;
-err1:
- free_percpu(ptr[0]);
- ptr[0] = NULL;
-err0:
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(snmp_mib_init);
+#if BITS_PER_LONG==32
-void snmp_mib_free(void *ptr[2])
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
- BUG_ON(ptr == NULL);
- free_percpu(ptr[0]);
- free_percpu(ptr[1]);
- ptr[0] = ptr[1] = NULL;
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+
+ res += v;
+ }
+ return res;
}
-EXPORT_SYMBOL_GPL(snmp_mib_free);
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
#ifdef CONFIG_IP_MULTICAST
-static struct net_protocol igmp_protocol = {
+static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
+ .netns_ok = 1,
};
#endif
-static struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .no_policy = 1,
+static const struct net_protocol tcp_protocol = {
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
};
-static struct net_protocol udp_protocol = {
+static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
.handler = udp_rcv,
.err_handler = udp_err,
.no_policy = 1,
+ .netns_ok = 1,
};
-static struct net_protocol icmp_protocol = {
+static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
+ .err_handler = icmp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
};
-static int __init init_ipv4_mibs(void)
+static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void **)net_statistics,
- sizeof(struct linux_mib),
- __alignof__(struct linux_mib)) < 0)
- goto err_net_mib;
- if (snmp_mib_init((void **)ip_statistics,
- sizeof(struct ipstats_mib),
- __alignof__(struct ipstats_mib)) < 0)
- goto err_ip_mib;
- if (snmp_mib_init((void **)icmp_statistics,
- sizeof(struct icmp_mib),
- __alignof__(struct icmp_mib)) < 0)
- goto err_icmp_mib;
- if (snmp_mib_init((void **)icmpmsg_statistics,
- sizeof(struct icmpmsg_mib),
- __alignof__(struct icmpmsg_mib)) < 0)
- goto err_icmpmsg_mib;
- if (snmp_mib_init((void **)tcp_statistics,
- sizeof(struct tcp_mib),
- __alignof__(struct tcp_mib)) < 0)
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
goto err_tcp_mib;
- if (snmp_mib_init((void **)udp_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
+ goto err_ip_mib;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
+ goto err_net_mib;
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
goto err_udp_mib;
- if (snmp_mib_init((void **)udplite_statistics,
- sizeof(struct udp_mib),
- __alignof__(struct udp_mib)) < 0)
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
goto err_udplite_mib;
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
+ goto err_icmp_mib;
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
+ goto err_icmpmsg_mib;
- tcp_mib_init();
-
+ tcp_mib_init(net);
return 0;
-err_udplite_mib:
- snmp_mib_free((void **)udp_statistics);
-err_udp_mib:
- snmp_mib_free((void **)tcp_statistics);
-err_tcp_mib:
- snmp_mib_free((void **)icmpmsg_statistics);
err_icmpmsg_mib:
- snmp_mib_free((void **)icmp_statistics);
+ free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void **)ip_statistics);
-err_ip_mib:
- snmp_mib_free((void **)net_statistics);
+ free_percpu(net->mib.udplite_statistics);
+err_udplite_mib:
+ free_percpu(net->mib.udp_statistics);
+err_udp_mib:
+ free_percpu(net->mib.net_statistics);
err_net_mib:
+ free_percpu(net->mib.ip_statistics);
+err_ip_mib:
+ free_percpu(net->mib.tcp_statistics);
+err_tcp_mib:
return -ENOMEM;
}
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
static int ipv4_proc_init(void);
/*
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
- .gso_send_check = inet_gso_send_check,
- .gso_segment = inet_gso_segment,
};
static int __init inet_init(void)
{
- struct sk_buff *dummy_skb;
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
rc = proto_register(&tcp_prot, 1);
if (rc)
@@ -1373,25 +1715,33 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
/*
* Tell SOCKET that we are alive...
*/
(void)sock_register(&inet_family_ops);
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
+
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
- printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
@@ -1413,32 +1763,42 @@ static int __init inet_init(void)
ip_init();
- tcp_v4_init(&inet_family_ops);
+ tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ ping_init();
+
/*
* Set the ICMP layer up
*/
- icmp_init(&inet_family_ops);
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
- ip_mr_init();
+ if (ip_mr_init())
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
@@ -1449,6 +1809,8 @@ static int __init inet_init(void)
rc = 0;
out:
return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
@@ -1471,15 +1833,15 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
- if (fib_proc_init())
- goto out_fib;
+ if (ping_proc_init())
+ goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
- fib_proc_exit();
-out_fib:
+ ping_proc_exit();
+out_ping:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
@@ -1499,20 +1861,3 @@ static int __init ipv4_proc_init(void)
MODULE_ALIAS_NETPROTO(PF_INET);
-EXPORT_SYMBOL(inet_accept);
-EXPORT_SYMBOL(inet_bind);
-EXPORT_SYMBOL(inet_dgram_connect);
-EXPORT_SYMBOL(inet_dgram_ops);
-EXPORT_SYMBOL(inet_getname);
-EXPORT_SYMBOL(inet_ioctl);
-EXPORT_SYMBOL(inet_listen);
-EXPORT_SYMBOL(inet_register_protosw);
-EXPORT_SYMBOL(inet_release);
-EXPORT_SYMBOL(inet_sendmsg);
-EXPORT_SYMBOL(inet_shutdown);
-EXPORT_SYMBOL(inet_sock_destruct);
-EXPORT_SYMBOL(inet_stream_connect);
-EXPORT_SYMBOL(inet_stream_ops);
-EXPORT_SYMBOL(inet_unregister_protosw);
-EXPORT_SYMBOL(net_statistics);
-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4e8e3b079f5..a2afa89513a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,23 +1,83 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
-#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
#include <net/icmp.h>
#include <net/protocol.h>
-#include <asm/scatterlist.h>
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
/* Clear mutable options and find final destination to substitute
* into IP header for icv calculation. Options are already checked
* for validity, so paranoia is not required. */
-static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
{
- unsigned char * optptr = (unsigned char*)(iph+1);
+ unsigned char *optptr = (unsigned char *)(iph+1);
int l = iph->ihl*4 - sizeof(struct iphdr);
int optlen;
@@ -55,20 +115,79 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
return 0;
}
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
struct iphdr *iph, *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- union {
- struct iphdr iph;
- char buf[60];
- } tmp_iph;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
+ if (!iph)
+ goto out;
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
top_iph = ip_hdr(skb);
- iph = &tmp_iph.iph;
iph->tos = top_iph->tos;
iph->ttl = top_iph->ttl;
@@ -79,10 +198,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
if (err)
- goto error;
+ goto out_free;
}
- ah = ip_auth_hdr(skb);
ah->nexthdr = *skb_mac_header(skb);
*skb_mac_header(skb) = IPPROTO_AH;
@@ -92,20 +210,39 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ahp = x->data;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
- spin_lock_bh(&x->lock);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
- spin_unlock_bh(&x->lock);
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
- if (err)
- goto error;
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
top_iph->ttl = iph->ttl;
@@ -115,51 +252,127 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
}
- err = 0;
-
-error:
+out_free:
+ kfree(iph);
+out:
return err;
}
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ah_hlen;
int ihl;
int nexthdr;
- int err = -EINVAL;
- struct iphdr *iph;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- char work_buf[60];
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
ah = (struct ip_auth_hdr *)skb->data;
ahp = x->data;
+ ahash = ahp->ahash;
+
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
- goto out;
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
/* We are going to _remove_ AH header to keep sockets happy,
* so... Later this can change. */
- if (skb_cloned(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_unclone(skb, GFP_ATOMIC))
goto out;
skb->ip_summed = CHECKSUM_NONE;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
ah = (struct ip_auth_hdr *)skb->data;
iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
- ihl = skb->data - skb_network_header(skb);
- memcpy(work_buf, iph, ihl);
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph)
+ goto out;
+
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
+ icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
iph->ttl = 0;
iph->tos = 0;
@@ -167,57 +380,90 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
iph->check = 0;
if (ihl > sizeof(*iph)) {
__be32 dummy;
- if (ip_clear_mutable_options(iph, &dummy))
- goto out;
+ err = ip_clear_mutable_options(iph, &dummy);
+ if (err)
+ goto out_free;
}
- {
- u8 auth_data[MAX_AH_AUTH_LEN];
- memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
- skb_push(skb, ihl);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- if (err)
- goto out;
- err = -EINVAL;
- if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) {
- x->stats.integrity_failed++;
+ skb_push(skb, ihl);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
goto out;
- }
+
+ goto out_free;
}
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out_free;
+
skb->network_header += ah_hlen;
- memcpy(skb_network_header(skb), work_buf, ihl);
- skb->transport_header = skb->network_header;
+ memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
- return nexthdr;
+ err = nexthdr;
+out_free:
+ kfree (work_iph);
out:
return err;
}
-static void ah4_err(struct sk_buff *skb, u32 info)
+static int ah4_err(struct sk_buff *skb, u32 info)
{
- struct iphdr *iph = (struct iphdr*)skb->data;
- struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
if (!x)
- return;
- printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
- ntohl(ah->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
xfrm_state_put(x);
+
+ return 0;
}
static int ah_init_state(struct xfrm_state *x)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *tfm;
+ struct crypto_ahash *ahash;
if (!x->aalg)
goto error;
@@ -226,46 +472,47 @@ static int ah_init_state(struct xfrm_state *x)
goto error;
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
- if (ahp == NULL)
+ if (!ahp)
return -ENOMEM;
- tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
goto error;
- ahp->tfm = tfm;
- if (crypto_hash_setkey(tfm, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
goto error;
/*
* Lookup the algorithm description maintained by xfrm_algo,
* verify crypto transform properties, and store information
* we need for AH processing. This lookup cannot fail here
- * after a successful crypto_alloc_hash().
+ * after a successful crypto_alloc_ahash().
*/
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(tfm)) {
- printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_hash_digestsize(tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ crypto_ahash_digestsize(ahash)) {
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
goto error;
}
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
- if (!ahp->work_icv)
- goto error;
-
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
@@ -274,8 +521,7 @@ static int ah_init_state(struct xfrm_state *x)
error:
if (ahp) {
- kfree(ahp->work_icv);
- crypto_free_hash(ahp->tfm);
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
return -EINVAL;
@@ -288,15 +534,16 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- crypto_free_hash(ahp->tfm);
- ahp->tfm = NULL;
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
-static struct xfrm_type ah_type =
+static const struct xfrm_type ah_type =
{
.description = "AH4",
.owner = THIS_MODULE,
@@ -308,20 +555,22 @@ static struct xfrm_type ah_type =
.output = ah_output
};
-static struct net_protocol ah4_protocol = {
+static struct xfrm4_protocol ah4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
.err_handler = ah4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init ah4_init(void)
{
if (xfrm_register_type(&ah_type, AF_INET) < 0) {
- printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
- printk(KERN_INFO "ip ah init: can't add protocol\n");
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&ah_type, AF_INET);
return -EAGAIN;
}
@@ -330,10 +579,10 @@ static int __init ah4_init(void)
static void __exit ah4_fini(void)
{
- if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
- printk(KERN_INFO "ip ah close: can't remove protocol\n");
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(ah4_init);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 36d6798947b..1a9b99e0446 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,7 +1,5 @@
/* linux/net/ipv4/arp.c
*
- * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
- *
* Copyright (C) 1994 by Florian La Roche
*
* This module implements the Address Resolution Protocol ARP (RFC 826),
@@ -57,7 +55,7 @@
* Stuart Cheshire : Metricom and grat arp fixes
* *** FOR 2.1 clean this up ***
* Lawrence V. Stefani: (08/12/96) Added FDDI support.
- * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
* folded into the mainstream FDDI code.
* Ack spit, Linus how did you allow that
* one in...
@@ -72,8 +70,11 @@
* bonding can change the skb before
* sending (e.g. insert 8021q tag).
* Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -90,7 +91,6 @@
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
-#include <linux/trdevice.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -98,7 +98,7 @@
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
-#include <linux/jhash.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -111,97 +111,82 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
#include <net/ax25.h>
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
#include <net/netrom.h>
-#endif
-#endif
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-#include <net/atmclip.h>
-struct neigh_table *clip_tbl_hook;
-#endif
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
-static struct neigh_ops arp_generic_ops = {
+static const struct neigh_ops arp_generic_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops arp_hh_ops = {
+static const struct neigh_ops arp_hh_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
-static struct neigh_ops arp_direct_ops = {
+static const struct neigh_ops arp_direct_ops = {
.family = AF_INET,
- .output = dev_queue_xmit,
- .connected_output = dev_queue_xmit,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
};
-struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_compat_output,
.connected_output = neigh_compat_output,
- .hh_output = dev_queue_xmit,
- .queue_xmit = dev_queue_xmit,
};
struct neigh_table arp_tbl = {
- .family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
- .key_len = 4,
- .hash = arp_hash,
- .constructor = arp_constructor,
- .proxy_redo = parp_redo,
- .id = "arp_cache",
- .parms = {
- .tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .family = AF_INET,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .reachable_time = 30 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
},
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
};
+EXPORT_SYMBOL(arp_tbl);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
@@ -211,11 +196,11 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
case ARPHRD_IEEE802:
ip_eth_mc_map(addr, haddr);
return 0;
- case ARPHRD_IEEE802_TR:
- ip_tr_mc_map(addr, haddr);
- return 0;
case ARPHRD_INFINIBAND:
- ip_ib_mc_map(addr, haddr);
+ ip_ib_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
return 0;
default:
if (dir) {
@@ -227,20 +212,20 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
}
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+ return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32*)neigh->primary_key;
+ __be32 addr = *(__be32 *)neigh->primary_key;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
- neigh->type = inet_addr_type(addr);
-
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
@@ -248,6 +233,8 @@ static int arp_constructor(struct neighbour *neigh)
return -EINVAL;
}
+ neigh->type = inet_addr_type(dev_net(dev), addr);
+
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
@@ -256,7 +243,7 @@ static int arp_constructor(struct neighbour *neigh)
if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
- neigh->output = neigh->ops->queue_xmit;
+ neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
@@ -289,24 +276,27 @@ static int arp_constructor(struct neighbour *neigh)
default:
break;
case ARPHRD_ROSE:
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
#endif
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
+#else
+ break;
#endif
- ;}
+ }
#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
- } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
- } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
@@ -316,7 +306,7 @@ static int arp_constructor(struct neighbour *neigh)
else
neigh->ops = &arp_generic_ops;
- if (neigh->nud_state&NUD_VALID)
+ if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
@@ -333,26 +323,30 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
- u8 *dst_ha = NULL;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
struct net_device *dev = neigh->dev;
- __be32 target = *(__be32*)neigh->primary_key;
+ __be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev;
- if (!in_dev)
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
return;
-
+ }
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL)
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
if (!skb)
break;
saddr = ip_hdr(skb)->saddr;
- if (inet_addr_type(saddr) == RTN_LOCAL) {
+ if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -362,33 +356,32 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
case 2: /* Avoid secondary IPs, get a primary/preferred one */
break;
}
+ rcu_read_unlock();
- if (in_dev)
- in_dev_put(in_dev);
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- if ((probes -= neigh->parms->ucast_probes) < 0) {
- if (!(neigh->nud_state&NUD_VALID))
- printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
- dst_ha = neigh->ha;
- read_lock_bh(&neigh->lock);
- } else if ((probes -= neigh->parms->app_probes) < 0) {
-#ifdef CONFIG_ARPD
- neigh_app_ns(neigh);
-#endif
- return;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
+ } else {
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
+ if (probes < 0) {
+ neigh_app_ns(neigh);
+ return;
+ }
}
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
- dst_ha, dev->dev_addr, NULL);
- if (dst_ha)
- read_unlock_bh(&neigh->lock);
+ dst_hw, dev->dev_addr, NULL);
}
-static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
- __be32 sip, __be32 tip)
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
+ struct net *net = dev_net(in_dev->dev);
int scope;
switch (IN_DEV_ARP_IGNORE(in_dev)) {
@@ -407,7 +400,7 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
case 3: /* Do not reply for scope host addresses */
sip = 0;
scope = RT_SCOPE_LINK;
- dev = NULL;
+ in_dev = NULL;
break;
case 4: /* Reserved */
case 5:
@@ -419,21 +412,21 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
default:
return 0;
}
- return !inet_confirm_addr(dev, sip, tip, scope);
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
- .saddr = tip } } };
struct rtable *rt;
int flag = 0;
/*unsigned long now; */
+ struct net *net = dev_net(dev);
- if (ip_route_output_key(&rt, &fl) < 0)
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
return 1;
- if (rt->u.dst.dev != dev) {
- NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+ if (rt->dst.dev != dev) {
+ NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
flag = 1;
}
ip_rt_put(rt);
@@ -451,11 +444,12 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
* is allowed to use this function, it is scheduled to be removed. --ANK
*/
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+ __be32 paddr, struct net_device *dev)
{
switch (addr_hint) {
case RTN_LOCAL:
- printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ pr_debug("arp called for own IP address\n");
memcpy(haddr, dev->dev_addr, dev->addr_len);
return 1;
case RTN_MULTICAST:
@@ -475,25 +469,23 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
__be32 paddr;
struct neighbour *n;
- if (!skb->dst) {
- printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ if (!skb_dst(skb)) {
+ pr_debug("arp_find is called with dst==NULL\n");
kfree_skb(skb);
return 1;
}
- paddr = ((struct rtable*)skb->dst)->rt_gateway;
-
- if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+ paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
+ if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+ paddr, dev))
return 0;
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
if (n) {
n->used = jiffies;
- if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
+ if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+ neigh_ha_snapshot(haddr, n, dev);
neigh_release(n);
return 0;
}
@@ -502,56 +494,74 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
kfree_skb(skb);
return 1;
}
+EXPORT_SYMBOL(arp_find);
/* END OF OBSOLETE FUNCTIONS */
-int arp_bind_neighbour(struct dst_entry *dst)
-{
- struct net_device *dev = dst->dev;
- struct neighbour *n = dst->neighbour;
-
- if (dev == NULL)
- return -EINVAL;
- if (n == NULL) {
- __be32 nexthop = ((struct rtable*)dst)->rt_gateway;
- if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
- nexthop = 0;
- n = __neigh_lookup_errno(
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ? clip_tbl_hook :
-#endif
- &arp_tbl, &nexthop, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- dst->neighbour = n;
- }
- return 0;
-}
-
/*
* Check if we can use proxy ARP for this path
*/
-
-static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
{
struct in_device *out_dev;
int imi, omi = -1;
- if (!IN_DEV_PROXY_ARP(in_dev))
+ if (rt->dst.dev == dev)
return 0;
- if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
return 1;
if (imi == -1)
return 0;
/* place to check for proxy_arp for routes */
- if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- in_dev_put(out_dev);
- }
- return (omi != imi && omi != -1);
+
+ return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
}
/*
@@ -564,25 +574,27 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
*/
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
- unsigned char *dest_hw, unsigned char *src_hw,
- unsigned char *target_hw)
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw)
{
struct sk_buff *skb;
struct arphdr *arp;
unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
/*
* Allocate a buffer
*/
- skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
- + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
if (skb == NULL)
return NULL;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
- arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
skb->protocol = htons(ETH_P_ARP);
if (src_hw == NULL)
@@ -612,13 +624,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pro = htons(ETH_P_IP);
break;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
case ARPHRD_AX25:
arp->ar_hrd = htons(ARPHRD_AX25);
arp->ar_pro = htons(AX25_P_IP);
break;
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#if IS_ENABLED(CONFIG_NETROM)
case ARPHRD_NETROM:
arp->ar_hrd = htons(ARPHRD_NETROM);
arp->ar_pro = htons(AX25_P_IP);
@@ -626,35 +638,37 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
#endif
#endif
-#ifdef CONFIG_FDDI
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
arp->ar_hrd = htons(ARPHRD_ETHER);
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#ifdef CONFIG_TR
- case ARPHRD_IEEE802_TR:
- arp->ar_hrd = htons(ARPHRD_IEEE802);
- arp->ar_pro = htons(ETH_P_IP);
- break;
-#endif
}
arp->ar_hln = dev->addr_len;
arp->ar_pln = 4;
arp->ar_op = htons(type);
- arp_ptr=(unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
memcpy(arp_ptr, src_hw, dev->addr_len);
- arp_ptr+=dev->addr_len;
- memcpy(arp_ptr, &src_ip,4);
- arp_ptr+=4;
- if (target_hw != NULL)
- memcpy(arp_ptr, target_hw, dev->addr_len);
- else
- memset(arp_ptr, 0, dev->addr_len);
- arp_ptr+=dev->addr_len;
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &src_ip, 4);
+ arp_ptr += 4;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
memcpy(arp_ptr, &dest_ip, 4);
return skb;
@@ -663,6 +677,7 @@ out:
kfree_skb(skb);
return NULL;
}
+EXPORT_SYMBOL(arp_create);
/*
* Send an arp packet.
@@ -670,16 +685,17 @@ out:
void arp_xmit(struct sk_buff *skb)
{
/* Send it off, maybe filter it using firewalling first. */
- NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
}
+EXPORT_SYMBOL(arp_xmit);
/*
* Create and send an arp packet.
*/
void arp_send(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
- unsigned char *dest_hw, unsigned char *src_hw,
- unsigned char *target_hw)
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
{
struct sk_buff *skb;
@@ -692,12 +708,12 @@ void arp_send(int type, int ptype, __be32 dest_ip,
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
- if (skb == NULL) {
+ if (skb == NULL)
return;
- }
arp_xmit(skb);
}
+EXPORT_SYMBOL(arp_send);
/*
* Process an arp request.
@@ -706,15 +722,17 @@ void arp_send(int type, int ptype, __be32 dest_ip,
static int arp_process(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
- unsigned char *sha, *tha;
+ unsigned char *sha;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
+ struct net *net = dev_net(dev);
+ bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
@@ -731,22 +749,11 @@ static int arp_process(struct sk_buff *skb)
htons(dev_type) != arp->ar_hrd)
goto out;
break;
-#ifdef CONFIG_NET_ETHERNET
case ARPHRD_ETHER:
-#endif
-#ifdef CONFIG_TR
- case ARPHRD_IEEE802_TR:
-#endif
-#ifdef CONFIG_FDDI
case ARPHRD_FDDI:
-#endif
-#ifdef CONFIG_NET_FC
case ARPHRD_IEEE802:
-#endif
-#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
- defined(CONFIG_FDDI) || defined(CONFIG_NET_FC)
/*
- * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
@@ -759,21 +766,16 @@ static int arp_process(struct sk_buff *skb)
arp->ar_pro != htons(ETH_P_IP))
goto out;
break;
-#endif
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
goto out;
break;
-#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
goto out;
break;
-#endif
-#endif
}
/* Understand only these message types */
@@ -785,19 +787,26 @@ static int arp_process(struct sk_buff *skb)
/*
* Extract fields
*/
- arp_ptr= (unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
- tha = arp_ptr;
- arp_ptr += dev->addr_len;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
- if (LOOPBACK(tip) || MULTICAST(tip))
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out;
/*
@@ -826,48 +835,54 @@ static int arp_process(struct sk_buff *skb)
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- inet_addr_type(tip) == RTN_LOCAL &&
- !arp_ignore(in_dev,dev,sip,tip))
- arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+ inet_addr_type(net, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+ dev->dev_addr, sha);
goto out;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
- rt = (struct rtable*)skb->dst;
+ rt = skb_rtable(skb);
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
- n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
- if (n) {
- int dont_send = 0;
-
- if (!dont_send)
- dont_send |= arp_ignore(in_dev,dev,sip,tip);
- if (!dont_send && IN_DEV_ARPFILTER(in_dev))
- dont_send |= arp_filter(sip,tip,dev);
- if (!dont_send)
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
-
- neigh_release(n);
+ int dont_send;
+
+ dont_send = arp_ignore(in_dev, sip, tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send = arp_filter(sip, tip, dev);
+ if (!dont_send) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ neigh_release(n);
+ }
}
goto out;
} else if (IN_DEV_FORWARD(in_dev)) {
- if ((rt->rt_flags&RTCF_DNAT) ||
- (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
- (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
- in_dev->arp_parms->proxy_delay == 0) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
} else {
- pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
- in_dev_put(in_dev);
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
return 0;
}
goto out;
@@ -879,14 +894,17 @@ static int arp_process(struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IPV4_DEVCONF_ALL(ARP_ACCEPT)) {
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
+ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
+ inet_addr_type(net, sip) == RTN_UNICAST;
+
if (n == NULL &&
- arp->ar_op == htons(ARPOP_REPLY) &&
- inet_addr_type(sip) == RTN_UNICAST)
+ ((arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -899,7 +917,10 @@ static int arp_process(struct sk_buff *skb)
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
- override = time_after(jiffies, n->updated + n->parms->locktime);
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
@@ -907,14 +928,13 @@ static int arp_process(struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
- neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
neigh_release(n);
}
out:
- if (in_dev)
- in_dev_put(in_dev);
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
@@ -931,31 +951,28 @@ static void parp_redo(struct sk_buff *skb)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct arphdr *arp;
+ const struct arphdr *arp;
- if (dev->nd_net != &init_net)
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
goto freeskb;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
+
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
- if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
- (2 * dev->addr_len) +
- (2 * sizeof(u32)))))
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
goto freeskb;
arp = arp_hdr(skb);
- if (arp->ar_hln != dev->addr_len ||
- dev->flags & IFF_NOARP ||
- skb->pkt_type == PACKET_OTHERHOST ||
- skb->pkt_type == PACKET_LOOPBACK ||
- arp->ar_pln != 4)
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
goto freeskb;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto out_of_mem;
-
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
- return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
freeskb:
kfree_skb(skb);
@@ -971,52 +988,67 @@ out_of_mem:
* Set (create) an ARP cache entry.
*/
-static int arp_req_set(struct arpreq *r, struct net_device * dev)
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
- __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ if (dev == NULL) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ return 0;
+ }
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask && mask != htonl(0xFFFFFFFF))
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
struct neighbour *neigh;
int err;
- if (r->arp_flags&ATF_PUBL) {
- __be32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
- if (mask && mask != htonl(0xFFFFFFFF))
- return -EINVAL;
- if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data);
- if (!dev)
- return -ENODEV;
- }
- if (mask) {
- if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
- return -ENOBUFS;
- return 0;
- }
- if (dev == NULL) {
- IPV4_DEVCONF_ALL(PROXY_ARP) = 1;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, 1);
- return 0;
- }
- return -ENXIO;
- }
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
- return err;
- dev = rt->u.dst.dev;
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
}
switch (dev->type) {
-#ifdef CONFIG_FDDI
+#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
/*
* According to RFC 1390, FDDI devices should accept ARP
@@ -1039,26 +1071,26 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
- unsigned state = NUD_STALE;
+ unsigned int state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
- err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
- NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
}
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
- unsigned flags = 0;
if (neigh->nud_state&NUD_PERMANENT)
- flags = ATF_PERM|ATF_COM;
+ return ATF_PERM | ATF_COM;
else if (neigh->nud_state&NUD_VALID)
- flags = ATF_COM;
- return flags;
+ return ATF_COM;
+ else
+ return 0;
}
/*
@@ -1085,84 +1117,87 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
return err;
}
-static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+static int arp_invalidate(struct net_device *dev, __be32 ip)
{
- int err;
- __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- struct neighbour *neigh;
-
- if (r->arp_flags & ATF_PUBL) {
- __be32 mask =
- ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask == htonl(0xFFFFFFFF))
- return pneigh_delete(&arp_tbl, &ip, dev);
- if (mask == 0) {
- if (dev == NULL) {
- IPV4_DEVCONF_ALL(PROXY_ARP) = 0;
- return 0;
- }
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev),
- PROXY_ARP, 0);
- return 0;
- }
- return -ENXIO;
- }
- return -EINVAL;
- }
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
- if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(&rt, &fl)) != 0)
- return err;
- dev = rt->u.dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
- err = -ENXIO;
- neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- if (neigh->nud_state&~NUD_NOARP)
+ if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
}
+
return err;
}
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask == htonl(0xFFFFFFFF))
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+ if (mask)
+ return -EINVAL;
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (dev == NULL) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ return arp_invalidate(dev, ip);
+}
+
/*
* Handle an ARP layer I/O control request.
*/
-int arp_ioctl(unsigned int cmd, void __user *arg)
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
int err;
struct arpreq r;
struct net_device *dev = NULL;
switch (cmd) {
- case SIOCDARP:
- case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- case SIOCGARP:
- err = copy_from_user(&r, arg, sizeof(struct arpreq));
- if (err)
- return -EFAULT;
- break;
- default:
- return -EINVAL;
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
}
if (r.arp_pa.sa_family != AF_INET)
return -EPFNOSUPPORT;
if (!(r.arp_flags & ATF_PUBL) &&
- (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1170,7 +1205,8 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL)
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (dev == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1186,33 +1222,37 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
- err = arp_req_delete(&r, dev);
+ err = arp_req_delete(net, &r, dev);
break;
case SIOCSARP:
- err = arp_req_set(&r, dev);
+ err = arp_req_set(net, &r, dev);
break;
case SIOCGARP:
err = arp_req_get(&r, dev);
- if (!err && copy_to_user(arg, &r, sizeof(r)))
- err = -EFAULT;
break;
}
out:
rtnl_unlock();
+ if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
return err;
}
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
- struct net_device *dev = ptr;
-
- if (dev->nd_net != &init_net)
- return NOTIFY_DONE;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
- rt_cache_flush(0);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
break;
default:
break;
@@ -1239,8 +1279,8 @@ void arp_ifdown(struct net_device *dev)
* Called once on startup.
*/
-static struct packet_type arp_packet_type = {
- .type = __constant_htons(ETH_P_ARP),
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
@@ -1253,14 +1293,13 @@ void __init arp_init(void)
dev_add_pack(&arp_packet_type);
arp_proc_init();
#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier);
}
#ifdef CONFIG_PROC_FS
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
/* ------------------------------------------------------------------------ */
/*
@@ -1274,12 +1313,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
for (n = 0, s = buf; n < 6; n++) {
c = (a->ax25_call[n] >> 1) & 0x7F;
- if (c != ' ') *s++ = c;
+ if (c != ' ')
+ *s++ = c;
}
*s++ = '-';
-
- if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
*s++ = '1';
n -= 10;
}
@@ -1288,10 +1328,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = '\0';
if (*buf == '\0' || *buf == '-')
- return "*";
+ return "*";
return buf;
-
}
#endif /* CONFIG_AX25 */
@@ -1301,7 +1340,6 @@ static void arp_format_neigh_entry(struct seq_file *seq,
struct neighbour *n)
{
char hbuffer[HBUFFERLEN];
- const char hexbuf[] = "0123456789ABCDEF";
int k, j;
char tbuf[16];
struct net_device *dev = n->dev;
@@ -1309,21 +1347,23 @@ static void arp_format_neigh_entry(struct seq_file *seq,
read_lock(&n->lock);
/* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#if IS_ENABLED(CONFIG_AX25)
if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
ax2asc2((ax25_address *)n->ha, hbuffer);
else {
#endif
for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
- hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
- hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
hbuffer[k++] = ':';
}
- hbuffer[--k] = 0;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+ if (k != 0)
+ --k;
+ hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
}
#endif
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+ sprintf(tbuf, "%pI4", n->primary_key);
seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
read_unlock(&n->lock);
@@ -1336,7 +1376,7 @@ static void arp_format_pneigh_entry(struct seq_file *seq,
int hatype = dev ? dev->type : 0;
char tbuf[16];
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+ sprintf(tbuf, "%pI4", n->key);
seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
dev ? dev->name : "*");
@@ -1370,16 +1410,16 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
/* ------------------------------------------------------------------------ */
static const struct seq_operations arp_seq_ops = {
- .start = arp_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = arp_seq_show,
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
};
static int arp_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &arp_seq_ops,
- sizeof(struct neigh_seq_state));
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
}
static const struct file_operations arp_seq_fops = {
@@ -1387,16 +1427,32 @@ static const struct file_operations arp_seq_fops = {
.open = arp_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-static int __init arp_proc_init(void)
+
+static int __net_init arp_net_init(struct net *net)
{
- if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
return -ENOMEM;
return 0;
}
+static void __net_exit arp_net_exit(struct net *net)
+{
+ remove_proc_entry("arp", net->proc_net);
+}
+
+static struct pernet_operations arp_net_ops = {
+ .init = arp_net_init,
+ .exit = arp_net_exit,
+};
+
+static int __init arp_proc_init(void)
+{
+ return register_pernet_subsys(&arp_net_ops);
+}
+
#else /* CONFIG_PROC_FS */
static int __init arp_proc_init(void)
@@ -1405,14 +1461,3 @@ static int __init arp_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(arp_find);
-EXPORT_SYMBOL(arp_create);
-EXPORT_SYMBOL(arp_xmit);
-EXPORT_SYMBOL(arp_send);
-EXPORT_SYMBOL(arp_tbl);
-
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 805a78e6ed5..69e77c8ff28 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,17 +3,22 @@
*
* This is an implementation of the CIPSO 2.2 protocol as specified in
* draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory. While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
* have chosen to adopt the protocol and over the years it has become a
* de-facto standard for labeled networking.
*
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
* Author: Paul Moore <paul.moore@hp.com>
*
*/
/*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -26,8 +31,7 @@
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
@@ -38,32 +42,24 @@
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/bug.h>
#include <asm/unaligned.h>
-struct cipso_v4_domhsh_entry {
- char *domain;
- u32 valid;
- struct list_head list;
- struct rcu_head rcu;
-};
-
/* List of available DOI definitions */
-/* XXX - Updates should be minimal so having a single lock for the
- * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be
- * okay. */
/* XXX - This currently assumes a minimal number of different DOIs in use,
* if in practice there are a lot of different DOIs this list should
* probably be turned into a hash table or something similar so we
* can do quick lookups. */
static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
-static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list);
+static LIST_HEAD(cipso_v4_doi_list);
/* Label mapping cache */
int cipso_v4_cache_enabled = 1;
@@ -115,10 +111,23 @@ int cipso_v4_rbm_strictvalid = 1;
/* The maximum number of category ranges permitted in the ranged category tag
* (tag #5). You may note that the IETF draft states that the maximum number
* of category ranges is 7, but if the low end of the last category range is
- * zero then it is possibile to fit 8 category ranges because the zero should
+ * zero then it is possible to fit 8 category ranges because the zero should
* be omitted. */
#define CIPSO_V4_TAG_RNG_CAT_MAX 8
+/* Base length of the local tag (non-standard tag).
+ * Tag definition (may change between kernel versions)
+ *
+ * 0 8 16 24 32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN 6
+
/*
* Helper Functions
*/
@@ -194,25 +203,6 @@ static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
}
/**
- * cipso_v4_doi_domhsh_free - Frees a domain list entry
- * @entry: the entry's RCU field
- *
- * Description:
- * This function is designed to be used as a callback to the call_rcu()
- * function so that the memory allocated to a domain list entry can be released
- * safely.
- *
- */
-static void cipso_v4_doi_domhsh_free(struct rcu_head *entry)
-{
- struct cipso_v4_domhsh_entry *ptr;
-
- ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu);
- kfree(ptr->domain);
- kfree(ptr);
-}
-
-/**
* cipso_v4_cache_entry_free - Frees a cache entry
* @entry: the entry to free
*
@@ -299,8 +289,6 @@ void cipso_v4_cache_invalidate(void)
cipso_v4_cache[iter].size = 0;
spin_unlock_bh(&cipso_v4_cache[iter].lock);
}
-
- return;
}
/**
@@ -338,7 +326,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
return -ENOENT;
hash = cipso_v4_map_cache_hash(key, key_len);
- bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+ bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
if (entry->hash == hash &&
@@ -348,6 +336,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
atomic_inc(&entry->lsm_data->refcount);
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
if (prev_entry == NULL) {
spin_unlock_bh(&cipso_v4_cache[bkt].lock);
return 0;
@@ -416,7 +405,7 @@ int cipso_v4_cache_add(const struct sk_buff *skb,
atomic_inc(&secattr->cache->refcount);
entry->lsm_data = secattr->cache;
- bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+ bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
list_add(&entry->list, &cipso_v4_cache[bkt].list);
@@ -448,7 +437,7 @@ cache_add_failure:
*
* Description:
* Search the DOI definition list for a DOI definition with a DOI value that
- * matches @doi. The caller is responsibile for calling rcu_read_[un]lock().
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
* Returns a pointer to the DOI definition on success and NULL on failure.
*/
static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -456,7 +445,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
struct cipso_v4_doi *iter;
list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
- if (iter->doi == doi && iter->valid)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
return iter;
return NULL;
}
@@ -464,6 +453,7 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
/**
* cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
* @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
*
* Description:
* The caller defines a new DOI for use by the CIPSO engine and calls this
@@ -473,116 +463,218 @@ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
* zero on success and non-zero on failure.
*
*/
-int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
{
+ int ret_val = -EINVAL;
u32 iter;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
- if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
- return -EINVAL;
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ goto doi_add_return;
for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
switch (doi_def->tags[iter]) {
case CIPSO_V4_TAG_RBITMAP:
break;
case CIPSO_V4_TAG_RANGE:
+ case CIPSO_V4_TAG_ENUM:
if (doi_def->type != CIPSO_V4_MAP_PASS)
- return -EINVAL;
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+ goto doi_add_return;
break;
case CIPSO_V4_TAG_INVALID:
if (iter == 0)
- return -EINVAL;
- break;
- case CIPSO_V4_TAG_ENUM:
- if (doi_def->type != CIPSO_V4_MAP_PASS)
- return -EINVAL;
+ goto doi_add_return;
break;
default:
- return -EINVAL;
+ goto doi_add_return;
}
}
- doi_def->valid = 1;
- INIT_RCU_HEAD(&doi_def->rcu);
- INIT_LIST_HEAD(&doi_def->dom_list);
+ atomic_set(&doi_def->refcount, 1);
- rcu_read_lock();
- if (cipso_v4_doi_search(doi_def->doi) != NULL)
- goto doi_add_failure_rlock;
spin_lock(&cipso_v4_doi_list_lock);
- if (cipso_v4_doi_search(doi_def->doi) != NULL)
- goto doi_add_failure_slock;
+ if (cipso_v4_doi_search(doi_def->doi) != NULL) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+ if (audit_buf != NULL) {
+ const char *type_str;
+ switch (doi_type) {
+ case CIPSO_V4_MAP_TRANS:
+ type_str = "trans";
+ break;
+ case CIPSO_V4_MAP_PASS:
+ type_str = "pass";
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ type_str = "local";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " cipso_doi=%u cipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
- return 0;
+ return ret_val;
+}
-doi_add_failure_slock:
- spin_unlock(&cipso_v4_doi_list_lock);
-doi_add_failure_rlock:
- rcu_read_unlock();
- return -EEXIST;
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+ if (doi_def == NULL)
+ return;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ kfree(doi_def->map.std->lvl.cipso);
+ kfree(doi_def->map.std->lvl.local);
+ kfree(doi_def->map.std->cat.cipso);
+ kfree(doi_def->map.std->cat.local);
+ break;
+ }
+ kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+ struct cipso_v4_doi *doi_def;
+
+ doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+ cipso_v4_doi_free(doi_def);
}
/**
* cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
* @doi: the DOI value
* @audit_secid: the LSM secid to use in the audit message
- * @callback: the DOI cleanup/free callback
*
* Description:
- * Removes a DOI definition from the CIPSO engine, @callback is called to
- * free any memory. The NetLabel routines will be called to release their own
- * LSM domain mappings as well as our own domain list. Returns zero on
- * success and negative values on failure.
+ * Removes a DOI definition from the CIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
*
*/
-int cipso_v4_doi_remove(u32 doi,
- struct netlbl_audit *audit_info,
- void (*callback) (struct rcu_head * head))
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
{
+ int ret_val;
struct cipso_v4_doi *doi_def;
- struct cipso_v4_domhsh_entry *dom_iter;
+ struct audit_buffer *audit_buf;
- rcu_read_lock();
- if (cipso_v4_doi_search(doi) != NULL) {
- spin_lock(&cipso_v4_doi_list_lock);
- doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL) {
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- return -ENOENT;
- }
- doi_def->valid = 0;
- list_del_rcu(&doi_def->list);
+ spin_lock(&cipso_v4_doi_list_lock);
+ doi_def = cipso_v4_doi_search(doi);
+ if (doi_def == NULL) {
spin_unlock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
- if (dom_iter->valid)
- netlbl_domhsh_remove(dom_iter->domain,
- audit_info);
- cipso_v4_cache_invalidate();
- rcu_read_unlock();
-
- call_rcu(&doi_def->rcu, callback);
- return 0;
+ ret_val = -ENOENT;
+ goto doi_remove_return;
}
- rcu_read_unlock();
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
- return -ENOENT;
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+ if (audit_buf != NULL) {
+ audit_log_format(audit_buf,
+ " cipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
}
/**
- * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
* @doi: the DOI value
*
* Description:
* Searches for a valid DOI definition and if one is found it is returned to
* the caller. Otherwise NULL is returned. The caller must ensure that
- * rcu_read_lock() is held while accessing the returned definition.
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
*
*/
struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
{
- return cipso_v4_doi_search(doi);
+ struct cipso_v4_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (doi_def == NULL)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+ if (doi_def == NULL)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&cipso_v4_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
}
/**
@@ -608,7 +700,7 @@ int cipso_v4_doi_walk(u32 *skip_cnt,
rcu_read_lock();
list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
- if (iter_doi->valid) {
+ if (atomic_read(&iter_doi->refcount) > 0) {
if (doi_cnt++ < *skip_cnt)
continue;
ret_val = callback(iter_doi, cb_arg);
@@ -624,92 +716,6 @@ doi_walk_return:
return ret_val;
}
-/**
- * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to add
- *
- * Description:
- * Adds the @domain to the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel). This function
- * does allocate memory. Returns zero on success, negative values on failure.
- *
- */
-int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain)
-{
- struct cipso_v4_domhsh_entry *iter;
- struct cipso_v4_domhsh_entry *new_dom;
-
- new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL);
- if (new_dom == NULL)
- return -ENOMEM;
- if (domain) {
- new_dom->domain = kstrdup(domain, GFP_KERNEL);
- if (new_dom->domain == NULL) {
- kfree(new_dom);
- return -ENOMEM;
- }
- }
- new_dom->valid = 1;
- INIT_RCU_HEAD(&new_dom->rcu);
-
- rcu_read_lock();
- spin_lock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
- if (iter->valid &&
- ((domain != NULL && iter->domain != NULL &&
- strcmp(iter->domain, domain) == 0) ||
- (domain == NULL && iter->domain == NULL))) {
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- kfree(new_dom->domain);
- kfree(new_dom);
- return -EEXIST;
- }
- list_add_tail_rcu(&new_dom->list, &doi_def->dom_list);
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
-
- return 0;
-}
-
-/**
- * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition
- * @doi_def: the DOI definition
- * @domain: the domain to remove
- *
- * Description:
- * Removes the @domain from the DOI specified by @doi_def, this function
- * should only be called by external functions (i.e. NetLabel). Returns zero
- * on success and negative values on error.
- *
- */
-int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
- const char *domain)
-{
- struct cipso_v4_domhsh_entry *iter;
-
- rcu_read_lock();
- spin_lock(&cipso_v4_doi_list_lock);
- list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
- if (iter->valid &&
- ((domain != NULL && iter->domain != NULL &&
- strcmp(iter->domain, domain) == 0) ||
- (domain == NULL && iter->domain == NULL))) {
- iter->valid = 0;
- list_del_rcu(&iter->list);
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
- call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free);
-
- return 0;
- }
- spin_unlock(&cipso_v4_doi_list_lock);
- rcu_read_unlock();
-
- return -ENOENT;
-}
-
/*
* Label Mapping Functions
*/
@@ -730,7 +736,7 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
switch (doi_def->type) {
case CIPSO_V4_MAP_PASS:
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
return 0;
break;
@@ -759,7 +765,7 @@ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
*net_lvl = host_lvl;
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
if (host_lvl < doi_def->map.std->lvl.local_size &&
doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
*net_lvl = doi_def->map.std->lvl.local[host_lvl];
@@ -793,7 +799,7 @@ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
*host_lvl = net_lvl;
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
map_tbl = doi_def->map.std;
if (net_lvl < map_tbl->lvl.cipso_size &&
map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
@@ -830,7 +836,7 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
switch (doi_def->type) {
case CIPSO_V4_MAP_PASS:
return 0;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
cipso_cat_size = doi_def->map.std->cat.cipso_size;
cipso_array = doi_def->map.std->cat.cipso;
for (;;) {
@@ -878,13 +884,13 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
u32 host_cat_size = 0;
u32 *host_cat_array = NULL;
- if (doi_def->type == CIPSO_V4_MAP_STD) {
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
host_cat_size = doi_def->map.std->cat.local_size;
host_cat_array = doi_def->map.std->cat.local;
}
for (;;) {
- host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat,
+ host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
host_spot + 1);
if (host_spot < 0)
break;
@@ -893,7 +899,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
net_spot = host_spot;
break;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
if (host_spot >= host_cat_size)
return -EPERM;
net_spot = host_cat_array[host_spot];
@@ -939,7 +945,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
u32 net_cat_size = 0;
u32 *net_cat_array = NULL;
- if (doi_def->type == CIPSO_V4_MAP_STD) {
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
net_cat_size = doi_def->map.std->cat.cipso_size;
net_cat_array = doi_def->map.std->cat.cipso;
}
@@ -959,7 +965,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
case CIPSO_V4_MAP_PASS:
host_spot = net_spot;
break;
- case CIPSO_V4_MAP_STD:
+ case CIPSO_V4_MAP_TRANS:
if (net_spot >= net_cat_size)
return -EPERM;
host_spot = net_cat_array[net_spot];
@@ -967,7 +973,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
return -EPERM;
break;
}
- ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
host_spot,
GFP_ATOMIC);
if (ret_val != 0)
@@ -1001,7 +1007,7 @@ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
return -EFAULT;
for (iter = 0; iter < enumcat_len; iter += 2) {
- cat = ntohs(get_unaligned((__be16 *)&enumcat[iter]));
+ cat = get_unaligned_be16(&enumcat[iter]);
if (cat <= cat_prev)
return -EFAULT;
cat_prev = cat;
@@ -1033,7 +1039,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
u32 cat_iter = 0;
for (;;) {
- cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1);
+ cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ cat + 1);
if (cat < 0)
break;
if ((cat_iter + 2) > net_cat_len)
@@ -1068,8 +1075,8 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
u32 iter;
for (iter = 0; iter < net_cat_len; iter += 2) {
- ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
- ntohs(get_unaligned((__be16 *)&net_cat[iter])),
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+ get_unaligned_be16(&net_cat[iter]),
GFP_ATOMIC);
if (ret_val != 0)
return ret_val;
@@ -1103,10 +1110,9 @@ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
return -EFAULT;
for (iter = 0; iter < rngcat_len; iter += 4) {
- cat_high = ntohs(get_unaligned((__be16 *)&rngcat[iter]));
+ cat_high = get_unaligned_be16(&rngcat[iter]);
if ((iter + 4) <= rngcat_len)
- cat_low = ntohs(
- get_unaligned((__be16 *)&rngcat[iter + 2]));
+ cat_low = get_unaligned_be16(&rngcat[iter + 2]);
else
cat_low = 0;
@@ -1149,7 +1155,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
return -ENOSPC;
for (;;) {
- iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
+ iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ iter + 1);
if (iter < 0)
break;
cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1157,7 +1164,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
return -ENOSPC;
array[array_cnt++] = iter;
- iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter);
+ iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
+ iter);
if (iter < 0)
return -EFAULT;
cat_size += sizeof(u16);
@@ -1203,14 +1211,13 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
u16 cat_high;
for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
- cat_high = ntohs(get_unaligned((__be16 *)&net_cat[net_iter]));
+ cat_high = get_unaligned_be16(&net_cat[net_iter]);
if ((net_iter + 4) <= net_cat_len)
- cat_low = ntohs(
- get_unaligned((__be16 *)&net_cat[net_iter + 2]));
+ cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
else
cat_low = 0;
- ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat,
+ ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
cat_low,
cat_high,
GFP_ATOMIC);
@@ -1270,7 +1277,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1283,7 +1292,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
return ret_val;
/* This will send packets using the "optimized" format when
- * possibile as specified in section 3.4.2.6 of the
+ * possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
tag_len = 14;
@@ -1292,7 +1301,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
} else
tag_len = 4;
- buffer[0] = 0x01;
+ buffer[0] = CIPSO_V4_TAG_RBITMAP;
buffer[1] = tag_len;
buffer[3] = level;
@@ -1322,12 +1331,12 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1335,7 +1344,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1369,7 +1378,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1385,7 +1396,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
} else
tag_len = 4;
- buffer[0] = 0x02;
+ buffer[0] = CIPSO_V4_TAG_ENUM;
buffer[1] = tag_len;
buffer[3] = level;
@@ -1415,12 +1426,12 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1428,7 +1439,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1462,7 +1473,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
return -EPERM;
- ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
if (ret_val != 0)
return ret_val;
@@ -1478,7 +1491,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
} else
tag_len = 4;
- buffer[0] = 0x05;
+ buffer[0] = CIPSO_V4_TAG_RANGE;
buffer[1] = tag_len;
buffer[3] = level;
@@ -1507,12 +1520,12 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
if (ret_val != 0)
return ret_val;
- secattr->mls_lvl = level;
+ secattr->attr.mls.lvl = level;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
if (tag_len > 4) {
- secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
- if (secattr->mls_cat == NULL)
+ secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
return -ENOMEM;
ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1520,7 +1533,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
tag_len - 4,
secattr);
if (ret_val != 0) {
- netlbl_secattr_catmap_free(secattr->mls_cat);
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
return ret_val;
}
@@ -1531,6 +1544,54 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
}
/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag. Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ if (!(secattr->flags & NETLBL_SECATTR_SECID))
+ return -EPERM;
+
+ buffer[0] = CIPSO_V4_TAG_LOCAL;
+ buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+ *(u32 *)&buffer[2] = secattr->attr.secid;
+
+ return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ secattr->attr.secid = *(u32 *)&tag[2];
+ secattr->flags |= NETLBL_SECATTR_SECID;
+
+ return 0;
+}
+
+/**
* cipso_v4_validate - Validate a CIPSO option
* @option: the start of the option, on error it is set to point to the error
*
@@ -1549,7 +1610,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
* that is unrecognized."
*
*/
-int cipso_v4_validate(unsigned char **option)
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
{
unsigned char *opt = *option;
unsigned char *tag;
@@ -1568,13 +1629,13 @@ int cipso_v4_validate(unsigned char **option)
}
rcu_read_lock();
- doi_def = cipso_v4_doi_search(ntohl(get_unaligned((__be32 *)&opt[2])));
+ doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
if (doi_def == NULL) {
err_offset = 2;
goto validate_return_locked;
}
- opt_iter = 6;
+ opt_iter = CIPSO_V4_HDR_LEN;
tag = opt + opt_iter;
while (opt_iter < opt_len) {
for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
@@ -1592,7 +1653,7 @@ int cipso_v4_validate(unsigned char **option)
switch (tag[0]) {
case CIPSO_V4_TAG_RBITMAP:
- if (tag_len < 4) {
+ if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
err_offset = opt_iter + 1;
goto validate_return_locked;
}
@@ -1610,7 +1671,7 @@ int cipso_v4_validate(unsigned char **option)
err_offset = opt_iter + 3;
goto validate_return_locked;
}
- if (tag_len > 4 &&
+ if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
cipso_v4_map_cat_rbm_valid(doi_def,
&tag[4],
tag_len - 4) < 0) {
@@ -1620,7 +1681,7 @@ int cipso_v4_validate(unsigned char **option)
}
break;
case CIPSO_V4_TAG_ENUM:
- if (tag_len < 4) {
+ if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
err_offset = opt_iter + 1;
goto validate_return_locked;
}
@@ -1630,7 +1691,7 @@ int cipso_v4_validate(unsigned char **option)
err_offset = opt_iter + 3;
goto validate_return_locked;
}
- if (tag_len > 4 &&
+ if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
cipso_v4_map_cat_enum_valid(doi_def,
&tag[4],
tag_len - 4) < 0) {
@@ -1639,7 +1700,7 @@ int cipso_v4_validate(unsigned char **option)
}
break;
case CIPSO_V4_TAG_RANGE:
- if (tag_len < 4) {
+ if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
err_offset = opt_iter + 1;
goto validate_return_locked;
}
@@ -1649,7 +1710,7 @@ int cipso_v4_validate(unsigned char **option)
err_offset = opt_iter + 3;
goto validate_return_locked;
}
- if (tag_len > 4 &&
+ if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
cipso_v4_map_cat_rng_valid(doi_def,
&tag[4],
tag_len - 4) < 0) {
@@ -1657,6 +1718,21 @@ int cipso_v4_validate(unsigned char **option)
goto validate_return_locked;
}
break;
+ case CIPSO_V4_TAG_LOCAL:
+ /* This is a non-standard tag that we only allow for
+ * local connections, so if the incoming interface is
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+ if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+ break;
default:
err_offset = opt_iter;
goto validate_return_locked;
@@ -1674,7 +1750,7 @@ validate_return:
}
/**
- * cipso_v4_error - Send the correct reponse for a bad packet
+ * cipso_v4_error - Send the correct response for a bad packet
* @skb: the packet
* @error: the error code
* @gateway: CIPSO gateway flag
@@ -1712,48 +1788,27 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
}
/**
- * cipso_v4_sock_setattr - Add a CIPSO option to a socket
- * @sk: the socket
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
* @doi_def: the CIPSO DOI to use
- * @secattr: the specific security attributes of the socket
+ * @secattr: the security attributes
*
* Description:
- * Set the CIPSO option on the given socket using the DOI definition and
- * security attributes passed to the function. This function requires
- * exclusive access to @sk, which means it either needs to be in the
- * process of being created or locked. Returns zero on success and negative
- * values on failure.
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function. Returns the length of the option on success and
+ * negative values on failure.
*
*/
-int cipso_v4_sock_setattr(struct sock *sk,
- const struct cipso_v4_doi *doi_def,
- const struct netlbl_lsm_secattr *secattr)
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
{
- int ret_val = -EPERM;
+ int ret_val;
u32 iter;
- unsigned char *buf;
- u32 buf_len = 0;
- u32 opt_len;
- struct ip_options *opt = NULL;
- struct inet_sock *sk_inet;
- struct inet_connection_sock *sk_conn;
- /* In the case of sock_create_lite(), the sock->sk field is not
- * defined yet but it is not a problem as the only users of these
- * "lite" PF_INET sockets are functions which do an accept() call
- * afterwards so we will label the socket as part of the accept(). */
- if (sk == NULL)
- return 0;
-
- /* We allocate the maximum CIPSO option size here so we are probably
- * being a little wasteful, but it makes our life _much_ easier later
- * on and after all we are only talking about 40 bytes. */
- buf_len = CIPSO_V4_OPT_LEN_MAX;
- buf = kmalloc(buf_len, GFP_ATOMIC);
- if (buf == NULL) {
- ret_val = -ENOMEM;
- goto socket_setattr_failure;
- }
+ if (buf_len <= CIPSO_V4_HDR_LEN)
+ return -ENOSPC;
/* XXX - This code assumes only one tag per CIPSO option which isn't
* really a good assumption to make but since we only support the MAC
@@ -1780,9 +1835,14 @@ int cipso_v4_sock_setattr(struct sock *sk,
&buf[CIPSO_V4_HDR_LEN],
buf_len - CIPSO_V4_HDR_LEN);
break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_gentag_loc(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
default:
- ret_val = -EPERM;
- goto socket_setattr_failure;
+ return -EPERM;
}
iter++;
@@ -1790,9 +1850,58 @@ int cipso_v4_sock_setattr(struct sock *sk,
iter < CIPSO_V4_TAG_MAXCNT &&
doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
if (ret_val < 0)
- goto socket_setattr_failure;
+ return ret_val;
cipso_v4_gentag_hdr(doi_def, buf, ret_val);
- buf_len = CIPSO_V4_HDR_LEN + ret_val;
+ return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *old, *opt = NULL;
+ struct inet_sock *sk_inet;
+ struct inet_connection_sock *sk_conn;
+
+ /* In the case of sock_create_lite(), the sock->sk field is not
+ * defined yet but it is not a problem as the only users of these
+ * "lite" PF_INET sockets are functions which do an accept() call
+ * afterwards so we will label the socket as part of the accept(). */
+ if (sk == NULL)
+ return 0;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (buf == NULL) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto socket_setattr_failure;
+ buf_len = ret_val;
/* We can't use ip_options_get() directly because it makes a call to
* ip_options_get_alloc() which allocates memory with GFP_KERNEL and
@@ -1804,23 +1913,25 @@ int cipso_v4_sock_setattr(struct sock *sk,
ret_val = -ENOMEM;
goto socket_setattr_failure;
}
- memcpy(opt->__data, buf, buf_len);
- opt->optlen = opt_len;
- opt->is_data = 1;
- opt->cipso = sizeof(struct iphdr);
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
kfree(buf);
buf = NULL;
sk_inet = inet_sk(sk);
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
if (sk_inet->is_icsk) {
sk_conn = inet_csk(sk);
- if (sk_inet->opt)
- sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
- sk_conn->icsk_ext_hdr_len += opt->optlen;
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
}
- opt = xchg(&sk_inet->opt, opt);
- kfree(opt);
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
@@ -1831,6 +1942,187 @@ socket_setattr_failure:
}
/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *opt = NULL;
+ struct inet_request_sock *req_inet;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (buf == NULL) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto req_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (opt == NULL) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ req_inet = inet_rsk(req);
+ opt = xchg(&req_inet->opt, opt);
+ if (opt)
+ kfree_rcu(opt, rcu);
+
+ return 0;
+
+req_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+{
+ int hdr_delta = 0;
+ struct ip_options_rcu *opt = *opt_ptr;
+
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+ u8 cipso_len;
+ u8 cipso_off;
+ unsigned char *cipso_ptr;
+ int iter;
+ int optlen_new;
+
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
+ cipso_len = cipso_ptr[1];
+
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
+
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ opt->opt.optlen - cipso_off - cipso_len);
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length */
+ iter = 0;
+ optlen_new = 0;
+ while (iter < opt->opt.optlen)
+ if (opt->opt.__data[iter] != IPOPT_NOP) {
+ iter += opt->opt.__data[iter + 1];
+ optlen_new = iter;
+ } else
+ iter++;
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
+ } else {
+ /* only the cipso option was present on the socket so we can
+ * remove the entire option struct */
+ *opt_ptr = NULL;
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
+ }
+
+ return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+ int hdr_delta;
+ struct ip_options_rcu *opt;
+ struct inet_sock *sk_inet;
+
+ sk_inet = inet_sk(sk);
+ opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+ if (opt == NULL || opt->opt.cipso == 0)
+ return;
+
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+ if (sk_inet->is_icsk && hdr_delta > 0) {
+ struct inet_connection_sock *sk_conn = inet_csk(sk);
+ sk_conn->icsk_ext_hdr_len -= hdr_delta;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+ struct ip_options_rcu *opt;
+ struct inet_request_sock *req_inet;
+
+ req_inet = inet_rsk(req);
+ opt = req_inet->opt;
+ if (opt == NULL || opt->opt.cipso == 0)
+ return;
+
+ cipso_v4_delopt(&req_inet->opt);
+}
+
+/**
* cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
* @cipso: the CIPSO v4 option
* @secattr: the security attributes
@@ -1850,7 +2142,7 @@ static int cipso_v4_getattr(const unsigned char *cipso,
if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
return 0;
- doi = ntohl(get_unaligned((__be32 *)&cipso[2]));
+ doi = get_unaligned_be32(&cipso[2]);
rcu_read_lock();
doi_def = cipso_v4_doi_search(doi);
if (doi_def == NULL)
@@ -1868,7 +2160,12 @@ static int cipso_v4_getattr(const unsigned char *cipso,
case CIPSO_V4_TAG_RANGE:
ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+ break;
}
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
getattr_return:
rcu_read_unlock();
@@ -1889,14 +2186,136 @@ getattr_return:
*/
int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
{
- struct ip_options *opt;
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
+
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+ u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+ u32 opt_len;
+ int len_delta;
- opt = inet_sk(sk)->opt;
- if (opt == NULL || opt->cipso == 0)
- return -ENOMSG;
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+ buf_len = ret_val;
+ opt_len = (buf_len + 3) & ~3;
- return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr),
- secattr);
+ /* we overwrite any existing options to ensure that we have enough
+ * room for the CIPSO option, the reason is that we _need_ to guarantee
+ * that the security label is applied to the packet - we do the same
+ * thing when using the socket options and it hasn't caused a problem,
+ * if we need to we can always revisit this choice later */
+
+ len_delta = opt_len - opt->optlen;
+ /* if we don't ensure enough headroom we could panic on the skb_push()
+ * call below so make sure we have enough, we are also "mangling" the
+ * packet so we should probably do a copy-on-write call anyway */
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta > 0) {
+ /* we assume that the header + opt->optlen have already been
+ * "pushed" in ip_options_build() or similar */
+ iph = ip_hdr(skb);
+ skb_push(skb, len_delta);
+ memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ } else if (len_delta < 0) {
+ iph = ip_hdr(skb);
+ memset(iph + 1, IPOPT_NOP, opt->optlen);
+ } else
+ iph = ip_hdr(skb);
+
+ if (opt->optlen > 0)
+ memset(opt, 0, sizeof(*opt));
+ opt->optlen = opt_len;
+ opt->cipso = sizeof(struct iphdr);
+ opt->is_changed = 1;
+
+ /* we have to do the following because we are being called from a
+ * netfilter hook which means the packet already has had the header
+ * fields populated and the checksum calculated - yes this means we
+ * are doing more work than needed but we do it to keep the core
+ * stack clean and tidy */
+ memcpy(iph + 1, buf, buf_len);
+ if (opt_len > buf_len)
+ memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+ if (len_delta != 0) {
+ iph->ihl = 5 + (opt_len >> 2);
+ iph->tot_len = htons(skb->len);
+ }
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char *cipso_ptr;
+
+ if (opt->cipso == 0)
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ /* the easiest thing to do is just replace the cipso option with noop
+ * options since we don't change the size of the packet, although we
+ * still need to recalculate the checksum */
+
+ iph = ip_hdr(skb);
+ cipso_ptr = (unsigned char *)iph + opt->cipso;
+ memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ opt->cipso = 0;
+ opt->is_changed = 1;
+
+ ip_send_check(iph);
+
+ return 0;
}
/**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 0301dd468cf..a3095fdefbe 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
struct rtable *rt;
__be32 saddr;
int oif;
@@ -38,40 +39,84 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_reset(sk);
+ lock_sock(sk);
+
oif = sk->sk_bound_dev_if;
- saddr = inet->saddr;
- if (MULTICAST(usin->sin_addr.s_addr)) {
+ saddr = inet->inet_saddr;
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
if (!oif)
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
}
- err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->sport, usin->sin_port, sk, 1);
- if (err) {
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return err;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
}
if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
ip_rt_put(rt);
- return -EACCES;
+ err = -EACCES;
+ goto out;
+ }
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr; /* Update source address */
+ if (!inet->inet_rcv_saddr) {
+ inet->inet_rcv_saddr = fl4->saddr;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
}
- if (!inet->saddr)
- inet->saddr = rt->rt_src; /* Update source address */
- if (!inet->rcv_saddr)
- inet->rcv_saddr = rt->rt_src;
- inet->daddr = rt->rt_dst;
- inet->dport = usin->sin_port;
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- inet->id = jiffies;
+ inet->inet_id = jiffies;
- sk_dst_set(sk, &rt->u.dst);
- return(0);
+ sk_dst_set(sk, &rt->dst);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
-
EXPORT_SYMBOL(ip4_datagram_connect);
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 55d199e4ae2..e9449376b58 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,8 +1,6 @@
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
- *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
@@ -29,7 +27,6 @@
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
@@ -52,68 +49,153 @@
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
+#include <linux/netconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#include "fib_lookup.h"
-struct ipv4_devconf ipv4_devconf = {
+static struct ipv4_devconf ipv4_devconf = {
.data = {
- [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
static struct ipv4_devconf ipv4_devconf_dflt = {
.data = {
- [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,
- [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
},
};
-#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr)
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .type = NLA_U32 },
[IFA_ADDRESS] = { .type = NLA_U32 },
[IFA_BROADCAST] = { .type = NLA_U32 },
- [IFA_ANYCAST] = { .type = NLA_U32 },
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
};
+#define IN4_ADDR_HSIZE_SHIFT 8
+#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
+
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+
+static u32 inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ net_hash_mix(net);
+
+ return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+ ASSERT_RTNL();
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ ASSERT_RTNL();
+ hlist_del_init_rcu(&ifa->hash);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
+ if (ifa->ifa_local == addr) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ result = dev;
+ break;
+ }
+ }
+ if (!result) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p);
-static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+static void devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static void devinet_sysctl_register(struct in_device *idev)
+{
+}
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
#endif
/* Locks all the inet devices. */
static struct in_ifaddr *inet_alloc_ifa(void)
{
- struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
-
- if (ifa) {
- INIT_RCU_HEAD(&ifa->rcu_head);
- }
-
- return ifa;
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
}
static void inet_rcu_free_ifa(struct rcu_head *head)
@@ -124,7 +206,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
kfree(ifa);
}
-static inline void inet_free_ifa(struct in_ifaddr *ifa)
+static void inet_free_ifa(struct in_ifaddr *ifa)
{
call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
}
@@ -133,19 +215,19 @@ void in_dev_finish_destroy(struct in_device *idev)
{
struct net_device *dev = idev->dev;
- BUG_TRAP(!idev->ifa_list);
- BUG_TRAP(!idev->mc_list);
+ WARN_ON(idev->ifa_list);
+ WARN_ON(idev->mc_list);
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
#ifdef NET_REFCNT_DEBUG
- printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
- idev, dev ? dev->name : "NIL");
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
dev_put(dev);
if (!idev->dead)
- printk("Freeing alive in_device %p\n", idev);
- else {
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
kfree(idev);
- }
}
+EXPORT_SYMBOL(in_dev_finish_destroy);
static struct in_device *inetdev_init(struct net_device *dev)
{
@@ -156,25 +238,21 @@ static struct in_device *inetdev_init(struct net_device *dev)
in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
if (!in_dev)
goto out;
- INIT_RCU_HEAD(&in_dev->rcu_head);
- memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+ memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
- if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ dev_disable_lro(dev);
/* Reference in_dev->dev */
dev_hold(dev);
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
-#endif
-
/* Account for reference dev->ip_ptr (below) */
in_dev_hold(in_dev);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
+ devinet_sysctl_register(in_dev);
ip_mc_init_dev(in_dev);
if (dev->flags & IFF_UP)
ip_mc_up(in_dev);
@@ -213,15 +291,9 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
-#endif
-
- dev->ip_ptr = NULL;
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_unregister(in_dev->arp_parms);
-#endif
+ devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
arp_ifdown(dev);
@@ -244,7 +316,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
}
static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 pid)
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -275,9 +347,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
- rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_DOWN, ifa);
inet_free_ifa(ifa);
@@ -288,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
}
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -302,10 +387,11 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
is valid, it will try to restore deleted routes... Grr.
So that, this order is correct.
*/
- rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
+ struct in_ifaddr *next_sec = promote->ifa_next;
if (prev_prom) {
prev_prom->ifa_next = promote->ifa_next;
@@ -314,10 +400,10 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
- rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -335,8 +421,12 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 pid)
+ u32 portid)
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -371,17 +461,22 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- net_srandom(ifa->ifa_local);
+ prandom_seed((__force u32) ifa->ifa_local);
ifap = last_primary;
}
ifa->ifa_next = *ifap;
*ifap = ifa;
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
- rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
return 0;
@@ -403,27 +498,33 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return -ENOBUFS;
}
ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
if (ifa->ifa_dev != in_dev) {
- BUG_TRAP(!ifa->ifa_dev);
+ WARN_ON(ifa->ifa_dev);
in_dev_hold(in_dev);
ifa->ifa_dev = in_dev;
}
- if (LOOPBACK(ifa->ifa_local))
+ if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa);
}
-struct in_device *inetdev_by_index(int ifindex)
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct in_device *in_dev = NULL;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(&init_net, ifindex);
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
- in_dev = in_dev_get(dev);
- read_unlock(&dev_base_lock);
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ rcu_read_unlock();
return in_dev;
}
+EXPORT_SYMBOL(inetdev_by_index);
/* Called only from RTNL semaphored context. No locks. */
@@ -439,8 +540,9 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
-static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
@@ -454,14 +556,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto errout;
ifm = nlmsg_data(nlh);
- in_dev = inetdev_by_index(ifm->ifa_index);
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
if (in_dev == NULL) {
err = -ENODEV;
goto errout;
}
- __in_dev_put(in_dev);
-
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
@@ -476,7 +576,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
!inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
continue;
- __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
return 0;
}
@@ -485,57 +585,179 @@ errout:
return err;
}
-static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh)
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ struct hlist_node *n;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ change_needed = true;
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+ rtnl_lock();
+ hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap;
+
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &(*ifap)->ifa_next) {
+ if (*ifap == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_unlock();
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+ next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
struct ifaddrmsg *ifm;
struct net_device *dev;
struct in_device *in_dev;
- int err = -EINVAL;
+ int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
if (err < 0)
goto errout;
ifm = nlmsg_data(nlh);
- if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL) {
- err = -EINVAL;
+ err = -EINVAL;
+ if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL)
goto errout;
- }
- dev = __dev_get_by_index(&init_net, ifm->ifa_index);
- if (dev == NULL) {
- err = -ENODEV;
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ err = -ENODEV;
+ if (dev == NULL)
goto errout;
- }
in_dev = __in_dev_get_rtnl(dev);
- if (in_dev == NULL) {
- err = -ENOBUFS;
+ err = -ENOBUFS;
+ if (in_dev == NULL)
goto errout;
- }
-
- ipv4_devconf_setall(in_dev);
ifa = inet_alloc_ifa();
- if (ifa == NULL) {
+ if (ifa == NULL)
/*
* A potential indev allocation can be left alive, it stays
* assigned to its device and is destroy with it.
*/
- err = -ENOBUFS;
goto errout;
- }
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
in_dev_hold(in_dev);
if (tb[IFA_ADDRESS] == NULL)
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
- ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
ifa->ifa_scope = ifm->ifa_scope;
ifa->ifa_dev = in_dev;
@@ -545,42 +767,96 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh)
if (tb[IFA_BROADCAST])
ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
- if (tb[IFA_ANYCAST])
- ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]);
-
if (tb[IFA_LABEL])
nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout_free;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
return ifa;
+errout_free:
+ inet_free_ifa(ifa);
errout:
return ERR_PTR(err);
}
-static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(nlh);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq,
+ &check_lifetime_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+ }
+ return 0;
}
/*
* Determine a default network mask, based on the IP address.
*/
-static __inline__ int inet_abc_len(__be32 addr)
+static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ZERONET(addr))
+ if (ipv4_is_zeronet(addr))
rc = 0;
else {
__u32 haddr = ntohl(addr);
@@ -597,7 +873,7 @@ static __inline__ int inet_abc_len(__be32 addr)
}
-int devinet_ioctl(unsigned int cmd, void __user *arg)
+int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct ifreq ifr;
struct sockaddr_in sin_orig;
@@ -625,9 +901,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (colon)
*colon = 0;
-#ifdef CONFIG_KMOD
- dev_load(&init_net, ifr.ifr_name);
-#endif
+ dev_load(net, ifr.ifr_name);
switch (cmd) {
case SIOCGIFADDR: /* Get interface address */
@@ -644,16 +918,16 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
case SIOCSIFFLAGS:
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
case SIOCSIFBRDADDR: /* Set the broadcast address */
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
- ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
@@ -667,13 +941,15 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
ret = -ENODEV;
- if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL)
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
goto done;
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
@@ -684,7 +960,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
ifap = &ifa->ifa_next) {
if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
- ifa->ifa_address) {
+ ifa->ifa_local) {
break; /* found */
}
}
@@ -741,8 +1017,10 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
- if ((ifa = inet_alloc_ifa()) == NULL)
+ ifa = inet_alloc_ifa();
+ if (!ifa)
break;
+ INIT_HLIST_NODE(&ifa->hash);
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
else
@@ -753,7 +1031,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
break;
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = 0;
- ifa->ifa_anycast = 0;
+ ifa->ifa_scope = 0;
}
ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
@@ -769,6 +1047,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
ifa->ifa_prefixlen = 32;
ifa->ifa_mask = inet_make_mask(32);
}
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
ret = inet_set_ifa(dev, ifa);
break;
@@ -843,10 +1122,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
struct ifreq ifr;
int done = 0;
- if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ if (!in_dev)
goto out;
- for (; ifa; ifa = ifa->ifa_next) {
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
if (!buf) {
done += sizeof(ifr);
continue;
@@ -854,10 +1133,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
if (len < (int) sizeof(ifr))
break;
memset(&ifr, 0, sizeof(struct ifreq));
- if (ifa->ifa_label)
- strcpy(ifr.ifr_name, ifa->ifa_label);
- else
- strcpy(ifr.ifr_name, dev->name);
+ strcpy(ifr.ifr_name, ifa->ifa_label);
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
@@ -879,6 +1155,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
__be32 addr = 0;
struct in_device *in_dev;
+ struct net *net = dev_net(dev);
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -895,36 +1172,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (!addr)
addr = ifa->ifa_local;
} endfor_ifa(in_dev);
-no_in_dev:
- rcu_read_unlock();
if (addr)
- goto out;
+ goto out_unlock;
+no_in_dev:
/* Not loopback addresses on loopback should be preferred
in this case. It is importnat that lo is the first interface
in dev_base list.
*/
- read_lock(&dev_base_lock);
- rcu_read_lock();
- for_each_netdev(&init_net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
continue;
for_primary_ifa(in_dev) {
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope) {
addr = ifa->ifa_local;
- goto out_unlock_both;
+ goto out_unlock;
}
} endfor_ifa(in_dev);
}
-out_unlock_both:
- read_unlock(&dev_base_lock);
+out_unlock:
rcu_read_unlock();
-out:
return addr;
}
+EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
@@ -960,44 +1234,40 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
}
} endfor_ifa(in_dev);
- return same? addr : 0;
+ return same ? addr : 0;
}
/*
* Confirm that local IP address exists using wildcards:
- * - dev: only on this interface, 0=any interface
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
* - dst: only in the same subnet as dst, 0=any dst
* - local: address, 0=autoselect the local address
* - scope: maximum allowed scope value for the local address
*/
-__be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope)
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
{
__be32 addr = 0;
- struct in_device *in_dev;
-
- if (dev) {
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)))
- addr = confirm_addr_indev(in_dev, dst, local, scope);
- rcu_read_unlock();
+ struct net_device *dev;
- return addr;
- }
+ if (in_dev != NULL)
+ return confirm_addr_indev(in_dev, dst, local, scope);
- read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(&init_net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev))) {
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
}
}
rcu_read_unlock();
- read_unlock(&dev_base_lock);
return addr;
}
+EXPORT_SYMBOL(inet_confirm_addr);
/*
* Device notifier
@@ -1007,14 +1277,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(register_inetaddr_notifier);
int unregister_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
-/* Rename ifa_labels for a device name change. Make some effort to preserve existing
- * alias numbering and to create unique labels if possible.
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
{
@@ -1027,17 +1299,38 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
memcpy(old, ifa->ifa_label, IFNAMSIZ);
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
if (named++ == 0)
- continue;
- dot = strchr(ifa->ifa_label, ':');
+ goto skip;
+ dot = strchr(old, ':');
if (dot == NULL) {
sprintf(old, ":%d", named);
dot = old;
}
- if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
strcat(ifa->ifa_label, dot);
- } else {
+ else
strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
- }
+skip:
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+}
+
+static bool inetdev_valid_mtu(unsigned int mtu)
+{
+ return mtu >= 68;
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa;
+
+ for (ifa = in_dev->ifa_list; ifa;
+ ifa = ifa->ifa_next) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
}
}
@@ -1046,12 +1339,9 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
static int inetdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev = __in_dev_get_rtnl(dev);
- if (dev->nd_net != &init_net)
- return NOTIFY_DONE;
-
ASSERT_RTNL();
if (!in_dev) {
@@ -1063,21 +1353,27 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
}
+ } else if (event == NETDEV_CHANGEMTU) {
+ /* Re-enabling IP */
+ if (inetdev_valid_mtu(dev->mtu))
+ in_dev = inetdev_init(dev);
}
goto out;
}
switch (event) {
case NETDEV_REGISTER:
- printk(KERN_DEBUG "inetdev_event: bug\n");
- dev->ip_ptr = NULL;
+ pr_debug("%s: bug\n", __func__);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
- if (dev->mtu < 68)
+ if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa;
- if ((ifa = inet_alloc_ifa()) != NULL) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1086,18 +1382,36 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
inet_insert_ifa(ifa);
}
}
ip_mc_up(in_dev);
+ /* fall through */
+ case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
+ /* Send gratuitous ARP to notify of link change */
+ inetdev_send_gratuitous_arp(dev, in_dev);
break;
case NETDEV_DOWN:
ip_mc_down(in_dev);
break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ ip_mc_unmap(in_dev);
+ break;
+ case NETDEV_POST_TYPE_CHANGE:
+ ip_mc_remap(in_dev);
+ break;
case NETDEV_CHANGEMTU:
- if (dev->mtu >= 68)
+ if (inetdev_valid_mtu(dev->mtu))
break;
- /* MTU falled under 68, disable IP */
+ /* disable IP when MTU is not enough */
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -1107,13 +1421,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
*/
inetdev_changename(dev, in_dev);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl_unregister(&in_dev->cnf);
- neigh_sysctl_unregister(in_dev->arp_parms);
- neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
- devinet_sysctl_register(in_dev, &in_dev->cnf);
-#endif
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
break;
}
out:
@@ -1121,50 +1430,89 @@ out:
}
static struct notifier_block ip_netdev_notifier = {
- .notifier_call =inetdev_event,
+ .notifier_call = inetdev_event,
};
-static inline size_t inet_nlmsg_size(void)
+static size_t inet_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ nla_total_size(4) /* IFA_ADDRESS */
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
- + nla_total_size(4) /* IFA_ANYCAST */
- + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 pid, u32 seq, int event, unsigned int flags)
+ u32 portid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ u32 preferred, valid;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_flags = ifa->ifa_flags;
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (ifa->ifa_address)
- NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
-
- if (ifa->ifa_local)
- NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
-
- if (ifa->ifa_broadcast)
- NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
-
- if (ifa->ifa_anycast)
- NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast);
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
- if (ifa->ifa_label[0])
- NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+ if ((ifa->ifa_address &&
+ nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
+ goto nla_put_failure;
return nlmsg_end(skb, nlh);
@@ -1175,261 +1523,583 @@ nla_put_failure:
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx, ip_idx;
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
- int s_ip_idx, s_idx = cb->args[0];
-
- s_ip_idx = ip_idx = cb->args[1];
- idx = 0;
- for_each_netdev(&init_net, dev) {
- if (idx < s_idx)
- goto cont;
- if (idx > s_idx)
- s_ip_idx = 0;
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
- goto cont;
-
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) <= 0)
- goto done;
- }
+ RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
cont:
- idx++;
+ idx++;
+ }
+ rcu_read_unlock();
}
done:
- cb->args[0] = idx;
- cb->args[1] = ip_idx;
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
return skb->len;
}
-static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh,
- u32 pid)
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
{
struct sk_buff *skb;
u32 seq = nlh ? nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
+ struct net *net;
+ net = dev_net(ifa->ifa_dev->dev);
skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
if (skb == NULL)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
-#ifdef CONFIG_SYSCTL
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+ if (!in_dev)
+ return 0;
-static void devinet_copy_dflt_conf(int i)
+ return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
{
- struct net_device *dev;
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ struct nlattr *nla;
+ int i;
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
- struct in_device *in_dev;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (in_dev && !test_bit(i, in_dev->cnf.state))
- in_dev->cnf.data[i] = ipv4_devconf_dflt.data[i];
- rcu_read_unlock();
+ if (!in_dev)
+ return -ENODATA;
+
+ nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+ ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+ return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+ [IFLA_INET_CONF] = { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla)
+{
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int err, rem;
+
+ if (dev && !__in_dev_get_rtnl(dev))
+ return -EAFNOSUPPORT;
+
+ err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+ int cfgid = nla_type(a);
+
+ if (nla_len(a) < 4)
+ return -EINVAL;
+
+ if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+ return -EINVAL;
+ }
}
- read_unlock(&dev_base_lock);
+
+ return 0;
}
-static int devinet_conf_proc(ctl_table *ctl, int write,
- struct file* filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
{
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int rem;
- if (write) {
- struct ipv4_devconf *cnf = ctl->extra1;
- int i = (int *)ctl->data - cnf->data;
+ if (!in_dev)
+ return -EAFNOSUPPORT;
- set_bit(i, cnf->state);
+ if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+ BUG();
- if (cnf == &ipv4_devconf_dflt)
- devinet_copy_dflt_conf(i);
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+ ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
}
- return ret;
+ return 0;
}
-static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
+static int inet_netconf_msgsize_devconf(int type)
{
- struct ipv4_devconf *cnf;
- int *valp = table->data;
- int new;
- int i;
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ return size;
+}
- if (!newval || !newlen)
- return 0;
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv4_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
- if (newlen != sizeof(int))
- return -EINVAL;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
- if (new == *valp)
- return 0;
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
- if (oldval && oldlenp) {
- size_t len;
+void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
- if (get_user(len, oldlenp))
- return -EFAULT;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
}
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
- *valp = new;
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+};
- cnf = table->extra1;
- i = (int *)table->data - cnf->data;
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv4_devconf *devconf;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
- set_bit(i, cnf->state);
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv4_policy);
+ if (err < 0)
+ goto errout;
- if (cnf == &ipv4_devconf_dflt)
- devinet_copy_dflt_conf(i);
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
- return 1;
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (dev == NULL)
+ goto errout;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev == NULL)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
}
-void inet_forward_change(void)
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
struct net_device *dev;
- int on = IPV4_DEVCONF_ALL(FORWARDING);
+ struct in_device *in_dev;
+ struct hlist_head *head;
- IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on;
- IPV4_DEVCONF_DFLT(FORWARDING) = on;
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) <= 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && !test_bit(i, in_dev->cnf.state))
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+ }
+ rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+ if (on)
+ dev_disable_lro(dev);
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev)
+ if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
+}
- rt_cache_flush(0);
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
}
-static int devinet_sysctl_forward(ctl_table *ctl, int write,
- struct file* filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
- int *valp = ctl->data;
- int val = *valp;
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
- if (write && *valp != val) {
- if (valp == &IPV4_DEVCONF_ALL(FORWARDING))
- inet_forward_change();
- else if (valp != &IPV4_DEVCONF_DFLT(FORWARDING))
- rt_cache_flush(0);
+ if (write) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
+ int i = (int *)ctl->data - cnf->data;
+ int ifindex;
+
+ set_bit(i, cnf->state);
+
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
}
return ret;
}
-int ipv4_doint_and_flush(ctl_table *ctl, int write,
- struct file* filp, void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+ loff_t pos = *ppos;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
- if (write && *valp != val)
- rt_cache_flush(0);
+ if (write && *valp != val) {
+ struct net *net = ctl->extra2;
+
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ if (!rtnl_trylock()) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
+ return restart_syscall();
+ }
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
+ }
+ rtnl_unlock();
+ rt_cache_flush(net);
+ } else
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+ }
return ret;
}
-int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
- int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp,
- newval, newlen);
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
- if (ret == 1)
- rt_cache_flush(0);
+ if (write && *valp != val)
+ rt_cache_flush(net);
return ret;
}
-
-#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
{ \
- .ctl_name = NET_IPV4_CONF_ ## attr, \
.procname = name, \
.data = ipv4_devconf.data + \
- NET_IPV4_CONF_ ## attr - 1, \
+ IPV4_DEVCONF_ ## attr - 1, \
.maxlen = sizeof(int), \
.mode = mval, \
.proc_handler = proc, \
- .strategy = sysctl, \
.extra1 = &ipv4_devconf, \
}
#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \
- devinet_conf_sysctl)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \
- devinet_conf_sysctl)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
-#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl)
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
- DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \
- ipv4_doint_and_flush_strategy)
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
- ctl_table devinet_dev[2];
- ctl_table devinet_conf_dir[2];
- ctl_table devinet_proto_dir[2];
- ctl_table devinet_root_dir[2];
+ struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
- devinet_sysctl_forward,
- devinet_conf_sysctl),
+ devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
@@ -1439,6 +2109,8 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
"accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1448,131 +2120,216 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
- "force_igmp_version"),
DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
"promote_secondaries"),
- },
- .devinet_dev = {
- {
- .ctl_name = NET_PROTO_CONF_ALL,
- .procname = "all",
- .mode = 0555,
- .child = devinet_sysctl.devinet_vars,
- },
- },
- .devinet_conf_dir = {
- {
- .ctl_name = NET_IPV4_CONF,
- .procname = "conf",
- .mode = 0555,
- .child = devinet_sysctl.devinet_dev,
- },
- },
- .devinet_proto_dir = {
- {
- .ctl_name = NET_IPV4,
- .procname = "ipv4",
- .mode = 0555,
- .child = devinet_sysctl.devinet_conf_dir,
- },
- },
- .devinet_root_dir = {
- {
- .ctl_name = CTL_NET,
- .procname = "net",
- .mode = 0555,
- .child = devinet_sysctl.devinet_proto_dir,
- },
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
},
};
-static void devinet_sysctl_register(struct in_device *in_dev,
- struct ipv4_devconf *p)
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ struct ipv4_devconf *p)
{
int i;
- struct net_device *dev = in_dev ? in_dev->dev : NULL;
- struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t),
- GFP_KERNEL);
- char *dev_name = NULL;
+ struct devinet_sysctl_table *t;
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
if (!t)
- return;
+ goto out;
+
for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
t->devinet_vars[i].extra1 = p;
+ t->devinet_vars[i].extra2 = net;
}
- if (dev) {
- dev_name = dev->name;
- t->devinet_dev[0].ctl_name = dev->ifindex;
- } else {
- dev_name = "default";
- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
- }
-
- /*
- * Make a copy of dev_name, because '.procname' is regarded as const
- * by sysctl and we wouldn't want anyone to change it under our feet
- * (see SIOCSIFNAME).
- */
- dev_name = kstrdup(dev_name, GFP_KERNEL);
- if (!dev_name)
- goto free;
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
- t->devinet_dev[0].procname = dev_name;
- t->devinet_dev[0].child = t->devinet_vars;
- t->devinet_conf_dir[0].child = t->devinet_dev;
- t->devinet_proto_dir[0].child = t->devinet_conf_dir;
- t->devinet_root_dir[0].child = t->devinet_proto_dir;
-
- t->sysctl_header = register_sysctl_table(t->devinet_root_dir);
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
if (!t->sysctl_header)
- goto free_procname;
+ goto free;
p->sysctl = t;
- return;
+ return 0;
- /* error path */
- free_procname:
- kfree(dev_name);
- free:
+free:
kfree(t);
- return;
+out:
+ return -ENOBUFS;
}
-static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
{
- if (p->sysctl) {
- struct devinet_sysctl_table *t = p->sysctl;
- p->sysctl = NULL;
- unregister_sysctl_table(t->sysctl_header);
- kfree(t->devinet_dev[0].procname);
- kfree(t);
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (t == NULL)
+ return;
+
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+}
+
+static void devinet_sysctl_register(struct in_device *idev)
+{
+ neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ &idev->cnf);
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ __devinet_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ IPV4_DEVCONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+ { },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+ int err;
+ struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table_header *forw_hdr;
+#endif
+
+ err = -ENOMEM;
+ all = &ipv4_devconf;
+ dflt = &ipv4_devconf_dflt;
+
+ if (!net_eq(net, &init_net)) {
+ all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (all == NULL)
+ goto err_alloc_all;
+
+ dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (dflt == NULL)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
}
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all", all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default", dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
+ if (forw_hdr == NULL)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+ __devinet_sysctl_unregister(all);
+err_reg_all:
+ if (tbl != ctl_forward_entry)
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ if (dflt != &ipv4_devconf_dflt)
+ kfree(dflt);
+err_alloc_dflt:
+ if (all != &ipv4_devconf)
+ kfree(all);
+err_alloc_all:
+ return err;
}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+ __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ kfree(tbl);
#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops = {
+ .family = AF_INET,
+ .fill_link_af = inet_fill_link_af,
+ .get_link_af_size = inet_get_link_af_size,
+ .validate_link_af = inet_validate_link_af,
+ .set_link_af = inet_set_link_af,
+};
void __init devinet_init(void)
{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+ register_pernet_subsys(&devinet_ops);
+
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
-#ifdef CONFIG_SYSCTL
- devinet_sysctl.sysctl_header =
- register_sysctl_table(devinet_sysctl.devinet_root_dir);
- devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
-#endif
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+ rtnl_af_register(&inet_af_ops);
+
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+ inet_netconf_dump_devconf, NULL);
}
-EXPORT_SYMBOL(in_dev_finish_destroy);
-EXPORT_SYMBOL(inet_select_addr);
-EXPORT_SYMBOL(inetdev_by_index);
-EXPORT_SYMBOL(register_inetaddr_notifier);
-EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 6b1a31a74cf..360b565918c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,82 +1,226 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
-#include <asm/scatterlist.h>
-#include <linux/crypto.h>
+#include <linux/scatterlist.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
-#include <linux/random.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/in6.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+{
+ unsigned int len;
+
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+ struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_givcrypt_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_givcrypt_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+ struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
struct ip_esp_hdr *esph;
- struct crypto_blkcipher *tfm;
- struct blkcipher_desc desc;
- struct esp_data *esp;
+ struct crypto_aead *aead;
+ struct aead_givcrypt_request *req;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
struct sk_buff *trailer;
+ void *tmp;
+ u8 *iv;
u8 *tail;
int blksize;
int clen;
int alen;
+ int plen;
+ int tfclen;
int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
/* skb is pure payload to encrypt */
- err = -ENOMEM;
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
- /* Round to block size */
- clen = skb->len;
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
- esp = x->data;
- alen = esp->auth.icv_trunc_len;
- tfm = esp->conf.tfm;
- desc.tfm = tfm;
- desc.flags = 0;
- blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- clen = ALIGN(clen + 2, blksize);
- if (esp->conf.padlen)
- clen = ALIGN(clen, esp->conf.padlen);
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ plen = clen - skb->len - tfclen;
- if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
goto error;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_givreq(aead, iv);
+ asg = esp_givreq_sg(aead, req);
+ sg = asg + sglists;
/* Fill padding... */
tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
do {
int i;
- for (i=0; i<clen-skb->len - 2; i++)
+ for (i = 0; i < plen - 2; i++)
tail[i] = i + 1;
} while (0);
- tail[clen - skb->len - 2] = (clen - skb->len) - 2;
- pskb_put(skb, trailer, clen - skb->len);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
+ pskb_put(skb, trailer, clen - skb->len + alen);
skb_push(skb, -skb_network_offset(skb));
esph = ip_esp_hdr(skb);
- *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
- spin_lock_bh(&x->lock);
-
/* this is non-NULL only with UDP Encapsulation */
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
struct udphdr *uh;
__be32 *udpdata32;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
uh = (struct udphdr *)esph;
- uh->source = encap->encap_sport;
- uh->dest = encap->encap_dport;
- uh->len = htons(skb->len + alen - skb_transport_offset(skb));
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(skb->len - skb_transport_offset(skb));
uh->check = 0;
- switch (encap->encap_type) {
+ switch (encap_type) {
default:
case UDP_ENCAP_ESPINUDP:
esph = (struct ip_esp_hdr *)(uh + 1);
@@ -92,127 +236,65 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
}
esph->spi = x->id.spi;
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq);
-
- if (esp->conf.ivlen) {
- if (unlikely(!esp->conf.ivinitted)) {
- get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
- esp->conf.ivinitted = 1;
- }
- crypto_blkcipher_set_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
- }
-
- do {
- struct scatterlist *sg = &esp->sgbuf[0];
-
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg)
- goto unlock;
- }
- skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
- err = crypto_blkcipher_encrypt(&desc, sg, sg, clen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
- } while (0);
-
- if (unlikely(err))
- goto unlock;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+ clen + alen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+ aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+ aead_givcrypt_set_assoc(req, asg, assoclen);
+ aead_givcrypt_set_giv(req, esph->enc_data,
+ XFRM_SKB_CB(skb)->seq.output.low);
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_givencrypt(req);
+ if (err == -EINPROGRESS)
+ goto error;
- if (esp->conf.ivlen) {
- memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen);
- crypto_blkcipher_get_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
- }
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
- if (esp->auth.icv_full_len) {
- err = esp_mac_digest(esp, skb, (u8 *)esph - skb->data,
- sizeof(*esph) + esp->conf.ivlen + clen);
- memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen);
- }
-
-unlock:
- spin_unlock_bh(&x->lock);
+ kfree(tmp);
error:
return err;
}
-/*
- * Note: detecting truncated vs. non-truncated authentication data is very
- * expensive, so we only support truncated data, which is the recommended
- * and common case.
- */
-static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+static int esp_input_done2(struct sk_buff *skb, int err)
{
- struct iphdr *iph;
- struct ip_esp_hdr *esph;
- struct esp_data *esp = x->data;
- struct crypto_blkcipher *tfm = esp->conf.tfm;
- struct blkcipher_desc desc = { .tfm = tfm };
- struct sk_buff *trailer;
- int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
- int alen = esp->auth.icv_trunc_len;
- int elen = skb->len - sizeof(*esph) - esp->conf.ivlen - alen;
- int nfrags;
+ const struct iphdr *iph;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen = crypto_aead_authsize(aead);
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int elen = skb->len - hlen;
int ihl;
u8 nexthdr[2];
- struct scatterlist *sg;
int padlen;
- int err;
-
- if (!pskb_may_pull(skb, sizeof(*esph)))
- goto out;
-
- if (elen <= 0 || (elen & (blksize-1)))
- goto out;
-
- /* If integrity check is required, do this. */
- if (esp->auth.icv_full_len) {
- u8 sum[alen];
- err = esp_mac_digest(esp, skb, 0, skb->len - alen);
- if (err)
- goto out;
-
- if (skb_copy_bits(skb, skb->len - alen, sum, alen))
- BUG();
-
- if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) {
- x->stats.integrity_failed++;
- goto out;
- }
- }
-
- if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
- goto out;
-
- skb->ip_summed = CHECKSUM_NONE;
-
- esph = (struct ip_esp_hdr *)skb->data;
-
- /* Get ivec. This can be wrong, check against another impls. */
- if (esp->conf.ivlen)
- crypto_blkcipher_set_iv(tfm, esph->enc_data, esp->conf.ivlen);
-
- sg = &esp->sgbuf[0];
+ kfree(ESP_SKB_CB(skb)->tmp);
- if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
- sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
- if (!sg)
- goto out;
- }
- skb_to_sgvec(skb, sg, sizeof(*esph) + esp->conf.ivlen, elen);
- err = crypto_blkcipher_decrypt(&desc, sg, sg, elen);
- if (unlikely(sg != &esp->sgbuf[0]))
- kfree(sg);
if (unlikely(err))
- return err;
+ goto out;
if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
BUG();
+ err = -EINVAL;
padlen = nexthdr[0];
- if (padlen+2 >= elen)
+ if (padlen + 2 + alen >= elen)
goto out;
/* ... check padding bits here. Silly. :-) */
@@ -258,146 +340,317 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
}
pskb_trim(skb, skb->len - alen - padlen - 2);
- __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen);
- skb_set_transport_header(skb, -ihl);
+ __skb_pull(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr[1];
- return nexthdr[1];
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
out:
- return -EINVAL;
+ return err;
}
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct esp_data *esp = x->data;
- u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
- u32 align = max_t(u32, blksize, esp->conf.padlen);
- u32 rem;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+ void *tmp;
+ u8 *iv;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ int err = -EINVAL;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ goto out;
+
+ if (elen <= 0)
+ goto out;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ err = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ asg = esp_req_sg(aead, req);
+ sg = asg + sglists;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr *)skb->data;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ iv = esph->enc_data;
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ aead_request_set_crypt(req, sg, sg, elen, iv);
+ aead_request_set_assoc(req, asg, assoclen);
+
+ err = crypto_aead_decrypt(req);
+ if (err == -EINPROGRESS)
+ goto out;
+
+ err = esp_input_done2(skb, err);
+
+out:
+ return err;
+}
- mtu -= x->props.header_len + esp->auth.icv_trunc_len;
- rem = mtu & (align - 1);
- mtu &= ~(align - 1);
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+{
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ unsigned int net_adj;
switch (x->props.mode) {
- case XFRM_MODE_TUNNEL:
- break;
- default:
case XFRM_MODE_TRANSPORT:
- /* The worst case */
- mtu -= blksize - 4;
- mtu += min_t(u32, blksize - 4, rem);
- break;
case XFRM_MODE_BEET:
- /* The worst case. */
- mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
break;
+ default:
+ BUG();
}
- return mtu - 2;
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
}
-static void esp4_err(struct sk_buff *skb, u32 info)
+static int esp4_err(struct sk_buff *skb, u32 info)
{
- struct iphdr *iph = (struct iphdr*)skb->data;
- struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
struct xfrm_state *x;
- if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
- icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
- return;
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
- x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
- return;
- NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr));
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
xfrm_state_put(x);
+
+ return 0;
}
static void esp_destroy(struct xfrm_state *x)
{
- struct esp_data *esp = x->data;
+ struct crypto_aead *aead = x->data;
- if (!esp)
+ if (!aead)
return;
- crypto_free_blkcipher(esp->conf.tfm);
- esp->conf.tfm = NULL;
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- crypto_free_hash(esp->auth.tfm);
- esp->auth.tfm = NULL;
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- kfree(esp);
+ crypto_free_aead(aead);
}
-static int esp_init_state(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x)
{
- struct esp_data *esp = NULL;
- struct crypto_blkcipher *tfm;
- u32 align;
+ struct crypto_aead *aead;
+ int err;
+
+ aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+error:
+ return err;
+}
+static int esp_init_authenc(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -EINVAL;
if (x->ealg == NULL)
goto error;
- esp = kzalloc(sizeof(*esp), GFP_KERNEL);
- if (esp == NULL)
- return -ENOMEM;
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
+ goto error;
+
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
if (x->aalg) {
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *hash;
- hash = crypto_alloc_hash(x->aalg->alg_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(hash))
- goto error;
-
- esp->auth.tfm = hash;
- if (crypto_hash_setkey(hash, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
- goto error;
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
+ err = -EINVAL;
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(hash)) {
+ crypto_aead_authsize(aead)) {
NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
x->aalg->alg_name,
- crypto_hash_digestsize(hash),
+ crypto_aead_authsize(aead),
aalg_desc->uinfo.auth.icv_fullbits/8);
- goto error;
+ goto free_key;
}
- esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
-
- esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
- if (!esp->auth.work_icv)
- goto error;
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err)
+ goto free_key;
}
- tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- goto error;
- esp->conf.tfm = tfm;
- esp->conf.ivlen = crypto_blkcipher_ivsize(tfm);
- esp->conf.padlen = 0;
- if (esp->conf.ivlen) {
- esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
- if (unlikely(esp->conf.ivec == NULL))
- goto error;
- esp->conf.ivinitted = 0;
- }
- if (crypto_blkcipher_setkey(tfm, x->ealg->alg_key,
- (x->ealg->alg_key_len + 7) / 8))
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp_init_state(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ x->data = NULL;
+
+ if (x->aead)
+ err = esp_init_aead(x);
+ else
+ err = esp_init_authenc(x);
+
+ if (err)
goto error;
- x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
- else if (x->props.mode == XFRM_MODE_BEET)
+ else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
x->props.header_len += IPV4_BEET_PHMAXLEN;
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
@@ -413,21 +666,20 @@ static int esp_init_state(struct xfrm_state *x)
break;
}
}
- x->data = esp;
- align = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
- if (esp->conf.padlen)
- align = max_t(u32, align, esp->conf.padlen);
- x->props.trailer_len = align + 1 + esp->auth.icv_trunc_len;
- return 0;
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
error:
- x->data = esp;
- esp_destroy(x);
- x->data = NULL;
- return -EINVAL;
+ return err;
+}
+
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
}
-static struct xfrm_type esp_type =
+static const struct xfrm_type esp_type =
{
.description = "ESP4",
.owner = THIS_MODULE,
@@ -440,20 +692,22 @@ static struct xfrm_type esp_type =
.output = esp_output
};
-static struct net_protocol esp4_protocol = {
+static struct xfrm4_protocol esp4_protocol = {
.handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
.err_handler = esp4_err,
- .no_policy = 1,
+ .priority = 0,
};
static int __init esp4_init(void)
{
if (xfrm_register_type(&esp_type, AF_INET) < 0) {
- printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+ pr_info("%s: can't add xfrm type\n", __func__);
return -EAGAIN;
}
- if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
- printk(KERN_INFO "ip esp init: can't add protocol\n");
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
xfrm_unregister_type(&esp_type, AF_INET);
return -EAGAIN;
}
@@ -462,10 +716,10 @@ static int __init esp4_init(void)
static void __exit esp4_fini(void)
{
- if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
- printk(KERN_INFO "ip esp close: can't remove protocol\n");
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+ pr_info("%s: can't remove xfrm type\n", __func__);
}
module_init(esp4_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 78b514ba141..255aa9946fe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,8 +5,6 @@
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
@@ -17,7 +15,6 @@
#include <linux/module.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
@@ -34,65 +31,97 @@
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
+#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
-#include <net/icmp.h>
#include <net/arp.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
+#include <net/xfrm.h>
-#define FFprint(a...) printk(KERN_DEBUG a)
+#ifndef CONFIG_IP_MULTIPLE_TABLES
-static struct sock *fibnl;
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
-#ifndef CONFIG_IP_MULTIPLE_TABLES
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
+ if (local_table == NULL)
+ return -ENOMEM;
-struct fib_table *ip_fib_local_table;
-struct fib_table *ip_fib_main_table;
+ main_table = fib_trie_table(RT_TABLE_MAIN);
+ if (main_table == NULL)
+ goto fail;
-#define FIB_TABLE_HASHSZ 1
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
+fail:
+ kfree(local_table);
+ return -ENOMEM;
+}
#else
-#define FIB_TABLE_HASHSZ 256
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
-
-struct fib_table *fib_new_table(u32 id)
+struct fib_table *fib_new_table(struct net *net, u32 id)
{
struct fib_table *tb;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
- tb = fib_get_table(id);
+ tb = fib_get_table(net, id);
if (tb)
return tb;
- tb = fib_hash_init(id);
+
+ tb = fib_trie_table(id);
if (!tb)
return NULL;
+
+ switch (id) {
+ case RT_TABLE_LOCAL:
+ net->ipv4.fib_local = tb;
+ break;
+
+ case RT_TABLE_MAIN:
+ net->ipv4.fib_main = tb;
+ break;
+
+ case RT_TABLE_DEFAULT:
+ net->ipv4.fib_default = tb;
+ break;
+
+ default:
+ break;
+ }
+
h = id & (FIB_TABLE_HASHSZ - 1);
- hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
-struct fib_table *fib_get_table(u32 id)
+struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
- struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1);
+
rcu_read_lock();
- hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
@@ -103,153 +132,200 @@ struct fib_table *fib_get_table(u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-static void fib_flush(void)
+static void fib_flush(struct net *net)
{
int flushed = 0;
struct fib_table *tb;
- struct hlist_node *node;
+ struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist)
- flushed += tb->tb_flush(tb);
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist)
+ flushed += fib_table_flush(tb);
}
if (flushed)
- rt_cache_flush(-1);
+ rt_cache_flush(net);
}
/*
- * Find the first device with a given source address.
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
*/
-
-struct net_device * ip_dev_find(__be32 addr)
-{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
- struct fib_result res;
- struct net_device *dev = NULL;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- if (!ip_fib_local_table ||
- ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
- return NULL;
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
-
- if (dev)
- dev_hold(dev);
-out:
- fib_res_put(&res);
- return dev;
-}
-
-unsigned inet_addr_type(__be32 addr)
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct flowi4 fl4 = { .daddr = addr };
struct fib_result res;
- unsigned ret = RTN_BROADCAST;
+ unsigned int ret = RTN_BROADCAST;
+ struct fib_table *local_table;
- if (ZERONET(addr) || BADCLASS(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
- if (MULTICAST(addr))
+ if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- if (ip_fib_local_table) {
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local_table) {
ret = RTN_UNICAST;
- if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
- &fl, &res)) {
- ret = res.type;
- fib_res_put(&res);
+ rcu_read_lock();
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!dev || dev == res.fi->fib_dev)
+ ret = res.type;
}
+ rcu_read_unlock();
}
return ret;
}
-/* Given (packet source, input interface) and optional (dst, oif, tos):
- - (main) check, that source is valid i.e. not broadcast or our local
- address.
- - figure out what "logical" interface this packet arrived
- and calculate "specific destination" address.
- - check, that packet arrived from expected physical interface.
- */
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr);
+}
+EXPORT_SYMBOL(inet_addr_type);
-int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
- struct net_device *dev, __be32 *spec_dst, u32 *itag)
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
{
+ return __inet_dev_addr_type(net, dev, addr);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
struct in_device *in_dev;
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = src,
- .saddr = dst,
- .tos = tos } },
- .iif = oif };
struct fib_result res;
- int no_addr, rpf;
- int ret;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
- no_addr = rpf = 0;
- rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
- if (in_dev) {
- no_addr = in_dev->ifa_list == NULL;
- rpf = IN_DEV_RPFILTER(in_dev);
+ BUG_ON(!in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ fl4.daddr = ip_hdr(skb)->saddr;
+ fl4.saddr = 0;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_scope = scope;
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ if (!fib_lookup(net, &fl4, &res))
+ return FIB_RES_PREFSRC(net, res);
+ } else {
+ scope = RT_SCOPE_LINK;
}
- rcu_read_unlock();
- if (in_dev == NULL)
- goto e_inval;
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
+{
+ int ret, no_addr, accept_local;
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct net *net;
+ bool dev_match;
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ no_addr = idev->ifa_list == NULL;
- if (fib_lookup(&fl, &res))
+ accept_local = IN_DEV_ACCEPT_LOCAL(idev);
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+
+ net = dev_net(dev);
+ if (fib_lookup(net, &fl4, &res))
goto last_resort;
- if (res.type != RTN_UNICAST)
- goto e_inval_res;
- *spec_dst = FIB_RES_PREFSRC(res);
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !accept_local)
+ goto e_inval;
+ }
fib_combine_itag(itag, &res);
+ dev_match = false;
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
#else
if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
#endif
- {
+ if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- fib_res_put(&res);
return ret;
}
- fib_res_put(&res);
if (no_addr)
goto last_resort;
- if (rpf)
- goto e_inval;
- fl.oif = dev->ifindex;
+ if (rpf == 1)
+ goto e_rpf;
+ fl4.flowi4_oif = dev->ifindex;
ret = 0;
- if (fib_lookup(&fl, &res) == 0) {
- if (res.type == RTN_UNICAST) {
- *spec_dst = FIB_RES_PREFSRC(res);
+ if (fib_lookup(net, &fl4, &res) == 0) {
+ if (res.type == RTN_UNICAST)
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- }
- fib_res_put(&res);
}
return ret;
last_resort:
if (rpf)
- goto e_inval;
- *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ goto e_rpf;
*itag = 0;
return 0;
-e_inval_res:
- fib_res_put(&res);
e_inval:
return -EINVAL;
+e_rpf:
+ return -EXDEV;
+}
+
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+ if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ *itag = 0;
+ return 0;
+ }
+ return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
}
static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -269,13 +345,14 @@ static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
return len + nla_total_size(4);
}
-static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
struct fib_config *cfg)
{
__be32 addr;
int plen;
memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
if (rt->rt_dst.sa_family != AF_INET)
return -EAFNOSUPPORT;
@@ -336,7 +413,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
colon = strchr(devname, ':');
if (colon)
*colon = 0;
- dev = __dev_get_by_name(&init_net, devname);
+ dev = __dev_get_by_name(net, devname);
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
@@ -359,7 +436,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
if (rt->rt_gateway.sa_family == AF_INET && addr) {
cfg->fc_gw = addr;
if (rt->rt_flags & RTF_GATEWAY &&
- inet_addr_type(addr) == RTN_UNICAST)
+ inet_addr_type(net, addr) == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
@@ -397,10 +474,10 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
}
/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
*/
-
-int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct fib_config cfg;
struct rtentry rt;
@@ -409,27 +486,27 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(&rt, arg, sizeof(rt)))
return -EFAULT;
rtnl_lock();
- err = rtentry_to_fib_config(cmd, &rt, &cfg);
+ err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
if (err == 0) {
struct fib_table *tb;
if (cmd == SIOCDELRT) {
- tb = fib_get_table(cfg.fc_table);
+ tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(tb, &cfg);
else
err = -ESRCH;
} else {
- tb = fib_new_table(cfg.fc_table);
+ tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(tb, &cfg);
else
err = -ENOBUFS;
}
@@ -443,7 +520,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg)
return -EINVAL;
}
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -453,12 +530,11 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
[RTA_PREFSRC] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
- [RTA_PROTOINFO] = { .type = NLA_U32 },
[RTA_FLOW] = { .type = NLA_U32 },
};
-static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct fib_config *cfg)
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg)
{
struct nlattr *attr;
int err, remaining;
@@ -480,8 +556,9 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags = rtm->rtm_flags;
cfg->fc_nlflags = nlh->nlmsg_flags;
- cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
if (cfg->fc_type > RTN_MAX) {
err = -EINVAL;
@@ -527,72 +604,76 @@ errout:
return err;
}
-static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ struct net *net = sock_net(skb->sk);
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
if (err < 0)
goto errout;
- tb = fib_get_table(cfg.fc_table);
+ tb = fib_get_table(net, cfg.fc_table);
if (tb == NULL) {
err = -ESRCH;
goto errout;
}
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(tb, &cfg);
errout:
return err;
}
-static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
{
+ struct net *net = sock_net(skb->sk);
struct fib_config cfg;
struct fib_table *tb;
int err;
- err = rtm_to_fib_config(skb, nlh, &cfg);
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
if (err < 0)
goto errout;
- tb = fib_new_table(cfg.fc_table);
+ tb = fib_new_table(net, cfg.fc_table);
if (tb == NULL) {
err = -ENOBUFS;
goto errout;
}
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(tb, &cfg);
errout:
return err;
}
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
- struct hlist_node *node;
+ struct hlist_head *head;
int dumped = 0;
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
- return ip_rt_dump(skb, cb);
+ return skb->len;
s_h = cb->args[0];
s_e = cb->args[1];
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
- hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- if (tb->tb_dump(tb, skb, cb) < 0)
+ if (fib_table_dump(tb, skb, cb) < 0)
goto out;
dumped = 1;
next:
@@ -607,14 +688,14 @@ out:
}
/* Prepare and feed intra-kernel routing request.
- Really, it should be netlink message, but :-( netlink
- can be not configured, so that we feed it directly
- to fib engine. It is legal, because all events occur
- only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
*/
-
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
+ struct net *net = dev_net(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -624,12 +705,15 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
};
if (type == RTN_UNICAST)
- tb = fib_new_table(RT_TABLE_MAIN);
+ tb = fib_new_table(net, RT_TABLE_MAIN);
else
- tb = fib_new_table(RT_TABLE_LOCAL);
+ tb = fib_new_table(net, RT_TABLE_LOCAL);
if (tb == NULL)
return;
@@ -642,9 +726,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- tb->tb_insert(tb, &cfg);
+ fib_table_insert(tb, &cfg);
else
- tb->tb_delete(tb, &cfg);
+ fib_table_delete(tb, &cfg);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -654,70 +738,130 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
- __be32 prefix = ifa->ifa_address&mask;
+ __be32 prefix = ifa->ifa_address & mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
- printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
}
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
}
}
}
-static void fib_del_ifaddr(struct in_ifaddr *ifa)
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
{
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
- struct in_ifaddr *prim = ifa;
- __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
- __be32 any = ifa->ifa_address&ifa->ifa_mask;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
- unsigned ok = 0;
+ unsigned int ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
- if (!(ifa->ifa_flags&IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
- else {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
- printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
+ subnet = 1;
}
/* Deletion is more complicated than add.
- We should take care of not to delete too much :-)
-
- Scan address list to be sure that addresses are really gone.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
*/
for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
if (ifa->ifa_local == ifa1->ifa_local)
ok |= LOCAL_OK;
if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -726,27 +870,45 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD1_OK;
if (any == ifa1->ifa_broadcast)
ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
}
- if (!(ok&BRD_OK))
+ if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok&BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok&BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
- if (!(ok&LOCAL_OK)) {
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ }
+ if (!(ok & LOCAL_OK)) {
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
- if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ if (gone &&
+ inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
- We must flush stray FIB entries.
-
- First of all, we scan fib_info list searching
- for stray nexthop entries, then ignite fib_flush.
- */
- if (fib_sync_down(ifa->ifa_local, NULL, 0))
- fib_flush();
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ fib_flush(dev_net(dev));
}
}
#undef LOCAL_OK
@@ -755,32 +917,29 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = { .mark = frn->fl_mark,
- .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
- .tos = frn->fl_tos,
- .scope = frn->fl_scope } } };
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
+ };
frn->err = -ENOENT;
if (tb) {
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = tb->tb_lookup(tb, &fl, &res);
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
frn->nh_sel = res.nh_sel;
frn->type = res.type;
frn->scope = res.scope;
- fib_res_put(&res);
}
local_bh_enable();
}
@@ -788,64 +947,87 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
static void nl_fib_input(struct sk_buff *skb)
{
+ struct net *net;
struct fib_result_nl *frn;
struct nlmsghdr *nlh;
struct fib_table *tb;
- u32 pid;
+ u32 portid;
+ net = sock_net(skb->sk);
nlh = nlmsg_hdr(skb);
- if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
- nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) {
- kfree_skb(skb);
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
return;
- }
- frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
- tb = fib_get_table(frn->tb_id_in);
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
+ if (skb == NULL)
+ return;
+ nlh = nlmsg_hdr(skb);
+
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
+ tb = fib_get_table(net, frn->tb_id_in);
nl_fib_lookup(frn, tb);
- pid = NETLINK_CB(skb).pid; /* pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
+ NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT);
+ netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
}
-static void nl_fib_lookup_init(void)
+static int __net_init nl_fib_lookup_init(struct net *net)
{
- fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0,
- nl_fib_input, NULL, THIS_MODULE);
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
+ if (sk == NULL)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
}
static void fib_disable_ip(struct net_device *dev, int force)
{
- if (fib_sync_down(0, dev, force))
- fib_flush();
- rt_cache_flush(0);
+ if (fib_sync_down_dev(dev, force))
+ fib_flush(dev_net(dev));
+ rt_cache_flush(dev_net(dev));
arp_ifdown(dev);
}
static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
switch (event) {
case NETDEV_UP:
fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- fib_sync_up(ifa->ifa_dev->dev);
+ fib_sync_up(dev);
#endif
- rt_cache_flush(-1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev));
break;
case NETDEV_DOWN:
- fib_del_ifaddr(ifa);
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
- Disable IP.
+ * Disable IP.
*/
- fib_disable_ip(ifa->ifa_dev->dev, 1);
+ fib_disable_ip(dev, 1);
} else {
- rt_cache_flush(-1);
+ rt_cache_flush(dev_net(dev));
}
break;
}
@@ -854,17 +1036,17 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
-
- if (dev->nd_net != &init_net)
- return NOTIFY_DONE;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
+ rt_flush_dev(dev);
return NOTIFY_DONE;
}
+ in_dev = __in_dev_get_rtnl(dev);
if (!in_dev)
return NOTIFY_DONE;
@@ -876,50 +1058,122 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev);
#endif
- rt_cache_flush(-1);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
fib_disable_ip(dev, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
- rt_cache_flush(0);
+ rt_cache_flush(net);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block fib_inetaddr_notifier = {
- .notifier_call =fib_inetaddr_event,
+ .notifier_call = fib_inetaddr_event,
};
static struct notifier_block fib_netdev_notifier = {
- .notifier_call =fib_netdev_event,
+ .notifier_call = fib_netdev_event,
};
-void __init ip_fib_init(void)
+static int __net_init ip_fib_net_init(struct net *net)
+{
+ int err;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (net->ipv4.fib_table_hash == NULL)
+ return -ENOMEM;
+
+ err = fib4_rules_init(net);
+ if (err < 0)
+ goto fail;
+ return 0;
+
+fail:
+ kfree(net->ipv4.fib_table_hash);
+ return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
{
unsigned int i;
- for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&fib_table_hash[i]);
-#ifndef CONFIG_IP_MULTIPLE_TABLES
- ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
- hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]);
- ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
- hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]);
-#else
- fib4_rules_init();
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
#endif
+ rtnl_lock();
+ for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ struct fib_table *tb;
+ struct hlist_head *head;
+ struct hlist_node *tmp;
+
+ head = &net->ipv4.fib_table_hash[i];
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
+ fib_table_flush(tb);
+ fib_free_table(tb);
+ }
+ }
+ rtnl_unlock();
+ kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ net->ipv4.fib_num_tclassid_users = 0;
+#endif
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ ip_fib_net_exit(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+ ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+
+ register_pernet_subsys(&fib_net_ops);
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- nl_fib_lookup_init();
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+ fib_trie_init();
}
-
-EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index 527a6e0af5b..00000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1065 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
-};
-
-struct fn_zone {
- struct fn_zone *fz_next; /* Next not empty zone */
- struct hlist_head *fz_hash; /* Hash table pointer */
- int fz_nent; /* Number of entries */
-
- int fz_divisor; /* Hash divisor */
- u32 fz_hashmask; /* (fz_divisor - 1) */
-#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
-
- int fz_order; /* Zone order */
- __be32 fz_mask;
-#define FZ_MASK(fz) ((fz)->fz_mask)
-};
-
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key)>>(32 - fz->fz_order);
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= FZ_HASHMASK(fz);
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static DEFINE_RWLOCK(fib_hash_lock);
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE) {
- return kmalloc(size, GFP_KERNEL);
- } else {
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(size));
- }
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del(&f->fn_hash);
-
- new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case 16:
- new_divisor = 256;
- break;
- case 256:
- new_divisor = 1024;
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- memset(ht, 0, new_divisor * sizeof(struct hlist_head));
-
- write_lock_bh(&fib_hash_lock);
- old_ht = fz->fz_hash;
- fz->fz_hash = ht;
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- fn_rebuild_zone(fz, old_ht, old_divisor);
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fz_hash_free(old_ht, old_divisor);
- }
-}
-
-static inline void fn_free_node(struct fib_node * f)
-{
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa)
-{
- fib_release_info(fa->fa_info);
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- if (z) {
- fz->fz_divisor = 16;
- } else {
- fz->fz_divisor = 1;
- }
- fz->fz_hashmask = (fz->fz_divisor - 1);
- fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
- if (!fz->fz_hash) {
- kfree(fz);
- return NULL;
- }
- memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
- fz->fz_order = z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i=z+1; i<=32; i++)
- if (table->fn_zones[i])
- break;
- write_lock_bh(&fib_hash_lock);
- if (i>32) {
- /* No more specific masks, we are the first. */
- fz->fz_next = table->fn_zone_list;
- table->fn_zone_list = fz;
- } else {
- fz->fz_next = table->fn_zones[i]->fz_next;
- table->fn_zones[i]->fz_next = fz;
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
- return fz;
-}
-
-static int
-fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash*)tb->tb_data;
-
- read_lock(&fib_hash_lock);
- for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k = fz_key(flp->fl4_dst, fz);
-
- head = &fz->fz_hash[fn_hash(k, fz)];
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- f->fn_key, fz->fz_mask,
- fz->fz_order);
- if (err <= 0)
- goto out;
- }
- }
- err = 1;
-out:
- read_unlock(&fib_hash_lock);
- return err;
-}
-
-static int fn_hash_last_dflt=-1;
-
-static void
-fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash*)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- read_lock(&fib_hash_lock);
- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
- fa->fa_state |= FA_S_ACCESSED;
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- fn_hash_last_dflt = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- fn_hash_last_dflt = order;
- goto out;
- }
-
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
- }
- fn_hash_last_dflt = last_idx;
-out:
- read_unlock(&fib_hash_lock);
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-
- hlist_add_head(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f, *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_orig;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- struct fib_info *fi_drop;
- u8 state;
-
- write_lock_bh(&fib_hash_lock);
- fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = cfg->fc_type;
- fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fib_release_info(fi_drop);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
- rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, NLM_F_REPLACE);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- fa_orig = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi)
- goto out;
- }
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_orig;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
- new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
- goto out;
-
- new_f = NULL;
- if (!f) {
- new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out_free_new_fa;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- write_lock_bh(&fib_hash_lock);
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(-1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, 0);
- return 0;
-
-out_free_new_fa:
- kmem_cache_free(fn_alias_kmem, new_fa);
-out:
- fib_release_info(fi);
- return err;
-}
-
-
-static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash*)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, 0);
-
- kill_fn = 0;
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
- fn_free_alias(fa);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = &fz->fz_hash[idx];
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
-
- fn_free_alias(fa);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-static int fn_hash_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
- next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
-
- s_h = cb->args[3];
- for (h=0; h < fz->fz_divisor; h++) {
- if (h < s_h) continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- if (fz->fz_hash == NULL ||
- hlist_empty(&fz->fz_hash[h]))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
- cb->args[3] = h;
- return -1;
- }
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
-{
- int m, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash*)tb->tb_data;
-
- s_m = cb->args[2];
- read_lock(&fib_hash_lock);
- for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
- if (m < s_m) continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- read_unlock(&fib_hash_lock);
- return -1;
- }
- }
- read_unlock(&fib_hash_lock);
- cb->args[2] = m;
- return skb->len;
-}
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
-{
- struct fib_table *tb;
-
- if (fn_hash_kmem == NULL)
- fn_hash_kmem = kmem_cache_create("ip_fib_hash",
- sizeof(struct fib_node),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
-
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_lookup = fn_hash_lookup;
- tb->tb_insert = fn_hash_insert;
- tb->tb_delete = fn_hash_delete;
- tb->tb_flush = fn_hash_flush;
- tb->tb_select_default = fn_hash_select_default;
- tb->tb_dump = fn_hash_dump;
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = table->fn_zone_list; iter->zone;
- iter->zone = iter->zone->fz_next) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = iter->zone->fz_hash;
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa,&fn->fn_alias,fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = iter->zone->fz_next;
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = iter->zone->fz_hash;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-{
- void *v = NULL;
-
- read_lock(&fib_hash_lock);
- if (ip_fib_main_table)
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
-{
- read_unlock(&fib_hash_lock);
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- char bf[128];
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- snprintf(bf, sizeof(bf),
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
- seq_printf(seq, "%-127s\n", bf);
-out:
- return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_private(file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-
-int __init fib_proc_init(void)
-{
- if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __init fib_proc_exit(void)
-{
- proc_net_remove(&init_net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index eef9eec17e0..1e4f6600b31 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,35 +7,45 @@
struct fib_alias {
struct list_head fa_list;
- struct rcu_head rcu;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
- u8 fa_scope;
u8 fa_state;
+ struct rcu_head rcu;
};
#define FA_S_ACCESSED 0x01
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
- const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen);
-extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(struct fib_config *cfg);
-extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
-extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst,
- int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
-extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
- unsigned int nlm_flags);
-extern struct fib_alias *fib_find_alias(struct list_head *fah,
- u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort,
- int *last_idx, int *dflt);
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ /* we used to play games with refcounts, but we now use RCU */
+ res->fi = fi;
+}
+
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f16839c6a72..f2e15738534 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
* IPv4 Forwarding Information Base: policy rules.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Thomas Graf <tgraf@suug.ch>
+ * Thomas Graf <tgraf@suug.ch>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
* 2 of the License, or (at your option) any later version.
*
* Fixes:
- * Rani Assaf : local_rule cannot be deleted
+ * Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
*/
@@ -26,16 +26,14 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
+#include <linux/export.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/ip_fib.h>
#include <net/fib_rules.h>
-static struct fib_rules_ops fib4_rules_ops;
-
-struct fib4_rule
-{
+struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
@@ -44,57 +42,29 @@ struct fib4_rule
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-static struct fib4_rule default_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7FFF,
- .table = RT_TABLE_DEFAULT,
- .action = FR_ACT_TO_TBL,
- },
-};
-
-static struct fib4_rule main_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .pref = 0x7FFE,
- .table = RT_TABLE_MAIN,
- .action = FR_ACT_TO_TBL,
- },
-};
-
-static struct fib4_rule local_rule = {
- .common = {
- .refcnt = ATOMIC_INIT(2),
- .table = RT_TABLE_LOCAL,
- .action = FR_ACT_TO_TBL,
- .flags = FIB_RULE_PERMANENT,
- },
-};
-
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
-{
- return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
-}
-#endif
-
-int fib_lookup(struct flowi *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
+ .flags = FIB_LOOKUP_NOREF,
};
int err;
- err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg);
- res->r = arg.rule;
-
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
return err;
}
+EXPORT_SYMBOL_GPL(__fib_lookup);
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
@@ -120,50 +90,69 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
goto errout;
}
- if ((tbl = fib_get_table(rule->table)) == NULL)
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (!tbl)
goto errout;
- err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result);
+ err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
return err;
}
-
-void fib_select_default(const struct flowi *flp, struct fib_result *res)
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
- if (res->r && res->r->action == FR_ACT_TO_TBL &&
- FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
- struct fib_table *tb;
- if ((tb = fib_get_table(res->r->table)) != NULL)
- tb->tb_select_default(tb, flp, res);
- }
+ struct fib_result *result = (struct fib_result *) arg->result;
+ struct net_device *dev = NULL;
+
+ if (result->fi)
+ dev = result->fi->fib_dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
}
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
- __be32 daddr = fl->fl4_dst;
- __be32 saddr = fl->fl4_src;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
if (((saddr ^ r->src) & r->srcmask) ||
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl->fl4_tos))
+ if (r->tos && (r->tos != fl4->flowi4_tos))
return 0;
return 1;
}
-static struct fib_table *fib_empty_table(void)
+static struct fib_table *fib_empty_table(struct net *net)
{
u32 id;
for (id = 1; id <= RT_TABLE_MAX; id++)
- if (fib_get_table(id) == NULL)
- return fib_new_table(id);
+ if (fib_get_table(net, id) == NULL)
+ return fib_new_table(net, id);
return NULL;
}
@@ -173,9 +162,10 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
};
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
+ struct fib_rule_hdr *frh,
struct nlattr **tb)
{
+ struct net *net = sock_net(skb->sk);
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
@@ -186,7 +176,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
- table = fib_empty_table();
+ table = fib_empty_table(net);
if (table == NULL) {
err = -ENOBUFS;
goto errout;
@@ -202,9 +192,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (tb[FRA_FLOW])
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW]) {
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users++;
+ }
#endif
rule4->src_len = frh->src_len;
@@ -213,11 +206,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->dstmask = inet_make_mask(rule4->dst_len);
rule4->tos = frh->tos;
+ net->ipv4.fib_has_custom_rules = true;
err = 0;
errout:
return err;
}
+static void fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users--;
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+}
+
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
@@ -232,7 +238,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->tos && (rule4->tos != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
@@ -247,24 +253,23 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
}
static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+ struct fib_rule_hdr *frh)
{
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- frh->family = AF_INET;
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
frh->tos = rule4->tos;
- if (rule4->dst_len)
- NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-
- if (rule4->src_len)
- NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-
-#ifdef CONFIG_NET_CLS_ROUTE
- if (rule4->tclassid)
- NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+ if ((rule4->dst_len &&
+ nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_be32(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
#endif
return 0;
@@ -272,23 +277,6 @@ nla_put_failure:
return -ENOBUFS;
}
-static u32 fib4_rule_default_pref(void)
-{
- struct list_head *pos;
- struct fib_rule *rule;
-
- if (!list_empty(&fib4_rules_ops.rules_list)) {
- pos = fib4_rules_ops.rules_list.next;
- if (pos->next != &fib4_rules_ops.rules_list) {
- rule = list_entry(pos->next, struct fib_rule, list);
- if (rule->pref)
- return rule->pref - 1;
- }
- }
-
- return 0;
-}
-
static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
@@ -296,34 +284,69 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+ nla_total_size(4); /* flow */
}
-static void fib4_rule_flush_cache(void)
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
{
- rt_cache_flush(-1);
+ rt_cache_flush(ops->fro_net);
}
-static struct fib_rules_ops fib4_rules_ops = {
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
.action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
- .default_pref = fib4_rule_default_pref,
+ .default_pref = fib_default_rule_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = fib4_rule_policy,
- .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list),
.owner = THIS_MODULE,
};
-void __init fib4_rules_init(void)
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+ int err;
+
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
{
- list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list);
- list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list);
- list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list);
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
- fib_rules_register(&fib4_rules_ops);
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
+ return 0;
+
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+ fib_rules_unregister(net->ipv4.rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1351a2617dc..b10cd43a472 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -5,8 +5,6 @@
*
* IPv4 Forwarding Information Base: semantics.
*
- * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
@@ -16,7 +14,6 @@
*/
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
@@ -34,6 +31,7 @@
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -47,12 +45,10 @@
#include "fib_lookup.h"
-#define FSprintk(a...)
-
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -63,98 +59,178 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
static DEFINE_SPINLOCK(fib_multipath_lock);
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
-#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
-for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
/* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
#define endfor_nexthops(fi) }
-static const struct
-{
- int error;
- u8 scope;
-} fib_props[RTN_MAX + 1] = {
- {
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_UNSPEC */
- {
+ },
+ [RTN_UNICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNICAST */
- {
+ },
+ [RTN_LOCAL] = {
.error = 0,
.scope = RT_SCOPE_HOST,
- }, /* RTN_LOCAL */
- {
+ },
+ [RTN_BROADCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_BROADCAST */
- {
+ },
+ [RTN_ANYCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_ANYCAST */
- {
+ },
+ [RTN_MULTICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_MULTICAST */
- {
+ },
+ [RTN_BLACKHOLE] = {
.error = -EINVAL,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_BLACKHOLE */
- {
+ },
+ [RTN_UNREACHABLE] = {
.error = -EHOSTUNREACH,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNREACHABLE */
- {
+ },
+ [RTN_PROHIBIT] = {
.error = -EACCES,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_PROHIBIT */
- {
+ },
+ [RTN_THROW] = {
.error = -EAGAIN,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_THROW */
- {
+ },
+ [RTN_NAT] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_NAT */
- {
+ },
+ [RTN_XRESOLVE] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_XRESOLVE */
+ },
};
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_free(&rt->dst);
+}
+
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+ struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt)
+ dst_free(&rt->dst);
+ }
+ free_percpu(rtp);
+}
/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ if (nexthop_nh->nh_exceptions)
+ free_nh_exceptions(nexthop_nh);
+ rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+ rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+ } endfor_nexthops(fi);
+
+ release_net(fi->fib_net);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
+ kfree(fi);
+}
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk("Freeing alive fib_info %p\n", fi);
+ pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
+ fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
change_nexthops(fi) {
- if (nh->nh_dev)
- dev_put(nh->nh_dev);
- nh->nh_dev = NULL;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users--;
} endfor_nexthops(fi);
- fib_info_cnt--;
- kfree(fi);
+#endif
+ call_rcu(&fi->rcu, free_fib_info_rcu);
}
void fib_release_info(struct fib_info *fi)
@@ -165,9 +241,9 @@ void fib_release_info(struct fib_info *fi)
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
change_nexthops(fi) {
- if (!nh->nh_dev)
+ if (!nexthop_nh->nh_dev)
continue;
- hlist_del(&nh->nh_hash);
+ hlist_del(&nexthop_nh->nh_hash);
} endfor_nexthops(fi)
fi->fib_dead = 1;
fib_info_put(fi);
@@ -175,7 +251,7 @@ void fib_release_info(struct fib_info *fi)
spin_unlock_bh(&fib_info_lock);
}
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
@@ -186,24 +262,36 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight != onh->nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
return -1;
onh++;
} endfor_nexthops(fi);
return 0;
}
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
unsigned int val = fi->fib_nhs;
- val ^= fi->fib_protocol;
+ val ^= (fi->fib_protocol << 8) | fi->fib_scope;
val ^= (__force u32)fi->fib_prefsrc;
val ^= fi->fib_priority;
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->nh_oif);
+ } endfor_nexthops(fi)
return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}
@@ -211,22 +299,25 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
static struct fib_info *fib_find_info(const struct fib_info *nfi)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_info *fi;
unsigned int hash;
hash = fib_info_hashfn(nfi);
head = &fib_info_hash[hash];
- hlist_for_each_entry(fi, node, head, fib_hash) {
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!net_eq(fi->fib_net, nfi->fib_net))
+ continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
- sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -234,23 +325,12 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
return NULL;
}
-static inline unsigned int fib_devindex_hashfn(unsigned int val)
-{
- unsigned int mask = DEVINDEX_HASHSIZE - 1;
-
- return (val ^
- (val >> DEVINDEX_HASHBITS) ^
- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
-}
-
/* Check, that the gateway is already configured.
- Used only by redirect accept routine.
+ * Used only by redirect accept routine.
*/
-
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
unsigned int hash;
@@ -258,10 +338,10 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hash = fib_devindex_hashfn(dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD)) {
+ !(nh->nh_flags & RTNH_F_DEAD)) {
spin_unlock(&fib_info_lock);
return 0;
}
@@ -300,7 +380,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
}
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
- int dst_len, u32 tb_id, struct nl_info *info,
+ int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
struct sk_buff *skb;
@@ -311,8 +391,8 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
if (skb == NULL)
goto errout;
- err = fib_dump_info(skb, info->pid, seq, event, tb_id,
- fa->fa_type, fa->fa_scope, key, dst_len,
+ err = fib_dump_info(skb, info->portid, seq, event, tb_id,
+ fa->fa_type, key, dst_len,
fa->fa_tos, fa->fa_info, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -320,11 +400,12 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
kfree_skb(skb);
goto errout;
}
- err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
- info->nlh, GFP_KERNEL);
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
errout:
if (err < 0)
- rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
/* Return the first fib alias matching TOS with
@@ -345,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-int fib_detect_death(struct fib_info *fi, int order,
- struct fib_info **last_resort, int *last_idx, int *dflt)
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
{
struct neighbour *n;
int state = NUD_NONE;
@@ -356,12 +438,12 @@ int fib_detect_death(struct fib_info *fi, int order,
state = n->nud_state;
neigh_release(n);
}
- if (state==NUD_REACHABLE)
+ if (state == NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != *dflt)
+ if ((state & NUD_VALID) && order != dflt)
return 0;
- if ((state&NUD_VALID) ||
- (*last_idx<0 && order > *dflt)) {
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
*last_resort = fi;
*last_idx = order;
}
@@ -392,19 +474,22 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
- nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
- nh->nh_oif = rtnh->rtnh_ifindex;
- nh->nh_weight = rtnh->rtnh_hops + 1;
+ nexthop_nh->nh_flags =
+ (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
+ nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+ nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
- nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
}
@@ -456,7 +541,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla && nla_get_be32(nla) != nh->nh_gw)
return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
return 1;
@@ -471,147 +556,147 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
/*
- Picture
- -------
-
- Semantics of nexthop is very messy by historical reasons.
- We have to take into account, that:
- a) gateway can be actually local interface address,
- so that gatewayed route is direct.
- b) gateway must be on-link address, possibly
- described not by an ifaddr, but also by a direct route.
- c) If both gateway and interface are specified, they should not
- contradict.
- d) If we use tunnel routes, gateway could be not on-link.
-
- Attempt to reconcile all of these (alas, self-contradictory) conditions
- results in pretty ugly and hairy code with obscure logic.
-
- I chose to generalized it instead, so that the size
- of code does not increase practically, but it becomes
- much more general.
- Every prefix is assigned a "scope" value: "host" is local address,
- "link" is direct route,
- [ ... "site" ... "interior" ... ]
- and "universe" is true gateway route with global meaning.
-
- Every prefix refers to a set of "nexthop"s (gw, oif),
- where gw must have narrower scope. This recursion stops
- when gw has LOCAL scope or if "nexthop" is declared ONLINK,
- which means that gw is forced to be on link.
-
- Code is still hairy, but now it is apparently logically
- consistent and very flexible. F.e. as by-product it allows
- to co-exists in peace independent exterior and interior
- routing processes.
-
- Normally it looks as following.
-
- {universe prefix} -> (gw, oif) [scope link]
- |
- |-> {link prefix} -> (gw, oif) [scope local]
- |
- |-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
*/
-
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
int err;
+ struct net *net;
+ struct net_device *dev;
+ net = cfg->fc_nlinfo.nl_net;
if (nh->nh_gw) {
struct fib_result res;
-#ifdef CONFIG_IP_ROUTE_PERVASIVE
- if (nh->nh_flags&RTNH_F_PERVASIVE)
- return 0;
-#endif
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
+ if (nh->nh_flags & RTNH_F_ONLINK) {
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
- if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
return -ENODEV;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return -ENETDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
return 0;
}
+ rcu_read_lock();
{
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = nh->nh_gw,
- .scope = cfg->fc_scope + 1,
- },
- },
- .oif = nh->nh_oif,
+ struct flowi4 fl4 = {
+ .daddr = nh->nh_gw,
+ .flowi4_scope = cfg->fc_scope + 1,
+ .flowi4_oif = nh->nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
};
/* It is not necessary, but requires a bit of thinking */
- if (fl.fl4_scope < RT_SCOPE_LINK)
- fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(&fl, &res)) != 0)
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+ err = fib_lookup(net, &fl4, &res);
+ if (err) {
+ rcu_read_unlock();
return err;
+ }
}
err = -EINVAL;
if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
goto out;
nh->nh_scope = res.scope;
nh->nh_oif = FIB_RES_OIF(res);
- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
- goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
goto out;
- err = 0;
-out:
- fib_res_put(&res);
- return err;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
return -EINVAL;
- in_dev = inetdev_by_index(nh->nh_oif);
+ rcu_read_lock();
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->nh_oif);
if (in_dev == NULL)
- return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP)) {
- in_dev_put(in_dev);
- return -ENETDOWN;
- }
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
- in_dev_put(in_dev);
+ err = 0;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return err;
}
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
- return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
- return kmalloc(bytes, GFP_KERNEL);
+ return kzalloc(bytes, GFP_KERNEL);
else
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL, get_order(bytes));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
{
if (!hash)
return;
@@ -622,25 +707,25 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
free_pages((unsigned long) hash, get_order(bytes));
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
+ unsigned int old_size = fib_info_hash_size;
unsigned int i, bytes;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ fib_info_hash_size = new_size;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
struct hlist_head *dest;
unsigned int new_hash;
@@ -655,10 +740,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *node, *n;
+ struct hlist_node *n;
struct fib_info *fi;
- hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
unsigned int new_hash;
@@ -674,8 +759,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
+}
+
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+ nh->nh_saddr = inet_select_addr(nh->nh_dev,
+ nh->nh_gw,
+ nh->nh_parent->fib_scope);
+ nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+
+ return nh->nh_saddr;
}
struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -684,6 +779,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
struct fib_info *fi = NULL;
struct fib_info *ofi;
int nhs = 1;
+ struct net *net = cfg->fc_nlinfo.nl_net;
+
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
/* Fast check to catch the most weird cases */
if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
@@ -698,28 +797,24 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
unsigned int bytes;
if (!new_size)
- new_size = 1;
+ new_size = 16;
bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
- } else {
- memset(new_info_hash, 0, bytes);
- memset(new_laddrhash, 0, bytes);
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
+ } else
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
- }
-
- if (!fib_hash_size)
+ if (!fib_info_hash_size)
goto failure;
}
@@ -727,15 +822,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fi == NULL)
goto failure;
fib_info_cnt++;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
+ fi->fib_net = hold_net(net);
fi->fib_protocol = cfg->fc_protocol;
+ fi->fib_scope = cfg->fc_scope;
fi->fib_flags = cfg->fc_flags;
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
+ fi->fib_type = cfg->fc_type;
fi->fib_nhs = nhs;
change_nexthops(fi) {
- nh->nh_parent = fi;
+ nexthop_nh->nh_parent = fi;
+ nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+ if (!nexthop_nh->nh_pcpu_rth_output)
+ goto failure;
} endfor_nexthops(fi)
if (cfg->fc_mx) {
@@ -746,9 +853,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int type = nla_type(nla);
if (type) {
+ u32 val;
+
if (type > RTAX_MAX)
goto err_inval;
- fi->fib_metrics[type - 1] = nla_get_u32(nla);
+ val = nla_get_u32(nla);
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ fi->fib_metrics[type - 1] = val;
}
}
}
@@ -762,7 +876,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
goto err_inval;
#endif
@@ -775,8 +889,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight = 1;
@@ -787,6 +903,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
goto err_inval;
goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ goto err_inval;
+ }
}
if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -799,13 +926,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (nhs != 1 || nh->nh_gw)
goto err_inval;
nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
+ nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
err = -ENODEV;
if (nh->nh_dev == NULL)
goto failure;
} else {
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nh)) != 0)
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
goto failure;
} endfor_nexthops(fi)
}
@@ -813,12 +941,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fi->fib_prefsrc) {
if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
fi->fib_prefsrc != cfg->fc_dst)
- if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
goto err_inval;
}
+ change_nexthops(fi) {
+ fib_info_update_nh_saddr(net, nexthop_nh);
+ } endfor_nexthops(fi)
+
link_it:
- if ((ofi = fib_find_info(fi)) != NULL) {
+ ofi = fib_find_info(fi);
+ if (ofi) {
fi->fib_dead = 1;
free_fib_info(fi);
ofi->fib_treeref++;
@@ -840,11 +973,11 @@ link_it:
struct hlist_head *head;
unsigned int hash;
- if (!nh->nh_dev)
+ if (!nexthop_nh->nh_dev)
continue;
- hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+ hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_add_head(&nh->nh_hash, head);
+ hlist_add_head(&nexthop_nh->nh_hash, head);
} endfor_nexthops(fi)
spin_unlock_bh(&fib_info_lock);
return fi;
@@ -861,92 +994,14 @@ failure:
return ERR_PTR(err);
}
-/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, __be32 zone, __be32 mask,
- int prefixlen)
-{
- struct fib_alias *fa;
- int nh_sel = 0;
-
- list_for_each_entry_rcu(fa, head, fa_list) {
- int err;
-
- if (fa->fa_tos &&
- fa->fa_tos != flp->fl4_tos)
- continue;
-
- if (fa->fa_scope < flp->fl4_scope)
- continue;
-
- fa->fa_state |= FA_S_ACCESSED;
-
- err = fib_props[fa->fa_type].error;
- if (err == 0) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi->fib_flags & RTNH_F_DEAD)
- continue;
-
- switch (fa->fa_type) {
- case RTN_UNICAST:
- case RTN_LOCAL:
- case RTN_BROADCAST:
- case RTN_ANYCAST:
- case RTN_MULTICAST:
- for_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- continue;
- if (!flp->oif || flp->oif == nh->nh_oif)
- break;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (nhsel < fi->fib_nhs) {
- nh_sel = nhsel;
- goto out_fill_res;
- }
-#else
- if (nhsel < 1) {
- goto out_fill_res;
- }
-#endif
- endfor_nexthops(fi);
- continue;
-
- default:
- printk(KERN_DEBUG "impossible 102\n");
- return -EINVAL;
- }
- }
- return err;
- }
- return 1;
-
-out_fill_res:
- res->prefixlen = prefixlen;
- res->nh_sel = nh_sel;
- res->type = fa->fa_type;
- res->scope = fa->fa_scope;
- res->fi = fa->fa_info;
- atomic_inc(&res->fi->fib_clntref);
- return 0;
-}
-
-/* Find appropriate source address to this destination */
-
-__be32 __fib_res_prefsrc(struct fib_result *res)
-{
- return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
-}
-
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
{
struct nlmsghdr *nlh;
struct rtmsg *rtm;
- nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -955,34 +1010,40 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtm->rtm_dst_len = dst_len;
rtm->rtm_src_len = 0;
rtm->rtm_tos = tos;
- rtm->rtm_table = tb_id;
- NLA_PUT_U32(skb, RTA_TABLE, tb_id);
+ if (tb_id < 256)
+ rtm->rtm_table = tb_id;
+ else
+ rtm->rtm_table = RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
rtm->rtm_type = type;
rtm->rtm_flags = fi->fib_flags;
- rtm->rtm_scope = scope;
+ rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
- if (rtm->rtm_dst_len)
- NLA_PUT_BE32(skb, RTA_DST, dst);
-
- if (fi->fib_priority)
- NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-
+ if (rtm->rtm_dst_len &&
+ nla_put_be32(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
goto nla_put_failure;
- if (fi->fib_prefsrc)
- NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-
+ if (fi->fib_prefsrc &&
+ nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
-
- if (fi->fib_nh->nh_oif)
- NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (fi->fib_nh[0].nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
+ if (fi->fib_nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
#endif
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1003,11 +1064,13 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
rtnh->rtnh_hops = nh->nh_weight - 1;
rtnh->rtnh_ifindex = nh->nh_oif;
- if (nh->nh_gw)
- NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
- if (nh->nh_tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+ if (nh->nh_gw &&
+ nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
#endif
/* length of rtnetlink header + attributes */
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
@@ -1024,98 +1087,154 @@ nla_put_failure:
}
/*
- Update FIB if:
- - local address disappeared -> we must delete all the entries
- referring to it.
- - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
*/
+int fib_sync_down_addr(struct net *net, __be32 local)
+{
+ int ret = 0;
+ unsigned int hash = fib_laddr_hashfn(local);
+ struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct fib_info *fi;
-int fib_sync_down(__be32 local, struct net_device *dev, int force)
+ if (fib_info_laddrhash == NULL || local == 0)
+ return 0;
+
+ hlist_for_each_entry(fi, head, fib_lhash) {
+ if (!net_eq(fi->fib_net, net))
+ continue;
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ return ret;
+}
+
+int fib_sync_down_dev(struct net_device *dev, int force)
{
int ret = 0;
int scope = RT_SCOPE_NOWHERE;
+ struct fib_info *prev_fi = NULL;
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct fib_nh *nh;
if (force)
scope = -1;
- if (local && fib_info_laddrhash) {
- unsigned int hash = fib_laddr_hashfn(local);
- struct hlist_head *head = &fib_info_laddrhash[hash];
- struct hlist_node *node;
- struct fib_info *fi;
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
- hlist_for_each_entry(fi, node, head, fib_lhash) {
- if (fi->fib_prefsrc == local) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+ dead++;
+ else if (nexthop_nh->nh_dev == dev &&
+ nexthop_nh->nh_scope != scope) {
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ spin_lock_bh(&fib_multipath_lock);
+ fi->fib_power -= nexthop_nh->nh_power;
+ nexthop_nh->nh_power = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+#endif
+ dead++;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (force > 1 && nexthop_nh->nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
+#endif
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
}
}
- if (dev) {
- struct fib_info *prev_fi = NULL;
- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
- struct hlist_head *head = &fib_info_devhash[hash];
- struct hlist_node *node;
- struct fib_nh *nh;
+ return ret;
+}
- hlist_for_each_entry(nh, node, head, nh_hash) {
- struct fib_info *fi = nh->nh_parent;
- int dead;
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct list_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
- BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
- continue;
- prev_fi = fi;
- dead = 0;
- change_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- dead++;
- else if (nh->nh_dev == dev &&
- nh->nh_scope != scope) {
- nh->nh_flags |= RTNH_F_DEAD;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nh->nh_power;
- nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
- dead++;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nh->nh_dev == dev) {
- dead = fi->fib_nhs;
- break;
- }
-#endif
- } endfor_nexthops(fi)
- if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- }
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (next_fi->fib_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
}
+ fi = next_fi;
+ order++;
}
- return ret;
+ if (order <= 0 || fi == NULL) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
- Dead device goes up. We wake up dead nexthops.
- It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
*/
-
int fib_sync_up(struct net_device *dev)
{
struct fib_info *prev_fi;
unsigned int hash;
struct hlist_head *head;
- struct hlist_node *node;
struct fib_nh *nh;
int ret;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return 0;
prev_fi = NULL;
@@ -1123,7 +1242,7 @@ int fib_sync_up(struct net_device *dev)
head = &fib_info_devhash[hash];
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
+ hlist_for_each_entry(nh, head, nh_hash) {
struct fib_info *fi = nh->nh_parent;
int alive;
@@ -1134,18 +1253,20 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
alive++;
continue;
}
- if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ if (nexthop_nh->nh_dev == NULL ||
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
continue;
- if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
+ if (nexthop_nh->nh_dev != dev ||
+ !__in_dev_get_rtnl(dev))
continue;
alive++;
spin_lock_bh(&fib_multipath_lock);
- nh->nh_power = 0;
- nh->nh_flags &= ~RTNH_F_DEAD;
+ nexthop_nh->nh_power = 0;
+ nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
spin_unlock_bh(&fib_multipath_lock);
} endfor_nexthops(fi)
@@ -1159,11 +1280,10 @@ int fib_sync_up(struct net_device *dev)
}
/*
- The algorithm is suboptimal, but it provides really
- fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
*/
-
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+void fib_select_multipath(struct fib_result *res)
{
struct fib_info *fi = res->fi;
int w;
@@ -1172,9 +1292,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- power += nh->nh_weight;
- nh->nh_power = nh->nh_weight;
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ power += nexthop_nh->nh_weight;
+ nexthop_nh->nh_power = nexthop_nh->nh_weight;
}
} endfor_nexthops(fi);
fi->fib_power = power;
@@ -1188,15 +1308,17 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
/* w should be random number [0..fi->fib_power-1],
- it is pretty bad approximation.
+ * it is pretty bad approximation.
*/
w = jiffies % fi->fib_power;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
- if ((w -= nh->nh_power) <= 0) {
- nh->nh_power--;
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
+ nexthop_nh->nh_power) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
+ nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
spin_unlock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 81a8285d6d6..5afeb5aa4c7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -12,18 +12,16 @@
*
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
*
- * This work is based on the LPC-trie which is originally descibed in:
+ * This work is based on the LPC-trie which is originally described in:
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
*
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
*
- * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
- *
*
* Code from fib_hash has been reused which includes the following header:
*
@@ -50,11 +48,10 @@
* Patrick McHardy <kaber@trash.net>
*/
-#define VERSION "0.408"
+#define VERSION "0.409"
#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/bitops.h>
+#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -73,6 +70,8 @@
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -82,7 +81,6 @@
#include <net/ip_fib.h>
#include "fib_lookup.h"
-#undef CONFIG_IP_FIB_TRIE_STATS
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -97,34 +95,38 @@ typedef unsigned int t_key;
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
- t_key key;
+struct rt_trie_node {
unsigned long parent;
+ t_key key;
};
struct leaf {
- t_key key;
unsigned long parent;
+ t_key key;
struct hlist_head list;
struct rcu_head rcu;
};
struct leaf_info {
struct hlist_node hlist;
- struct rcu_head rcu;
int plen;
+ u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
struct list_head falh;
+ struct rcu_head rcu;
};
struct tnode {
- t_key key;
unsigned long parent;
- unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
- unsigned short full_children; /* KEYLENGTH bits needed */
- unsigned short empty_children; /* KEYLENGTH bits needed */
- struct rcu_head rcu;
- struct node *child[0];
+ t_key key;
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
+ union {
+ struct rcu_head rcu;
+ struct tnode *tnode_free;
+ };
+ struct rt_trie_node __rcu *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -144,49 +146,88 @@ struct trie_stat {
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
+ unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
struct trie {
- struct node *trie;
+ struct rt_trie_node __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
- int size;
- unsigned int revision;
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+ int wasfull);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
-static void tnode_free(struct tnode *tn);
+/* tnodes to free after resize(); protected by RTNL */
+static struct tnode *tnode_free_head;
+static size_t tnode_free_size;
+
+/*
+ * synchronize_rcu after call_rcu for that many pages; it should be especially
+ * useful before resizing the root node with PREEMPT_NONE configs; the value was
+ * obtained experimentally, aiming to avoid visible slowdown.
+ */
+static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
-static struct trie *trie_local = NULL, *trie_main = NULL;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+/*
+ * caller must hold RTNL
+ */
+static inline struct tnode *node_parent(const struct rt_trie_node *node)
+{
+ unsigned long parent;
+
+ parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+
+ return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+}
+
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
{
- struct tnode *ret;
+ unsigned long parent;
- ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
- return rcu_dereference(ret);
+ parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
+ lockdep_rtnl_is_held());
+
+ return (struct tnode *)(parent & ~NODE_TYPE_MASK);
}
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+/* Same as rcu_assign_pointer
+ * but that macro() assumes that value is a pointer.
+ */
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
- rcu_assign_pointer(node->parent,
- (unsigned long)ptr | NODE_TYPE(node));
+ smp_wmb();
+ node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
-/* rcu_read_lock needs to be hold by caller from readside */
+/*
+ * caller must hold RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
+{
+ BUG_ON(i >= 1U << tn->bits);
-static inline struct node *tnode_get_child(struct tnode *tn, int i)
+ return rtnl_dereference(tn->child[i]);
+}
+
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1 << tn->bits);
+ BUG_ON(i >= 1U << tn->bits);
- return rcu_dereference(tn->child[i]);
+ return rcu_dereference_rtnl(tn->child[i]);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -194,12 +235,12 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -300,11 +341,10 @@ static inline void check_tnode(const struct tnode *tn)
WARN_ON(tn && tn->pos+tn->bits > 32);
}
-static int halve_threshold = 25;
-static int inflate_threshold = 50;
-static int halve_threshold_root = 8;
-static int inflate_threshold_root = 15;
-
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
static void __alias_free_mem(struct rcu_head *head)
{
@@ -319,57 +359,76 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
static void __leaf_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct leaf, rcu));
+ struct leaf *l = container_of(head, struct leaf, rcu);
+ kmem_cache_free(trie_leaf_kmem, l);
}
-static void __leaf_info_free_rcu(struct rcu_head *head)
+static inline void free_leaf(struct leaf *l)
{
- kfree(container_of(head, struct leaf_info, rcu));
+ call_rcu(&l->rcu, __leaf_free_rcu);
}
static inline void free_leaf_info(struct leaf_info *leaf)
{
- call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+ kfree_rcu(leaf, rcu);
}
-static struct tnode *tnode_alloc(unsigned int size)
+static struct tnode *tnode_alloc(size_t size)
{
- struct page *pages;
-
if (size <= PAGE_SIZE)
- return kcalloc(size, 1, GFP_KERNEL);
-
- pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
- if (!pages)
- return NULL;
-
- return page_address(pages);
+ return kzalloc(size, GFP_KERNEL);
+ else
+ return vzalloc(size);
}
static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
- unsigned int size = sizeof(struct tnode) +
- (1 << tn->bits) * sizeof(struct node *);
+ size_t size = sizeof(struct tnode) +
+ (sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
else
- free_pages((unsigned long)tn, get_order(size));
+ vfree(tn);
}
static inline void tnode_free(struct tnode *tn)
{
- if (IS_LEAF(tn)) {
- struct leaf *l = (struct leaf *) tn;
- call_rcu_bh(&l->rcu, __leaf_free_rcu);
- } else
+ if (IS_LEAF(tn))
+ free_leaf((struct leaf *) tn);
+ else
call_rcu(&tn->rcu, __tnode_free_rcu);
}
+static void tnode_free_safe(struct tnode *tn)
+{
+ BUG_ON(IS_LEAF(tn));
+ tn->tnode_free = tnode_free_head;
+ tnode_free_head = tn;
+ tnode_free_size += sizeof(struct tnode) +
+ (sizeof(struct rt_trie_node *) << tn->bits);
+}
+
+static void tnode_free_flush(void)
+{
+ struct tnode *tn;
+
+ while ((tn = tnode_free_head)) {
+ tnode_free_head = tn->tnode_free;
+ tn->tnode_free = NULL;
+ tnode_free(tn);
+ }
+
+ if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+ tnode_free_size = 0;
+ synchronize_rcu();
+ }
+}
+
static struct leaf *leaf_new(void)
{
- struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = T_LEAF;
INIT_HLIST_HEAD(&l->list);
@@ -382,19 +441,18 @@ static struct leaf_info *leaf_info_new(int plen)
struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
if (li) {
li->plen = plen;
+ li->mask_plen = ntohl(inet_make_mask(plen));
INIT_LIST_HEAD(&li->falh);
}
return li;
}
-static struct tnode* tnode_new(t_key key, int pos, int bits)
+static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- int nchildren = 1<<bits;
- int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
- memset(tn, 0, sz);
tn->parent = T_TNODE;
tn->pos = pos;
tn->bits = bits;
@@ -403,8 +461,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned int) (sizeof(struct node) * 1<<bits));
+ pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+ sizeof(struct rt_trie_node *) << bits);
return tn;
}
@@ -413,7 +471,7 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -421,7 +479,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
-static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+static inline void put_child(struct tnode *tn, int i,
+ struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -431,14 +490,14 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+ int wasfull)
{
- struct node *chi = tn->child[i];
+ struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
int isfull;
BUG_ON(i >= 1<<tn->bits);
-
/* update emptyChildren */
if (n == NULL && chi != NULL)
tn->empty_children++;
@@ -461,14 +520,14 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
rcu_assign_pointer(tn->child[i], n);
}
-static struct node *resize(struct trie *t, struct tnode *tn)
+#define MAX_WORK 10
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
- int err = 0;
struct tnode *old_tn;
int inflate_threshold_use;
int halve_threshold_use;
- int max_resize;
+ int max_work;
if (!tn)
return NULL;
@@ -478,23 +537,12 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* No children */
if (tn->empty_children == tnode_child_length(tn)) {
- tnode_free(tn);
+ tnode_free_safe(tn);
return NULL;
}
/* One child */
if (tn->empty_children == tnode_child_length(tn) - 1)
- for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
-
- n = tn->child[i];
- if (!n)
- continue;
-
- /* compress one level */
- node_set_parent(n, NULL);
- tnode_free(tn);
- return n;
- }
+ goto one_child;
/*
* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
@@ -563,19 +611,23 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!tn->parent)
+ if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
- else
+ halve_threshold_use = halve_threshold_root;
+ } else {
inflate_threshold_use = inflate_threshold;
+ halve_threshold_use = halve_threshold;
+ }
- err = 0;
- max_resize = 10;
- while ((tn->full_children > 0 && max_resize-- &&
- 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
- inflate_threshold_use * tnode_child_length(tn))) {
+ max_work = MAX_WORK;
+ while ((tn->full_children > 0 && max_work-- &&
+ 50 * (tn->full_children + tnode_child_length(tn)
+ - tn->empty_children)
+ >= inflate_threshold_use * tnode_child_length(tn))) {
old_tn = tn;
tn = inflate(t, tn);
+
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -585,33 +637,19 @@ static struct node *resize(struct trie *t, struct tnode *tn)
}
}
- if (max_resize < 0) {
- if (!tn->parent)
- printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
- inflate_threshold_root, tn->bits);
- else
- printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
- inflate_threshold, tn->bits);
- }
-
check_tnode(tn);
+ /* Return if at least one inflate is run */
+ if (max_work != MAX_WORK)
+ return (struct rt_trie_node *) tn;
+
/*
* Halve as long as the number of empty children in this
* node is above threshold.
*/
-
- /* Keep root node larger */
-
- if (!tn->parent)
- halve_threshold_use = halve_threshold_root;
- else
- halve_threshold_use = halve_threshold;
-
- err = 0;
- max_resize = 10;
- while (tn->bits > 1 && max_resize-- &&
+ max_work = MAX_WORK;
+ while (tn->bits > 1 && max_work-- &&
100 * (tnode_child_length(tn) - tn->empty_children) <
halve_threshold_use * tnode_child_length(tn)) {
@@ -626,37 +664,43 @@ static struct node *resize(struct trie *t, struct tnode *tn)
}
}
- if (max_resize < 0) {
- if (!tn->parent)
- printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
- halve_threshold_root, tn->bits);
- else
- printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
- halve_threshold, tn->bits);
- }
/* Only one child remains */
- if (tn->empty_children == tnode_child_length(tn) - 1)
+ if (tn->empty_children == tnode_child_length(tn) - 1) {
+one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
+ struct rt_trie_node *n;
- n = tn->child[i];
+ n = rtnl_dereference(tn->child[i]);
if (!n)
continue;
/* compress one level */
node_set_parent(n, NULL);
- tnode_free(tn);
+ tnode_free_safe(tn);
return n;
}
+ }
+ return (struct rt_trie_node *) tn;
+}
+
+
+static void tnode_clean_free(struct tnode *tn)
+{
+ int i;
+ struct tnode *tofree;
- return (struct node *) tn;
+ for (i = 0; i < tnode_child_length(tn); i++) {
+ tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+ if (tofree)
+ tnode_free(tofree);
+ }
+ tnode_free(tn);
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
{
- struct tnode *inode;
struct tnode *oldtnode = tn;
int olen = tnode_child_length(tn);
int i;
@@ -676,8 +720,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
for (i = 0; i < olen; i++) {
- struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+ struct tnode *inode;
+ inode = (struct tnode *) tnode_get_child(oldtnode, i);
if (inode &&
IS_TNODE(inode) &&
inode->pos == oldtnode->pos + oldtnode->bits &&
@@ -698,13 +743,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
+ put_child(tn, 2*i, (struct rt_trie_node *) left);
+ put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
- struct node *node = tnode_get_child(oldtnode, i);
+ struct tnode *inode;
+ struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -716,11 +762,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (IS_LEAF(node) || ((struct tnode *) node)->pos >
tn->pos + tn->bits - 1) {
- if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
- 1) == 0)
- put_child(t, tn, 2*i, node);
- else
- put_child(t, tn, 2*i+1, node);
+ put_child(tn,
+ tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
+ node);
continue;
}
@@ -728,10 +772,10 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode = (struct tnode *) node;
if (inode->bits == 1) {
- put_child(t, tn, 2*i, inode->child[0]);
- put_child(t, tn, 2*i+1, inode->child[1]);
+ put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
+ put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
- tnode_free(inode);
+ tnode_free_safe(inode);
continue;
}
@@ -759,46 +803,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ put_child(tn, 2*i, NULL);
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
size = tnode_child_length(left);
for (j = 0; j < size; j++) {
- put_child(t, left, j, inode->child[j]);
- put_child(t, right, j, inode->child[j + size]);
+ put_child(left, j, rtnl_dereference(inode->child[j]));
+ put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
+ put_child(tn, 2*i, resize(t, left));
+ put_child(tn, 2*i+1, resize(t, right));
- tnode_free(inode);
+ tnode_free_safe(inode);
}
- tnode_free(oldtnode);
+ tnode_free_safe(oldtnode);
return tn;
nomem:
- {
- int size = tnode_child_length(tn);
- int j;
-
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- return ERR_PTR(-ENOMEM);
- }
+ tnode_clean_free(tn);
+ return ERR_PTR(-ENOMEM);
}
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
- struct node *left, *right;
+ struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
@@ -829,7 +863,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct node *)newn);
+ put_child(tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -844,50 +878,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (left == NULL) {
if (right == NULL) /* Both are empty */
continue;
- put_child(t, tn, i/2, right);
+ put_child(tn, i/2, right);
continue;
}
if (right == NULL) {
- put_child(t, tn, i/2, left);
+ put_child(tn, i/2, left);
continue;
}
/* Two nonempty children */
newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
+ put_child(tn, i/2, NULL);
+ put_child(newBinNode, 0, left);
+ put_child(newBinNode, 1, right);
+ put_child(tn, i/2, resize(t, newBinNode));
}
- tnode_free(oldtnode);
+ tnode_free_safe(oldtnode);
return tn;
nomem:
- {
- int size = tnode_child_length(tn);
- int j;
-
- for (j = 0; j < size; j++)
- if (tn->child[j])
- tnode_free((struct tnode *)tn->child[j]);
-
- tnode_free(tn);
-
- return ERR_PTR(-ENOMEM);
- }
-}
-
-static void trie_init(struct trie *t)
-{
- if (!t)
- return;
-
- t->size = 0;
- rcu_assign_pointer(t->trie, NULL);
- t->revision = 0;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- memset(&t->stats, 0, sizeof(struct trie_use_stats));
-#endif
+ tnode_clean_free(tn);
+ return ERR_PTR(-ENOMEM);
}
/* readside must use rcu_read_lock currently dump routines
@@ -896,17 +907,16 @@ static void trie_init(struct trie *t)
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
{
struct hlist_head *head = &l->list;
- struct hlist_node *node;
struct leaf_info *li;
- hlist_for_each_entry_rcu(li, node, head, hlist)
+ hlist_for_each_entry_rcu(li, head, hlist)
if (li->plen == plen)
return li;
return NULL;
}
-static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct leaf *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
@@ -919,12 +929,11 @@ static inline struct list_head * get_fa_head(struct leaf *l, int plen)
static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
struct leaf_info *li = NULL, *last = NULL;
- struct hlist_node *node;
if (hlist_empty(head)) {
hlist_add_head_rcu(&new->hlist, head);
} else {
- hlist_for_each_entry(li, node, head, hlist) {
+ hlist_for_each_entry(li, head, hlist) {
if (new->plen > li->plen)
break;
@@ -944,10 +953,10 @@ fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
- struct node *n;
+ struct rt_trie_node *n;
pos = 0;
- n = rcu_dereference(t->trie);
+ n = rcu_dereference_rtnl(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
@@ -956,7 +965,10 @@ fib_find_node(struct trie *t, u32 key)
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child_rcu(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
} else
break;
}
@@ -968,19 +980,27 @@ fib_find_node(struct trie *t, u32 key)
return NULL;
}
-static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
- t_key cindex, key = tn->key;
+ t_key cindex, key;
struct tnode *tp;
- while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+ key = tn->key;
+
+ while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *) resize (t, (struct tnode *)tn);
- tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+ tn = (struct tnode *)resize(t, tn);
+
+ tnode_put_child_reorg(tp, cindex,
+ (struct rt_trie_node *)tn, wasfull);
+
+ tp = node_parent((struct rt_trie_node *) tn);
+ if (!tp)
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
- tp = node_parent((struct node *) tn);
+ tnode_free_flush();
if (!tp)
break;
tn = tp;
@@ -988,19 +1008,19 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode*) resize(t, (struct tnode *)tn);
+ tn = (struct tnode *)resize(t, tn);
- return (struct node*) tn;
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ tnode_free_flush();
}
/* only used from updater-side */
-static struct list_head *
-fib_insert_node(struct trie *t, int *err, u32 key, int plen)
+static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
+ struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
@@ -1008,7 +1028,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
t_key cindex;
pos = 0;
- n = t->trie;
+ n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
@@ -1036,7 +1056,10 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
- n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ n = tnode_get_child(tn,
+ tkey_extract_bits(key,
+ tn->pos,
+ tn->bits));
BUG_ON(n && node_parent(n) != tn);
} else
@@ -1054,34 +1077,27 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- struct leaf *l = (struct leaf *) n;
-
+ l = (struct leaf *) n;
li = leaf_info_new(plen);
- if (!li) {
- *err = -ENOMEM;
- goto err;
- }
+ if (!li)
+ return NULL;
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
goto done;
}
- t->size++;
l = leaf_new();
- if (!l) {
- *err = -ENOMEM;
- goto err;
- }
+ if (!l)
+ return NULL;
l->key = key;
li = leaf_info_new(plen);
if (!li) {
- tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
+ free_leaf(l);
+ return NULL;
}
fa_head = &li->falh;
@@ -1090,10 +1106,10 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct node *)l, tp);
+ node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ put_child(tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1101,12 +1117,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
* first tnode need some special handling
*/
- if (tp)
- pos = tp->pos+tp->bits;
- else
- pos = 0;
-
if (n) {
+ pos = tp ? tp->pos+tp->bits : 0;
newpos = tkey_mismatch(key, pos, n->key);
tn = tnode_new(n->key, newpos, 1);
} else {
@@ -1116,43 +1128,40 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (!tn) {
free_leaf_info(li);
- tnode_free((struct tnode *) l);
- *err = -ENOMEM;
- goto err;
+ free_leaf(l);
+ return NULL;
}
- node_set_parent((struct node *)tn, tp);
+ node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
- put_child(t, tn, 1-missbit, n);
+ put_child(tn, missbit, (struct rt_trie_node *)l);
+ put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+ put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
if (tp && tp->pos + tp->bits > 32)
- printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
+ pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
/* Rebalance the trie */
- rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+ trie_rebalance(t, tp);
done:
- t->revision++;
-err:
return fa_head;
}
/*
* Caller must hold RTNL.
*/
-static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *new_fa;
@@ -1203,17 +1212,44 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
* and we need to allocate a new one of those as well.
*/
- if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_orig;
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_first, *fa_match;
err = -EEXIST;
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ /* We have 2 goals:
+ * 1. Find exact match for type, scope, fib_info to avoid
+ * duplicate routes
+ * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+ */
+ fa_match = NULL;
+ fa_first = fa;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, fa_head, fa_list) {
+ if (fa->fa_tos != tos)
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == cfg->fc_type &&
+ fa->fa_info == fi) {
+ fa_match = fa;
+ break;
+ }
+ }
+
if (cfg->fc_nlflags & NLM_F_REPLACE) {
struct fib_info *fi_drop;
u8 state;
+ fa = fa_first;
+ if (fa_match) {
+ if (fa == fa_match)
+ err = 0;
+ goto out;
+ }
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (new_fa == NULL)
@@ -1223,16 +1259,15 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
- new_fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
@@ -1242,20 +1277,11 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
* uses the same scope, type, and nexthop
* information.
*/
- fa_orig = fa;
- list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- goto out;
- }
- }
+ if (fa_match)
+ goto out;
+
if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_orig;
+ fa = fa_first;
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1269,23 +1295,26 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
new_fa->fa_state = 0;
/*
* Insert new entry to the list.
*/
if (!fa_head) {
- err = 0;
- fa_head = fib_insert_node(t, &err, key, plen);
- if (err)
+ fa_head = fib_insert_node(t, key, plen);
+ if (unlikely(!fa_head)) {
+ err = -ENOMEM;
goto out_free_new_fa;
+ }
}
+ if (!plen)
+ tb->tb_num_default++;
+
list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : fa_head));
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
@@ -1299,53 +1328,86 @@ err:
return err;
}
-
/* should be called with rcu_read_lock */
-static inline int check_leaf(struct trie *t, struct leaf *l,
- t_key key, int *plen, const struct flowi *flp,
- struct fib_result *res)
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+ t_key key, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
{
- int err, i;
- __be32 mask;
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
- struct hlist_node *node;
- hlist_for_each_entry_rcu(li, node, hhead, hlist) {
- i = li->plen;
- mask = inet_make_mask(i);
- if (l->key != (key & ntohl(mask)))
+ hlist_for_each_entry_rcu(li, hhead, hlist) {
+ struct fib_alias *fa;
+
+ if (l->key != (key & li->mask_plen))
continue;
- if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
- *plen = i;
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_passed++;
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
+ t->stats.semantic_match_passed++;
#endif
- return err;
+ res->prefixlen = li->plen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_info->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &li->falh;
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+ return 0;
+ }
}
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats.semantic_match_miss++;
#endif
}
+
return 1;
}
-static int
-fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
- int plen, ret = 0;
- struct node *n;
+ int ret;
+ struct rt_trie_node *n;
struct tnode *pn;
- int pos, bits;
- t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
+ unsigned int pos, bits;
+ t_key key = ntohl(flp->daddr);
+ unsigned int chopped_off;
t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
+ unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
+ t_key pref_mismatch;
rcu_read_lock();
@@ -1359,10 +1421,10 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
/* Just a leaf? */
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- goto failed;
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ goto found;
}
+
pn = (struct tnode *) n;
chopped_off = 0;
@@ -1374,7 +1436,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
pos, bits);
- n = tnode_get_child(pn, cindex);
+ n = tnode_get_child_rcu(pn, cindex);
if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1384,14 +1446,12 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
}
if (IS_LEAF(n)) {
- if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
- goto found;
- else
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ if (ret > 0)
goto backtrace;
+ goto found;
}
-#define HL_OPTIMIZE
-#ifdef HL_OPTIMIZE
cn = (struct tnode *)n;
/*
@@ -1420,12 +1480,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
* *are* zero.
*/
- /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+ /* NOTA BENE: Checking only skipped bits
+ for the new node here */
if (current_prefix_length < pos+bits) {
if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length) != 0 ||
- !(cn->child[0]))
+ cn->pos - current_prefix_length)
+ || !(cn->child[0]))
goto backtrace;
}
@@ -1448,38 +1509,37 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
* new tnode's key.
*/
- /* Note: We aren't very concerned about the piece of the key
- * that precede pn->pos+pn->bits, since these have already been
- * checked. The bits after cn->pos aren't checked since these are
- * by definition "unknown" at this point. Thus, what we want to
- * see is if we are about to enter the "prefix matching" state,
- * and in that case verify that the skipped bits that will prevail
- * throughout this subtree are zero, as they have to be if we are
- * to find a matching prefix.
+ /*
+ * Note: We aren't very concerned about the piece of
+ * the key that precede pn->pos+pn->bits, since these
+ * have already been checked. The bits after cn->pos
+ * aren't checked since these are by definition
+ * "unknown" at this point. Thus, what we want to see
+ * is if we are about to enter the "prefix matching"
+ * state, and in that case verify that the skipped
+ * bits that will prevail throughout this subtree are
+ * zero, as they have to be if we are to find a
+ * matching prefix.
*/
- node_prefix = mask_pfx(cn->key, cn->pos);
- key_prefix = mask_pfx(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
+ pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
- /* In short: If skipped bits in this node do not match the search
- * key, enter the "prefix matching" state.directly.
+ /*
+ * In short: If skipped bits in this node do not match
+ * the search key, enter the "prefix matching"
+ * state.directly.
*/
if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch <<1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+ /* fls(x) = __fls(x) + 1 */
+ int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
- if (key_prefix != 0)
+ if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;
if (current_prefix_length >= cn->pos)
current_prefix_length = mp;
}
-#endif
+
pn = (struct tnode *)n; /* Descend */
chopped_off = 0;
continue;
@@ -1488,12 +1548,14 @@ backtrace:
chopped_off++;
/* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
+ while ((chopped_off <= pn->bits)
+ && !(cindex & (1<<(chopped_off-1))))
chopped_off++;
/* Decrease current_... with bits chopped off */
if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits - chopped_off;
+ current_prefix_length = pn->pos + pn->bits
+ - chopped_off;
/*
* Either we do the actual chop off according or if we have
@@ -1503,7 +1565,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent((struct node *) pn);
+ struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;
@@ -1524,59 +1586,31 @@ found:
rcu_read_unlock();
return ret;
}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
-/* only called from updater side */
-static int trie_leaf_remove(struct trie *t, t_key key)
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- t_key cindex;
- struct tnode *tp = NULL;
- struct node *n = t->trie;
- struct leaf *l;
+ struct tnode *tp = node_parent((struct rt_trie_node *) l);
- pr_debug("entering trie_leaf_remove(%p)\n", n);
-
- /* Note that in the case skipped bits, those bits are *not* checked!
- * When we finish this, we will have NULL or a T_LEAF, and the
- * T_LEAF may or may not match our key.
- */
-
- while (n != NULL && IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- check_tnode(tn);
- n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
-
- BUG_ON(n && node_parent(n) != tn);
- }
- l = (struct leaf *) n;
-
- if (!n || !tkey_equals(l->key, key))
- return 0;
-
- /*
- * Key found.
- * Remove the leaf and rebalance the tree
- */
-
- t->revision++;
- t->size--;
-
- tp = node_parent(n);
- tnode_free((struct tnode *) n);
+ pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, NULL);
- rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+ t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
+ put_child(tp, cindex, NULL);
+ trie_rebalance(t, tp);
} else
- rcu_assign_pointer(t->trie, NULL);
+ RCU_INIT_POINTER(t->trie, NULL);
- return 1;
+ free_leaf(l);
}
/*
* Caller must hold RTNL.
*/
-static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
u32 key, mask;
@@ -1602,7 +1636,12 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
if (!l)
return -ESRCH;
- fa_head = get_fa_head(l, plen);
+ li = find_leaf_info(l, plen);
+
+ if (!li)
+ return -ESRCH;
+
+ fa_head = &li->falh;
fa = fib_find_alias(fa_head, tos, 0);
if (!fa)
@@ -1611,9 +1650,8 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
- fa_head = fa->fa_list.prev;
-
- list_for_each_entry(fa, fa_head, fa_list) {
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, fa_head, fa_list) {
struct fib_info *fi = fa->fa_info;
if (fa->fa_tos != tos)
@@ -1621,7 +1659,9 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
(cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
fib_nh_match(cfg, fi) == 0) {
@@ -1637,28 +1677,28 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
- l = fib_find_node(t, key);
- li = find_leaf_info(l, plen);
-
list_del_rcu(&fa->fa_list);
+ if (!plen)
+ tb->tb_num_default--;
+
if (list_empty(fa_head)) {
hlist_del_rcu(&li->hlist);
free_leaf_info(li);
}
if (hlist_empty(&l->list))
- trie_leaf_remove(t, key);
+ trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(-1);
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
return 0;
}
-static int trie_flush_list(struct trie *t, struct list_head *head)
+static int trie_flush_list(struct list_head *head)
{
struct fib_alias *fa, *fa_node;
int found = 0;
@@ -1676,15 +1716,15 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
return found;
}
-static int trie_flush_leaf(struct trie *t, struct leaf *l)
+static int trie_flush_leaf(struct leaf *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct leaf_info *li = NULL;
- hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
- found += trie_flush_list(t, &li->falh);
+ hlist_for_each_entry_safe(li, tmp, lih, hlist) {
+ found += trie_flush_list(&li->falh);
if (list_empty(&li->falh)) {
hlist_del_rcu(&li->hlist);
@@ -1694,185 +1734,113 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
return found;
}
-/* rcu_read_lock needs to be hold by caller from readside */
-
-static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
{
- struct node *c = (struct node *) thisleaf;
- struct tnode *p;
- int idx;
- struct node *trie = rcu_dereference(t->trie);
+ do {
+ t_key idx;
- if (c == NULL) {
- if (trie == NULL)
- return NULL;
+ if (c)
+ idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
+ else
+ idx = 0;
- if (IS_LEAF(trie)) /* trie w. just a leaf */
- return (struct leaf *) trie;
+ while (idx < 1u << p->bits) {
+ c = tnode_get_child_rcu(p, idx++);
+ if (!c)
+ continue;
- p = (struct tnode*) trie; /* Start */
- } else
- p = node_parent(c);
+ if (IS_LEAF(c))
+ return (struct leaf *) c;
- while (p) {
- int pos, last;
+ /* Rescan start scanning in new node */
+ p = (struct tnode *) c;
+ idx = 0;
+ }
- /* Find the next child of the parent */
- if (c)
- pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
- else
- pos = 0;
+ /* Node empty, walk back up to parent */
+ c = (struct rt_trie_node *) p;
+ } while ((p = node_parent_rcu(c)) != NULL);
- last = 1 << p->bits;
- for (idx = pos; idx < last ; idx++) {
- c = rcu_dereference(p->child[idx]);
+ return NULL; /* Root of trie */
+}
- if (!c)
- continue;
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+ struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
- /* Decend if tnode */
- while (IS_TNODE(c)) {
- p = (struct tnode *) c;
- idx = 0;
+ if (!n)
+ return NULL;
- /* Rightmost non-NULL branch */
- if (p && IS_TNODE(p))
- while (!(c = rcu_dereference(p->child[idx]))
- && idx < (1<<p->bits)) idx++;
+ if (IS_LEAF(n)) /* trie is just a leaf */
+ return (struct leaf *) n;
- /* Done with this tnode? */
- if (idx >= (1 << p->bits) || !c)
- goto up;
- }
- return (struct leaf *) c;
- }
-up:
- /* No more children go up one step */
- c = (struct node *) p;
- p = node_parent(c);
- }
- return NULL; /* Ready. Root of trie */
+ return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+ struct rt_trie_node *c = (struct rt_trie_node *) l;
+ struct tnode *p = node_parent_rcu(c);
+
+ if (!p)
+ return NULL; /* trie with just one leaf */
+
+ return leaf_walk_rcu(p, c);
}
+static struct leaf *trie_leafindex(struct trie *t, int index)
+{
+ struct leaf *l = trie_firstleaf(t);
+
+ while (l && index-- > 0)
+ l = trie_nextleaf(l);
+
+ return l;
+}
+
+
/*
* Caller must hold RTNL.
*/
-static int fn_trie_flush(struct fib_table *tb)
+int fib_table_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *ll = NULL, *l = NULL;
- int found = 0, h;
-
- t->revision++;
+ struct leaf *l, *ll = NULL;
+ int found = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
- found += trie_flush_leaf(t, l);
+ for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
+ found += trie_flush_leaf(l);
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
ll = l;
}
if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll->key);
+ trie_leaf_remove(t, ll);
pr_debug("trie_flush found=%d\n", found);
return found;
}
-static int trie_last_dflt = -1;
-
-static void
-fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+void fib_free_table(struct fib_table *tb)
{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
-
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
-
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
-
- if (list_empty(fa_head))
- goto out;
-
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
- fa->fa_state |= FA_S_ACCESSED;
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- if (order <= 0 || fi == NULL) {
- trie_last_dflt = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = fi;
- atomic_inc(&fi->fib_clntref);
- trie_last_dflt = order;
- goto out;
- }
- if (last_idx >= 0) {
- if (res->fi)
- fib_info_put(res->fi);
- res->fi = last_resort;
- if (last_resort)
- atomic_inc(&last_resort->fib_clntref);
- }
- trie_last_dflt = last_idx;
- out:;
- rcu_read_unlock();
+ kfree(tb);
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
+ struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
int i, s_i;
struct fib_alias *fa;
-
__be32 xkey = htonl(key);
- s_i = cb->args[4];
+ s_i = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
@@ -1882,130 +1850,129 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
i++;
continue;
}
- BUG_ON(!fa->fa_info);
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE,
tb->tb_id,
fa->fa_type,
- fa->fa_scope,
xkey,
plen,
fa->fa_tos,
- fa->fa_info, 0) < 0) {
- cb->args[4] = i;
+ fa->fa_info, NLM_F_MULTI) < 0) {
+ cb->args[5] = i;
return -1;
}
i++;
}
- cb->args[4] = i;
+ cb->args[5] = i;
return skb->len;
}
-static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
{
- int h, s_h;
- struct list_head *fa_head;
- struct leaf *l = NULL;
+ struct leaf_info *li;
+ int i, s_i;
- s_h = cb->args[3];
+ s_i = cb->args[4];
+ i = 0;
- for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
- if (h < s_h)
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
+ if (i < s_i) {
+ i++;
continue;
- if (h > s_h)
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
-
- fa_head = get_fa_head(l, plen);
+ }
- if (!fa_head)
- continue;
+ if (i > s_i)
+ cb->args[5] = 0;
- if (list_empty(fa_head))
+ if (list_empty(&li->falh))
continue;
- if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
- cb->args[3] = h;
+ if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+ cb->args[4] = i;
return -1;
}
+ i++;
}
- cb->args[3] = h;
+
+ cb->args[4] = i;
return skb->len;
}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- int m, s_m;
+ struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
-
- s_m = cb->args[2];
+ t_key key = cb->args[2];
+ int count = cb->args[3];
rcu_read_lock();
- for (m = 0; m <= 32; m++) {
- if (m < s_m)
- continue;
- if (m > s_m)
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ if (count == 0)
+ l = trie_firstleaf(t);
+ else {
+ /* Normally, continue from last key, but if that is missing
+ * fallback to using slow rescan
+ */
+ l = fib_find_node(t, key);
+ if (!l)
+ l = trie_leafindex(t, count);
+ }
- if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
- cb->args[2] = m;
- goto out;
+ while (l) {
+ cb->args[2] = l->key;
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ cb->args[3] = count;
+ rcu_read_unlock();
+ return -1;
}
+
+ ++count;
+ l = trie_nextleaf(l);
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
}
+ cb->args[3] = count;
rcu_read_unlock();
- cb->args[2] = m;
+
return skb->len;
-out:
- rcu_read_unlock();
- return -1;
}
-/* Fix more generic FIB names for init later */
+void __init fib_trie_init(void)
+{
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_PANIC, NULL);
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(u32 id)
-#else
-struct fib_table * __init fib_hash_init(u32 id)
-#endif
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ max(sizeof(struct leaf),
+ sizeof(struct leaf_info)),
+ 0, SLAB_PANIC, NULL);
+}
+
+
+struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
- if (fn_alias_kmem == NULL)
- fn_alias_kmem = kmem_cache_create("ip_fib_alias",
- sizeof(struct fib_alias),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
-
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
GFP_KERNEL);
if (tb == NULL)
return NULL;
tb->tb_id = id;
- tb->tb_lookup = fn_trie_lookup;
- tb->tb_insert = fn_trie_insert;
- tb->tb_delete = fn_trie_delete;
- tb->tb_flush = fn_trie_flush;
- tb->tb_select_default = fn_trie_select_default;
- tb->tb_dump = fn_trie_dump;
- memset(tb->tb_data, 0, sizeof(struct trie));
+ tb->tb_default = -1;
+ tb->tb_num_default = 0;
t = (struct trie *) tb->tb_data;
-
- trie_init(t);
-
- if (id == RT_TABLE_LOCAL)
- trie_local = t;
- else if (id == RT_TABLE_MAIN)
- trie_main = t;
-
- if (id == RT_TABLE_LOCAL)
- printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
+ memset(t, 0, sizeof(*t));
return tb;
}
@@ -2013,16 +1980,17 @@ struct fib_table * __init fib_hash_init(u32 id)
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
+ struct seq_net_private p;
+ struct fib_table *tb;
struct tnode *tnode;
- struct trie *trie;
- unsigned index;
- unsigned depth;
+ unsigned int index;
+ unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
- unsigned cindex = iter->index;
+ unsigned int cindex = iter->index;
struct tnode *p;
/* A single entry routing table */
@@ -2033,7 +2001,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child(tn, cindex);
+ struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2052,7 +2020,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent((struct node *)tn);
+ p = node_parent_rcu((struct rt_trie_node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2064,51 +2032,51 @@ rescan:
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct node *n ;
+ struct rt_trie_node *n;
if (!t)
return NULL;
n = rcu_dereference(t->trie);
-
- if (!iter)
+ if (!n)
return NULL;
- if (n) {
- if (IS_TNODE(n)) {
- iter->tnode = (struct tnode *) n;
- iter->trie = t;
- iter->index = 0;
- iter->depth = 1;
- } else {
- iter->tnode = NULL;
- iter->trie = t;
- iter->index = 0;
- iter->depth = 0;
- }
- return n;
+ if (IS_TNODE(n)) {
+ iter->tnode = (struct tnode *) n;
+ iter->index = 0;
+ iter->depth = 1;
+ } else {
+ iter->tnode = NULL;
+ iter->index = 0;
+ iter->depth = 0;
}
- return NULL;
+
+ return n;
}
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct rt_trie_node *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
rcu_read_lock();
- for (n = fib_trie_get_first(&iter, t); n;
- n = fib_trie_get_next(&iter)) {
+ for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
+ struct leaf *l = (struct leaf *)n;
+ struct leaf_info *li;
+
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
+
+ hlist_for_each_entry_rcu(li, &l->list, hlist)
+ ++s->prefixes;
} else {
const struct tnode *tn = (const struct tnode *) n;
int i;
@@ -2130,20 +2098,24 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
*/
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- unsigned i, max, pointers, bytes, avdepth;
+ unsigned int i, max, pointers, bytes, avdepth;
if (stat->leaves)
avdepth = stat->totdepth*100 / stat->leaves;
else
avdepth = 0;
- seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
-
bytes = sizeof(struct leaf) * stat->leaves;
- seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct leaf_info) * stat->prefixes;
+
+ seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
bytes += sizeof(struct tnode) * stat->tnodes;
max = MAX_STAT_DEPTH;
@@ -2151,62 +2123,84 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
max--;
pointers = 0;
- for (i = 1; i <= max; i++)
+ for (i = 1; i < max; i++)
if (stat->nodesizes[i] != 0) {
- seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
}
seq_putc(seq, '\n');
- seq_printf(seq, "\tPointers: %d\n", pointers);
+ seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
- seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
- seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
+ bytes += sizeof(struct rt_trie_node *) * pointers;
+ seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
+}
#ifdef CONFIG_IP_FIB_TRIE_STATS
- seq_printf(seq, "Counters:\n---------\n");
- seq_printf(seq,"gets = %d\n", t->stats.gets);
- seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
- seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
- seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
- seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
- seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
-#ifdef CLEAR_STATS
- memset(&(t->stats), 0, sizeof(t->stats));
-#endif
+static void trie_show_usage(struct seq_file *seq,
+ const struct trie_use_stats *stats)
+{
+ seq_printf(seq, "\nCounters:\n---------\n");
+ seq_printf(seq, "gets = %u\n", stats->gets);
+ seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ stats->semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n",
+ stats->semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n",
+ stats->resize_node_skipped);
+}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+ if (tb->tb_id == RT_TABLE_LOCAL)
+ seq_puts(seq, "Local:\n");
+ else if (tb->tb_id == RT_TABLE_MAIN)
+ seq_puts(seq, "Main:\n");
+ else
+ seq_printf(seq, "Id %d:\n", tb->tb_id);
}
+
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- struct trie_stat *stat;
-
- stat = kmalloc(sizeof(*stat), GFP_KERNEL);
- if (!stat)
- return -ENOMEM;
+ struct net *net = (struct net *)seq->private;
+ unsigned int h;
- seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ seq_printf(seq,
+ "Basic info: size of leaf:"
+ " %Zd bytes, size of tnode: %Zd bytes.\n",
sizeof(struct leaf), sizeof(struct tnode));
- if (trie_local) {
- seq_printf(seq, "Local:\n");
- trie_collect_stats(trie_local, stat);
- trie_show_stats(seq, stat);
- }
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct trie_stat stat;
+
+ if (!t)
+ continue;
- if (trie_main) {
- seq_printf(seq, "Main:\n");
- trie_collect_stats(trie_main, stat);
- trie_show_stats(seq, stat);
+ fib_table_print(seq, tb);
+
+ trie_collect_stats(t, &stat);
+ trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ trie_show_usage(seq, &t->stats);
+#endif
+ }
}
- kfree(stat);
return 0;
}
static int fib_triestat_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, fib_triestat_seq_show, NULL);
+ return single_open_net(inode, file, fib_triestat_seq_show);
}
static const struct file_operations fib_triestat_fops = {
@@ -2214,72 +2208,97 @@ static const struct file_operations fib_triestat_fops = {
.open = fib_triestat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = single_release_net,
};
-static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
- loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
loff_t idx = 0;
- struct node *n;
+ unsigned int h;
- for (n = fib_trie_get_first(iter, trie_local);
- n; ++idx, n = fib_trie_get_next(iter)) {
- if (pos == idx)
- return n;
- }
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
- for (n = fib_trie_get_first(iter, trie_main);
- n; ++idx, n = fib_trie_get_next(iter)) {
- if (pos == idx)
- return n;
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct rt_trie_node *n;
+
+ for (n = fib_trie_get_first(iter,
+ (struct trie *) tb->tb_data);
+ n; n = fib_trie_get_next(iter))
+ if (pos == idx++) {
+ iter->tb = tb;
+ return n;
+ }
+ }
}
+
return NULL;
}
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
rcu_read_lock();
- if (*pos == 0)
- return SEQ_START_TOKEN;
- return fib_trie_get_idx(seq->private, *pos - 1);
+ return fib_trie_get_idx(seq, *pos);
}
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_trie_iter *iter = seq->private;
- void *l = v;
+ struct net *net = seq_file_net(seq);
+ struct fib_table *tb = iter->tb;
+ struct hlist_node *tb_node;
+ unsigned int h;
+ struct rt_trie_node *n;
++*pos;
- if (v == SEQ_START_TOKEN)
- return fib_trie_get_idx(iter, 0);
-
- v = fib_trie_get_next(iter);
- BUG_ON(v == l);
- if (v)
- return v;
+ /* next node in same table */
+ n = fib_trie_get_next(iter);
+ if (n)
+ return n;
- /* continue scan in next trie */
- if (iter->trie == trie_local)
- return fib_trie_get_first(iter, trie_main);
+ /* walk rest of this hash chain */
+ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+ while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+ tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ /* new hash chain */
+ while (++h < FIB_TABLE_HASHSZ) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ }
return NULL;
+
+found:
+ iter->tb = tb;
+ return n;
}
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
rcu_read_unlock();
}
static void seq_indent(struct seq_file *seq, int n)
{
- while (n-- > 0) seq_puts(seq, " ");
+ while (n-- > 0)
+ seq_puts(seq, " ");
}
-static inline const char *rtn_scope(enum rt_scope_t s)
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
{
- static char buf[32];
-
switch (s) {
case RT_SCOPE_UNIVERSE: return "universe";
case RT_SCOPE_SITE: return "site";
@@ -2287,12 +2306,12 @@ static inline const char *rtn_scope(enum rt_scope_t s)
case RT_SCOPE_HOST: return "host";
case RT_SCOPE_NOWHERE: return "nowhere";
default:
- snprintf(buf, sizeof(buf), "scope=%d", s);
+ snprintf(buf, len, "scope=%d", s);
return buf;
}
}
-static const char *rtn_type_names[__RTN_MAX] = {
+static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_UNSPEC] = "UNSPEC",
[RTN_UNICAST] = "UNICAST",
[RTN_LOCAL] = "LOCAL",
@@ -2307,13 +2326,11 @@ static const char *rtn_type_names[__RTN_MAX] = {
[RTN_XRESOLVE] = "XRESOLVE",
};
-static inline const char *rtn_type(unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
- static char buf[32];
-
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
- snprintf(buf, sizeof(buf), "type %d", t);
+ snprintf(buf, len, "type %u", t);
return buf;
}
@@ -2321,48 +2338,43 @@ static inline const char *rtn_type(unsigned t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
-
- if (v == SEQ_START_TOKEN)
- return 0;
+ struct rt_trie_node *n = v;
- if (!node_parent(n)) {
- if (iter->trie == trie_local)
- seq_puts(seq, "<local>:\n");
- else
- seq_puts(seq, "<main>:\n");
- }
+ if (!node_parent_rcu(n))
+ fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
seq_indent(seq, iter->depth-1);
- seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
- NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
+ seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
+ &prf, tn->pos, tn->bits, tn->full_children,
tn->empty_children);
} else {
struct leaf *l = (struct leaf *) n;
- int i;
+ struct leaf_info *li;
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
- seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
- for (i = 32; i >= 0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
- if (li) {
- struct fib_alias *fa;
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", i,
- rtn_scope(fa->fa_scope),
- rtn_type(fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, "tos =%d\n",
- fa->fa_tos);
- seq_putc(seq, '\n');
- }
+ seq_printf(seq, " |-- %pI4\n", &val);
+
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth+1);
+ seq_printf(seq, " /%d %s %s", li->plen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
}
@@ -2379,8 +2391,8 @@ static const struct seq_operations fib_trie_seq_ops = {
static int fib_trie_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &fib_trie_seq_ops,
- sizeof(struct fib_trie_iter));
+ return seq_open_net(inode, file, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter));
}
static const struct file_operations fib_trie_fops = {
@@ -2388,16 +2400,93 @@ static const struct file_operations fib_trie_fops = {
.open = fib_trie_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
+};
+
+struct fib_route_iter {
+ struct seq_net_private p;
+ struct trie *main_trie;
+ loff_t pos;
+ t_key key;
};
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
- static unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
+ struct leaf *l = NULL;
+ struct trie *t = iter->main_trie;
+
+ /* use cache location of last found key */
+ if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+ pos -= iter->pos;
+ else {
+ iter->pos = 0;
+ l = trie_firstleaf(t);
+ }
+
+ while (l && pos-- > 0) {
+ iter->pos++;
+ l = trie_nextleaf(l);
+ }
+
+ if (l)
+ iter->key = pos; /* remember it */
+ else
+ iter->pos = 0; /* forget it */
+
+ return l;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb;
+
+ rcu_read_lock();
+ tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+ if (!tb)
+ return NULL;
+
+ iter->main_trie = (struct trie *) tb->tb_data;
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+ else
+ return fib_route_get_idx(iter, *pos - 1);
+}
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct leaf *l = v;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ iter->pos = 0;
+ l = trie_firstleaf(iter->main_trie);
+ } else {
+ iter->pos++;
+ l = trie_nextleaf(l);
+ }
+
+ if (l)
+ iter->key = l->key;
+ else
+ iter->pos = 0;
+ return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+{
+ unsigned int flags = 0;
+
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
if (fi && fi->fib_nh->nh_gw)
flags |= RTF_GATEWAY;
if (mask == htonl(0xFFFFFFFF))
@@ -2409,15 +2498,13 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
/*
* This outputs /proc/net/route.
* The format of the file is not supposed to be changed
- * and needs to be same as fib_hash output to avoid breaking
+ * and needs to be same as fib_hash output to avoid breaking
* legacy utilities
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- const struct fib_trie_iter *iter = seq->private;
struct leaf *l = v;
- int i;
- char bf[128];
+ struct leaf_info *li;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2426,48 +2513,44 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- if (iter->trie == trie_local)
- return 0;
- if (IS_TNODE(l))
- return 0;
-
- for (i=32; i>=0; i--) {
- struct leaf_info *li = find_leaf_info(l, i);
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
struct fib_alias *fa;
__be32 mask, prefix;
- if (!li)
- continue;
-
mask = inet_make_mask(li->plen);
prefix = htonl(l->key);
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
- unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
if (fa->fa_type == RTN_BROADCAST
|| fa->fa_type == RTN_MULTICAST)
continue;
+ seq_setwidth(seq, 127);
+
if (fi)
- snprintf(bf, sizeof(bf),
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
fi->fib_dev ? fi->fib_dev->name : "*",
prefix,
fi->fib_nh->nh_gw, flags, 0, 0,
fi->fib_priority,
mask,
- (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
else
- snprintf(bf, sizeof(bf),
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
- seq_printf(seq, "%-127s\n", bf);
+ seq_pad(seq, '\n');
}
}
@@ -2475,16 +2558,16 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
}
static const struct seq_operations fib_route_seq_ops = {
- .start = fib_trie_seq_start,
- .next = fib_trie_seq_next,
- .stop = fib_trie_seq_stop,
+ .start = fib_route_seq_start,
+ .next = fib_route_seq_next,
+ .stop = fib_route_seq_stop,
.show = fib_route_seq_show,
};
static int fib_route_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &fib_route_seq_ops,
- sizeof(struct fib_trie_iter));
+ return seq_open_net(inode, file, &fib_route_seq_ops,
+ sizeof(struct fib_route_iter));
}
static const struct file_operations fib_route_fops = {
@@ -2492,35 +2575,36 @@ static const struct file_operations fib_route_fops = {
.open = fib_route_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-int __init fib_proc_init(void)
+int __net_init fib_proc_init(struct net *net)
{
- if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops))
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops))
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove(&init_net, "fib_triestat");
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- proc_net_remove(&init_net, "fib_trie");
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
-void __init fib_proc_exit(void)
+void __net_exit fib_proc_exit(struct net *net)
{
- proc_net_remove(&init_net, "fib_trie");
- proc_net_remove(&init_net, "fib_triestat");
- proc_net_remove(&init_net, "route");
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 00000000000..0485bf7f8f0
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,364 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
+
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(gre_build_header);
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ unsigned int ip_hlen = ip_hdrlen(skb);
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
+
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
+
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static int gre_cisco_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ int i;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+ int ret;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+ ret = proto->handler(skb, &tpi);
+ if (ret == PACKET_RCVD) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void gre_cisco_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int i;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+
+ if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
+ goto out;
+
+ }
+out:
+ rcu_read_unlock();
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+ if (ver >= GREPROTO_MAX)
+ return;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_cisco_rcv,
+ .err_handler = gre_cisco_err,
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *newp)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[newp->priority];
+
+ return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_register);
+
+int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[del_proto->priority];
+ int ret;
+
+ ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
+
+ if (ret)
+ return ret;
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_unregister);
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ goto err;
+ }
+
+ if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+ pr_info("%s: can't add ipgre handler\n", __func__);
+ goto err_gre;
+ }
+
+ return 0;
+err_gre:
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+err:
+ return -EAGAIN;
+}
+
+static void __exit gre_exit(void)
+{
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 00000000000..f0bdd47bbbc
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,298 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ if (!skb->encapsulation)
+ return -EINVAL;
+ return 0;
+}
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl;
+ struct gre_base_hdr *greh;
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ ghl = skb_inner_network_header(skb) - skb_transport_header(skb);
+ if (unlikely(ghl < sizeof(*greh)))
+ goto out;
+
+ csum = !!(greh->flags & GRE_CSUM);
+ if (csum)
+ skb->encap_hdr_csum = 1;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (!segs || IS_ERR(segs)) {
+ skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ goto out;
+ }
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ skb_reset_transport_header(skb);
+
+ greh = (struct gre_base_hdr *)
+ skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+/* Compute the whole skb csum in s/w and store it, then verify GRO csum
+ * starting from gro_offset.
+ */
+static __sum16 gro_skb_checksum(struct sk_buff *skb)
+{
+ __sum16 sum;
+
+ skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
+ csum_partial(skb->data, skb_gro_offset(skb), 0));
+ sum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
+ if (unlikely(!sum) && !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ } else {
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ }
+
+ return sum;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ type = greh->protocol;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL)
+ goto out_unlock;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out_unlock;
+ }
+ if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
+ __sum16 csum = 0;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ csum = csum_fold(NAPI_GRO_CB(skb)->csum);
+ /* Don't trust csum error calculated/reported by h/w */
+ if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
+ csum = gro_skb_checksum(skb);
+
+ /* GRE CSUM is the 1's complement of the 1's complement sum
+ * of the GRE hdr plus payload so it should add up to 0xffff
+ * (and 0 after csum_fold()) just like the IPv4 hdr csum.
+ */
+ if (csum)
+ goto out_unlock;
+ }
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
+ },
+};
+
+static int __init gre_offload_init(void)
+{
+ return inet_add_offload(&gre_offload, IPPROTO_GRE);
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 272c69e106e..42b7bcf8045 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,9 +1,7 @@
/*
* NET3: Implementation of the ICMP protocol layer.
*
- * Alan Cox, <alan@redhat.com>
- *
- * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -64,6 +62,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
#include <linux/jiffies.h>
@@ -76,6 +76,7 @@
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -84,14 +85,17 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
+#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/init.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/ip_fib.h>
/*
* Build xmit assembly blocks
@@ -107,20 +111,13 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options replyopts;
- unsigned char optbuf[40];
+ struct ip_options_data replyopts;
};
-/*
- * Statistics
- */
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
-DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly;
-
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
-struct icmp_err icmp_err_convert[] = {
+const struct icmp_err icmp_err_convert[] = {
{
.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
.fatal = 0,
@@ -186,29 +183,7 @@ struct icmp_err icmp_err_convert[] = {
.fatal = 1,
},
};
-
-/* Control parameters for ECHO replies. */
-int sysctl_icmp_echo_ignore_all __read_mostly;
-int sysctl_icmp_echo_ignore_broadcasts __read_mostly = 1;
-
-/* Control parameter - ignore bogus broadcast responses? */
-int sysctl_icmp_ignore_bogus_error_responses __read_mostly = 1;
-
-/*
- * Configurable global rate limit.
- *
- * ratelimit defines tokens/packet consumed for dst->rate_token bucket
- * ratemask defines which icmp types are ratelimited by setting
- * it's bit position.
- *
- * default:
- * dest unreachable (3), source quench (4),
- * time exceeded (11), parameter problem (12)
- */
-
-int sysctl_icmp_ratelimit __read_mostly = 1 * HZ;
-int sysctl_icmp_ratemask __read_mostly = 0x1818;
-int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly;
+EXPORT_SYMBOL(icmp_err_convert);
/*
* ICMP control array. This specifies what to do with each ICMP.
@@ -228,71 +203,43 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
*
* On SMP we have one ICMP socket per-cpu.
*/
-static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
-#define icmp_socket __get_cpu_var(__icmp_socket)
+static struct sock *icmp_sk(struct net *net)
+{
+ return net->ipv4.icmp_sk[smp_processor_id()];
+}
-static __inline__ int icmp_xmit_lock(void)
+static inline struct sock *icmp_xmit_lock(struct net *net)
{
+ struct sock *sk;
+
local_bh_disable();
- if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+ sk = icmp_sk(net);
+
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
local_bh_enable();
- return 1;
+ return NULL;
}
- return 0;
+ return sk;
}
-static void icmp_xmit_unlock(void)
+static inline void icmp_xmit_unlock(struct sock *sk)
{
- spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
}
/*
* Send an ICMP frame.
*/
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
- *
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code)
{
- unsigned long now;
- int rc = 0;
-
- now = jiffies;
- dst->rate_tokens += now - dst->rate_last;
- dst->rate_last = now;
- if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
- dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
- if (dst->rate_tokens >= timeout) {
- dst->rate_tokens -= timeout;
- rc = 1;
- }
- return rc;
-}
-
-static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
-{
- struct dst_entry *dst = &rt->u.dst;
- int rc = 1;
+ struct dst_entry *dst = &rt->dst;
+ bool rc = true;
if (type > NR_ICMP_TYPES)
goto out;
@@ -306,8 +253,13 @@ static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+ struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ rc = inet_peer_xrlim_allow(peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
+ }
out:
return rc;
}
@@ -315,10 +267,10 @@ out:
/*
* Maintain the counters used in the SNMP statistics for outgoing ICMP
*/
-void icmp_out_count(unsigned char type)
+void icmp_out_count(struct net *net, unsigned char type)
{
- ICMPMSGOUT_INC_STATS(type);
- ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
+ ICMPMSGOUT_INC_STATS(net, type);
+ ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
}
/*
@@ -342,21 +294,25 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
}
static void icmp_push_reply(struct icmp_bxm *icmp_param,
- struct ipcm_cookie *ipc, struct rtable *rt)
+ struct flowi4 *fl4,
+ struct ipcm_cookie *ipc, struct rtable **rt)
{
+ struct sock *sk;
struct sk_buff *skb;
- if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ sk = icmp_sk(dev_net((*rt)->dst.dev));
+ if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT) < 0)
- ip_flush_pending_frames(icmp_socket->sk);
- else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ ipc, rt, MSG_DONTWAIT) < 0) {
+ ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+ ip_flush_pending_frames(sk);
+ } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
__wsum csum = 0;
struct sk_buff *skb1;
- skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
+ skb_queue_walk(&sk->sk_write_queue, skb1) {
csum = csum_add(csum, skb1->csum);
}
csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
@@ -364,7 +320,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
icmp_param->head_len, csum);
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
- ip_push_pending_frames(icmp_socket->sk);
+ ip_push_pending_frames(sk, fl4);
}
}
@@ -374,46 +330,149 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
- struct sock *sk = icmp_socket->sk;
- struct inet_sock *inet = inet_sk(sk);
struct ipcm_cookie ipc;
- struct rtable *rt = (struct rtable *)skb->dst;
- __be32 daddr;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ struct flowi4 fl4;
+ struct sock *sk;
+ struct inet_sock *inet;
+ __be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
- if (ip_options_echo(&icmp_param->replyopts, skb))
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;
- if (icmp_xmit_lock())
+ sk = icmp_xmit_lock(net);
+ if (sk == NULL)
return;
+ inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
inet->tos = ip_hdr(skb)->tos;
- daddr = ipc.addr = rt->rt_src;
+ sk->sk_mark = mark;
+ daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
ipc.opt = NULL;
- if (icmp_param->replyopts.optlen) {
- ipc.opt = &icmp_param->replyopts;
- if (ipc.opt->srr)
- daddr = icmp_param->replyopts.faddr;
- }
- {
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = rt->rt_spec_dst,
- .tos = RT_TOS(ip_hdr(skb)->tos) } },
- .proto = IPPROTO_ICMP };
- security_skb_classify_flow(skb, &fl);
- if (ip_route_output_key(&rt, &fl))
- goto out_unlock;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (icmp_param->replyopts.opt.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts.opt;
+ if (ipc.opt->opt.srr)
+ daddr = icmp_param->replyopts.opt.opt.faddr;
}
- if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_proto = IPPROTO_ICMP;
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+ if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
icmp_param->data.icmph.code))
- icmp_push_reply(icmp_param, &ipc, rt);
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
- icmp_xmit_unlock();
+ icmp_xmit_unlock(sk);
}
+static struct rtable *icmp_route_lookup(struct net *net,
+ struct flowi4 *fl4,
+ struct sk_buff *skb_in,
+ const struct iphdr *iph,
+ __be32 saddr, u8 tos, u32 mark,
+ int type, int code,
+ struct icmp_bxm *param)
+{
+ struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
+ int err;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = (param->replyopts.opt.opt.srr ?
+ param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
+ fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_proto = IPPROTO_ICMP;
+ fl4->fl4_icmp_type = type;
+ fl4->fl4_icmp_code = code;
+ security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ rt = __ip_route_output_key(net, fl4);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ if (!IS_ERR(rt)) {
+ if (rt != rt2)
+ return rt;
+ } else if (PTR_ERR(rt) == -EPERM) {
+ rt = NULL;
+ } else
+ return rt;
+
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4_dec.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+ RT_TOS(tos), rt2->dst.dev);
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+ flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(rt2)) {
+ dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
+ rt = rt2;
+ } else if (PTR_ERR(rt2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
/*
* Send an ICMP message in response to a situation
@@ -430,14 +489,19 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
- struct rtable *rt = (struct rtable *)skb_in->dst;
+ struct icmp_bxm *icmp_param;
+ struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
+ struct flowi4 fl4;
__be32 saddr;
u8 tos;
+ u32 mark;
+ struct net *net;
+ struct sock *sk;
if (!rt)
goto out;
+ net = dev_net(rt->dst.dev);
/*
* Find the original header. It is expected to be valid, of course.
@@ -447,7 +511,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph = ip_hdr(skb_in);
if ((u8 *)iph < skb_in->head ||
- (skb_in->network_header + sizeof(*iph)) > skb_in->tail)
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
goto out;
/*
@@ -501,9 +566,14 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- if (icmp_xmit_lock())
+ icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+ if (!icmp_param)
return;
+ sk = icmp_xmit_lock(net);
+ if (sk == NULL)
+ goto out_free;
+
/*
* Construct source address and options.
*/
@@ -512,21 +582,24 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
if (!(rt->rt_flags & RTCF_LOCAL)) {
struct net_device *dev = NULL;
- if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index(&init_net, rt->fl.iif);
+ rcu_read_lock();
+ if (rt_is_input_route(rt) &&
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+ dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
- if (dev) {
+ if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- dev_put(dev);
- } else
+ else
saddr = 0;
+ rcu_read_unlock();
}
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
goto out_unlock;
@@ -534,79 +607,98 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
- icmp_out_count(icmp_param.data.icmph.type);
- inet_sk(icmp_socket->sk)->tos = tos;
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
+ inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts;
-
- {
- struct flowi fl = {
- .nl_u = {
- .ip4_u = {
- .daddr = icmp_param.replyopts.srr ?
- icmp_param.replyopts.faddr :
- iph->saddr,
- .saddr = saddr,
- .tos = RT_TOS(tos)
- }
- },
- .proto = IPPROTO_ICMP,
- .uli_u = {
- .icmpt = {
- .type = type,
- .code = code
- }
- }
- };
- security_skb_classify_flow(skb_in, &fl);
- if (ip_route_output_key(&rt, &fl))
- goto out_unlock;
- }
+ ipc.opt = &icmp_param->replyopts.opt;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
+ type, code, icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
- if (!icmpv4_xrlim_allow(rt, type, code))
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->u.dst);
+ room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
- icmp_param.data_len = skb_in->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = skb_in->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
- icmp_push_reply(&icmp_param, &ipc, rt);
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
- icmp_xmit_unlock();
+ icmp_xmit_unlock(sk);
+out_free:
+ kfree(icmp_param);
out:;
}
+EXPORT_SYMBOL(icmp_send);
+
+
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ return;
+
+ raw_icmp_error(skb, protocol, info);
+ rcu_read_lock();
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
/*
- * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
*/
static void icmp_unreach(struct sk_buff *skb)
{
- struct iphdr *iph;
+ const struct iphdr *iph;
struct icmphdr *icmph;
- int hash, protocol;
- struct net_protocol *ipprot;
- struct sock *raw_sk;
+ struct net *net;
u32 info = 0;
+ net = dev_net(skb_dst(skb)->dev);
+
/*
* Incomplete header ?
* Only checks for the IP header, there should be an
@@ -617,7 +709,7 @@ static void icmp_unreach(struct sk_buff *skb)
goto out_err;
icmph = icmp_hdr(skb);
- iph = (struct iphdr *)skb->data;
+ iph = (const struct iphdr *)skb->data;
if (iph->ihl < 5) /* Mangled header, drop. */
goto out_err;
@@ -630,22 +722,28 @@ static void icmp_unreach(struct sk_buff *skb)
case ICMP_PORT_UNREACH:
break;
case ICMP_FRAG_NEEDED:
- if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
- "fragmentation needed "
- "and DF set.\n",
- NIPQUAD(iph->daddr));
- } else {
- info = ip_rt_frag_needed(iph,
- ntohs(icmph->un.frag.mtu));
- if (!info)
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.txt
+ */
+ switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ default:
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
+ &iph->daddr);
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
goto out;
+ /* fall through */
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
- "Route Failed.\n",
- NIPQUAD(iph->daddr));
+ LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),
+ &iph->daddr);
break;
default:
break;
@@ -667,62 +765,27 @@ static void icmp_unreach(struct sk_buff *skb)
*/
/*
- * Check the other end isnt violating RFC 1122. Some routers send
+ * Check the other end isn't violating RFC 1122. Some routers send
* bogus responses to broadcast frames. If you see this message
* first check your netmask matches at both ends, if it does then
* get the other vendor to fix their kit.
*/
- if (!sysctl_icmp_ignore_bogus_error_responses &&
- inet_addr_type(iph->daddr) == RTN_BROADCAST) {
- if (net_ratelimit())
- printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
- "type %u, code %u "
- "error to a broadcast: %u.%u.%u.%u on %s\n",
- NIPQUAD(ip_hdr(skb)->saddr),
- icmph->type, icmph->code,
- NIPQUAD(iph->daddr),
- skb->dev->name);
+ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
goto out;
}
- /* Checkin full IP header plus 8 bytes of protocol to
- * avoid additional coding at protocol handlers.
- */
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- protocol = iph->protocol;
-
- /*
- * Deliver ICMP message to raw sockets. Pretty useless feature?
- */
-
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
- hash = protocol & (MAX_INET_PROTOS - 1);
- read_lock(&raw_v4_lock);
- if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
- while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
- iph->saddr,
- skb->dev->ifindex)) != NULL) {
- raw_err(raw_sk, skb, info);
- raw_sk = sk_next(raw_sk);
- iph = (struct iphdr *)skb->data;
- }
- }
- read_unlock(&raw_v4_lock);
-
- rcu_read_lock();
- ipprot = rcu_dereference(inet_protos[hash]);
- if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, info);
- rcu_read_unlock();
+ icmp_socket_deliver(skb, info);
out:
return;
out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
goto out;
}
@@ -733,37 +796,15 @@ out_err:
static void icmp_redirect(struct sk_buff *skb)
{
- struct iphdr *iph;
-
- if (skb->len < sizeof(struct iphdr))
- goto out_err;
+ if (skb->len < sizeof(struct iphdr)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return;
+ }
- /*
- * Get the copied header of the packet that caused the redirect
- */
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
-
- iph = (struct iphdr *)skb->data;
+ return;
- switch (icmp_hdr(skb)->code & 7) {
- case ICMP_REDIR_NET:
- case ICMP_REDIR_NETTOS:
- /*
- * As per RFC recommendations now handle it as a host redirect.
- */
- case ICMP_REDIR_HOST:
- case ICMP_REDIR_HOSTTOS:
- ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
- icmp_hdr(skb)->un.gateway,
- iph->saddr, skb->dev);
- break;
- }
-out:
- return;
-out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- goto out;
+ icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
}
/*
@@ -780,7 +821,10 @@ out_err:
static void icmp_echo(struct sk_buff *skb)
{
- if (!sysctl_icmp_echo_ignore_all) {
+ struct net *net;
+
+ net = dev_net(skb_dst(skb)->dev);
+ if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
struct icmp_bxm icmp_param;
icmp_param.data.icmph = *icmp_hdr(skb);
@@ -802,7 +846,7 @@ static void icmp_echo(struct sk_buff *skb)
*/
static void icmp_timestamp(struct sk_buff *skb)
{
- struct timeval tv;
+ struct timespec tv;
struct icmp_bxm icmp_param;
/*
* Too short.
@@ -813,9 +857,9 @@ static void icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- do_gettimeofday(&tv);
- icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
- tv.tv_usec / 1000);
+ getnstimeofday(&tv);
+ icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
+ tv.tv_nsec / NSEC_PER_MSEC);
icmp_param.data.times[2] = icmp_param.data.times[1];
if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
BUG();
@@ -830,94 +874,10 @@ static void icmp_timestamp(struct sk_buff *skb)
out:
return;
out_err:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
goto out;
}
-
-/*
- * Handle ICMP_ADDRESS_MASK requests. (RFC950)
- *
- * RFC1122 (3.2.2.9). A host MUST only send replies to
- * ADDRESS_MASK requests if it's been configured as an address mask
- * agent. Receiving a request doesn't constitute implicit permission to
- * act as one. Of course, implementing this correctly requires (SHOULD)
- * a way to turn the functionality on and off. Another one for sysctl(),
- * I guess. -- MS
- *
- * RFC1812 (4.3.3.9). A router MUST implement it.
- * A router SHOULD have switch turning it on/off.
- * This switch MUST be ON by default.
- *
- * Gratuitous replies, zero-source replies are not implemented,
- * that complies with RFC. DO NOT implement them!!! All the idea
- * of broadcast addrmask replies as specified in RFC950 is broken.
- * The problem is that it is not uncommon to have several prefixes
- * on one physical interface. Moreover, addrmask agent can even be
- * not aware of existing another prefixes.
- * If source is zero, addrmask agent cannot choose correct prefix.
- * Gratuitous mask announcements suffer from the same problem.
- * RFC1812 explains it, but still allows to use ADDRMASK,
- * that is pretty silly. --ANK
- *
- * All these rules are so bizarre, that I removed kernel addrmask
- * support at all. It is wrong, it is obsolete, nobody uses it in
- * any case. --ANK
- *
- * Furthermore you can do it with a usermode address agent program
- * anyway...
- */
-
-static void icmp_address(struct sk_buff *skb)
-{
-#if 0
- if (net_ratelimit())
- printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
-#endif
-}
-
-/*
- * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
- * loudly if an inconsistency is found.
- */
-
-static void icmp_address_reply(struct sk_buff *skb)
-{
- struct rtable *rt = (struct rtable *)skb->dst;
- struct net_device *dev = skb->dev;
- struct in_device *in_dev;
- struct in_ifaddr *ifa;
-
- if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
- goto out;
-
- in_dev = in_dev_get(dev);
- if (!in_dev)
- goto out;
- rcu_read_lock();
- if (in_dev->ifa_list &&
- IN_DEV_LOG_MARTIANS(in_dev) &&
- IN_DEV_FORWARD(in_dev)) {
- __be32 _mask, *mp;
-
- mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
- BUG_ON(mp == NULL);
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (*mp == ifa->ifa_mask &&
- inet_ifa_match(rt->rt_src, ifa))
- break;
- }
- if (!ifa && net_ratelimit()) {
- printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
- "%s/%u.%u.%u.%u\n",
- NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
- }
- }
- rcu_read_unlock();
- in_dev_put(in_dev);
-out:;
-}
-
static void icmp_discard(struct sk_buff *skb)
{
}
@@ -928,27 +888,40 @@ static void icmp_discard(struct sk_buff *skb)
int icmp_rcv(struct sk_buff *skb)
{
struct icmphdr *icmph;
- struct rtable *rt = (struct rtable *)skb->dst;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
- ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto error;
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+ goto drop;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*icmph));
+
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ skb_set_network_header(skb, nh);
}
- if (!pskb_pull(skb, sizeof(struct icmphdr)))
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
+
+ if (!pskb_pull(skb, sizeof(*icmph)))
goto error;
icmph = icmp_hdr(skb);
- ICMPMSGIN_INC_STATS_BH(icmph->type);
+ ICMPMSGIN_INC_STATS_BH(net, icmph->type);
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
@@ -972,7 +945,7 @@ int icmp_rcv(struct sk_buff *skb)
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
- sysctl_icmp_echo_ignore_broadcasts) {
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
goto error;
}
if (icmph->type != ICMP_ECHO &&
@@ -988,17 +961,43 @@ int icmp_rcv(struct sk_buff *skb)
drop:
kfree_skb(skb);
return 0;
+csum_error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
error:
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
goto drop;
}
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, offset, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
/*
* This table is the definition of how we handle ICMP.
*/
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY] = {
- .handler = icmp_discard,
+ .handler = ping_rcv,
},
[1] = {
.handler = icmp_discard,
@@ -1060,49 +1059,92 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
.handler = icmp_discard,
},
[ICMP_ADDRESS] = {
- .handler = icmp_address,
+ .handler = icmp_discard,
},
[ICMP_ADDRESSREPLY] = {
- .handler = icmp_address_reply,
+ .handler = icmp_discard,
},
};
-void __init icmp_init(struct net_proto_family *ops)
+static void __net_exit icmp_sk_exit(struct net *net)
{
- struct inet_sock *inet;
int i;
- for_each_possible_cpu(i) {
- int err;
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
+ kfree(net->ipv4.icmp_sk);
+ net->ipv4.icmp_sk = NULL;
+}
- err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
- &per_cpu(__icmp_socket, i));
+static int __net_init icmp_sk_init(struct net *net)
+{
+ int i, err;
+
+ net->ipv4.icmp_sk =
+ kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
+ if (net->ipv4.icmp_sk == NULL)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, net);
if (err < 0)
- panic("Failed to create the ICMP control socket.\n");
+ goto fail;
- per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+ net->ipv4.icmp_sk[i] = sk;
/* Enough space for 2 64K ICMP packets, including
- * sk_buff struct overhead.
+ * sk_buff/skb_shared_info struct overhead.
*/
- per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
- (2 * ((64 * 1024) + sizeof(struct sk_buff)));
-
- inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
- inet->uc_ttl = -1;
- inet->pmtudisc = IP_PMTUDISC_DONT;
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
- /* Unhash it so that IP input processing does not even
- * see it, we do not wish this socket to see incoming
- * packets.
+ /*
+ * Speedup sock_wfree()
*/
- per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
}
+
+ /* Control parameters for ECHO replies. */
+ net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
+
+ /* Control parameter - ignore bogus broadcast responses? */
+ net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+ /*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token
+ * bucket ratemask defines which icmp types are ratelimited by
+ * setting it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+ net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+ net->ipv4.sysctl_icmp_ratemask = 0x1818;
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
+ kfree(net->ipv4.icmp_sk);
+ return err;
}
-EXPORT_SYMBOL(icmp_err_convert);
-EXPORT_SYMBOL(icmp_send);
-EXPORT_SYMBOL(icmp_statistics);
-EXPORT_SYMBOL(icmpmsg_statistics);
-EXPORT_SYMBOL(xrlim_allow);
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+ .init = icmp_sk_init,
+ .exit = icmp_sk_exit,
+};
+
+int __init icmp_init(void)
+{
+ return register_pernet_subsys(&icmp_sk_ops);
+}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7dbc282d4f9..db710b059ba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,10 +8,8 @@
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
- *
* Authors:
- * Alan Cox <Alan.Cox@linux.org>
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -73,8 +71,8 @@
*/
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -90,6 +88,7 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
+#include <linux/pkt_sched.h>
#include <net/net_namespace.h>
#include <net/arp.h>
@@ -115,7 +114,8 @@
#define IGMP_V1_Router_Present_Timeout (400*HZ)
#define IGMP_V2_Router_Present_Timeout (400*HZ)
-#define IGMP_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
+#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
#define IGMP_Query_Response_Interval (10*HZ)
#define IGMP_Unsolicited_Report_Count 2
@@ -130,16 +130,39 @@
*/
#define IGMP_V1_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 1 || \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 2 || \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
static void igmpv3_clear_delrec(struct in_device *in_dev);
@@ -154,22 +177,32 @@ static void ip_ma_put(struct ip_mc_list *im)
{
if (atomic_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
- kfree(im);
+ kfree_rcu(im, rcu);
}
}
+#define for_each_pmc_rcu(in_dev, pmc) \
+ for (pmc = rcu_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc) \
+ for (pmc = rtnl_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rtnl_dereference(pmc->next_rcu))
+
#ifdef CONFIG_IP_MULTICAST
/*
* Timer management
*/
-static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
if (del_timer(&im->timer))
atomic_dec(&im->refcnt);
- im->tm_running=0;
+ im->tm_running = 0;
im->reporter = 0;
im->unsolicit_count = 0;
spin_unlock_bh(&im->lock);
@@ -178,16 +211,16 @@ static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv=net_random() % max_delay;
+ int tv = prandom_u32() % max_delay;
- im->tm_running=1;
+ im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
atomic_inc(&im->refcnt);
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = net_random() % in_dev->mr_maxdelay;
+ int tv = prandom_u32() % in_dev->mr_maxdelay;
in_dev->mr_gq_running = 1;
if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
@@ -196,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = net_random() % delay;
+ int tv = prandom_u32() % delay;
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
@@ -209,7 +242,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
if (del_timer(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
- im->tm_running=1;
+ im->tm_running = 1;
spin_unlock_bh(&im->lock);
return;
}
@@ -277,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
struct ip_sf_list *psf;
int scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -285,37 +318,43 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
return scount;
}
+#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
+
static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
{
struct sk_buff *skb;
struct rtable *rt;
struct iphdr *pip;
struct igmpv3_report *pig;
-
- skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- {
- struct flowi fl = { .oif = dev->ifindex,
- .nl_u = { .ip4_u = {
- .daddr = IGMPV3_ALL_MCR } },
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl)) {
- kfree_skb(skb);
+ struct net *net = dev_net(dev);
+ struct flowi4 fl4;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ while (1) {
+ skb = alloc_skb(size + hlen + tlen,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (skb)
+ break;
+ size >>= 1;
+ if (size < 256)
return NULL;
- }
}
- if (rt->rt_src == 0) {
+ skb->priority = TC_PRIO_CONTROL;
+ igmp_skb_size(skb) = size;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
kfree_skb(skb);
- ip_rt_put(rt);
return NULL;
}
- skb->dst = &rt->u.dst;
+ skb_dst_set(skb, &rt->dst);
skb->dev = dev;
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
@@ -326,15 +365,15 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->tos = 0xc0;
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
- pip->daddr = rt->rt_dst;
- pip->saddr = rt->rt_src;
+ pip->daddr = fl4.daddr;
+ pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->u.dst, NULL);
- ((u8*)&pip[1])[0] = IPOPT_RA;
- ((u8*)&pip[1])[1] = 4;
- ((u8*)&pip[1])[2] = 0;
- ((u8*)&pip[1])[3] = 0;
+ ip_select_ident(skb, NULL);
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
@@ -349,22 +388,17 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
static int igmpv3_sendpack(struct sk_buff *skb)
{
- struct iphdr *pip = ip_hdr(skb);
struct igmphdr *pig = igmp_hdr(skb);
- const int iplen = skb->tail - skb->network_header;
- const int igmplen = skb->tail - skb->transport_header;
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
- pip->tot_len = htons(iplen);
- ip_send_check(pip);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
- dst_output);
+ return ip_local_out(skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
- return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -389,7 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
return skb;
}
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
skb_tailroom(skb)) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -429,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
}
first = 1;
psf_prev = NULL;
- for (psf=*psf_list; psf; psf=psf_next) {
+ for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
@@ -486,7 +520,7 @@ empty_source:
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
- if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) {
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
@@ -507,8 +541,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
int type;
if (!pmc) {
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
spin_lock_bh(&pmc->lock);
@@ -519,7 +553,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
} else {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
@@ -542,7 +576,7 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf=*ppsf; psf; psf = psf_next) {
+ for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
@@ -561,12 +595,12 @@ static void igmpv3_send_cr(struct in_device *in_dev)
struct sk_buff *skb = NULL;
int type, dtype;
- read_lock(&in_dev->mc_list_lock);
+ rcu_read_lock();
spin_lock_bh(&in_dev->mc_tomb_lock);
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -598,7 +632,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
spin_unlock_bh(&in_dev->mc_tomb_lock);
/* change recs */
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rcu(in_dev, pmc) {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE]) {
type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -621,7 +655,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
}
spin_unlock_bh(&pmc->lock);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
if (!skb)
return;
@@ -636,8 +670,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
struct igmphdr *ih;
struct rtable *rt;
struct net_device *dev = in_dev->dev;
+ struct net *net = dev_net(dev);
__be32 group = pmc ? pmc->multiaddr : 0;
+ struct flowi4 fl4;
__be32 dst;
+ int hlen, tlen;
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
@@ -646,27 +683,24 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
else
dst = group;
- {
- struct flowi fl = { .oif = dev->ifindex,
- .nl_u = { .ip4_u = { .daddr = dst } },
- .proto = IPPROTO_IGMP };
- if (ip_route_output_key(&rt, &fl))
- return -1;
- }
- if (rt->rt_src == 0) {
- ip_rt_put(rt);
+ rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
return -1;
- }
- skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
if (skb == NULL) {
ip_rt_put(rt);
return -1;
}
+ skb->priority = TC_PRIO_CONTROL;
- skb->dst = &rt->u.dst;
+ skb_dst_set(skb, &rt->dst);
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
iph = ip_hdr(skb);
@@ -678,25 +712,22 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
- iph->saddr = rt->rt_src;
+ iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
- iph->tot_len = htons(IGMP_SIZE);
- ip_select_ident(iph, &rt->u.dst, NULL);
- ((u8*)&iph[1])[0] = IPOPT_RA;
- ((u8*)&iph[1])[1] = 4;
- ((u8*)&iph[1])[2] = 0;
- ((u8*)&iph[1])[3] = 0;
- ip_send_check(iph);
+ ip_select_ident(skb, NULL);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
- ih->type=type;
- ih->code=0;
- ih->csum=0;
- ih->group=group;
- ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+ ih->type = type;
+ ih->code = 0;
+ ih->csum = 0;
+ ih->group = group;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ return ip_local_out(skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -705,7 +736,7 @@ static void igmp_gq_timer_expire(unsigned long data)
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_timer_expire(unsigned long data)
@@ -715,9 +746,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
igmpv3_send_cr(in_dev);
if (in_dev->mr_ifc_count) {
in_dev->mr_ifc_count--;
- igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
}
- __in_dev_put(in_dev);
+ in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
@@ -732,15 +764,15 @@ static void igmp_ifc_event(struct in_device *in_dev)
static void igmp_timer_expire(unsigned long data)
{
- struct ip_mc_list *im=(struct ip_mc_list *)data;
+ struct ip_mc_list *im = (struct ip_mc_list *)data;
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
- im->tm_running=0;
+ im->tm_running = 0;
if (im->unsolicit_count) {
im->unsolicit_count--;
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
}
im->reporter = 1;
spin_unlock(&im->lock);
@@ -762,15 +794,15 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
int i, scount;
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++) {
+ for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
- if (pmc->sfcount[MCAST_INCLUDE] ||
+ if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
- continue;
+ break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
@@ -793,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
- for (i=0; i<nsrcs; i++)
+ for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
@@ -811,26 +843,29 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
return 1;
}
-static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
- return;
+ return false;
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
break;
}
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
+ return false;
}
-static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int len)
{
struct igmphdr *ih = igmp_hdr(skb);
@@ -862,16 +897,30 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
- return; /* ignore bogus packet; freed by caller */
+ return true; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_Query_Response_Interval;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
- return;
+ return true;
ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs) {
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ ntohs(ih3->nsrcs)*sizeof(__be32)))
- return;
+ return true;
ih3 = igmpv3_query_hdr(skb);
}
@@ -883,9 +932,9 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
in_dev->mr_qrv = ih3->qrv;
if (!group) { /* general query */
if (ih3->nsrcs)
- return; /* no sources allowed */
+ return false; /* no sources allowed */
igmp_gq_start_timer(in_dev);
- return;
+ return false;
}
/* mark sources to include, if group & source-specific */
mark = ih3->nsrcs != 0;
@@ -901,8 +950,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
int changed;
if (group && group != im->multiaddr)
@@ -920,56 +969,48 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (changed)
igmp_mod_timer(im, max_delay);
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
+ return false;
}
+/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = in_dev_get(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
int len = skb->len;
+ bool dropped = true;
- if (in_dev==NULL) {
- kfree_skb(skb);
- return 0;
- }
+ if (in_dev == NULL)
+ goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (!csum_fold(skb->csum))
- break;
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = 0;
- if (__skb_checksum_complete(skb))
- goto drop;
- }
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
- igmp_heard_query(in_dev, skb, len);
+ dropped = igmp_heard_query(in_dev, skb, len);
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
- case IGMPV3_HOST_MEMBERSHIP_REPORT:
/* Is it our report looped back? */
- if (((struct rtable*)skb->dst)->fl.iif == 0)
+ if (rt_is_output_route(skb_rtable(skb)))
break;
/* don't rely on MC router hearing unicast reports */
if (skb->pkt_type == PACKET_MULTICAST ||
skb->pkt_type == PACKET_BROADCAST)
- igmp_heard_report(in_dev, ih->group);
+ dropped = igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
- in_dev_put(in_dev);
return pim_rcv_v1(skb);
#endif
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
case IGMP_DVMRP:
case IGMP_TRACE:
case IGMP_HOST_LEAVE_MESSAGE:
@@ -981,8 +1022,10 @@ int igmp_rcv(struct sk_buff *skb)
}
drop:
- in_dev_put(in_dev);
- kfree_skb(skb);
+ if (dropped)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
return 0;
}
@@ -1000,13 +1043,13 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
- is changed. This check should be done in dev->set_multicast_list
+ is changed. This check should be done in ndo_set_rx_mode
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
*/
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_add(dev,buf,dev->addr_len,0);
+ dev_mc_add(dev, buf);
}
/*
@@ -1019,7 +1062,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_delete(dev,buf,dev->addr_len,0);
+ dev_mc_del(dev, buf);
}
#ifdef CONFIG_IP_MULTICAST
@@ -1052,7 +1095,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
@@ -1070,7 +1113,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
- for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
@@ -1083,7 +1126,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
if (pmc) {
- for (psf=pmc->tomb; psf; psf=psf_next) {
+ for (psf = pmc->tomb; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
@@ -1108,20 +1151,20 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
kfree(pmc);
}
/* clear dead sources, too */
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
struct ip_sf_list *psf, *psf_next;
spin_lock_bh(&pmc->lock);
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
- for (; psf; psf=psf_next) {
+ for (; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
}
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
}
#endif
@@ -1146,20 +1189,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
if (!in_dev->dead) {
if (IGMP_V1_SEEN(in_dev))
- goto done;
+ return;
if (IGMP_V2_SEEN(in_dev)) {
if (reporter)
igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
- goto done;
+ return;
}
/* IGMPv3 */
igmpv3_add_delrec(in_dev, im);
igmp_ifc_event(in_dev);
}
-done:
#endif
- ip_mc_clear_src(im);
}
static void igmp_group_added(struct ip_mc_list *im)
@@ -1196,6 +1237,57 @@ static void igmp_group_added(struct ip_mc_list *im)
* Multicast list managers
*/
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
/*
* A socket has joined a multicast group on device dev.
@@ -1207,7 +1299,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
ASSERT_RTNL();
- for (im=in_dev->mc_list; im; im=im->next) {
+ for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == addr) {
im->users++;
ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1215,37 +1307,30 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
}
}
- im = kmalloc(sizeof(*im), GFP_KERNEL);
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
if (!im)
goto out;
- im->users=1;
- im->interface=in_dev;
+ im->users = 1;
+ im->interface = in_dev;
in_dev_hold(in_dev);
- im->multiaddr=addr;
+ im->multiaddr = addr;
/* initial mode is (EX, empty) */
im->sfmode = MCAST_EXCLUDE;
- im->sfcount[MCAST_INCLUDE] = 0;
im->sfcount[MCAST_EXCLUDE] = 1;
- im->sources = NULL;
- im->tomb = NULL;
- im->crcount = 0;
atomic_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
- im->tm_running=0;
- init_timer(&im->timer);
- im->timer.data=(unsigned long)im;
- im->timer.function=&igmp_timer_expire;
+ setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
im->unsolicit_count = IGMP_Unsolicited_Report_Count;
- im->reporter = 0;
- im->gsquery = 0;
#endif
- im->loaded = 0;
- write_lock_bh(&in_dev->mc_list_lock);
- im->next=in_dev->mc_list;
- in_dev->mc_list=im;
- write_unlock_bh(&in_dev->mc_list_lock);
+
+ im->next_rcu = in_dev->mc_list;
+ in_dev->mc_count++;
+ rcu_assign_pointer(in_dev->mc_list, im);
+
+ ip_mc_hash_add(in_dev, im);
+
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im->multiaddr);
#endif
@@ -1255,26 +1340,34 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
out:
return;
}
+EXPORT_SYMBOL(ip_mc_inc_group);
/*
- * Resend IGMP JOIN report; used for bonding.
+ * Resend IGMP JOIN report; used by netdev notifier.
*/
-void ip_mc_rejoin_group(struct ip_mc_list *im)
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
- struct in_device *in_dev = im->interface;
+ struct ip_mc_list *im;
+ int type;
- if (im->multiaddr == IGMP_ALL_HOSTS)
- return;
+ ASSERT_RTNL();
- if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
- igmp_mod_timer(im, IGMP_Initial_Report_Delay);
- return;
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+
+ /* a failover is happening and switches
+ * must be notified immediately
+ */
+ if (IGMP_V1_SEEN(in_dev))
+ type = IGMP_HOST_MEMBERSHIP_REPORT;
+ else if (IGMP_V2_SEEN(in_dev))
+ type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+ else
+ type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ igmp_send_report(in_dev, im, type);
}
- /* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- igmp_ifc_event(in_dev);
#endif
}
@@ -1284,17 +1377,21 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
{
- struct ip_mc_list *i, **ip;
+ struct ip_mc_list *i;
+ struct ip_mc_list __rcu **ip;
ASSERT_RTNL();
- for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
- if (i->multiaddr==addr) {
+ for (ip = &in_dev->mc_list;
+ (i = rtnl_dereference(*ip)) != NULL;
+ ip = &i->next_rcu) {
+ if (i->multiaddr == addr) {
if (--i->users == 0) {
- write_lock_bh(&in_dev->mc_list_lock);
- *ip = i->next;
- write_unlock_bh(&in_dev->mc_list_lock);
+ ip_mc_hash_remove(in_dev, i);
+ *ip = i->next_rcu;
+ in_dev->mc_count--;
igmp_group_dropped(i);
+ ip_mc_clear_src(i);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
@@ -1306,17 +1403,40 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
+EXPORT_SYMBOL(ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
+}
/* Device going down */
void ip_mc_down(struct in_device *in_dev)
{
- struct ip_mc_list *i;
+ struct ip_mc_list *pmc;
ASSERT_RTNL();
- for (i=in_dev->mc_list; i; i=i->next)
- igmp_group_dropped(i);
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST
in_dev->mr_ifc_count = 0;
@@ -1335,20 +1455,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
{
ASSERT_RTNL();
- in_dev->mc_tomb = NULL;
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_gq_running = 0;
- init_timer(&in_dev->mr_gq_timer);
- in_dev->mr_gq_timer.data=(unsigned long) in_dev;
- in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
- in_dev->mr_ifc_count = 0;
- init_timer(&in_dev->mr_ifc_timer);
- in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
- in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+ setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
+ (unsigned long)in_dev);
+ setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
+ (unsigned long)in_dev);
in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
#endif
- rwlock_init(&in_dev->mc_list_lock);
spin_lock_init(&in_dev->mc_tomb_lock);
}
@@ -1356,14 +1470,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
- struct ip_mc_list *i;
+ struct ip_mc_list *pmc;
ASSERT_RTNL();
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
- for (i=in_dev->mc_list; i; i=i->next)
- igmp_group_added(i);
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
}
/*
@@ -1379,43 +1493,40 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
/* Deactivate timers */
ip_mc_down(in_dev);
- write_lock_bh(&in_dev->mc_list_lock);
- while ((i = in_dev->mc_list) != NULL) {
- in_dev->mc_list = i->next;
- write_unlock_bh(&in_dev->mc_list_lock);
+ while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+ in_dev->mc_list = i->next_rcu;
+ in_dev->mc_count--;
- igmp_group_dropped(i);
+ /* We've dropped the groups in ip_mc_down already */
+ ip_mc_clear_src(i);
ip_ma_put(i);
-
- write_lock_bh(&in_dev->mc_list_lock);
}
- write_unlock_bh(&in_dev->mc_list_lock);
}
-static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = imr->imr_multiaddr.s_addr } } };
- struct rtable *rt;
struct net_device *dev = NULL;
struct in_device *idev = NULL;
if (imr->imr_ifindex) {
- idev = inetdev_by_index(imr->imr_ifindex);
- if (idev)
- __in_dev_put(idev);
+ idev = inetdev_by_index(net, imr->imr_ifindex);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(imr->imr_address.s_addr);
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
- dev_put(dev);
}
- if (!dev && !ip_route_output_key(&rt, &fl)) {
- dev = rt->u.dst.dev;
- ip_rt_put(rt);
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
@@ -1438,7 +1549,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
int rv = 0;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1489,18 +1600,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!in_dev)
return -ENODEV;
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
@@ -1511,7 +1622,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[sfmode]--;
}
err = 0;
- for (i=0; i<sfcount; i++) {
+ for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
@@ -1531,7 +1642,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
@@ -1547,12 +1658,12 @@ out_unlock:
* Add multicast single-source filter to the interface list
*/
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
- __be32 *psfsrc, int delta)
+ __be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
@@ -1580,7 +1691,7 @@ static void sf_markstate(struct ip_mc_list *pmc)
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
- for (psf=pmc->sources; psf; psf=psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
@@ -1597,7 +1708,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
int new_in, rv;
rv = 0;
- for (psf=pmc->sources; psf; psf=psf->sf_next) {
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -1607,7 +1718,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) {
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
@@ -1629,12 +1740,11 @@ static int sf_setstate(struct ip_mc_list *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next)
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
- dpsf = (struct ip_sf_list *)
- kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
@@ -1662,18 +1772,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!in_dev)
return -ENODEV;
- read_lock(&in_dev->mc_list_lock);
- for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
- read_unlock(&in_dev->mc_list_lock);
+ rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
@@ -1682,17 +1792,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
- for (i=0; i<sfcount; i++) {
- err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+ for (i = 0; i < sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
if (err) {
int j;
- pmc->sfcount[sfmode]--;
- for (j=0; j<i; j++)
- (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+ if (!delta)
+ pmc->sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
@@ -1710,7 +1821,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
IGMP_Unsolicited_Report_Count;
in_dev->mr_ifc_count = pmc->crcount;
- for (psf=pmc->sources; psf; psf = psf->sf_next)
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
@@ -1725,12 +1836,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *nextpsf;
- for (psf=pmc->tomb; psf; psf=nextpsf) {
+ for (psf = pmc->tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
pmc->tomb = NULL;
- for (psf=pmc->sources; psf; psf=nextpsf) {
+ for (psf = pmc->sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
@@ -1748,18 +1859,19 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
{
int err;
__be32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml=NULL, *i;
+ struct ip_mc_socklist *iml = NULL, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
int ifindex;
int count = 0;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
- in_dev = ip_mc_find_dev(imr);
+ in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
iml = NULL;
@@ -1769,7 +1881,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
err = -EADDRINUSE;
ifindex = imr->imr_ifindex;
- for (i = inet->mc_list; i; i = i->next) {
+ for_each_pmc_rtnl(inet, i) {
if (i->multi.imr_multiaddr.s_addr == addr &&
i->multi.imr_ifindex == ifindex)
goto done;
@@ -1778,37 +1890,40 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
err = -ENOBUFS;
if (count >= sysctl_igmp_max_memberships)
goto done;
- iml = sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+ iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (iml == NULL)
goto done;
memcpy(&iml->multi, imr, sizeof(*imr));
- iml->next = inet->mc_list;
+ iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
iml->sfmode = MCAST_EXCLUDE;
- inet->mc_list = iml;
+ rcu_assign_pointer(inet->mc_list, iml);
ip_mc_inc_group(in_dev, addr);
err = 0;
done:
rtnl_unlock();
return err;
}
+EXPORT_SYMBOL(ip_mc_join_group);
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
+ struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
int err;
- if (iml->sflist == NULL) {
+ if (psf == NULL) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
- iml->sfmode, iml->sflist->sl_count,
- iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+ iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psf, rcu);
return err;
}
@@ -1819,16 +1934,24 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
- struct ip_mc_socklist *iml, **imlp;
+ struct ip_mc_socklist *iml;
+ struct ip_mc_socklist __rcu **imlp;
struct in_device *in_dev;
+ struct net *net = sock_net(sk);
__be32 group = imr->imr_multiaddr.s_addr;
u32 ifindex;
int ret = -EADDRNOTAVAIL;
rtnl_lock();
- in_dev = ip_mc_find_dev(imr);
+ in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
ifindex = imr->imr_ifindex;
- for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+ for (imlp = &inet->mc_list;
+ (iml = rtnl_dereference(*imlp)) != NULL;
+ imlp = &iml->next_rcu) {
if (iml->multi.imr_multiaddr.s_addr != group)
continue;
if (ifindex) {
@@ -1840,19 +1963,20 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
(void) ip_mc_leave_src(sk, iml, in_dev);
- *imlp = iml->next;
+ *imlp = iml->next_rcu;
- if (in_dev)
- ip_mc_dec_group(in_dev, group);
+ ip_mc_dec_group(in_dev, group);
rtnl_unlock();
- sock_kfree_s(sk, iml, sizeof(*iml));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
return 0;
}
- if (!in_dev)
- ret = -ENODEV;
+out:
rtnl_unlock();
return ret;
}
+EXPORT_SYMBOL(ip_mc_leave_group);
int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mreq_source *mreqs, int ifindex)
@@ -1864,10 +1988,11 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
int leavegroup = 0;
int i, j, rv;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -1875,7 +2000,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
imr.imr_ifindex = ifindex;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -1883,9 +2008,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
err = -EADDRNOTAVAIL;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
- && pmc->multi.imr_ifindex == imr.imr_ifindex)
+ for_each_pmc_rtnl(inet, pmc) {
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
@@ -1906,12 +2032,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
pmc->sfmode = omode;
}
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -1930,7 +2056,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
- for (j=i+1; j<psl->sl_count; j++)
+ for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
@@ -1956,14 +2082,17 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
- for (i=0; i<psl->sl_count; i++)
+ for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
@@ -1971,7 +2100,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
if (rv == 0) /* address already there is an error */
goto done;
- for (j=psl->sl_count-1; j>=i; j--)
+ for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
@@ -1995,9 +2124,10 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
int leavegroup = 0;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
if (msf->imsf_fmode != MCAST_INCLUDE &&
msf->imsf_fmode != MCAST_EXCLUDE)
@@ -2008,7 +2138,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = ifindex;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -2021,7 +2151,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
goto done;
}
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
@@ -2051,15 +2181,17 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, 0, NULL, 0);
}
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
} else
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
- pmc->sflist = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
@@ -2079,8 +2211,9 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
@@ -2088,7 +2221,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
- in_dev = ip_mc_find_dev(&imr);
+ in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
@@ -2096,7 +2229,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
}
err = -EADDRNOTAVAIL;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
@@ -2104,7 +2237,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
if (!pmc) /* must have a prior join */
goto done;
msf->imsf_fmode = pmc->sfmode;
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
rtnl_unlock();
if (!psl) {
len = 0;
@@ -2142,14 +2275,14 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (psin->sin_family != AF_INET)
return -EINVAL;
addr = psin->sin_addr.s_addr;
- if (!MULTICAST(addr))
+ if (!ipv4_is_multicast(addr))
return -EINVAL;
rtnl_lock();
err = -EADDRNOTAVAIL;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == addr &&
pmc->multi.imr_ifindex == gsf->gf_interface)
break;
@@ -2157,7 +2290,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!pmc) /* must have a prior join */
goto done;
gsf->gf_fmode = pmc->sfmode;
- psl = pmc->sflist;
+ psl = rtnl_dereference(pmc->sflist);
rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2166,7 +2299,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
return -EFAULT;
}
- for (i=0; i<copycount; i++) {
+ for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
@@ -2191,30 +2324,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
+ int ret;
- if (!MULTICAST(loc_addr))
- return 1;
+ ret = 1;
+ if (!ipv4_is_multicast(loc_addr))
+ goto out;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
pmc->multi.imr_ifindex == dif)
break;
}
+ ret = inet->mc_all;
if (!pmc)
- return 1;
- psl = pmc->sflist;
+ goto unlock;
+ psl = rcu_dereference(pmc->sflist);
+ ret = (pmc->sfmode == MCAST_EXCLUDE);
if (!psl)
- return pmc->sfmode == MCAST_EXCLUDE;
+ goto unlock;
- for (i=0; i<psl->sl_count; i++) {
+ for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
+ ret = 0;
if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
- return 0;
+ goto unlock;
if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
- return 0;
- return 1;
+ goto unlock;
+ ret = 1;
+unlock:
+ rcu_read_unlock();
+out:
+ return ret;
}
/*
@@ -2225,42 +2368,56 @@ void ip_mc_drop_socket(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
+ struct net *net = sock_net(sk);
if (inet->mc_list == NULL)
return;
rtnl_lock();
- while ((iml = inet->mc_list) != NULL) {
+ while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
struct in_device *in_dev;
- inet->mc_list = iml->next;
- in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+ inet->mc_list = iml->next_rcu;
+ in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
- if (in_dev != NULL) {
+ if (in_dev != NULL)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
- in_dev_put(in_dev);
- }
- sock_kfree_s(sk, iml, sizeof(*iml));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
}
rtnl_unlock();
}
-int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
{
struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
- read_lock(&in_dev->mc_list_lock);
- for (im=in_dev->mc_list; im; im=im->next) {
- if (im->multiaddr == mc_addr)
- break;
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
} else if (im) {
if (src_addr) {
- for (psf=im->sources; psf; psf=psf->sf_next) {
+ for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
@@ -2273,12 +2430,12 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
} else
rv = 1; /* unspecified source; tentatively allow */
}
- read_unlock(&in_dev->mc_list_lock);
return rv;
}
#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct in_device *in_dev;
};
@@ -2287,23 +2444,22 @@ struct igmp_mc_iter_state {
static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
+ struct net *net = seq_file_net(seq);
struct ip_mc_list *im = NULL;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
- for_each_netdev(&init_net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
- in_dev = in_dev_get(state->dev);
+
+ in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
- read_lock(&in_dev->mc_list_lock);
- im = in_dev->mc_list;
+ im = rcu_dereference(in_dev->mc_list);
if (im) {
state->in_dev = in_dev;
break;
}
- read_unlock(&in_dev->mc_list_lock);
- in_dev_put(in_dev);
}
return im;
}
@@ -2311,22 +2467,18 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
- im = im->next;
+
+ im = rcu_dereference(im->next_rcu);
while (!im) {
- if (likely(state->in_dev != NULL)) {
- read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- }
- state->dev = next_net_device(state->dev);
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
- state->in_dev = in_dev_get(state->dev);
+ state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
- read_lock(&state->in_dev->mc_list_lock);
- im = state->in_dev->mc_list;
+ im = rcu_dereference(state->in_dev->mc_list);
}
return im;
}
@@ -2341,8 +2493,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2358,15 +2511,13 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
- if (likely(state->in_dev != NULL)) {
- read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- state->in_dev = NULL;
- }
+
+ state->in_dev = NULL;
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2378,6 +2529,8 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
struct ip_mc_list *im = (struct ip_mc_list *)v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
+ long delta;
+
#ifdef CONFIG_IP_MULTICAST
querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2386,16 +2539,17 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
querier = "NONE";
#endif
- if (state->in_dev->mc_list == im) {
+ if (rcu_dereference(state->in_dev->mc_list) == im) {
seq_printf(seq, "%d\t%-10s: %5d %7s\n",
- state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+ state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
+ delta = im->timer.expires - jiffies;
seq_printf(seq,
"\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
im->multiaddr, im->users,
- im->tm_running, im->tm_running ?
- jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+ im->tm_running,
+ im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
im->reporter);
}
return 0;
@@ -2410,7 +2564,7 @@ static const struct seq_operations igmp_mc_seq_ops = {
static int igmp_mc_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &igmp_mc_seq_ops,
+ return seq_open_net(inode, file, &igmp_mc_seq_ops,
sizeof(struct igmp_mc_iter_state));
}
@@ -2419,10 +2573,11 @@ static const struct file_operations igmp_mc_seq_fops = {
.open = igmp_mc_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
struct igmp_mcf_iter_state {
+ struct seq_net_private p;
struct net_device *dev;
struct in_device *idev;
struct ip_mc_list *im;
@@ -2432,19 +2587,19 @@ struct igmp_mcf_iter_state {
static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
+ struct net *net = seq_file_net(seq);
struct ip_sf_list *psf = NULL;
struct ip_mc_list *im = NULL;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
state->idev = NULL;
state->im = NULL;
- for_each_netdev(&init_net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
- idev = in_dev_get(state->dev);
+ idev = __in_dev_get_rcu(state->dev);
if (unlikely(idev == NULL))
continue;
- read_lock(&idev->mc_list_lock);
- im = idev->mc_list;
+ im = rcu_dereference(idev->mc_list);
if (likely(im != NULL)) {
spin_lock_bh(&im->lock);
psf = im->sources;
@@ -2455,8 +2610,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
}
spin_unlock_bh(&im->lock);
}
- read_unlock(&idev->mc_list_lock);
- in_dev_put(idev);
}
return psf;
}
@@ -2470,20 +2623,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
- if (likely(state->idev != NULL)) {
- read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- }
- state->dev = next_net_device(state->dev);
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
- state->idev = in_dev_get(state->dev);
+ state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
- read_lock(&state->idev->mc_list_lock);
- state->im = state->idev->mc_list;
+ state->im = rcu_dereference(state->idev->mc_list);
}
if (!state->im)
break;
@@ -2504,8 +2652,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2521,19 +2670,16 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (likely(state->im != NULL)) {
spin_unlock_bh(&state->im->lock);
state->im = NULL;
}
- if (likely(state->idev != NULL)) {
- read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- state->idev = NULL;
- }
+ state->idev = NULL;
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
@@ -2569,7 +2715,7 @@ static const struct seq_operations igmp_mcf_seq_ops = {
static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &igmp_mcf_seq_ops,
+ return seq_open_net(inode, file, &igmp_mcf_seq_ops,
sizeof(struct igmp_mcf_iter_state));
}
@@ -2578,18 +2724,79 @@ static const struct file_operations igmp_mcf_seq_fops = {
.open = igmp_mcf_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
-int __init igmp_mc_proc_init(void)
+static int __net_init igmp_net_init(struct net *net)
{
- proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
- proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
+ if (!pde)
+ goto out_igmp;
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
+ if (!pde)
+ goto out_mcfilter;
return 0;
+
+out_mcfilter:
+ remove_proc_entry("igmp", net->proc_net);
+out_igmp:
+ return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
+{
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
}
+
+static struct pernet_operations igmp_net_ops = {
+ .init = igmp_net_init,
+ .exit = igmp_net_exit,
+};
#endif
-EXPORT_SYMBOL(ip_mc_dec_group);
-EXPORT_SYMBOL(ip_mc_inc_group);
-EXPORT_SYMBOL(ip_mc_join_group);
-EXPORT_SYMBOL(ip_mc_rejoin_group);
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
+#else
+ return register_netdevice_notifier(&igmp_notifier);
+#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3cef12835c4..14d02ea905b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -29,82 +29,119 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
#endif
-/*
- * This array holds the first and last local port number.
- */
-int sysctl_local_port_range[2] = { 32768, 61000 };
-DEFINE_SEQLOCK(sysctl_port_range_lock);
-
-void inet_get_local_port_range(int *low, int *high)
+void inet_get_local_port_range(struct net *net, int *low, int *high)
{
- unsigned seq;
+ unsigned int seq;
+
do {
- seq = read_seqbegin(&sysctl_port_range_lock);
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
- *low = sysctl_local_port_range[0];
- *high = sysctl_local_port_range[1];
- } while (read_seqretry(&sysctl_port_range_lock, seq));
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb)
+ const struct inet_bind_bucket *tb, bool relax)
{
- const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
struct sock *sk2;
- struct hlist_node *node;
int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
+
+ /*
+ * Unlike other sk lookup places we do not check
+ * for sk_net here, since _all_ the socks listed
+ * in tb->owners list belong to the same net - the
+ * one this bucket belongs to.
+ */
- sk_for_each_bound(sk2, node, &tb->owners) {
+ sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
!inet_v6_ipv6only(sk2) &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
break;
}
}
}
- return node != NULL;
+ return sk2 != NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
-int inet_csk_get_port(struct inet_hashinfo *hashinfo,
- struct sock *sk, unsigned short snum,
- int (*bind_conflict)(const struct sock *sk,
- const struct inet_bind_bucket *tb))
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_bind_hashbucket *head;
- struct hlist_node *node;
struct inet_bind_bucket *tb;
- int ret;
+ int ret, attempts = 5;
+ struct net *net = sock_net(sk);
+ int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
local_bh_disable();
if (!snum) {
int remaining, rover, low, high;
- inet_get_local_port_range(&low, &high);
- remaining = high - low;
- rover = net_random() % remaining + low;
+again:
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ smallest_rover = rover = prandom_u32() % remaining + low;
+ smallest_size = -1;
do {
- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+ if (inet_is_local_reserved_port(net, rover))
+ goto next_nolock;
+ head = &hashinfo->bhash[inet_bhashfn(net, rover,
+ hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == rover)
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == rover) {
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_rover = rover;
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = smallest_rover;
+ goto tb_found;
+ }
+ }
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = rover;
+ goto tb_found;
+ }
goto next;
+ }
break;
next:
spin_unlock(&head->lock);
+ next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
@@ -116,51 +153,81 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
* the top level, not from the 'break;' statement.
*/
ret = 1;
- if (remaining <= 0)
+ if (remaining <= 0) {
+ if (smallest_size != -1) {
+ snum = smallest_rover;
+ goto have_snum;
+ }
goto fail;
-
+ }
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
} else {
- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+have_snum:
+ head = &hashinfo->bhash[inet_bhashfn(net, snum,
+ hashinfo->bhash_size)];
spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->port == snum)
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
tb = NULL;
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse > 1)
+ if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size == -1) {
goto success;
} else {
ret = 1;
- if (bind_conflict(sk, tb))
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock(&head->lock);
+ goto again;
+ }
+
goto fail_unlock;
+ }
}
}
tb_not_found:
ret = 1;
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+ net, head, snum)) == NULL)
goto fail_unlock;
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
else
tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
success:
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, snum);
- BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
@@ -169,7 +236,6 @@ fail:
local_bh_enable();
return ret;
}
-
EXPORT_SYMBOL_GPL(inet_csk_get_port);
/*
@@ -197,7 +263,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -216,7 +282,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
if (!timeo)
break;
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return err;
}
@@ -226,7 +292,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct sock *newsk;
+ struct request_sock *req;
int error;
lock_sock(sk);
@@ -239,7 +307,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
goto out_err;
/* Find already established connection */
- if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+ if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
@@ -251,18 +319,35 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
-
- newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
- BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+ req = reqsk_queue_remove(queue);
+ newsk = req->sk;
+
+ sk_acceptq_removed(sk);
+ if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
+ spin_lock_bh(&queue->fastopenq->lock);
+ if (tcp_rsk(req)->listener) {
+ /* We are still waiting for the final ACK from 3WHS
+ * so can't free req now. Instead, we set req->sk to
+ * NULL to signify that the child socket is taken
+ * so reqsk_fastopen_remove() will free the req
+ * when 3WHS finishes (or is aborted).
+ */
+ req->sk = NULL;
+ req = NULL;
+ }
+ spin_unlock_bh(&queue->fastopenq->lock);
+ }
out:
release_sock(sk);
+ if (req)
+ __reqsk_free(req);
return newsk;
out_err:
newsk = NULL;
+ req = NULL;
*err = error;
goto out;
}
-
EXPORT_SYMBOL(inet_csk_accept);
/*
@@ -277,21 +362,13 @@ void inet_csk_init_xmit_timers(struct sock *sk,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- init_timer(&icsk->icsk_retransmit_timer);
- init_timer(&icsk->icsk_delack_timer);
- init_timer(&sk->sk_timer);
-
- icsk->icsk_retransmit_timer.function = retransmit_handler;
- icsk->icsk_delack_timer.function = delack_handler;
- sk->sk_timer.function = keepalive_handler;
-
- icsk->icsk_retransmit_timer.data =
- icsk->icsk_delack_timer.data =
- sk->sk_timer.data = (unsigned long)sk;
-
+ setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
+ (unsigned long)sk);
+ setup_timer(&icsk->icsk_delack_timer, delack_handler,
+ (unsigned long)sk);
+ setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -304,63 +381,97 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_delete_keepalive_timer(struct sock *sk)
{
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}
-
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
-struct dst_entry* inet_csk_route_req(struct sock *sk,
+struct dst_entry *inet_csk_route_req(struct sock *sk,
+ struct flowi4 *fl4,
const struct request_sock *req)
{
struct rtable *rt;
const struct inet_request_sock *ireq = inet_rsk(req);
- struct ip_options *opt = inet_rsk(req)->opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- ireq->rmt_addr),
- .saddr = ireq->loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
- .dport = ireq->rmt_port } } };
-
- security_req_classify_flow(req, &fl);
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
- ip_rt_put(rt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- return &rt->u.dst;
+ struct ip_options_rcu *opt = inet_rsk(req)->opt;
+ struct net *net = sock_net(sk);
+ int flags = inet_sk_flowi_flags(sk);
+
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol,
+ flags,
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_route_req);
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+ struct sock *newsk,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_sock *newinet = inet_sk(newsk);
+ struct ip_options_rcu *opt;
+ struct net *net = sock_net(sk);
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ fl4 = &newinet->cork.fl.u.ip4;
+
+ rcu_read_lock();
+ opt = rcu_dereference(newinet->inet_opt);
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ rcu_read_unlock();
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) 1
@@ -381,11 +492,11 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
prev = &req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
- if (ireq->rmt_port == rport &&
- ireq->rmt_addr == raddr &&
- ireq->loc_addr == laddr &&
+ if (ireq->ir_rmt_port == rport &&
+ ireq->ir_rmt_addr == raddr &&
+ ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
- BUG_TRAP(!req->sk);
+ WARN_ON(req->sk);
*prevp = prev;
break;
}
@@ -393,7 +504,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
return req;
}
-
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -401,17 +511,50 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
+ const int max_retries,
+ const u8 rskq_defer_accept,
+ int *expire, int *resend)
+{
+ if (!rskq_defer_accept) {
+ *expire = req->num_timeout >= thresh;
+ *resend = 1;
+ return;
+ }
+ *expire = req->num_timeout >= thresh &&
+ (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
+ /*
+ * Do not resend while waiting for data after ACK,
+ * start to resend on end of deferring period to give
+ * last chance for data or ACK to create established socket.
+ */
+ *resend = !inet_rsk(req)->acked ||
+ req->num_timeout >= rskq_defer_accept - 1;
+}
+
+int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+{
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
+
+ if (!err)
+ req->num_retrans++;
+ return err;
+}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
void inet_csk_reqsk_queue_prune(struct sock *parent,
const unsigned long interval,
@@ -432,7 +575,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
+ * If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
@@ -468,14 +611,22 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
reqp=&lopt->syn_table[i];
while ((req = *reqp) != NULL) {
if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (inet_rsk(req)->acked && req->retrans < max_retries))
- && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+ int expire = 0, resend = 0;
+
+ syn_ack_recalc(req, thresh, max_retries,
+ queue->rskq_defer_accept,
+ &expire, &resend);
+ req->rsk_ops->syn_ack_timeout(parent, req);
+ if (!expire &&
+ (!resend ||
+ !inet_rtx_syn_ack(parent, req) ||
+ inet_rsk(req)->acked)) {
unsigned long timeo;
- if (req->retrans++ == 0)
+ if (req->num_timeout++ == 0)
lopt->qlen_young--;
- timeo = min((timeout << req->retrans), max_rto);
+ timeo = min(timeout << req->num_timeout,
+ max_rto);
req->expires = now + timeo;
reqp = &req->dl_next;
continue;
@@ -499,13 +650,21 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
if (lopt->qlen)
inet_csk_reset_keepalive_timer(parent, interval);
}
-
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
-struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
- const gfp_t priority)
+/**
+ * inet_csk_clone_lock - clone an inet socket, and lock its clone
+ * @sk: the socket to clone
+ * @req: request_sock
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
{
- struct sock *newsk = sk_clone(sk, priority);
+ struct sock *newsk = sk_clone_lock(sk, priority);
if (newsk != NULL) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -513,9 +672,13 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
- inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+ inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+ inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
@@ -527,8 +690,7 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
}
return newsk;
}
-
-EXPORT_SYMBOL_GPL(inet_csk_clone);
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
@@ -538,14 +700,14 @@ EXPORT_SYMBOL_GPL(inet_csk_clone);
*/
void inet_csk_destroy_sock(struct sock *sk)
{
- BUG_TRAP(sk->sk_state == TCP_CLOSE);
- BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+ WARN_ON(sk->sk_state != TCP_CLOSE);
+ WARN_ON(!sock_flag(sk, SOCK_DEAD));
/* It cannot be in hash table! */
- BUG_TRAP(sk_unhashed(sk));
+ WARN_ON(!sk_unhashed(sk));
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+ /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
sk->sk_prot->destroy(sk);
@@ -555,12 +717,28 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- atomic_dec(sk->sk_prot->orphan_count);
+ percpu_counter_dec(sk->sk_prot->orphan_count);
sock_put(sk);
}
-
EXPORT_SYMBOL(inet_csk_destroy_sock);
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
@@ -580,8 +758,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* after validation is complete.
*/
sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
+ if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
@@ -593,7 +771,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
/*
@@ -603,13 +780,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *acc_req;
struct request_sock *req;
inet_csk_delete_keepalive_timer(sk);
/* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+ acc_req = reqsk_queue_yank_acceptq(queue);
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -619,7 +797,7 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ reqsk_queue_destroy(queue);
while ((req = acc_req) != NULL) {
struct sock *child = req->sk;
@@ -628,15 +806,28 @@ void inet_csk_listen_stop(struct sock *sk)
local_bh_disable();
bh_lock_sock(child);
- BUG_TRAP(!sock_owned_by_user(child));
+ WARN_ON(sock_owned_by_user(child));
sock_hold(child);
sk->sk_prot->disconnect(child, O_NONBLOCK);
sock_orphan(child);
- atomic_inc(sk->sk_prot->orphan_count);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != tcp_rsk(req)->listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ sock_put(sk);
+ }
inet_csk_destroy_sock(child);
bh_unlock_sock(child);
@@ -646,9 +837,19 @@ void inet_csk_listen_stop(struct sock *sk)
sk_acceptq_removed(sk);
__reqsk_free(req);
}
- BUG_TRAP(!sk->sk_ack_backlog);
+ if (queue->fastopenq != NULL) {
+ /* Free all the reqs queued in rskq_rst_head. */
+ spin_lock_bh(&queue->fastopenq->lock);
+ acc_req = queue->fastopenq->rskq_rst_head;
+ queue->fastopenq->rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq->lock);
+ while ((req = acc_req) != NULL) {
+ acc_req = req->dl_next;
+ __reqsk_free(req);
+ }
+ }
+ WARN_ON(sk->sk_ack_backlog);
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -657,31 +858,11 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
const struct inet_sock *inet = inet_sk(sk);
sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->daddr;
- sin->sin_port = inet->dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ sin->sin_port = inet->inet_dport;
}
-
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
-int inet_csk_ctl_sock_create(struct socket **sock, unsigned short family,
- unsigned short type, unsigned char protocol)
-{
- int rc = sock_create_kern(family, type, protocol, sock);
-
- if (rc == 0) {
- (*sock)->sk->sk_allocation = GFP_ATOMIC;
- inet_sk((*sock)->sk)->uc_ttl = -1;
- /*
- * Unhash it so that IP input processing does not even see it,
- * we do not wish this socket to see incoming packets.
- */
- (*sock)->sk->sk_prot->unhash((*sock)->sk);
- }
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(inet_csk_ctl_sock_create);
-
#ifdef CONFIG_COMPAT
int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
@@ -694,11 +875,10 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -708,6 +888,51 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ fl4 = &fl->u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
+ }
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7eb83ebed2e..e34dccbc4d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1,8 +1,6 @@
/*
* inet_diag.c Module for monitoring INET transport protocols sockets.
*
- * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
@@ -16,6 +14,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/time.h>
@@ -34,6 +33,7 @@
#include <linux/stddef.h>
#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
static const struct inet_diag_handler **inet_diag_table;
@@ -44,78 +44,127 @@ struct inet_diag_entry {
u16 dport;
u16 family;
u16 userlocks;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
+ struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
+#endif
};
-static struct sock *idiagnl;
+static DEFINE_MUTEX(inet_diag_table_mutex);
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
- RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
+{
+ if (!inet_diag_table[proto])
+ request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET, proto);
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ mutex_lock(&inet_diag_table_mutex);
+ if (!inet_diag_table[proto])
+ return ERR_PTR(-ENOENT);
+
+ return inet_diag_table[proto];
+}
+
+static inline void inet_diag_unlock_handler(
+ const struct inet_diag_handler *handler)
+{
+ mutex_unlock(&inet_diag_table_mutex);
+}
+
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
+ struct nlattr *attr;
void *info = NULL;
- struct inet_diag_meminfo *minfo = NULL;
- unsigned char *b = skb_tail_pointer(skb);
const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
- handler = inet_diag_table[unlh->nlmsg_type];
+ handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(handler == NULL);
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- r = NLMSG_DATA(nlh);
+ r = nlmsg_data(nlh);
BUG_ON(sk->sk_state == TCP_TIME_WAIT);
- if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
- minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
-
- if (ext & (1 << (INET_DIAG_INFO - 1)))
- info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
- handler->idiag_info_size);
-
- if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
- const size_t len = strlen(icsk->icsk_ca_ops->name);
-
- strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
- icsk->icsk_ca_ops->name);
- }
-
r->idiag_family = sk->sk_family;
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+ r->id.idiag_sport = inet->inet_sport;
+ r->id.idiag_dport = inet->inet_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = inet->inet_rcv_saddr;
+ r->id.idiag_dst[0] = inet->inet_daddr;
- r->id.idiag_sport = inet->sport;
- r->id.idiag_dport = inet->dport;
- r->id.idiag_src[0] = inet->rcv_saddr;
- r->id.idiag_dst[0] = inet->daddr;
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &np->daddr);
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
}
#endif
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
+ }
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+ if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+ goto errout;
+
+ if (icsk == NULL) {
+ handler->idiag_get_info(sk, r, NULL);
+ goto out;
+ }
+
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
- if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
@@ -133,120 +182,129 @@ static int inet_csk_diag_fill(struct sock *sk,
}
#undef EXPIRES_IN_MS
- r->idiag_uid = sock_i_uid(sk);
- r->idiag_inode = sock_i_ino(sk);
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ attr = nla_reserve(skb, INET_DIAG_INFO,
+ sizeof(struct tcp_info));
+ if (!attr)
+ goto errout;
- if (minfo) {
- minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
- minfo->idiag_wmem = sk->sk_wmem_queued;
- minfo->idiag_fmem = sk->sk_forward_alloc;
- minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+ info = nla_data(attr);
}
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
+ if (nla_put_string(skb, INET_DIAG_CONG,
+ icsk->icsk_ca_ops->name) < 0)
+ goto errout;
+
handler->idiag_get_info(sk, r, info);
if (sk->sk_state < TCP_TIME_WAIT &&
icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
icsk->icsk_ca_ops->get_info(sk, ext, skb);
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
+out:
+ return nlmsg_end(skb, nlh);
-rtattr_failure:
-nlmsg_failure:
- nlmsg_trim(skb, b);
+errout:
+ nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_csk_diag_fill(struct sock *sk,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ return inet_sk_diag_fill(sk, inet_csk(sk),
+ skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
+}
static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
- struct sk_buff *skb, int ext, u32 pid,
- u32 seq, u16 nlmsg_flags,
+ struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- long tmo;
+ s32 tmo;
struct inet_diag_msg *r;
- const unsigned char *previous_tail = skb_tail_pointer(skb);
- struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
- unlh->nlmsg_type, sizeof(*r));
+ struct nlmsghdr *nlh;
- r = NLMSG_DATA(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
- nlh->nlmsg_flags = nlmsg_flags;
+ r = nlmsg_data(nlh);
+ BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_ttd - jiffies;
+ tmo = tw->tw_ttd - inet_tw_time_stamp();
if (tmo < 0)
tmo = 0;
r->idiag_family = tw->tw_family;
- r->idiag_state = tw->tw_state;
- r->idiag_timer = 0;
r->idiag_retrans = 0;
+
r->id.idiag_if = tw->tw_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
+ sock_diag_save_cookie(tw, r->id.idiag_cookie);
+
r->id.idiag_sport = tw->tw_sport;
r->id.idiag_dport = tw->tw_dport;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
r->id.idiag_src[0] = tw->tw_rcv_saddr;
r->id.idiag_dst[0] = tw->tw_daddr;
+
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ);
+ r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- const struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
-
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &tw6->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &tw6->tw_v6_daddr);
+ *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
- return skb->len;
-nlmsg_failure:
- nlmsg_trim(skb, previous_tail);
- return -EMSGSIZE;
+
+ return nlmsg_end(skb, nlh);
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ struct inet_diag_req_v2 *r,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
- skb, ext, pid, seq, nlmsg_flags,
- unlh);
- return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
+ return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+ nlmsg_flags, unlh);
+
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+ nlmsg_flags, unlh);
}
-static int inet_diag_get_exact(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh)
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
{
int err;
struct sock *sk;
- struct inet_diag_req *req = NLMSG_DATA(nlh);
struct sk_buff *rep;
- struct inet_hashinfo *hashinfo;
- const struct inet_diag_handler *handler;
+ struct net *net = sock_net(in_skb->sk);
- handler = inet_diag_table[nlh->nlmsg_type];
- BUG_ON(handler == NULL);
- hashinfo = handler->idiag_hashinfo;
-
- if (req->idiag_family == AF_INET) {
- sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+ err = -EINVAL;
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
}
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- else if (req->idiag_family == AF_INET6) {
- sk = inet6_lookup(hashinfo,
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6) {
+ sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
@@ -255,47 +313,62 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
}
#endif
else {
- return -EINVAL;
+ goto out_nosk;
}
+ err = -ENOENT;
if (sk == NULL)
- return -ENOENT;
+ goto out_nosk;
- err = -ESTALE;
- if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
- req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
goto out;
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) +
- handler->idiag_info_size + 64)),
- GFP_KERNEL);
- if (!rep)
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ sizeof(struct tcp_info) + 64, GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
goto out;
+ }
- err = sk_diag_fill(sk, rep, req->idiag_ext,
- NETLINK_CB(in_skb).pid,
+ err = sk_diag_fill(sk, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0, nlh);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
- kfree_skb(rep);
+ nlmsg_free(rep);
goto out;
}
- err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
MSG_DONTWAIT);
if (err > 0)
err = 0;
out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- inet_twsk_put((struct inet_timewait_sock *)sk);
- else
- sock_put(sk);
- }
+ if (sk)
+ sock_gen_put(sk);
+
+out_nosk:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
+
+static int inet_diag_get_exact(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ const struct inet_diag_handler *handler;
+ int err;
+
+ handler = inet_diag_lock_handler(req->sdiag_protocol);
+ if (IS_ERR(handler))
+ err = PTR_ERR(handler);
+ else
+ err = handler->dump_one(in_skb, nlh, req);
+ inet_diag_unlock_handler(handler);
+
return err;
}
@@ -326,9 +399,12 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
}
-static int inet_diag_bc_run(const void *bc, int len,
- const struct inet_diag_entry *entry)
+static int inet_diag_bc_run(const struct nlattr *_bc,
+ const struct inet_diag_entry *entry)
{
+ const void *bc = nla_data(_bc);
+ int len = nla_len(_bc);
+
while (len > 0) {
int yes = 1;
const struct inet_diag_bc_op *op = bc;
@@ -343,7 +419,7 @@ static int inet_diag_bc_run(const void *bc, int len,
yes = entry->sport >= op[1].no;
break;
case INET_DIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
+ yes = entry->sport <= op[1].no;
break;
case INET_DIAG_BC_D_GE:
yes = entry->dport >= op[1].no;
@@ -367,25 +443,31 @@ static int inet_diag_bc_run(const void *bc, int len,
break;
}
- if (cond->prefix_len == 0)
- break;
-
if (op->code == INET_DIAG_BC_S_COND)
addr = entry->saddr;
else
addr = entry->daddr;
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
if (bitstring_match(addr, cond->addr,
cond->prefix_len))
break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr + 3, cond->addr,
- cond->prefix_len))
- break;
- }
yes = 0;
break;
}
@@ -399,9 +481,37 @@ static int inet_diag_bc_run(const void *bc, int len,
bc += op->no;
}
}
- return (len == 0);
+ return len == 0;
}
+int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+{
+ struct inet_diag_entry entry;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (bc == NULL)
+ return 1;
+
+ entry.family = sk->sk_family;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (entry.family == AF_INET6) {
+
+ entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry.daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->inet_rcv_saddr;
+ entry.daddr = &inet->inet_daddr;
+ }
+ entry.sport = inet->inet_num;
+ entry.dport = ntohs(inet->inet_dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ return inet_diag_bc_run(bc, &entry);
+}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
+
static int valid_cc(const void *bc, int len, int cc)
{
while (len >= 0) {
@@ -411,7 +521,7 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
if (cc == len)
return 1;
- if (op->yes < 4)
+ if (op->yes < 4 || op->yes & 3)
return 0;
len -= op->yes;
bc += op->yes;
@@ -419,39 +529,96 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ int addr_len;
+ struct inet_diag_hostcond *cond;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
{
- const unsigned char *bc = bytecode;
+ const void *bc = bytecode;
int len = bytecode_len;
while (len > 0) {
- struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+ const struct inet_diag_bc_op *op = bc;
+ int min_len = sizeof(struct inet_diag_bc_op);
//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
- case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
case INET_DIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len + 4)
- return -EINVAL;
- case INET_DIAG_BC_JMP:
- if (op->no < 4 || op->no > len + 4)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len - op->no))
+ if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len + 4)
- return -EINVAL;
break;
default:
return -EINVAL;
}
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+ return -EINVAL;
bc += op->yes;
len -= op->yes;
}
@@ -460,58 +627,35 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct inet_diag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
- }
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
- entry.userlocks = sk->sk_userlocks;
-
- if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
- }
-
- return inet_csk_diag_fill(sk, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
+ return inet_csk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
+static int inet_twsk_diag_dump(struct sock *sk,
struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ struct inet_timewait_sock *tw = inet_twsk(sk);
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ if (bc != NULL) {
struct inet_diag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
entry.family = tw->tw_family;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
- struct inet6_timewait_sock *tw6 =
- inet6_twsk((struct sock *)tw);
- entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw6->tw_v6_daddr.s6_addr32;
+ entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
+ entry.daddr = tw->tw_v6_daddr.s6_addr32;
} else
#endif
{
@@ -522,77 +666,109 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
entry.dport = ntohs(tw->tw_dport);
entry.userlocks = 0;
- if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+ if (!inet_diag_bc_run(bc, &entry))
return 0;
}
- return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
- NETLINK_CB(cb->skb).pid,
+ return inet_twsk_diag_fill(tw, skb, r,
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
+/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
+ * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
+ */
+static inline void inet_diag_req_addrs(const struct sock *sk,
+ const struct request_sock *req,
+ struct inet_diag_entry *entry)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (req->rsk_ops->family == AF_INET6) {
+ entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
+ entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
+ } else if (req->rsk_ops->family == AF_INET) {
+ ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
+ &entry->saddr_storage);
+ ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
+ &entry->daddr_storage);
+ entry->saddr = entry->saddr_storage.s6_addr32;
+ entry->daddr = entry->daddr_storage.s6_addr32;
+ }
+ } else
+#endif
+ {
+ entry->saddr = &ireq->ir_loc_addr;
+ entry->daddr = &ireq->ir_rmt_addr;
+ }
+}
+
static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req, u32 pid, u32 seq,
+ struct request_sock *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq,
const struct nlmsghdr *unlh)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb_tail_pointer(skb);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ r = nlmsg_data(nlh);
r->idiag_family = sk->sk_family;
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = req->retrans;
+ r->idiag_retrans = req->num_retrans;
r->id.idiag_if = sk->sk_bound_dev_if;
- r->id.idiag_cookie[0] = (u32)(unsigned long)req;
- r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+ sock_diag_save_cookie(req, r->id.idiag_cookie);
tmo = req->expires - jiffies;
if (tmo < 0)
tmo = 0;
- r->id.idiag_sport = inet->sport;
- r->id.idiag_dport = ireq->rmt_port;
- r->id.idiag_src[0] = ireq->loc_addr;
- r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->id.idiag_sport = inet->inet_sport;
+ r->id.idiag_dport = ireq->ir_rmt_port;
+
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = ireq->ir_loc_addr;
+ r->id.idiag_dst[0] = ireq->ir_rmt_addr;
+
r->idiag_expires = jiffies_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
- r->idiag_uid = sock_i_uid(sk);
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = 0;
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
- &inet6_rsk(req)->loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
- &inet6_rsk(req)->rmt_addr);
+ struct inet_diag_entry entry;
+ inet_diag_req_addrs(sk, req, &entry);
+ memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
+ memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
}
#endif
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-nlmsg_failure:
- nlmsg_trim(skb, b);
- return -1;
+ return nlmsg_end(skb, nlh);
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
{
struct inet_diag_entry entry;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt;
- struct rtattr *bc = NULL;
struct inet_sock *inet = inet_sk(sk);
int j, s_j;
int reqnum, s_reqnum;
@@ -612,9 +788,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (!lopt || !lopt->qlen)
goto out;
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
+ if (bc != NULL) {
+ entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
@@ -627,32 +802,21 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (reqnum < s_reqnum)
continue;
- if (r->id.idiag_dport != ireq->rmt_port &&
+ if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
- entry.saddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->loc_addr.s6_addr32 :
-#endif
- &ireq->loc_addr;
- entry.daddr =
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
- (entry.family == AF_INET6) ?
- inet6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
- &ireq->rmt_addr;
- entry.dport = ntohs(ireq->rmt_port);
+ inet_diag_req_addrs(sk, req, &entry);
+ entry.dport = ntohs(ireq->ir_rmt_port);
- if (!inet_diag_bc_run(RTA_DATA(bc),
- RTA_PAYLOAD(bc), &entry))
+ if (!inet_diag_bc_run(bc, &entry))
continue;
}
err = inet_diag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
@@ -670,17 +834,12 @@ out:
return err;
}
-static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+ struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int i, num;
int s_i, s_num;
- struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
- const struct inet_diag_handler *handler;
- struct inet_hashinfo *hashinfo;
-
- handler = inet_diag_table[cb->nlh->nlmsg_type];
- BUG_ON(handler == NULL);
- hashinfo = handler->idiag_hashinfo;
+ struct net *net = sock_net(skb->sk);
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -689,21 +848,30 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
goto skip_listen_ht;
- inet_listen_lock(hashinfo);
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct sock *sk;
- struct hlist_node *node;
+ struct hlist_nulls_node *node;
+ struct inet_listen_hashbucket *ilb;
num = 0;
- sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ ilb = &hashinfo->listening_hash[i];
+ spin_lock_bh(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
if (num < s_num) {
num++;
continue;
}
- if (r->id.idiag_sport != inet->sport &&
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_listen;
@@ -712,8 +880,8 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[3] > 0)
goto syn_recv;
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- inet_listen_unlock(hashinfo);
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -721,8 +889,8 @@ syn_recv:
if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
- inet_listen_unlock(hashinfo);
+ if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -731,93 +899,161 @@ next_listen:
cb->args[4] = 0;
++num;
}
+ spin_unlock_bh(&ilb->lock);
s_num = 0;
cb->args[3] = 0;
cb->args[4] = 0;
}
- inet_listen_unlock(hashinfo);
skip_listen_ht:
cb->args[0] = 1;
s_i = num = s_num = 0;
}
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
- return skb->len;
+ goto out;
- for (i = s_i; i < hashinfo->ehash_size; i++) {
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct sock *sk;
- struct hlist_node *node;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
if (i > s_i)
s_num = 0;
- read_lock_bh(&head->lock);
- num = 0;
- sk_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int res;
+ int state;
+ if (!net_eq(sock_net(sk), net))
+ continue;
if (num < s_num)
goto next_normal;
- if (!(r->idiag_states & (1 << sk->sk_state)))
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_substate : sk->sk_state;
+ if (!(r->idiag_states & (1 << state)))
goto next_normal;
- if (r->id.idiag_sport != inet->sport &&
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
r->id.idiag_sport)
goto next_normal;
- if (r->id.idiag_dport != inet->dport &&
+ if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
- if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
+ if (sk->sk_state == TCP_TIME_WAIT)
+ res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
+ else
+ res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ if (res < 0) {
+ spin_unlock_bh(lock);
goto done;
}
next_normal:
++num;
}
- if (r->idiag_states & TCPF_TIME_WAIT) {
- struct inet_timewait_sock *tw;
-
- inet_twsk_for_each(tw, node,
- &head->twchain) {
-
- if (num < s_num)
- goto next_dying;
- if (r->id.idiag_sport != tw->tw_sport &&
- r->id.idiag_sport)
- goto next_dying;
- if (r->id.idiag_dport != tw->tw_dport &&
- r->id.idiag_dport)
- goto next_dying;
- if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
- read_unlock_bh(&head->lock);
+ spin_unlock_bh(lock);
}
done:
cb->args[1] = i;
cb->args[2] = num;
- return skb->len;
+out:
+ ;
}
+EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
{
+ const struct inet_diag_handler *handler;
+ int err = 0;
+
+ handler = inet_diag_lock_handler(r->sdiag_protocol);
+ if (!IS_ERR(handler))
+ handler->dump(skb, cb, r, bc);
+ else
+ err = PTR_ERR(handler);
+ inet_diag_unlock_handler(handler);
+
+ return err ? : skb->len;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *bc = NULL;
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+}
+
+static inline int inet_diag_type2proto(int type)
+{
+ switch (type) {
+ case TCPDIAG_GETSOCK:
+ return IPPROTO_TCP;
+ case DCCPDIAG_GETSOCK:
+ return IPPROTO_DCCP;
+ default:
+ return 0;
+ }
+}
+
+static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ struct inet_diag_req_v2 req;
+ struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req);
+ req.sdiag_family = AF_UNSPEC; /* compatibility */
+ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, &req, bc);
+}
+
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh)
+{
+ struct inet_diag_req *rc = nlmsg_data(nlh);
+ struct inet_diag_req_v2 req;
+
+ req.sdiag_family = rc->idiag_family;
+ req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ return inet_diag_get_exact(in_skb, nlh, &req);
+}
+
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
+
if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
nlmsg_len(nlh) < hdrlen)
return -EINVAL;
- if (inet_diag_table[nlh->nlmsg_type] == NULL)
- return -ENOENT;
-
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
@@ -829,40 +1065,71 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
}
-
- return netlink_dump_start(idiagnl, skb, nlh,
- inet_diag_dump, NULL);
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+ }
}
- return inet_diag_get_exact(skb, nlh);
+ return inet_diag_get_exact_compat(skb, nlh);
}
-static DEFINE_MUTEX(inet_diag_mutex);
-
-static void inet_diag_rcv(struct sk_buff *skb)
+static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
- mutex_lock(&inet_diag_mutex);
- netlink_rcv_skb(skb, &inet_diag_rcv_msg);
- mutex_unlock(&inet_diag_mutex);
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(h, hdrlen)) {
+ struct nlattr *attr;
+ attr = nlmsg_find_attr(h, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (attr == NULL ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+
+ return inet_diag_get_exact(skb, h, nlmsg_data(h));
}
-static DEFINE_SPINLOCK(inet_diag_register_lock);
+static const struct sock_diag_handler inet_diag_handler = {
+ .family = AF_INET,
+ .dump = inet_diag_handler_dump,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+ .family = AF_INET6,
+ .dump = inet_diag_handler_dump,
+};
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
int err = -EINVAL;
- if (type >= INET_DIAG_GETSOCK_MAX)
+ if (type >= IPPROTO_MAX)
goto out;
- spin_lock(&inet_diag_register_lock);
+ mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
if (inet_diag_table[type] == NULL) {
inet_diag_table[type] = h;
err = 0;
}
- spin_unlock(&inet_diag_register_lock);
+ mutex_unlock(&inet_diag_table_mutex);
out:
return err;
}
@@ -872,20 +1139,18 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- if (type >= INET_DIAG_GETSOCK_MAX)
+ if (type >= IPPROTO_MAX)
return;
- spin_lock(&inet_diag_register_lock);
+ mutex_lock(&inet_diag_table_mutex);
inet_diag_table[type] = NULL;
- spin_unlock(&inet_diag_register_lock);
-
- synchronize_rcu();
+ mutex_unlock(&inet_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
static int __init inet_diag_init(void)
{
- const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ const int inet_diag_table_size = (IPPROTO_MAX *
sizeof(struct inet_diag_handler *));
int err = -ENOMEM;
@@ -893,24 +1158,35 @@ static int __init inet_diag_init(void)
if (!inet_diag_table)
goto out;
- idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
- inet_diag_rcv, NULL, THIS_MODULE);
- if (idiagnl == NULL)
- goto out_free_table;
- err = 0;
+ err = sock_diag_register(&inet_diag_handler);
+ if (err)
+ goto out_free_nl;
+
+ err = sock_diag_register(&inet6_diag_handler);
+ if (err)
+ goto out_free_inet;
+
+ sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
out:
return err;
-out_free_table:
+
+out_free_inet:
+ sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
kfree(inet_diag_table);
goto out;
}
static void __exit inet_diag_exit(void)
{
- sock_release(idiagnl->sk_socket);
+ sock_diag_unregister(&inet6_diag_handler);
+ sock_diag_unregister(&inet_diag_handler);
+ sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
kfree(inet_diag_table);
}
module_init(inet_diag_init);
module_exit(inet_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 484cf512858..3b01959bf4b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,8 +19,32 @@
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/sock.h>
#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
static void inet_frag_secret_rebuild(unsigned long dummy)
{
@@ -28,65 +52,96 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
unsigned long now = jiffies;
int i;
+ /* Per bucket lock NOT needed here, due to write lock protection */
write_lock(&f->lock);
+
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
- struct hlist_node *p, *n;
+ struct hlist_node *n;
- hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+ hb = &f->hash[i];
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
hlist_del(&q->list);
/* Relink to new hash chain. */
- hlist_add_head(&q->list, &f->hash[hval]);
+ hb_dest = &f->hash[hval];
+ hlist_add_head(&q->list, &hb_dest->chain);
}
}
}
write_unlock(&f->lock);
- mod_timer(&f->secret_timer, now + f->ctl->secret_interval);
+ mod_timer(&f->secret_timer, now + f->secret_interval);
}
void inet_frags_init(struct inet_frags *f)
{
int i;
- for (i = 0; i < INETFRAGS_HASHSZ; i++)
- INIT_HLIST_HEAD(&f->hash[i]);
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
- INIT_LIST_HEAD(&f->lru_list);
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
rwlock_init(&f->lock);
- f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
- (jiffies ^ (jiffies >> 6)));
-
- f->nqueues = 0;
- atomic_set(&f->mem, 0);
-
- init_timer(&f->secret_timer);
- f->secret_timer.function = inet_frag_secret_rebuild;
- f->secret_timer.data = (unsigned long)f;
- f->secret_timer.expires = jiffies + f->ctl->secret_interval;
+ setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
+ (unsigned long)f);
+ f->secret_timer.expires = jiffies + f->secret_interval;
add_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_init);
+void inet_frags_init_net(struct netns_frags *nf)
+{
+ nf->nqueues = 0;
+ init_frag_mem_limit(nf);
+ INIT_LIST_HEAD(&nf->lru_list);
+ spin_lock_init(&nf->lru_lock);
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
void inet_frags_fini(struct inet_frags *f)
{
del_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_fini);
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+ nf->low_thresh = 0;
+
+ local_bh_disable();
+ inet_frag_evictor(nf, f, true);
+ local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
+
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
- write_lock(&f->lock);
+ struct inet_frag_bucket *hb;
+ unsigned int hash;
+
+ read_lock(&f->lock);
+ hash = f->hashfn(fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
hlist_del(&fq->list);
- list_del(&fq->lru_list);
- f->nqueues--;
- write_unlock(&f->lock);
+ spin_unlock(&hb->chain_lock);
+
+ read_unlock(&f->lock);
+ inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -94,22 +149,17 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (del_timer(&fq->timer))
atomic_dec(&fq->refcnt);
- if (!(fq->last_in & COMPLETE)) {
+ if (!(fq->last_in & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->last_in |= COMPLETE;
+ fq->last_in |= INET_FRAG_COMPLETE;
}
}
-
EXPORT_SYMBOL(inet_frag_kill);
-static inline void frag_kfree_skb(struct inet_frags *f, struct sk_buff *skb,
- int *work)
+static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
+ struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
-
- atomic_sub(skb->truesize, &f->mem);
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
@@ -119,48 +169,63 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
int *work)
{
struct sk_buff *fp;
+ struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
- BUG_TRAP(q->last_in & COMPLETE);
- BUG_TRAP(del_timer(&q->timer) == 0);
+ WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
+ WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
fp = q->fragments;
+ nf = q->net;
while (fp) {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(f, fp, work);
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
fp = xp;
}
-
+ sum = sum_truesize + f->qsize;
if (work)
- *work -= f->qsize;
- atomic_sub(f->qsize, &f->mem);
+ *work -= sum;
+ sub_frag_mem_limit(q, sum);
- f->destructor(q);
+ if (f->destructor)
+ f->destructor(q);
+ kfree(q);
}
EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct inet_frags *f)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
{
struct inet_frag_queue *q;
int work, evicted = 0;
- work = atomic_read(&f->mem) - f->ctl->low_thresh;
- while (work > 0) {
- read_lock(&f->lock);
- if (list_empty(&f->lru_list)) {
- read_unlock(&f->lock);
+ if (!force) {
+ if (frag_mem_limit(nf) <= nf->high_thresh)
+ return 0;
+ }
+
+ work = frag_mem_limit(nf) - nf->low_thresh;
+ while (work > 0 || force) {
+ spin_lock(&nf->lru_lock);
+
+ if (list_empty(&nf->lru_list)) {
+ spin_unlock(&nf->lru_lock);
break;
}
- q = list_first_entry(&f->lru_list,
+ q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
- read_unlock(&f->lock);
+ /* Remove q from list to avoid several CPUs grabbing it */
+ list_del_init(&q->lru_list);
+
+ spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
- if (!(q->last_in & COMPLETE))
+ if (!(q->last_in & INET_FRAG_COMPLETE))
inet_frag_kill(q, f);
spin_unlock(&q->lock);
@@ -172,3 +237,125 @@ int inet_frag_evictor(struct inet_frags *f)
return evicted;
}
EXPORT_SYMBOL(inet_frag_evictor);
+
+static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+ struct inet_frag_queue *qp_in, struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *qp;
+ unsigned int hash;
+
+ read_lock(&f->lock); /* Protects against hash rebuild */
+ /*
+ * While we stayed w/o the lock other CPU could update
+ * the rnd seed, so we need to re-calculate the hash
+ * chain. Fortunatelly the qp_in can be used to get one.
+ */
+ hash = f->hashfn(qp_in);
+ hb = &f->hash[hash];
+ spin_lock(&hb->chain_lock);
+
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could be created on other cpu, while we
+ * released the hash bucket lock.
+ */
+ hlist_for_each_entry(qp, &hb->chain, list) {
+ if (qp->net == nf && f->match(qp, arg)) {
+ atomic_inc(&qp->refcnt);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+ qp_in->last_in |= INET_FRAG_COMPLETE;
+ inet_frag_put(qp_in, f);
+ return qp;
+ }
+ }
+#endif
+ qp = qp_in;
+ if (!mod_timer(&qp->timer, jiffies + nf->timeout))
+ atomic_inc(&qp->refcnt);
+
+ atomic_inc(&qp->refcnt);
+ hlist_add_head(&qp->list, &hb->chain);
+ inet_frag_lru_add(nf, qp);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+
+ return qp;
+}
+
+static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+ struct inet_frags *f, void *arg)
+{
+ struct inet_frag_queue *q;
+
+ q = kzalloc(f->qsize, GFP_ATOMIC);
+ if (q == NULL)
+ return NULL;
+
+ q->net = nf;
+ f->constructor(q, arg);
+ add_frag_mem_limit(q, f->qsize);
+
+ setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+ spin_lock_init(&q->lock);
+ atomic_set(&q->refcnt, 1);
+ INIT_LIST_HEAD(&q->lru_list);
+
+ return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+ struct inet_frags *f, void *arg)
+{
+ struct inet_frag_queue *q;
+
+ q = inet_frag_alloc(nf, f, arg);
+ if (q == NULL)
+ return NULL;
+
+ return inet_frag_intern(nf, q, f, arg);
+}
+
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+ struct inet_frags *f, void *key, unsigned int hash)
+ __releases(&f->lock)
+{
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *q;
+ int depth = 0;
+
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
+ if (q->net == nf && f->match(q, key)) {
+ atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+ return q;
+ }
+ depth++;
+ }
+ spin_unlock(&hb->chain_lock);
+ read_unlock(&f->lock);
+
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+ else
+ return ERR_PTR(-ENOBUFS);
+}
+EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fac6398e436..43116e8c8e1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -21,21 +21,51 @@
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
+#include <net/secure_seq.h>
#include <net/ip.h>
+static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 inet_ehash_secret __read_mostly;
+
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+
+
+static unsigned int inet_sk_ehashfn(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const __be32 laddr = inet->inet_rcv_saddr;
+ const __u16 lport = inet->inet_num;
+ const __be32 faddr = inet->inet_daddr;
+ const __be16 fport = inet->inet_dport;
+ struct net *net = sock_net(sk);
+
+ return inet_ehashfn(net, laddr, lport, faddr, fport);
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
*/
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb != NULL) {
+ write_pnet(&tb->ib_net, hold_net(net));
tb->port = snum;
tb->fastreuse = 0;
+ tb->fastreuseport = 0;
+ tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -49,6 +79,7 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
+ release_net(ib_net(tb));
kmem_cache_free(cachep, tb);
}
}
@@ -56,67 +87,109 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum)
{
- inet_sk(sk)->num = snum;
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ atomic_inc(&hashinfo->bsockets);
+
+ inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
+ tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
/*
* Get rid of any references to a local port held by the given sock.
*/
-static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+static void __inet_put_port(struct sock *sk)
{
- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
+ hashinfo->bhash_size);
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
+ atomic_dec(&hashinfo->bsockets);
+
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
+ tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
- inet_sk(sk)->num = 0;
+ inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
}
-void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+void inet_put_port(struct sock *sk)
{
local_bh_disable();
- __inet_put_port(hashinfo, sk);
+ __inet_put_port(sk);
local_bh_enable();
}
-
EXPORT_SYMBOL(inet_put_port);
-/*
- * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
{
- write_lock(&hashinfo->lhash_lock);
-
- if (atomic_read(&hashinfo->lhash_users)) {
- DEFINE_WAIT(wait);
+ struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+ unsigned short port = inet_sk(child)->inet_num;
+ const int bhash = inet_bhashfn(sock_net(sk), port,
+ table->bhash_size);
+ struct inet_bind_hashbucket *head = &table->bhash[bhash];
+ struct inet_bind_bucket *tb;
- for (;;) {
- prepare_to_wait_exclusive(&hashinfo->lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&hashinfo->lhash_users))
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ if (tb->port != port) {
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), sock_net(sk)) &&
+ tb->port == port)
break;
- write_unlock_bh(&hashinfo->lhash_lock);
- schedule();
- write_lock_bh(&hashinfo->lhash_lock);
}
-
- finish_wait(&hashinfo->lhash_wait, &wait);
+ if (!tb) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ sock_net(sk), head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ }
}
+ inet_bind_hash(child, tb, port);
+ spin_unlock(&head->lock);
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
-EXPORT_SYMBOL(inet_listen_wlock);
+static inline int compute_score(struct sock *sk, struct net *net,
+ const unsigned short hnum, const __be32 daddr,
+ const int dif)
+{
+ int score = -1;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+ !ipv6_only_sock(sk)) {
+ __be32 rcv_saddr = inet->inet_rcv_saddr;
+ score = sk->sk_family == PF_INET ? 2 : 1;
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+ }
+ return score;
+}
/*
* Don't inline this cruft. Here are some nice properties to exploit here. The
@@ -124,71 +197,125 @@ EXPORT_SYMBOL(inet_listen_wlock);
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
-static struct sock *inet_lookup_listener_slow(const struct hlist_head *head,
- const __be32 daddr,
- const unsigned short hnum,
- const int dif)
-{
- struct sock *result = NULL, *sk;
- const struct hlist_node *node;
- int hiscore = -1;
- sk_for_each(sk, node, head) {
- const struct inet_sock *inet = inet_sk(sk);
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- const __be32 rcv_saddr = inet->rcv_saddr;
- int score = sk->sk_family == PF_INET ? 1 : 0;
-
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- continue;
- score += 2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score += 2;
- }
- if (score == 5)
- return sk;
- if (score > hiscore) {
- hiscore = score;
- result = sk;
+struct sock *__inet_lookup_listener(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned int hash = inet_lhashfn(net, hnum);
+ struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
+
+ rcu_read_lock();
+begin:
+ result = NULL;
+ hiscore = 0;
+ sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ score = compute_score(sk, net, hnum, daddr, dif);
+ if (score > hiscore) {
+ result = sk;
+ hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
}
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, hnum, daddr,
+ dif) < hiscore)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
return result;
}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-/* Optimize the common listener case. */
-struct sock *__inet_lookup_listener(struct inet_hashinfo *hashinfo,
- const __be32 daddr, const unsigned short hnum,
- const int dif)
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
{
- struct sock *sk = NULL;
- const struct hlist_head *head;
-
- read_lock(&hashinfo->lhash_lock);
- head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
- if (!hlist_empty(head)) {
- const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
- if (inet->num == hnum && !sk->sk_node.next &&
- (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
- (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
- !sk->sk_bound_dev_if)
- goto sherry_cache;
- sk = inet_lookup_listener_slow(head, daddr, hnum, dif);
- }
- if (sk) {
-sherry_cache:
- sock_hold(sk);
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+struct sock *__inet_lookup_established(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif)
+{
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ const struct hlist_nulls_node *node;
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ unsigned int slot = hash & hashinfo->ehash_mask;
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+ rcu_read_lock();
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+ if (unlikely(!INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ sock_gen_put(sk);
+ goto begin;
+ }
+ goto found;
+ }
}
- read_unlock(&hashinfo->lhash_lock);
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
+ rcu_read_unlock();
return sk;
}
-EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
@@ -197,126 +324,214 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr = inet->rcv_saddr;
- __be32 saddr = inet->daddr;
+ __be32 daddr = inet->inet_rcv_saddr;
+ __be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
- INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+ struct net *net = sock_net(sk);
+ unsigned int hash = inet_ehashfn(net, daddr, lport,
+ saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
- const struct hlist_node *node;
- struct inet_timewait_sock *tw;
-
- prefetch(head->chain.first);
- write_lock(&head->lock);
-
- /* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &head->twchain) {
- tw = inet_twsk(sk2);
-
- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
- if (twsk_unique(sk, sk2, twp))
- goto unique;
- else
- goto not_unique;
- }
- }
- tw = NULL;
-
- /* And established part... */
- sk_for_each(sk2, node, &head->chain) {
- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+ const struct hlist_nulls_node *node;
+ struct inet_timewait_sock *tw = NULL;
+ int twrefcnt = 0;
+
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
goto not_unique;
+ }
}
-unique:
/* Must record num and sport now. Otherwise we will see
- * in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
sk->sk_hash = hash;
- BUG_TRAP(sk_unhashed(sk));
- __sk_add_node(sk, &head->chain);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ twrefcnt = inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ if (twrefcnt)
+ inet_twsk_put(tw);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
inet_twsk_put(tw);
}
-
return 0;
not_unique:
- write_unlock(&head->lock);
+ spin_unlock(lock);
return -EADDRNOTAVAIL;
}
static inline u32 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
- inet->dport);
+ return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+ inet->inet_daddr,
+ inet->inet_dport);
}
-/*
- * Bind a port for a connect operation and hash it.
- */
-int inet_hash_connect(struct inet_timewait_death_row *death_row,
- struct sock *sk)
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct hlist_nulls_head *list;
+ spinlock_t *lock;
+ struct inet_ehash_bucket *head;
+ int twrefcnt = 0;
+
+ WARN_ON(!sk_unhashed(sk));
+
+ sk->sk_hash = inet_sk_ehashfn(sk);
+ head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ list = &head->chain;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock(lock);
+ __sk_nulls_add_node_rcu(sk, list);
+ if (tw) {
+ WARN_ON(sk->sk_hash != tw->tw_hash);
+ twrefcnt = inet_twsk_unhash(tw);
+ }
+ spin_unlock(lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return twrefcnt;
+}
+EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+
+static void __inet_hash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
+
+ if (sk->sk_state != TCP_LISTEN) {
+ __inet_hash_nolisten(sk, NULL);
+ return;
+ }
+
+ WARN_ON(!sk_unhashed(sk));
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+
+ spin_lock(&ilb->lock);
+ __sk_nulls_add_node_rcu(sk, &ilb->head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ spin_unlock(&ilb->lock);
+}
+
+void inet_hash(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ local_bh_disable();
+ __inet_hash(sk);
+ local_bh_enable();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ spinlock_t *lock;
+ int done;
+
+ if (sk_unhashed(sk))
+ return;
+
+ if (sk->sk_state == TCP_LISTEN)
+ lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ else
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ done = __sk_nulls_del_node_init_rcu(sk);
+ if (done)
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk, u32 port_offset,
+ int (*check_established)(struct inet_timewait_death_row *,
+ struct sock *, __u16, struct inet_timewait_sock **),
+ int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->num;
+ const unsigned short snum = inet_sk(sk)->inet_num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret;
+ struct net *net = sock_net(sk);
+ int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
static u32 hint;
- u32 offset = hint + inet_sk_port_offset(sk);
- struct hlist_node *node;
+ u32 offset = hint + port_offset;
struct inet_timewait_sock *tw = NULL;
- inet_get_local_port_range(&low, &high);
- remaining = high - low;
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (tb->port == port) {
- BUG_TRAP(!hlist_empty(&tb->owners));
- if (tb->fastreuse >= 0)
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) &&
+ tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
- if (!__inet_check_established(death_row,
- sk, port,
- &tw))
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
goto ok;
goto next_port;
}
}
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
+ tb->fastreuseport = -1;
goto ok;
next_port:
@@ -332,35 +547,62 @@ ok:
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- __inet_hash(hinfo, sk, 0);
+ inet_sk(sk)->inet_sport = htons(port);
+ twrefcnt += hash(sk, tw);
}
+ if (tw)
+ twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
if (tw) {
inet_twsk_deschedule(tw, death_row);
- inet_twsk_put(tw);
+ while (twrefcnt) {
+ twrefcnt--;
+ inet_twsk_put(tw);
+ }
}
ret = 0;
goto out;
}
- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash(hinfo, sk, 0);
+ hash(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
- ret = __inet_check_established(death_row, sk, snum, NULL);
+ ret = check_established(death_row, sk, snum, NULL);
out:
local_bh_enable();
return ret;
}
}
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+ __inet_check_established, __inet_hash_nolisten);
+}
EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+ int i;
+
+ atomic_set(&h->bsockets, 0);
+ for (i = 0; i < INET_LHTABLE_SIZE; i++) {
+ spin_lock_init(&h->listening_hash[i].lock);
+ INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
+ i + LISTENING_NULLS_BASE);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index ac3b1d3dba2..f17ea49b28f 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/if_vlan.h>
#include <linux/inet_lro.h>
+#include <net/checksum.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
@@ -51,8 +52,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
* Basic tcp checks whether packet is suitable for LRO
*/
-static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
- int len, struct net_lro_desc *lro_desc)
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
+ int len, const struct net_lro_desc *lro_desc)
{
/* check ip header: don't aggregate padded frames */
if (ntohs(iph->tot_len) != len)
@@ -64,15 +65,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
if (iph->ihl != IPH_LEN_WO_OPTIONS)
return -1;
- if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
- || tcph->rst || tcph->syn || tcph->fin)
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+ tcph->rst || tcph->syn || tcph->fin)
return -1;
if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
return -1;
- if (tcph->doff != TCPH_LEN_WO_OPTIONS
- && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+ tcph->doff != TCPH_LEN_W_TIMESTAMP)
return -1;
/* check tcp options (only timestamp allowed) */
@@ -114,13 +115,11 @@ static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
*(p+2) = lro_desc->tcp_rcv_tsecr;
}
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
iph->tot_len = htons(lro_desc->ip_tot_len);
- iph->check = 0;
- iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
-
tcph->check = 0;
- tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0);
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
lro_desc->ip_tot_len -
@@ -135,7 +134,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
__wsum tcp_ps_hdr_csum;
tcp_csum = ~csum_unfold(tcph->check);
- tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum);
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
len + TCP_HDR_LEN(tcph),
@@ -146,8 +145,7 @@ static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
}
static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph,
- u16 vlan_tag, struct vlan_group *vgrp)
+ struct iphdr *iph, struct tcphdr *tcph)
{
int nr_frags;
__be32 *ptr;
@@ -173,8 +171,6 @@ static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
}
lro_desc->mss = tcp_data_len;
- lro_desc->vgrp = vgrp;
- lro_desc->vlan_tag = vlan_tag;
lro_desc->active = 1;
lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
@@ -234,38 +230,15 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
lro_desc->last_skb = skb;
}
-static void lro_add_frags(struct net_lro_desc *lro_desc,
- int len, int hlen, int truesize,
- struct skb_frag_struct *skb_frags,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct sk_buff *skb = lro_desc->parent;
- int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
- skb->truesize += truesize;
-
- skb_frags[0].page_offset += hlen;
- skb_frags[0].size -= hlen;
-
- while (tcp_data_len > 0) {
- *(lro_desc->next_frag) = *skb_frags;
- tcp_data_len -= skb_frags->size;
- lro_desc->next_frag++;
- skb_frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-}
static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
struct iphdr *iph,
struct tcphdr *tcph)
{
- if ((lro_desc->iph->saddr != iph->saddr)
- || (lro_desc->iph->daddr != iph->daddr)
- || (lro_desc->tcph->source != tcph->source)
- || (lro_desc->tcph->dest != tcph->dest))
+ if ((lro_desc->iph->saddr != iph->saddr) ||
+ (lro_desc->iph->daddr != iph->daddr) ||
+ (lro_desc->tcph->source != tcph->source) ||
+ (lro_desc->tcph->dest != tcph->dest))
return -1;
return 0;
}
@@ -309,29 +282,17 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,
skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
- if (lro_desc->vgrp) {
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
- vlan_hwaccel_receive_skb(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
- else
- vlan_hwaccel_rx(lro_desc->parent,
- lro_desc->vgrp,
- lro_desc->vlan_tag);
-
- } else {
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
- netif_receive_skb(lro_desc->parent);
- else
- netif_rx(lro_desc->parent);
- }
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
LRO_INC_STATS(lro_mgr, flushed);
lro_clear_desc(lro_desc);
}
static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
- struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+ void *priv)
{
struct net_lro_desc *lro_desc;
struct iphdr *iph;
@@ -339,9 +300,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
u64 flags;
int vlan_hdr_len = 0;
- if (!lro_mgr->get_skb_header
- || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
- &flags, priv))
+ if (!lro_mgr->get_skb_header ||
+ lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
goto out;
if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +312,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
if (!lro_desc)
goto out;
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
vlan_hdr_len = VLAN_HLEN;
if (!lro_desc->active) { /* start new lro session */
@@ -360,7 +321,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
goto out;
skb->ip_summed = lro_mgr->ip_summed_aggr;
- lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+ lro_init_desc(lro_desc, skb, iph, tcph);
LRO_INC_STATS(lro_mgr, aggregated);
return 0;
}
@@ -383,139 +344,16 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
out2: /* send aggregated SKBs to stack */
lro_flush(lro_mgr, lro_desc);
-out: /* Original SKB has to be posted to stack */
- skb->ip_summed = lro_mgr->ip_summed;
- return 1;
-}
-
-
-static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- void *mac_hdr,
- int hlen, __wsum sum,
- u32 ip_summed)
-{
- struct sk_buff *skb;
- struct skb_frag_struct *skb_frags;
- int data_len = len;
- int hdr_len = min(len, hlen);
-
- skb = netdev_alloc_skb(lro_mgr->dev, hlen);
- if (!skb)
- return NULL;
-
- skb->len = len;
- skb->data_len = len - hdr_len;
- skb->truesize += true_size;
- skb->tail += hdr_len;
-
- memcpy(skb->data, mac_hdr, hdr_len);
-
- skb_frags = skb_shinfo(skb)->frags;
- while (data_len > 0) {
- *skb_frags = *frags;
- data_len -= frags->size;
- skb_frags++;
- frags++;
- skb_shinfo(skb)->nr_frags++;
- }
-
- skb_shinfo(skb)->frags[0].page_offset += hdr_len;
- skb_shinfo(skb)->frags[0].size -= hdr_len;
-
- skb->ip_summed = ip_summed;
- skb->csum = sum;
- skb->protocol = eth_type_trans(skb, lro_mgr->dev);
- return skb;
-}
-
-static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
-{
- struct net_lro_desc *lro_desc;
- struct iphdr *iph;
- struct tcphdr *tcph;
- struct sk_buff *skb;
- u64 flags;
- void *mac_hdr;
- int mac_hdr_len;
- int hdr_len = LRO_MAX_PG_HLEN;
- int vlan_hdr_len = 0;
-
- if (!lro_mgr->get_frag_header
- || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
- (void *)&tcph, &flags, priv)) {
- mac_hdr = page_address(frags->page) + frags->page_offset;
- goto out1;
- }
-
- if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
- goto out1;
-
- hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
- mac_hdr_len = (int)((void *)(iph) - mac_hdr);
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (!lro_desc)
- goto out1;
-
- if (!lro_desc->active) { /* start new lro session */
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
- goto out1;
-
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, 0, lro_mgr->ip_summed_aggr);
- if (!skb)
- goto out;
-
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
- vlan_hdr_len = VLAN_HLEN;
-
- iph = (void *)(skb->data + vlan_hdr_len);
- tcph = (void *)((u8 *)skb->data + vlan_hdr_len
- + IP_HDR_LEN(iph));
-
- lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
- LRO_INC_STATS(lro_mgr, aggregated);
- return NULL;
- }
-
- if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
- goto out2;
-
- if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
- goto out2;
-
- lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
-
- if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
- lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
- lro_flush(lro_mgr, lro_desc);
-
- return NULL;
-
-out2: /* send aggregated packets to the stack */
- lro_flush(lro_mgr, lro_desc);
-
-out1: /* Original packet has to be posted to the stack */
- skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
- hdr_len, sum, lro_mgr->ip_summed);
out:
- return skb;
+ return 1;
}
void lro_receive_skb(struct net_lro_mgr *lro_mgr,
struct sk_buff *skb,
void *priv)
{
- if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ if (__lro_proc_skb(lro_mgr, skb, priv)) {
+ if (lro_mgr->features & LRO_F_NAPI)
netif_receive_skb(skb);
else
netif_rx(skb);
@@ -523,59 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
}
EXPORT_SYMBOL(lro_receive_skb);
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
- struct sk_buff *skb,
- struct vlan_group *vgrp,
- u16 vlan_tag,
- void *priv)
-{
- if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
- }
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
-
-void lro_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
- priv, sum);
- if (!skb)
- return;
-
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
- netif_receive_skb(skb);
- else
- netif_rx(skb);
-}
-EXPORT_SYMBOL(lro_receive_frags);
-
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
- struct skb_frag_struct *frags,
- int len, int true_size,
- struct vlan_group *vgrp,
- u16 vlan_tag, void *priv, __wsum sum)
-{
- struct sk_buff *skb;
-
- skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
- vlan_tag, priv, sum);
- if (!skb)
- return;
-
- if (test_bit(LRO_F_NAPI, &lro_mgr->features))
- vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
- else
- vlan_hwaccel_rx(skb, vgrp, vlan_tag);
-}
-EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
-
void lro_flush_all(struct net_lro_mgr *lro_mgr)
{
int i;
@@ -587,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr)
}
}
EXPORT_SYMBOL(lro_flush_all);
-
-void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct net_lro_desc *lro_desc;
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (lro_desc->active)
- lro_flush(lro_mgr, lro_desc);
-}
-EXPORT_SYMBOL(lro_flush_pkt);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e28f30..6d592f8555f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,43 +9,117 @@
*/
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/slab.h>
+#include <linux/module.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+
+/**
+ * inet_twsk_unhash - unhash a timewait socket from established hash
+ * @tw: timewait socket
+ *
+ * unhash a timewait socket from established hash, if hashed.
+ * ehash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+ if (hlist_nulls_unhashed(&tw->tw_node))
+ return 0;
+
+ hlist_nulls_del_rcu(&tw->tw_node);
+ sk_nulls_node_init(&tw->tw_node);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/**
+ * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ * @tw: timewait socket
+ * @hashinfo: hashinfo pointer
+ *
+ * unhash a timewait socket from bind hash, if hashed.
+ * bind hash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
+ return 0;
+
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
/* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_hashbucket *bhead;
- struct inet_bind_bucket *tb;
+ int refcnt;
/* Unlink from established hashes. */
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
- return;
- }
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+ spin_lock(lock);
+ refcnt = inet_twsk_unhash(tw);
+ spin_unlock(lock);
/* Disassociate with bind bucket. */
- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+ hashinfo->bhash_size)];
+
spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ refcnt += inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
+
+ BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+ atomic_sub(refcnt, &tw->tw_refcnt);
+}
+
+void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+ struct module *owner = tw->tw_prot->owner;
+ twsk_destructor((struct sock *)tw);
#ifdef SOCK_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
- tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
- }
+ pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
#endif
- inet_twsk_put(tw);
+ release_net(twsk_net(tw));
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+ module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+ if (atomic_dec_and_test(&tw->tw_refcnt))
+ inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
+{
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind_node, list);
}
/*
@@ -59,31 +133,40 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
struct inet_bind_hashbucket *bhead;
/* Step 1: Put TW into bind hash. Original socket stays there too.
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size)];
spin_lock(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
- BUG_TRAP(icsk->icsk_bind_hash);
+ WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
spin_unlock(&bhead->lock);
- write_lock(&ehead->lock);
+ spin_lock(lock);
- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
+ /*
+ * Step 2: Hash TW into tcp ehash chain.
+ * Notes :
+ * - tw_refcnt is set to 3 because :
+ * - We have one reference from bhash chain.
+ * - We have one reference from ehash chain.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
+ */
+ atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
- /* Step 3: Hash TW into TIMEWAIT chain. */
- inet_twsk_add_node(tw, &ehead->twchain);
- atomic_inc(&tw->tw_refcnt);
+ /* Step 3: Remove SK from hash chain */
+ if (__sk_nulls_del_node_init_rcu(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock(&ehead->lock);
+ spin_unlock(lock);
}
-
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -94,28 +177,37 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
if (tw != NULL) {
const struct inet_sock *inet = inet_sk(sk);
+ kmemcheck_annotate_bitfield(tw, flags);
+
/* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_daddr = inet->inet_daddr;
+ tw->tw_rcv_saddr = inet->inet_rcv_saddr;
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
+ tw->tw_tos = inet->tos;
+ tw->tw_num = inet->inet_num;
tw->tw_state = TCP_TIME_WAIT;
tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
+ tw->tw_sport = inet->inet_sport;
+ tw->tw_dport = inet->inet_dport;
tw->tw_family = sk->sk_family;
tw->tw_reuse = sk->sk_reuse;
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
+ tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
- atomic_set(&tw->tw_refcnt, 1);
+ twsk_net_set(tw, hold_net(sock_net(sk)));
+ /*
+ * Because we use RCU lookups, we should not set tw_refcnt
+ * to a non null value before everything is setup for this
+ * timewait socket.
+ */
+ atomic_set(&tw->tw_refcnt, 0);
inet_twsk_dead_node_init(tw);
__module_get(tw->tw_prot->owner);
}
return tw;
}
-
EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* Returns non-zero if quota exceeded. */
@@ -123,7 +215,6 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
const int slot)
{
struct inet_timewait_sock *tw;
- struct hlist_node *node;
unsigned int killed;
int ret;
@@ -136,10 +227,13 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
killed = 0;
ret = 0;
rescan:
- inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
+#ifdef CONFIG_NET_NS
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+#endif
inet_twsk_put(tw);
killed++;
spin_lock(&twdr->death_lock);
@@ -158,15 +252,16 @@ rescan:
}
twdr->tw_count -= killed;
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
+#ifndef CONFIG_NET_NS
+ NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
+#endif
return ret;
}
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
- int unsigned need_timer;
+ unsigned int need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
@@ -183,26 +278,23 @@ void inet_twdr_hangman(unsigned long data)
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
}
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
-extern void twkill_slots_invalid(void);
-
void inet_twdr_twkill_work(struct work_struct *work)
{
struct inet_timewait_death_row *twdr =
container_of(work, struct inet_timewait_death_row, twkill_work);
int i;
- if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
- twkill_slots_invalid();
+ BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
+ (sizeof(twdr->thread_slots) * 8));
while (twdr->thread_slots) {
spin_lock_bh(&twdr->death_lock);
@@ -223,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
spin_unlock_bh(&twdr->death_lock);
}
}
-
EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
/* These are always called from BH context. See callers in
@@ -243,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
}
-
EXPORT_SYMBOL(inet_twsk_deschedule);
void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -296,11 +386,11 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
- tw->tw_ttd = jiffies + timeo;
+ tw->tw_ttd = inet_tw_time_stamp() + timeo;
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
list = &twdr->cells[slot];
} else {
- tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+ tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
if (twdr->twcal_hand < 0) {
twdr->twcal_hand = 0;
@@ -324,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twsk_schedule);
void inet_twdr_twcal_tick(unsigned long data)
@@ -347,13 +436,16 @@ void inet_twdr_twcal_tick(unsigned long data)
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
+ struct hlist_node *safe;
struct inet_timewait_sock *tw;
- inet_twsk_for_each_inmate_safe(tw, node, safe,
+ inet_twsk_for_each_inmate_safe(tw, safe,
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
+#ifdef CONFIG_NET_NS
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+#endif
inet_twsk_put(tw);
killed++;
}
@@ -377,8 +469,57 @@ void inet_twdr_twcal_tick(unsigned long data)
out:
if ((twdr->tw_count -= killed) == 0)
del_timer(&twdr->tw_timer);
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+#ifndef CONFIG_NET_NS
+ NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
+#endif
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
+
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
+ struct inet_timewait_death_row *twdr, int family)
+{
+ struct inet_timewait_sock *tw;
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ unsigned int slot;
+
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ rcu_read_lock();
+restart:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_state != TCP_TIME_WAIT)
+ continue;
+ tw = inet_twsk(sk);
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
+ continue;
+
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+ continue;
+
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
+ local_bh_disable();
+ inet_twsk_deschedule(tw, twdr);
+ local_bh_enable();
+ inet_twsk_put(tw);
+ goto restart_rcu;
+ }
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 771031dfbd0..bd5f5928167 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -3,8 +3,6 @@
*
* This source is covered by the GNU GPL, the same as all kernel sources.
*
- * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
- *
* Authors: Andrey V. Savochkin <saw@msu.ru>
*/
@@ -19,27 +17,16 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
+#include <net/secure_seq.h>
/*
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size uses a constant ID and do not use this code (see
- * ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
@@ -53,54 +40,115 @@
* lookups performed with disabled BHs.
*
* Serialisation issues.
- * 1. Nodes may appear in the tree only with the pool write lock held.
- * 2. Nodes may disappear from the tree only with the pool write lock held
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
- * 3. Nodes appears and disappears from unused node list only under
- * "inet_peer_unused_lock".
- * 4. Global variable peer_total is modified under the pool lock.
- * 5. struct inet_peer fields modification:
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
* avl_left, avl_right, avl_parent, avl_height: pool lock
- * unused_next, unused_prevp: unused node list lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
- * dtime: unused node list lock
- * v4daddr: unchangeable
- * ip_id_count: idlock
+ * daddr: unchangeable
*/
-/* Exported for inet_getid inline function. */
-DEFINE_SPINLOCK(inet_peer_idlock);
-
static struct kmem_cache *peer_cachep __read_mostly;
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
#define node_height(x) x->avl_height
-static struct inet_peer peer_fake_node = {
- .avl_left = &peer_fake_node,
- .avl_right = &peer_fake_node,
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+ .avl_left = peer_avl_empty_rcu,
+ .avl_right = peer_avl_empty_rcu,
.avl_height = 0
};
-#define peer_avl_empty (&peer_fake_node)
-static struct inet_peer *peer_root = peer_avl_empty;
-static DEFINE_RWLOCK(peer_pool_lock);
+
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->flush_seq = ~0U;
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
+
+static atomic_t v4_seq = ATOMIC_INIT(0);
+static atomic_t v6_seq = ATOMIC_INIT(0);
+
+static atomic_t *inetpeer_seq_ptr(int family)
+{
+ return (family == AF_INET ? &v4_seq : &v6_seq);
+}
+
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+ atomic_t *fp = inetpeer_seq_ptr(family);
+
+ if (unlikely(base->flush_seq != atomic_read(fp))) {
+ inetpeer_invalidate_tree(base);
+ base->flush_seq = atomic_read(fp);
+ }
+}
+
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
-static int peer_total;
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
* aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
-int inet_peer_gc_mintime __read_mostly = 10 * HZ;
-int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
-static struct inet_peer *inet_peer_unused_head;
-static struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
-static DEFINE_SPINLOCK(inet_peer_unused_lock);
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n, *c;
+ struct list_head list;
-static void peer_check_expire(unsigned long dummy);
-static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if (need_resched())
+ cond_resched();
+
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
+ }
+
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
@@ -122,145 +170,179 @@ void __init inet_initpeers(void)
peer_cachep = kmem_cache_create("inet_peer_cache",
sizeof(struct inet_peer),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- peer_periodic_timer.expires = jiffies
- + net_random() % inet_peer_gc_maxtime
- + inet_peer_gc_maxtime;
- add_timer(&peer_periodic_timer);
+ INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
-/* Called with or without local BH being disabled. */
-static void unlink_from_unused(struct inet_peer *p)
+static int addr_compare(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
{
- spin_lock_bh(&inet_peer_unused_lock);
- if (p->unused_prevp != NULL) {
- /* On unused list. */
- *p->unused_prevp = p->unused_next;
- if (p->unused_next != NULL)
- p->unused_next->unused_prevp = p->unused_prevp;
- else
- inet_peer_unused_tailp = p->unused_prevp;
- p->unused_prevp = NULL; /* mark it as removed */
+ int i, n = (a->family == AF_INET ? 1 : 4);
+
+ for (i = 0; i < n; i++) {
+ if (a->addr.a6[i] == b->addr.a6[i])
+ continue;
+ if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+ return -1;
+ return 1;
}
- spin_unlock_bh(&inet_peer_unused_lock);
+
+ return 0;
}
+#define rcu_deref_locked(X, BASE) \
+ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
/*
* Called with local BH disabled and the pool lock held.
- * _stack is known to be NULL or not at compile time,
- * so compiler will optimize the if (_stack) tests.
*/
-#define lookup(_daddr,_stack) \
+#define lookup(_daddr, _stack, _base) \
({ \
- struct inet_peer *u, **v; \
- if (_stack != NULL) { \
- stackptr = _stack; \
- *stackptr++ = &peer_root; \
- } \
- for (u = peer_root; u != peer_avl_empty; ) { \
- if (_daddr == u->v4daddr) \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ \
+ stackptr = _stack; \
+ *stackptr++ = &_base->root; \
+ for (u = rcu_deref_locked(_base->root, _base); \
+ u != peer_avl_empty;) { \
+ int cmp = addr_compare(_daddr, &u->daddr); \
+ if (cmp == 0) \
break; \
- if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
+ if (cmp == -1) \
v = &u->avl_left; \
else \
v = &u->avl_right; \
- if (_stack != NULL) \
- *stackptr++ = v; \
- u = *v; \
+ *stackptr++ = v; \
+ u = rcu_deref_locked(*v, _base); \
} \
u; \
})
-/* Called with local BH disabled and the pool write lock held. */
-#define lookup_rightempty(start) \
+/*
+ * Called with rcu_read_lock()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base)
+{
+ struct inet_peer *u = rcu_dereference(base->root);
+ int count = 0;
+
+ while (u != peer_avl_empty) {
+ int cmp = addr_compare(daddr, &u->daddr);
+ if (cmp == 0) {
+ /* Before taking a reference, check if this entry was
+ * deleted (refcnt=-1)
+ */
+ if (!atomic_add_unless(&u->refcnt, 1, -1))
+ u = NULL;
+ return u;
+ }
+ if (cmp == -1)
+ u = rcu_dereference(u->avl_left);
+ else
+ u = rcu_dereference(u->avl_right);
+ if (unlikely(++count == PEER_MAXDEPTH))
+ break;
+ }
+ return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup_rightempty(start, base) \
({ \
- struct inet_peer *u, **v; \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
- for (u = *v; u->avl_right != peer_avl_empty; ) { \
+ for (u = rcu_deref_locked(*v, base); \
+ u->avl_right != peer_avl_empty_rcu;) { \
v = &u->avl_right; \
*stackptr++ = v; \
- u = *v; \
+ u = rcu_deref_locked(*v, base); \
} \
u; \
})
-/* Called with local BH disabled and the pool write lock held.
+/* Called with local BH disabled and the pool lock held.
* Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas. */
-static void peer_avl_rebalance(struct inet_peer **stack[],
- struct inet_peer ***stackend)
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+ struct inet_peer __rcu ***stackend,
+ struct inet_peer_base *base)
{
- struct inet_peer **nodep, *node, *l, *r;
+ struct inet_peer __rcu **nodep;
+ struct inet_peer *node, *l, *r;
int lh, rh;
while (stackend > stack) {
nodep = *--stackend;
- node = *nodep;
- l = node->avl_left;
- r = node->avl_right;
+ node = rcu_deref_locked(*nodep, base);
+ l = rcu_deref_locked(node->avl_left, base);
+ r = rcu_deref_locked(node->avl_right, base);
lh = node_height(l);
rh = node_height(r);
if (lh > rh + 1) { /* l: RH+2 */
struct inet_peer *ll, *lr, *lrl, *lrr;
int lrh;
- ll = l->avl_left;
- lr = l->avl_right;
+ ll = rcu_deref_locked(l->avl_left, base);
+ lr = rcu_deref_locked(l->avl_right, base);
lrh = node_height(lr);
if (lrh <= node_height(ll)) { /* ll: RH+1 */
- node->avl_left = lr; /* lr: RH or RH+1 */
- node->avl_right = r; /* r: RH */
+ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = lrh + 1; /* RH+1 or RH+2 */
- l->avl_left = ll; /* ll: RH+1 */
- l->avl_right = node; /* node: RH+1 or RH+2 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
+ RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
l->avl_height = node->avl_height + 1;
- *nodep = l;
+ RCU_INIT_POINTER(*nodep, l);
} else { /* ll: RH, lr: RH+1 */
- lrl = lr->avl_left; /* lrl: RH or RH-1 */
- lrr = lr->avl_right; /* lrr: RH or RH-1 */
- node->avl_left = lrr; /* lrr: RH or RH-1 */
- node->avl_right = r; /* r: RH */
+ lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+ lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = rh + 1; /* node: RH+1 */
- l->avl_left = ll; /* ll: RH */
- l->avl_right = lrl; /* lrl: RH or RH-1 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
+ RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
l->avl_height = rh + 1; /* l: RH+1 */
- lr->avl_left = l; /* l: RH+1 */
- lr->avl_right = node; /* node: RH+1 */
+ RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
lr->avl_height = rh + 2;
- *nodep = lr;
+ RCU_INIT_POINTER(*nodep, lr);
}
} else if (rh > lh + 1) { /* r: LH+2 */
struct inet_peer *rr, *rl, *rlr, *rll;
int rlh;
- rr = r->avl_right;
- rl = r->avl_left;
+ rr = rcu_deref_locked(r->avl_right, base);
+ rl = rcu_deref_locked(r->avl_left, base);
rlh = node_height(rl);
if (rlh <= node_height(rr)) { /* rr: LH+1 */
- node->avl_right = rl; /* rl: LH or LH+1 */
- node->avl_left = l; /* l: LH */
+ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = rlh + 1; /* LH+1 or LH+2 */
- r->avl_right = rr; /* rr: LH+1 */
- r->avl_left = node; /* node: LH+1 or LH+2 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
+ RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
r->avl_height = node->avl_height + 1;
- *nodep = r;
+ RCU_INIT_POINTER(*nodep, r);
} else { /* rr: RH, rl: RH+1 */
- rlr = rl->avl_right; /* rlr: LH or LH-1 */
- rll = rl->avl_left; /* rll: LH or LH-1 */
- node->avl_right = rll; /* rll: LH or LH-1 */
- node->avl_left = l; /* l: LH */
+ rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+ rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = lh + 1; /* node: LH+1 */
- r->avl_right = rr; /* rr: LH */
- r->avl_left = rlr; /* rlr: LH or LH-1 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
+ RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
r->avl_height = lh + 1; /* r: LH+1 */
- rl->avl_right = r; /* r: LH+1 */
- rl->avl_left = node; /* node: LH+1 */
+ RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
rl->avl_height = lh + 2;
- *nodep = rl;
+ RCU_INIT_POINTER(*nodep, rl);
}
} else {
node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -268,210 +350,230 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
}
}
-/* Called with local BH disabled and the pool write lock held. */
-#define link_to_pool(n) \
+/* Called with local BH disabled and the pool lock held. */
+#define link_to_pool(n, base) \
do { \
n->avl_height = 1; \
- n->avl_left = peer_avl_empty; \
- n->avl_right = peer_avl_empty; \
- **--stackptr = n; \
- peer_avl_rebalance(stack, stackptr); \
-} while(0)
-
-/* May be called with local BH enabled. */
-static void unlink_from_pool(struct inet_peer *p)
+ n->avl_left = peer_avl_empty_rcu; \
+ n->avl_right = peer_avl_empty_rcu; \
+ /* lockless readers can catch us now */ \
+ rcu_assign_pointer(**--stackptr, n); \
+ peer_avl_rebalance(stack, stackptr, base); \
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
{
- int do_free;
-
- do_free = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If the
- * reference count is still 1 then the node is referenced only as `p'
- * here and from the pool. So under the exclusive pool lock it's safe
- * to remove the node and free it later. */
- if (atomic_read(&p->refcnt) == 1) {
- struct inet_peer **stack[PEER_MAXDEPTH];
- struct inet_peer ***stackptr, ***delp;
- if (lookup(p->v4daddr, stack) != p)
- BUG();
- delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty) {
- *delp[0] = p->avl_right;
- --stackptr;
- } else {
- /* look for a node to insert instead of p */
- struct inet_peer *t;
- t = lookup_rightempty(p);
- BUG_ON(*stackptr[-1] != t);
- **--stackptr = t->avl_left;
- /* t is removed, t->v4daddr > x->v4daddr for any
- * x in p->avl_left subtree.
- * Put t in the old place of p. */
- *delp[0] = t;
- t->avl_left = p->avl_left;
- t->avl_right = p->avl_right;
- t->avl_height = p->avl_height;
- BUG_ON(delp[1] != &p->avl_left);
- delp[1] = &t->avl_left; /* was &p->avl_left */
- }
- peer_avl_rebalance(stack, stackptr);
- peer_total--;
- do_free = 1;
- }
- write_unlock_bh(&peer_pool_lock);
+ kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
- if (do_free)
- kmem_cache_free(peer_cachep, p);
- else
- /* The node is used again. Decrease the reference counter
- * back. The loop "cleanup -> unlink_from_unused
- * -> unlink_from_pool -> putpeer -> link_to_unused
- * -> cleanup (for the same node)"
- * doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon. */
- inet_putpeer(p);
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+{
+ struct inet_peer __rcu ***stackptr, ***delp;
+
+ if (lookup(&p->daddr, stack, base) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty_rcu) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p, base);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+ **--stackptr = t->avl_left;
+ /* t is removed, t->daddr > x->daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ RCU_INIT_POINTER(*delp[0], t);
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ BUG_ON(delp[1] != &p->avl_left);
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr, base);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
}
-/* May be called with local BH enabled. */
-static int cleanup_once(unsigned long ttl)
+/* perform garbage collect on all items stacked during a lookup */
+static int inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH],
+ struct inet_peer __rcu ***stackptr)
{
- struct inet_peer *p;
+ struct inet_peer *p, *gchead = NULL;
+ __u32 delta, ttl;
+ int cnt = 0;
- /* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&inet_peer_unused_lock);
- p = inet_peer_unused_head;
- if (p != NULL) {
- __u32 delta = (__u32)jiffies - p->dtime;
- if (delta < ttl) {
- /* Do not prune fresh entries. */
- spin_unlock_bh(&inet_peer_unused_lock);
- return -1;
+ if (base->total >= inet_peer_threshold)
+ ttl = 0; /* be aggressive */
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ base->total / inet_peer_threshold * HZ;
+ stackptr--; /* last stack slot is peer_avl_empty */
+ while (stackptr > stack) {
+ stackptr--;
+ p = rcu_deref_locked(**stackptr, base);
+ if (atomic_read(&p->refcnt) == 0) {
+ smp_rmb();
+ delta = (__u32)jiffies - p->dtime;
+ if (delta >= ttl &&
+ atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+ p->gc_next = gchead;
+ gchead = p;
+ }
}
- inet_peer_unused_head = p->unused_next;
- if (p->unused_next != NULL)
- p->unused_next->unused_prevp = p->unused_prevp;
- else
- inet_peer_unused_tailp = p->unused_prevp;
- p->unused_prevp = NULL; /* mark as not on the list */
- /* Grab an extra reference to prevent node disappearing
- * before unlink_from_pool() call. */
- atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&inet_peer_unused_lock);
-
- if (p == NULL)
- /* It means that the total number of USED entries has
- * grown over inet_peer_threshold. It shouldn't really
- * happen because of entry limits in route cache. */
- return -1;
-
- unlink_from_pool(p);
- return 0;
+ while ((p = gchead) != NULL) {
+ gchead = p->gc_next;
+ cnt++;
+ unlink_from_pool(p, base, stack);
+ }
+ return cnt;
}
-/* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(__be32 daddr, int create)
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
{
- struct inet_peer *p, *n;
- struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+ struct inet_peer *p;
+ unsigned int sequence;
+ int invalidated, gccnt = 0;
- /* Look up for the address quickly. */
- read_lock_bh(&peer_pool_lock);
- p = lookup(daddr, NULL);
- if (p != peer_avl_empty)
- atomic_inc(&p->refcnt);
- read_unlock_bh(&peer_pool_lock);
+ flush_check(base, daddr->family);
- if (p != peer_avl_empty) {
- /* The existing node has been found. */
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
+ /* Attempt a lockless lookup first.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ rcu_read_lock();
+ sequence = read_seqbegin(&base->lock);
+ p = lookup_rcu(daddr, base);
+ invalidated = read_seqretry(&base->lock, sequence);
+ rcu_read_unlock();
+
+ if (p)
return p;
- }
- if (!create)
+ /* If no writer did a change during our lookup, we can return early. */
+ if (!create && !invalidated)
return NULL;
- /* Allocate the space outside the locked region. */
- n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
- if (n == NULL)
- return NULL;
- n->v4daddr = daddr;
- atomic_set(&n->refcnt, 1);
- atomic_set(&n->rid, 0);
- n->ip_id_count = secure_ip_id(daddr);
- n->tcp_ts_stamp = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check if an entry has suddenly appeared. */
- p = lookup(daddr, stack);
- if (p != peer_avl_empty)
- goto out_free;
-
- /* Link the node. */
- link_to_pool(n);
- n->unused_prevp = NULL; /* not on the list */
- peer_total++;
- write_unlock_bh(&peer_pool_lock);
-
- if (peer_total >= inet_peer_threshold)
- /* Remove one less-recently-used entry. */
- cleanup_once(0);
-
- return n;
-
-out_free:
- /* The appropriate node is already in the pool. */
- atomic_inc(&p->refcnt);
- write_unlock_bh(&peer_pool_lock);
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
- /* Free preallocated the preallocated node. */
- kmem_cache_free(peer_cachep, n);
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ write_seqlock_bh(&base->lock);
+relookup:
+ p = lookup(daddr, stack, base);
+ if (p != peer_avl_empty) {
+ atomic_inc(&p->refcnt);
+ write_sequnlock_bh(&base->lock);
+ return p;
+ }
+ if (!gccnt) {
+ gccnt = inet_peer_gc(base, stack, stackptr);
+ if (gccnt && create)
+ goto relookup;
+ }
+ p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+ if (p) {
+ p->daddr = *daddr;
+ atomic_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+ INIT_LIST_HEAD(&p->gc_list);
+
+ /* Link the node. */
+ link_to_pool(p, base);
+ base->total++;
+ }
+ write_sequnlock_bh(&base->lock);
+
return p;
}
+EXPORT_SYMBOL_GPL(inet_getpeer);
-/* Called with local BH disabled. */
-static void peer_check_expire(unsigned long dummy)
+void inet_putpeer(struct inet_peer *p)
{
- unsigned long now = jiffies;
- int ttl;
+ p->dtime = (__u32)jiffies;
+ smp_mb__before_atomic();
+ atomic_dec(&p->refcnt);
+}
+EXPORT_SYMBOL_GPL(inet_putpeer);
- if (peer_total >= inet_peer_threshold)
- ttl = inet_peer_minttl;
- else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- peer_total / inet_peer_threshold * HZ;
- while (!cleanup_once(ttl)) {
- if (jiffies != now)
- break;
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
}
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
- /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
- * interval depending on the total number of entries (more entries,
- * less interval). */
- if (peer_total >= inet_peer_threshold)
- peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
- else
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
- add_timer(&peer_periodic_timer);
+static void inetpeer_inval_rcu(struct rcu_head *head)
+{
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
+
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
}
-void inet_putpeer(struct inet_peer *p)
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
- spin_lock_bh(&inet_peer_unused_lock);
- if (atomic_dec_and_test(&p->refcnt)) {
- p->unused_prevp = inet_peer_unused_tailp;
- p->unused_next = NULL;
- *inet_peer_unused_tailp = p;
- inet_peer_unused_tailp = &p->unused_next;
- p->dtime = (__u32)jiffies;
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
+
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
+ base->total = 0;
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
}
- spin_unlock_bh(&inet_peer_unused_lock);
+
+ write_sequnlock_bh(&base->lock);
}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 877da3ed52e..3a83ce5efa8 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -5,8 +5,6 @@
*
* The IP forwarding functionality.
*
- * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
- *
* Authors: see ip.c
*
* Fixes:
@@ -27,6 +25,7 @@
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp.h>
@@ -40,11 +39,30 @@
#include <net/route.h>
#include <net/xfrm.h>
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+ return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+ skb->ignore_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+
static int ip_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -54,9 +72,17 @@ static int ip_forward_finish(struct sk_buff *skb)
int ip_forward(struct sk_buff *skb)
{
+ u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
@@ -64,9 +90,6 @@ int ip_forward(struct sk_buff *skb)
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
-
skb_forward_csum(skb);
/*
@@ -80,21 +103,22 @@ int ip_forward(struct sk_buff *skb)
if (!xfrm4_route_forward(skb))
goto drop;
- rt = (struct rtable*)skb->dst;
+ rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->u.dst) &&
- (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
- IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->u.dst)));
+ htonl(mtu));
goto drop;
}
/* We are about to mangle packet. Copy it! */
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
@@ -105,13 +129,13 @@ int ip_forward(struct sk_buff *skb)
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
- if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp)
+ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
ip_rt_send_redirect(skb);
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
- ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
+ rt->dst.dev, ip_forward_finish);
sr_failed:
/*
@@ -122,7 +146,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c