diff options
Diffstat (limited to 'net/ipv6')
95 files changed, 27700 insertions, 13957 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 47263e45bac..438a73aa777 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -3,7 +3,7 @@ # # IPv6 as module will cause a CRASH if you try to unload it -config IPV6 +menuconfig IPV6 tristate "The IPv6 protocol" default m ---help--- @@ -11,7 +11,7 @@ config IPV6 You will still be able to do traditional IPv4 networking as well. For general information about IPv6, see - <http://playground.sun.com/pub/ipng/html/ipng-main.html>. + <https://en.wikipedia.org/wiki/IPv6>. For Linux IPv6 development information, see <http://www.linux-ipv6.org>. For specific information about IPv6 under Linux, read the HOWTO at <http://www.bieringer.de/linux/IPv6/>. @@ -19,47 +19,28 @@ config IPV6 To compile this protocol support as a module, choose M here: the module will be called ipv6. -config IPV6_PRIVACY - bool "IPv6: Privacy Extensions support" - depends on IPV6 - ---help--- - Privacy Extensions for Stateless Address Autoconfiguration in IPv6 - support. With this option, additional periodically-alter - pseudo-random global-scope unicast address(es) will assigned to - your interface(s). - - We use our standard pseudo random algorithm to generate randomized - interface identifier, instead of one described in RFC 3041. - - By default, kernel do not generate temporary addresses. - To use temporary addresses, do - - echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr - - See <file:Documentation/networking/ip-sysctl.txt> for details. +if IPV6 config IPV6_ROUTER_PREF bool "IPv6: Router Preference (RFC 4191) support" - depends on IPV6 ---help--- Router Preference is an optional extension to the Router - Advertisement message to improve the ability of hosts - to pick more appropriate router, especially when the hosts - is placed in a multi-homed network. + Advertisement message which improves the ability of hosts + to pick an appropriate router, especially when the hosts + are placed in a multi-homed network. If unsure, say N. config IPV6_ROUTE_INFO - bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)" - depends on IPV6_ROUTER_PREF && EXPERIMENTAL + bool "IPv6: Route Information (RFC 4191) support" + depends on IPV6_ROUTER_PREF ---help--- This is experimental support of Route Information. If unsure, say N. config IPV6_OPTIMISTIC_DAD - bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" - depends on IPV6 && EXPERIMENTAL + bool "IPv6: Enable RFC 4429 Optimistic DAD" ---help--- This is experimental support for optimistic Duplicate Address Detection. It allows for autoconfigured addresses @@ -69,8 +50,7 @@ config IPV6_OPTIMISTIC_DAD config INET6_AH tristate "IPv6: AH transformation" - depends on IPV6 - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_HMAC select CRYPTO_MD5 @@ -82,8 +62,7 @@ config INET6_AH config INET6_ESP tristate "IPv6: ESP transformation" - depends on IPV6 - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_AUTHENC select CRYPTO_HMAC @@ -98,11 +77,8 @@ config INET6_ESP config INET6_IPCOMP tristate "IPv6: IPComp transformation" - depends on IPV6 - select XFRM select INET6_XFRM_TUNNEL - select CRYPTO - select CRYPTO_DEFLATE + select XFRM_IPCOMP ---help--- Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. @@ -110,8 +86,7 @@ config INET6_IPCOMP If unsure, say Y. config IPV6_MIP6 - tristate "IPv6: Mobility (EXPERIMENTAL)" - depends on IPV6 && EXPERIMENTAL + tristate "IPv6: Mobility" select XFRM ---help--- Support for IPv6 Mobility described in RFC 3775. @@ -129,7 +104,6 @@ config INET6_TUNNEL config INET6_XFRM_MODE_TRANSPORT tristate "IPv6: IPsec transport mode" - depends on IPV6 default IPV6 select XFRM ---help--- @@ -139,7 +113,6 @@ config INET6_XFRM_MODE_TRANSPORT config INET6_XFRM_MODE_TUNNEL tristate "IPv6: IPsec tunnel mode" - depends on IPV6 default IPV6 select XFRM ---help--- @@ -149,7 +122,6 @@ config INET6_XFRM_MODE_TUNNEL config INET6_XFRM_MODE_BEET tristate "IPv6: IPsec BEET mode" - depends on IPV6 default IPV6 select XFRM ---help--- @@ -158,16 +130,28 @@ config INET6_XFRM_MODE_BEET If unsure, say Y. config INET6_XFRM_MODE_ROUTEOPTIMIZATION - tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" - depends on IPV6 && EXPERIMENTAL + tristate "IPv6: MIPv6 route optimization mode" select XFRM ---help--- Support for MIPv6 route optimization mode. +config IPV6_VTI +tristate "Virtual (secure) IPv6: tunneling" + select IPV6_TUNNEL + select NET_IP_TUNNEL + depends on INET6_XFRM_MODE_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This can be used with xfrm mode tunnel to give + the notion of a secure tunnel for IPSEC and then use routing protocol + on top. + config IPV6_SIT tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" - depends on IPV6 select INET_TUNNEL + select NET_IP_TUNNEL + select IPV6_NDISC_NODETYPE default y ---help--- Tunneling means encapsulating data of one protocol type within @@ -176,21 +160,58 @@ config IPV6_SIT into IPv4 packets. This is useful if you want to connect two IPv6 networks over an IPv4-only path. - Saying M here will produce a module called sit.ko. If unsure, say Y. + Saying M here will produce a module called sit. If unsure, say Y. + +config IPV6_SIT_6RD + bool "IPv6: IPv6 Rapid Deployment (6RD)" + depends on IPV6_SIT + default n + ---help--- + IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon + mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly + deploy IPv6 unicast service to IPv4 sites to which it provides + customer premise equipment. Like 6to4, it utilizes stateless IPv6 in + IPv4 encapsulation in order to transit IPv4-only network + infrastructure. Unlike 6to4, a 6rd service provider uses an IPv6 + prefix of its own in place of the fixed 6to4 prefix. + + With this option enabled, the SIT driver offers 6rd functionality by + providing additional ioctl API to configure the IPv6 Prefix for in + stead of static 2002::/16 for 6to4. + + If unsure, say N. + +config IPV6_NDISC_NODETYPE + bool config IPV6_TUNNEL tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL - depends on IPV6 ---help--- Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. If unsure, say N. +config IPV6_GRE + tristate "IPv6: GRE tunnel" + select IPV6_TUNNEL + select NET_IP_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + GRE (Generic Routing Encapsulation) and at this time allows + encapsulating of IPv4 or IPv6 over existing IPv6 infrastructure. + This driver is useful if the other endpoint is a Cisco router: Cisco + likes GRE much better than the other Linux tunneling driver ("IP + tunneling" above). In addition, GRE allows multicast redistribution + through the tunnel. + + Saying M here will produce a module called ip6_gre. If unsure, say N. + config IPV6_MULTIPLE_TABLES bool "IPv6: Multiple Routing Tables" - depends on IPV6 && EXPERIMENTAL select FIB_RULES ---help--- Support multiple routing tables. @@ -209,3 +230,32 @@ config IPV6_SUBTREES If unsure, say N. +config IPV6_MROUTE + bool "IPv6: multicast routing" + depends on IPV6 + ---help--- + Experimental support for IPv6 multicast forwarding. + If unsure, say N. + +config IPV6_MROUTE_MULTIPLE_TABLES + bool "IPv6: multicast policy routing" + depends on IPV6_MROUTE + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + +config IPV6_PIMSM_V2 + bool "IPv6: PIM-SM version 2 support" + depends on IPV6_MROUTE + ---help--- + Support for IPv6 PIM multicast routing protocol PIM-SMv2. + If unsure, say N. + +endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 24f3aa0f2a3..2fe68364bb2 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -7,15 +7,20 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ - raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o +ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o + ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o +ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o + ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ - xfrm6_output.o + xfrm6_output.o xfrm6_protocol.o ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o +ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o ipv6-objs += $(ipv6-y) @@ -31,9 +36,12 @@ obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o +obj-$(CONFIG_IPV6_GRE) += ip6_gre.o -obj-y += addrconf_core.o exthdrs_core.o +obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o +obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e7a1882db04..5667b3003af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6,8 +6,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: addrconf.c,v 1.69 2001/10/31 21:55:54 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -40,8 +38,11 @@ * status etc. */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> +#include <linux/kernel.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> @@ -54,6 +55,7 @@ #include <linux/route.h> #include <linux/inetdevice.h> #include <linux/init.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -61,11 +63,14 @@ #include <linux/delay.h> #include <linux/notifier.h> #include <linux/string.h> +#include <linux/hash.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> +#include <net/af_ieee802154.h> +#include <net/firewire.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> @@ -77,28 +82,30 @@ #include <net/pkt_sched.h> #include <linux/if_tunnel.h> #include <linux/rtnetlink.h> - -#ifdef CONFIG_IPV6_PRIVACY +#include <linux/netconf.h> #include <linux/random.h> -#endif - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/export.h> /* Set to 3 to get tracing... */ #define ACONF_DEBUG 2 #if ACONF_DEBUG >= 3 -#define ADBG(x) printk x +#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ADBG(x) +#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF -#define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} #ifdef CONFIG_SYSCTL static void addrconf_sysctl_register(struct inet6_dev *idev); @@ -113,34 +120,40 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) } #endif -#ifdef CONFIG_IPV6_PRIVACY -static int __ipv6_regen_rndid(struct inet6_dev *idev); -static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void __ipv6_regen_rndid(struct inet6_dev *idev); +static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static void ipv6_regen_rndid(unsigned long data); -static int desync_factor = MAX_DESYNC_FACTOR * HZ; -#endif - +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); /* * Configured unicast address hash table */ -static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; -static DEFINE_RWLOCK(addrconf_hash_lock); +static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); -static void addrconf_verify(unsigned long); +static void addrconf_verify(void); +static void addrconf_verify_rtnl(void); +static void addrconf_verify_work(struct work_struct *); -static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); -static DEFINE_SPINLOCK(addrconf_verify_lock); +static struct workqueue_struct *addrconf_wq; +static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work); static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); +static void addrconf_type_change(struct net_device *dev, + unsigned long event); static int addrconf_ifdown(struct net_device *dev, int how); -static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); -static void addrconf_dad_timer(unsigned long data); +static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, + int plen, + const struct net_device *dev, + u32 flags, u32 noflags); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp); +static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); @@ -149,12 +162,10 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); -static int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev); - -static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev); -struct ipv6_devconf ipv6_devconf __read_mostly = { +static struct ipv6_devconf ipv6_devconf __read_mostly = { .forwarding = 0, .hop_limit = IPV6_DEFAULT_HOPLIMIT, .mtu6 = IPV6_MIN_MTU, @@ -162,17 +173,17 @@ struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_redirects = 1, .autoconf = 1, .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, -#ifdef CONFIG_IPV6_PRIVACY .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, -#endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, @@ -185,6 +196,9 @@ struct ipv6_devconf ipv6_devconf __read_mostly = { #endif .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, + .suppress_frag_ndisc = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -194,17 +208,18 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra = 1, .accept_redirects = 1, .autoconf = 1, + .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, -#ifdef CONFIG_IPV6_PRIVACY .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, -#endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, @@ -217,111 +232,82 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #endif .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, + .suppress_frag_ndisc = 1, }; -/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; -const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; - /* Check if a valid qdisc is available */ -static inline int addrconf_qdisc_ok(struct net_device *dev) +static inline bool addrconf_qdisc_ok(const struct net_device *dev) { - return (dev->qdisc != &noop_qdisc); + return !qdisc_tx_is_noop(dev); } -static void addrconf_del_timer(struct inet6_ifaddr *ifp) +static void addrconf_del_rs_timer(struct inet6_dev *idev) { - if (del_timer(&ifp->timer)) + if (del_timer(&idev->rs_timer)) + __in6_dev_put(idev); +} + +static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) +{ + if (cancel_delayed_work(&ifp->dad_work)) __in6_ifa_put(ifp); } -enum addrconf_timer_t +static void addrconf_mod_rs_timer(struct inet6_dev *idev, + unsigned long when) { - AC_NONE, - AC_DAD, - AC_RS, -}; + if (!timer_pending(&idev->rs_timer)) + in6_dev_hold(idev); + mod_timer(&idev->rs_timer, jiffies + when); +} -static void addrconf_mod_timer(struct inet6_ifaddr *ifp, - enum addrconf_timer_t what, - unsigned long when) +static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, + unsigned long delay) { - if (!del_timer(&ifp->timer)) + if (!delayed_work_pending(&ifp->dad_work)) in6_ifa_hold(ifp); - - switch (what) { - case AC_DAD: - ifp->timer.function = addrconf_dad_timer; - break; - case AC_RS: - ifp->timer.function = addrconf_rs_timer; - break; - default:; - } - ifp->timer.expires = jiffies + when; - add_timer(&ifp->timer); + mod_delayed_work(addrconf_wq, &ifp->dad_work, delay); } static int snmp6_alloc_dev(struct inet6_dev *idev) { - if (snmp_mib_init((void **)idev->stats.ipv6, - sizeof(struct ipstats_mib)) < 0) + int i; + + idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + if (!idev->stats.ipv6) goto err_ip; - if (snmp_mib_init((void **)idev->stats.icmpv6, - sizeof(struct icmpv6_mib)) < 0) + + for_each_possible_cpu(i) { + struct ipstats_mib *addrconf_stats; + addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i); + u64_stats_init(&addrconf_stats->syncp); + } + + + idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6dev) goto err_icmp; - if (snmp_mib_init((void **)idev->stats.icmpv6msg, - sizeof(struct icmpv6msg_mib)) < 0) + idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6msgdev) goto err_icmpmsg; return 0; err_icmpmsg: - snmp_mib_free((void **)idev->stats.icmpv6); + kfree(idev->stats.icmpv6dev); err_icmp: - snmp_mib_free((void **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); err_ip: return -ENOMEM; } -static void snmp6_free_dev(struct inet6_dev *idev) -{ - snmp_mib_free((void **)idev->stats.icmpv6msg); - snmp_mib_free((void **)idev->stats.icmpv6); - snmp_mib_free((void **)idev->stats.ipv6); -} - -/* Nobody refers to this device, we may destroy it. */ - -static void in6_dev_finish_destroy_rcu(struct rcu_head *head) -{ - struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); - kfree(idev); -} - -void in6_dev_finish_destroy(struct inet6_dev *idev) -{ - struct net_device *dev = idev->dev; - BUG_TRAP(idev->addr_list==NULL); - BUG_TRAP(idev->mc_list==NULL); -#ifdef NET_REFCNT_DEBUG - printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); -#endif - dev_put(dev); - if (!idev->dead) { - printk("Freeing alive inet6 device %p\n", idev); - return; - } - snmp6_free_dev(idev); - call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); -} - -EXPORT_SYMBOL(in6_dev_finish_destroy); - -static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; - struct in6_addr maddr; ASSERT_RTNL(); @@ -335,7 +321,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) rwlock_init(&ndev->lock); ndev->dev = dev; - memcpy(&ndev->cnf, dev->nd_net->ipv6.devconf_dflt, sizeof(ndev->cnf)); + INIT_LIST_HEAD(&ndev->addr_list); + setup_timer(&ndev->rs_timer, addrconf_rs_timer, + (unsigned long)ndev); + memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -343,23 +332,25 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) kfree(ndev); return NULL; } + if (ndev->cnf.forwarding) + dev_disable_lro(dev); /* We refer to the device */ dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG((KERN_WARNING - "%s(): cannot allocate memory for statistics; dev=%s.\n", - __FUNCTION__, dev->name)); + ADBG(KERN_WARNING + "%s: cannot allocate memory for statistics; dev=%s.\n", + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); - ndev->dead = 1; - in6_dev_finish_destroy(ndev); + dev_put(dev); + kfree(ndev); return NULL; } if (snmp6_register_dev(ndev) < 0) { - ADBG((KERN_WARNING - "%s(): cannot create /proc/net/dev_snmp6/%s\n", - __FUNCTION__, dev->name)); + ADBG(KERN_WARNING + "%s: cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); ndev->dead = 1; in6_dev_finish_destroy(ndev); @@ -371,30 +362,30 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) */ in6_dev_hold(ndev); -#ifdef CONFIG_IPV6_PRIVACY + if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) + ndev->cnf.accept_dad = -1; + +#if IS_ENABLED(CONFIG_IPV6_SIT) + if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { + pr_info("%s: Disabled Multicast RS\n", dev->name); + ndev->cnf.rtr_solicits = 0; + } +#endif + + INIT_LIST_HEAD(&ndev->tempaddr_list); setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + dev->type == ARPHRD_TUNNEL6 || dev->type == ARPHRD_SIT || -#endif dev->type == ARPHRD_NONE) { - printk(KERN_INFO - "%s: Disabled Privacy Extensions\n", - dev->name); ndev->cnf.use_tempaddr = -1; - - if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { - printk(KERN_INFO - "%s: Disabled Multicast RS\n", - dev->name); - ndev->cnf.rtr_solicits = 0; - } } else { in6_dev_hold(ndev); ipv6_regen_rndid((unsigned long) ndev); } -#endif + + ndev->token = in6addr_any; if (netif_running(dev) && addrconf_qdisc_ok(dev)) ndev->if_flags |= IF_READY; @@ -405,21 +396,29 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); + /* Join interface-local all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes); + /* Join all-node multicast group */ - ipv6_addr_all_nodes(&maddr); - ipv6_dev_mc_inc(dev, &maddr); + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); return ndev; } -static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +static struct inet6_dev *ipv6_find_idev(struct net_device *dev) { struct inet6_dev *idev; ASSERT_RTNL(); - if ((idev = __in6_dev_get(dev)) == NULL) { - if ((idev = ipv6_add_dev(dev)) == NULL) + idev = __in6_dev_get(dev); + if (!idev) { + idev = ipv6_add_dev(dev); + if (!idev) return NULL; } @@ -428,25 +427,250 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) return idev; } +static int inet6_netconf_msgsize_devconf(int type) +{ + int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + + nla_total_size(4); /* NETCONFA_IFINDEX */ + + /* type -1 is used for ALL */ + if (type == -1 || type == NETCONFA_FORWARDING) + size += nla_total_size(4); +#ifdef CONFIG_IPV6_MROUTE + if (type == -1 || type == NETCONFA_MC_FORWARDING) + size += nla_total_size(4); +#endif + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); + + return size; +} + +static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, + struct ipv6_devconf *devconf, u32 portid, + u32 seq, int event, unsigned int flags, + int type) +{ + struct nlmsghdr *nlh; + struct netconfmsg *ncm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), + flags); + if (nlh == NULL) + return -EMSGSIZE; + + ncm = nlmsg_data(nlh); + ncm->ncm_family = AF_INET6; + + if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) + goto nla_put_failure; + + /* type -1 is used for ALL */ + if ((type == -1 || type == NETCONFA_FORWARDING) && + nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) + goto nla_put_failure; +#ifdef CONFIG_IPV6_MROUTE + if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + nla_put_s32(skb, NETCONFA_MC_FORWARDING, + devconf->mc_forwarding) < 0) + goto nla_put_failure; +#endif + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, + struct ipv6_devconf *devconf) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, + RTM_NEWNETCONF, 0, type); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); + return; +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); +} + +static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { + [NETCONFA_IFINDEX] = { .len = sizeof(int) }, + [NETCONFA_FORWARDING] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, +}; + +static int inet6_netconf_get_devconf(struct sk_buff *in_skb, + struct nlmsghdr *nlh) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NETCONFA_MAX+1]; + struct netconfmsg *ncm; + struct sk_buff *skb; + struct ipv6_devconf *devconf; + struct inet6_dev *in6_dev; + struct net_device *dev; + int ifindex; + int err; + + err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, + devconf_ipv6_policy); + if (err < 0) + goto errout; + + err = EINVAL; + if (!tb[NETCONFA_IFINDEX]) + goto errout; + + ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); + switch (ifindex) { + case NETCONFA_IFINDEX_ALL: + devconf = net->ipv6.devconf_all; + break; + case NETCONFA_IFINDEX_DEFAULT: + devconf = net->ipv6.devconf_dflt; + break; + default: + dev = __dev_get_by_index(net, ifindex); + if (dev == NULL) + goto errout; + in6_dev = __in6_dev_get(dev); + if (in6_dev == NULL) + goto errout; + devconf = &in6_dev->cnf; + break; + } + + err = -ENOBUFS; + skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_netconf_fill_devconf(skb, ifindex, devconf, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWNETCONF, 0, + -1); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: + return err; +} + +static int inet6_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + + if (inet6_netconf_fill_devconf(skb, dev->ifindex, + &idev->cnf, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + -1) <= 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } + if (h == NETDEV_HASHENTRIES) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } + if (h == NETDEV_HASHENTRIES + 1) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #ifdef CONFIG_SYSCTL static void dev_forward_change(struct inet6_dev *idev) { struct net_device *dev; struct inet6_ifaddr *ifa; - struct in6_addr addr; if (!idev) return; dev = idev->dev; - if (dev && (dev->flags & IFF_MULTICAST)) { - ipv6_addr_all_routers(&addr); - - if (idev->cnf.forwarding) - ipv6_dev_mc_inc(dev, &addr); - else - ipv6_dev_mc_dec(dev, &addr); + if (idev->cnf.forwarding) + dev_disable_lro(dev); + if (dev->flags & IFF_MULTICAST) { + if (idev->cnf.forwarding) { + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters); + } else { + ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters); + } } - for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + + list_for_each_entry(ifa, &idev->addr_list, if_list) { if (ifa->flags&IFA_F_TENTATIVE) continue; if (idev->cnf.forwarding) @@ -454,6 +678,8 @@ static void dev_forward_change(struct inet6_dev *idev) else addrconf_leave_anycast(ifa); } + inet6_netconf_notify_devconf(dev_net(dev), NETCONFA_FORWARDING, + dev->ifindex, &idev->cnf); } @@ -462,9 +688,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - read_lock(&dev_base_lock); for_each_netdev(net, dev) { - rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -472,85 +696,114 @@ static void addrconf_forward_change(struct net *net, __s32 newf) if (changed) dev_forward_change(idev); } - rcu_read_unlock(); } - read_unlock(&dev_base_lock); } -static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) { struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); net = (struct net *)table->extra2; - if (p == &net->ipv6.devconf_dflt->forwarding) - return; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->forwarding) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } if (p == &net->ipv6.devconf_all->forwarding) { - __s32 newf = net->ipv6.devconf_all->forwarding; net->ipv6.devconf_dflt->forwarding = newf; addrconf_forward_change(net, newf); - } else if ((!*p) ^ (!old)) + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); + rtnl_unlock(); - if (*p) - rt6_purge_dflt_routers(); + if (newf) + rt6_purge_dflt_routers(net); + return 1; } #endif /* Nobody refers to this ifaddr, destroy it */ - void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) { - BUG_TRAP(ifp->if_next==NULL); - BUG_TRAP(ifp->lst_next==NULL); + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); + #ifdef NET_REFCNT_DEBUG - printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); + pr_debug("%s\n", __func__); #endif in6_dev_put(ifp->idev); - if (del_timer(&ifp->timer)) - printk("Timer is still running, when freeing ifa=%p\n", ifp); + if (cancel_delayed_work(&ifp->dad_work)) + pr_notice("delayed DAD work was pending while freeing ifa=%p\n", + ifp); - if (!ifp->dead) { - printk("Freeing alive inet6 address %p\n", ifp); + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + pr_warn("Freeing alive inet6 address %p\n", ifp); return; } - dst_release(&ifp->rt->u.dst); + ip6_rt_put(ifp->rt); - kfree(ifp); + kfree_rcu(ifp, rcu); } static void ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - struct inet6_ifaddr *ifa, **ifap; + struct list_head *p; int ifp_scope = ipv6_addr_src_scope(&ifp->addr); /* * Each device address list is sorted in order of scope - * global before linklocal. */ - for (ifap = &idev->addr_list; (ifa = *ifap) != NULL; - ifap = &ifa->if_next) { + list_for_each(p, &idev->addr_list) { + struct inet6_ifaddr *ifa + = list_entry(p, struct inet6_ifaddr, if_list); if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) break; } - ifp->if_next = *ifap; - *ifap = ifp; + list_add_tail(&ifp->if_list, p); +} + +static u32 inet6_addr_hash(const struct in6_addr *addr) +{ + return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * -ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, - int scope, u32 flags) +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + const struct in6_addr *peer_addr, int pfxlen, + int scope, u32 flags, u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; - int hash; + unsigned int hash; int err = 0; + int addr_type = ipv6_addr_type(addr); + + if (addr_type == IPV6_ADDR_ANY || + addr_type & IPV6_ADDR_MULTICAST || + (!(idev->dev->flags & IFF_LOOPBACK) && + addr_type & IPV6_ADDR_LOOPBACK)) + return ERR_PTR(-EADDRNOTAVAIL); rcu_read_lock_bh(); if (idev->dead) { @@ -558,11 +811,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out2; } - write_lock(&addrconf_hash_lock); + if (idev->cnf.disable_ipv6) { + err = -EACCES; + goto out2; + } + + spin_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(&init_net, addr, idev->dev)) { - ADBG(("ipv6_add_addr: already assigned\n")); + if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { + ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; goto out; } @@ -570,63 +828,56 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); if (ifa == NULL) { - ADBG(("ipv6_add_addr: malloc failed\n")); + ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } - rt = addrconf_dst_alloc(idev, addr, 0); + rt = addrconf_dst_alloc(idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; } - ipv6_addr_copy(&ifa->addr, addr); + neigh_parms_data_state_setall(idev->nd_parms); + + ifa->addr = *addr; + if (peer_addr) + ifa->peer_addr = *peer_addr; spin_lock_init(&ifa->lock); - init_timer(&ifa->timer); - ifa->timer.data = (unsigned long) ifa; + spin_lock_init(&ifa->state_lock); + INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); + INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; ifa->prefix_len = pfxlen; ifa->flags = flags | IFA_F_TENTATIVE; + ifa->valid_lft = valid_lft; + ifa->prefered_lft = prefered_lft; ifa->cstamp = ifa->tstamp = jiffies; + ifa->tokenized = false; ifa->rt = rt; - /* - * part one of RFC 4429, section 3.3 - * We should not configure an address as - * optimistic if we do not yet know the link - * layer address of our nexhop router - */ - - if (rt->rt6i_nexthop == NULL) - ifa->flags &= ~IFA_F_OPTIMISTIC; - ifa->idev = idev; in6_dev_hold(idev); /* For caller */ in6_ifa_hold(ifa); /* Add to big hash table */ - hash = ipv6_addr_hash(addr); + hash = inet6_addr_hash(addr); - ifa->lst_next = inet6_addr_lst[hash]; - inet6_addr_lst[hash] = ifa; - in6_ifa_hold(ifa); - write_unlock(&addrconf_hash_lock); + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); write_lock(&idev->lock); /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); -#ifdef CONFIG_IPV6_PRIVACY if (ifa->flags&IFA_F_TEMPORARY) { - ifa->tmp_next = idev->tempaddr_list; - idev->tempaddr_list = ifa; + list_add(&ifa->tmp_list, &idev->tempaddr_list); in6_ifa_hold(ifa); } -#endif in6_ifa_hold(ifa); write_unlock(&idev->lock); @@ -634,7 +885,7 @@ out2: rcu_read_unlock_bh(); if (likely(err == 0)) - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); else { kfree(ifa); ifa = ERR_PTR(err); @@ -642,147 +893,162 @@ out2: return ifa; out: - write_unlock(&addrconf_hash_lock); + spin_unlock(&addrconf_hash_lock); goto out2; } -/* This function wants to get referenced ifp and releases it before return */ +enum cleanup_prefix_rt_t { + CLEANUP_PREFIX_RT_NOP, /* no cleanup action for prefix route */ + CLEANUP_PREFIX_RT_DEL, /* delete the prefix route */ + CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */ +}; -static void ipv6_del_addr(struct inet6_ifaddr *ifp) +/* + * Check, whether the prefix for ifp would still need a prefix route + * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_* + * constants. + * + * 1) we don't purge prefix if address was not permanent. + * prefix is managed by its own lifetime. + * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE. + * 3) if there are no addresses, delete prefix. + * 4) if there are still other permanent address(es), + * corresponding prefix is still permanent. + * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE, + * don't purge the prefix, assume user space is managing it. + * 6) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + **/ +static enum cleanup_prefix_rt_t +check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) { - struct inet6_ifaddr *ifa, **ifap; + struct inet6_ifaddr *ifa; struct inet6_dev *idev = ifp->idev; - int hash; - int deleted = 0, onlink = 0; - unsigned long expires = jiffies; + unsigned long lifetime; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL; + + *expires = jiffies; + + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (ifa == ifp) + continue; + if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) + continue; + if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE)) + return CLEANUP_PREFIX_RT_NOP; - hash = ipv6_addr_hash(&ifp->addr); + action = CLEANUP_PREFIX_RT_EXPIRE; - ifp->dead = 1; + spin_lock(&ifa->lock); - write_lock_bh(&addrconf_hash_lock); - for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL; - ifap = &ifa->lst_next) { - if (ifa == ifp) { - *ifap = ifa->lst_next; - __in6_ifa_put(ifp); - ifa->lst_next = NULL; - break; - } + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ + if (time_before(*expires, ifa->tstamp + lifetime * HZ)) + *expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); } - write_unlock_bh(&addrconf_hash_lock); - write_lock_bh(&idev->lock); -#ifdef CONFIG_IPV6_PRIVACY - if (ifp->flags&IFA_F_TEMPORARY) { - for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL; - ifap = &ifa->tmp_next) { - if (ifa == ifp) { - *ifap = ifa->tmp_next; - if (ifp->ifpub) { - in6_ifa_put(ifp->ifpub); - ifp->ifpub = NULL; - } - __in6_ifa_put(ifp); - ifa->tmp_next = NULL; - break; - } + return action; +} + +static void +cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) +{ + struct rt6_info *rt; + + rt = addrconf_get_prefix_route(&ifp->addr, + ifp->prefix_len, + ifp->idev->dev, + 0, RTF_GATEWAY | RTF_DEFAULT); + if (rt) { + if (del_rt) + ip6_del_rt(rt); + else { + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_set_expires(rt, expires); + ip6_rt_put(rt); } } -#endif +} - for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) { - if (ifa == ifp) { - *ifap = ifa->if_next; - __in6_ifa_put(ifp); - ifa->if_next = NULL; - if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) - break; - deleted = 1; - continue; - } else if (ifp->flags & IFA_F_PERMANENT) { - if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, - ifp->prefix_len)) { - if (ifa->flags & IFA_F_PERMANENT) { - onlink = 1; - if (deleted) - break; - } else { - unsigned long lifetime; - - if (!onlink) - onlink = -1; - - spin_lock(&ifa->lock); - lifetime = min_t(unsigned long, - ifa->valid_lft, 0x7fffffffUL/HZ); - if (time_before(expires, - ifa->tstamp + lifetime * HZ)) - expires = ifa->tstamp + lifetime * HZ; - spin_unlock(&ifa->lock); - } - } + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + int state; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP; + unsigned long expires; + + ASSERT_RTNL(); + + spin_lock_bh(&ifp->state_lock); + state = ifp->state; + ifp->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->state_lock); + + if (state == INET6_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&ifp->idev->lock); + + if (ifp->flags&IFA_F_TEMPORARY) { + list_del(&ifp->tmp_list); + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; } - ifap = &ifa->if_next; + __in6_ifa_put(ifp); } - write_unlock_bh(&idev->lock); - ipv6_ifa_notify(RTM_DELADDR, ifp); + if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + action = check_cleanup_prefix_route(ifp, &expires); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); + list_del_init(&ifp->if_list); + __in6_ifa_put(ifp); - addrconf_del_timer(ifp); + write_unlock_bh(&ifp->idev->lock); - /* - * Purge or update corresponding prefix - * - * 1) we don't purge prefix here if address was not permanent. - * prefix is managed by its own lifetime. - * 2) if there're no addresses, delete prefix. - * 3) if there're still other permanent address(es), - * corresponding prefix is still permanent. - * 4) otherwise, update prefix lifetime to the - * longest valid lifetime among the corresponding - * addresses on the device. - * Note: subsequent RA will update lifetime. - * - * --yoshfuji - */ - if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { - struct in6_addr prefix; - struct rt6_info *rt; + addrconf_del_dad_work(ifp); - ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); - rt = rt6_lookup(&prefix, NULL, ifp->idev->dev->ifindex, 1); + ipv6_ifa_notify(RTM_DELADDR, ifp); - if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { - if (onlink == 0) { - ip6_del_rt(rt); - rt = NULL; - } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { - rt->rt6i_expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; - } - } - dst_release(&rt->u.dst); + inet6addr_notifier_call_chain(NETDEV_DOWN, ifp); + + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, expires, + action == CLEANUP_PREFIX_RT_DEL); } + /* clean up prefsrc entries */ + rt6_remove_prefsrc(ifp); +out: in6_ifa_put(ifp); } -#ifdef CONFIG_IPV6_PRIVACY static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; - unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; unsigned long regen_advance; int tmp_plen; int ret = 0; - int max_addresses; u32 addr_flags; + unsigned long now = jiffies; - write_lock(&idev->lock); + write_lock_bh(&idev->lock); if (ift) { spin_lock_bh(&ift->lock); memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); @@ -794,9 +1060,8 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i retry: in6_dev_hold(idev); if (idev->cnf.use_tempaddr <= 0) { - write_unlock(&idev->lock); - printk(KERN_INFO - "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + write_unlock_bh(&idev->lock); + pr_info("%s: use_tempaddr is disabled\n", __func__); in6_dev_put(idev); ret = -1; goto out; @@ -805,49 +1070,43 @@ retry: if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { idev->cnf.use_tempaddr = -1; /*XXX*/ spin_unlock_bh(&ifp->lock); - write_unlock(&idev->lock); - printk(KERN_WARNING - "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + write_unlock_bh(&idev->lock); + pr_warn("%s: regeneration time exceeded - disabled temporary address support\n", + __func__); in6_dev_put(idev); ret = -1; goto out; } in6_ifa_hold(ifp); memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); - if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { - spin_unlock_bh(&ifp->lock); - write_unlock(&idev->lock); - printk(KERN_WARNING - "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); - in6_ifa_put(ifp); - in6_dev_put(idev); - ret = -1; - goto out; - } + __ipv6_try_regen_rndid(idev, tmpaddr); memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (now - ifp->tstamp) / HZ; tmp_valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft); + idev->cnf.temp_valid_lft + age); tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, - idev->cnf.temp_prefered_lft - desync_factor / HZ); + idev->cnf.temp_prefered_lft + age - + idev->cnf.max_desync_factor); tmp_plen = ifp->prefix_len; - max_addresses = idev->cnf.max_addresses; - tmp_cstamp = ifp->cstamp; tmp_tstamp = ifp->tstamp; spin_unlock_bh(&ifp->lock); regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - idev->nd_parms->retrans_time / HZ; - write_unlock(&idev->lock); + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + write_unlock_bh(&idev->lock); /* A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. In particular, * an implementation must not create a temporary address with a zero * Preferred Lifetime. + * Use age calculation as in addrconf_verify to avoid unnecessary + * temporary addresses being generated. */ - if (tmp_prefered_lft <= regen_advance) { + age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + if (tmp_prefered_lft <= regen_advance + age) { in6_ifa_put(ifp); in6_dev_put(idev); ret = -1; @@ -859,84 +1118,220 @@ retry: if (ifp->flags & IFA_F_OPTIMISTIC) addr_flags |= IFA_F_OPTIMISTIC; - ift = !max_addresses || - ipv6_count_addresses(idev) < max_addresses ? - ipv6_add_addr(idev, &addr, tmp_plen, - ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, - addr_flags) : NULL; - if (!ift || IS_ERR(ift)) { + ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, + ipv6_addr_scope(&addr), addr_flags, + tmp_valid_lft, tmp_prefered_lft); + if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); - printk(KERN_INFO - "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + pr_info("%s: retry temporary address regeneration\n", __func__); tmpaddr = &addr; - write_lock(&idev->lock); + write_lock_bh(&idev->lock); goto retry; } spin_lock_bh(&ift->lock); ift->ifpub = ifp; - ift->valid_lft = tmp_valid_lft; - ift->prefered_lft = tmp_prefered_lft; - ift->cstamp = tmp_cstamp; + ift->cstamp = now; ift->tstamp = tmp_tstamp; spin_unlock_bh(&ift->lock); - addrconf_dad_start(ift, 0); + addrconf_dad_start(ift); in6_ifa_put(ift); in6_dev_put(idev); out: return ret; } -#endif /* * Choose an appropriate source address (RFC3484) */ +enum { + IPV6_SADDR_RULE_INIT = 0, + IPV6_SADDR_RULE_LOCAL, + IPV6_SADDR_RULE_SCOPE, + IPV6_SADDR_RULE_PREFERRED, +#ifdef CONFIG_IPV6_MIP6 + IPV6_SADDR_RULE_HOA, +#endif + IPV6_SADDR_RULE_OIF, + IPV6_SADDR_RULE_LABEL, + IPV6_SADDR_RULE_PRIVACY, + IPV6_SADDR_RULE_ORCHID, + IPV6_SADDR_RULE_PREFIX, + IPV6_SADDR_RULE_MAX +}; + struct ipv6_saddr_score { - int addr_type; - unsigned int attrs; - int matchlen; - int scope; - unsigned int rule; + int rule; + int addr_type; + struct inet6_ifaddr *ifa; + DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX); + int scopedist; + int matchlen; }; -#define IPV6_SADDR_SCORE_LOCAL 0x0001 -#define IPV6_SADDR_SCORE_PREFERRED 0x0004 -#define IPV6_SADDR_SCORE_HOA 0x0008 -#define IPV6_SADDR_SCORE_OIF 0x0010 -#define IPV6_SADDR_SCORE_LABEL 0x0020 -#define IPV6_SADDR_SCORE_PRIVACY 0x0040 +struct ipv6_saddr_dst { + const struct in6_addr *addr; + int ifindex; + int scope; + int label; + unsigned int prefs; +}; static inline int ipv6_saddr_preferred(int type) { - if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| - IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK)) return 1; return 0; } -int ipv6_dev_get_saddr(struct net_device *daddr_dev, - struct in6_addr *daddr, struct in6_addr *saddr) +static int ipv6_get_saddr_eval(struct net *net, + struct ipv6_saddr_score *score, + struct ipv6_saddr_dst *dst, + int i) +{ + int ret; + + if (i <= score->rule) { + switch (i) { + case IPV6_SADDR_RULE_SCOPE: + ret = score->scopedist; + break; + case IPV6_SADDR_RULE_PREFIX: + ret = score->matchlen; + break; + default: + ret = !!test_bit(i, score->scorebits); + } + goto out; + } + + switch (i) { + case IPV6_SADDR_RULE_INIT: + /* Rule 0: remember if hiscore is not ready yet */ + ret = !!score->ifa; + break; + case IPV6_SADDR_RULE_LOCAL: + /* Rule 1: Prefer same address */ + ret = ipv6_addr_equal(&score->ifa->addr, dst->addr); + break; + case IPV6_SADDR_RULE_SCOPE: + /* Rule 2: Prefer appropriate scope + * + * ret + * ^ + * -1 | d 15 + * ---+--+-+---> scope + * | + * | d is scope of the destination. + * B-d | \ + * | \ <- smaller scope is better if + * B-15 | \ if scope is enough for destination. + * | ret = B - scope (-1 <= scope >= d <= 15). + * d-C-1 | / + * |/ <- greater is better + * -C / if scope is not enough for destination. + * /| ret = scope - C (-1 <= d < scope <= 15). + * + * d - C - 1 < B -15 (for all -1 <= d <= 15). + * C > d + 14 - B >= 15 + 14 - B = 29 - B. + * Assume B = 0 and we get C > 29. + */ + ret = __ipv6_addr_src_scope(score->addr_type); + if (ret >= dst->scope) + ret = -ret; + else + ret -= 128; /* 30 is enough */ + score->scopedist = ret; + break; + case IPV6_SADDR_RULE_PREFERRED: + /* Rule 3: Avoid deprecated and optimistic addresses */ + ret = ipv6_saddr_preferred(score->addr_type) || + !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + break; +#ifdef CONFIG_IPV6_MIP6 + case IPV6_SADDR_RULE_HOA: + { + /* Rule 4: Prefer home address */ + int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA); + ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome; + break; + } +#endif + case IPV6_SADDR_RULE_OIF: + /* Rule 5: Prefer outgoing interface */ + ret = (!dst->ifindex || + dst->ifindex == score->ifa->idev->dev->ifindex); + break; + case IPV6_SADDR_RULE_LABEL: + /* Rule 6: Prefer matching label */ + ret = ipv6_addr_label(net, + &score->ifa->addr, score->addr_type, + score->ifa->idev->dev->ifindex) == dst->label; + break; + case IPV6_SADDR_RULE_PRIVACY: + { + /* Rule 7: Prefer public address + * Note: prefer temporary address if use_tempaddr >= 2 + */ + int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? + !!(dst->prefs & IPV6_PREFER_SRC_TMP) : + score->ifa->idev->cnf.use_tempaddr >= 2; + ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; + break; + } + case IPV6_SADDR_RULE_ORCHID: + /* Rule 8-: Prefer ORCHID vs ORCHID or + * non-ORCHID vs non-ORCHID + */ + ret = !(ipv6_addr_orchid(&score->ifa->addr) ^ + ipv6_addr_orchid(dst->addr)); + break; + case IPV6_SADDR_RULE_PREFIX: + /* Rule 8: Use longest matching prefix */ + ret = ipv6_addr_diff(&score->ifa->addr, dst->addr); + if (ret > score->ifa->prefix_len) + ret = score->ifa->prefix_len; + score->matchlen = ret; + break; + default: + ret = 0; + } + + if (ret) + __set_bit(i, score->scorebits); + score->rule = i; +out: + return ret; +} + +int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, + struct in6_addr *saddr) { - struct ipv6_saddr_score hiscore; - struct inet6_ifaddr *ifa_result = NULL; - int daddr_type = __ipv6_addr_type(daddr); - int daddr_scope = __ipv6_addr_src_scope(daddr_type); - int daddr_ifindex = daddr_dev ? daddr_dev->ifindex : 0; - u32 daddr_label = ipv6_addr_label(daddr, daddr_type, daddr_ifindex); + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_dst dst; struct net_device *dev; + int dst_type; - memset(&hiscore, 0, sizeof(hiscore)); + dst_type = __ipv6_addr_type(daddr); + dst.addr = daddr; + dst.ifindex = dst_dev ? dst_dev->ifindex : 0; + dst.scope = __ipv6_addr_src_scope(dst_type); + dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); + dst.prefs = prefs; + + hiscore->rule = -1; + hiscore->ifa = NULL; - read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev_rcu(net, dev) { struct inet6_dev *idev; - struct inet6_ifaddr *ifa; - /* Rule 0: Candidate Source Address (section 4) + /* Candidate Source Address (section 4) * - multicast and link-local destination address, * the set of candidate source address MUST only * include addresses assigned to interfaces @@ -948,9 +1343,9 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, * belonging to the same site as the outgoing * interface.) */ - if ((daddr_type & IPV6_ADDR_MULTICAST || - daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && - daddr_dev && dev != daddr_dev) + if (((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + dst.ifindex && dev->ifindex != dst.ifindex) continue; idev = __in6_dev_get(dev); @@ -958,12 +1353,10 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, continue; read_lock_bh(&idev->lock); - for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { - struct ipv6_saddr_score score; + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; - score.addr_type = __ipv6_addr_type(&ifa->addr); - - /* Rule 0: + /* * - Tentative Address (RFC2462 section 5.4) * - A tentative address is not considered * "assigned to an interface" in the traditional @@ -973,11 +1366,14 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, * addresses, and the unspecified address MUST * NOT be included in a candidate set. */ - if ((ifa->flags & IFA_F_TENTATIVE) && - (!(ifa->flags & IFA_F_OPTIMISTIC))) + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) continue; - if (unlikely(score.addr_type == IPV6_ADDR_ANY || - score.addr_type & IPV6_ADDR_MULTICAST)) { + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { LIMIT_NETDEBUG(KERN_DEBUG "ADDRCONF: unspecified / multicast address " "assigned as unicast address on %s", @@ -985,226 +1381,88 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, continue; } - score.attrs = 0; - score.matchlen = 0; - score.scope = 0; - score.rule = 0; - - if (ifa_result == NULL) { - /* record it if the first available entry */ - goto record_it; - } - - /* Rule 1: Prefer same address */ - if (hiscore.rule < 1) { - if (ipv6_addr_equal(&ifa_result->addr, daddr)) - hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL; - hiscore.rule++; - } - if (ipv6_addr_equal(&ifa->addr, daddr)) { - score.attrs |= IPV6_SADDR_SCORE_LOCAL; - if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) { - score.rule = 1; - goto record_it; - } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL) - continue; - } + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); + miniscore = ipv6_get_saddr_eval(net, score, &dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto try_nextdev; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); - /* Rule 2: Prefer appropriate scope */ - if (hiscore.rule < 2) { - hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type); - hiscore.rule++; - } - score.scope = __ipv6_addr_src_scope(score.addr_type); - if (hiscore.scope < score.scope) { - if (hiscore.scope < daddr_scope) { - score.rule = 2; - goto record_it; - } else - continue; - } else if (score.scope < hiscore.scope) { - if (score.scope < daddr_scope) - break; /* addresses sorted by scope */ - else { - score.rule = 2; - goto record_it; - } - } + in6_ifa_hold(score->ifa); - /* Rule 3: Avoid deprecated and optimistic addresses */ - if (hiscore.rule < 3) { - if (ipv6_saddr_preferred(hiscore.addr_type) || - (((ifa_result->flags & - (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)) == 0))) - hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED; - hiscore.rule++; - } - if (ipv6_saddr_preferred(score.addr_type) || - (((ifa->flags & - (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)) == 0))) { - score.attrs |= IPV6_SADDR_SCORE_PREFERRED; - if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) { - score.rule = 3; - goto record_it; - } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED) - continue; - } + swap(hiscore, score); - /* Rule 4: Prefer home address */ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - if (hiscore.rule < 4) { - if (ifa_result->flags & IFA_F_HOMEADDRESS) - hiscore.attrs |= IPV6_SADDR_SCORE_HOA; - hiscore.rule++; - } - if (ifa->flags & IFA_F_HOMEADDRESS) { - score.attrs |= IPV6_SADDR_SCORE_HOA; - if (!(ifa_result->flags & IFA_F_HOMEADDRESS)) { - score.rule = 4; - goto record_it; - } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_HOA) - continue; - } -#else - if (hiscore.rule < 4) - hiscore.rule++; -#endif - - /* Rule 5: Prefer outgoing interface */ - if (hiscore.rule < 5) { - if (daddr_dev == NULL || - daddr_dev == ifa_result->idev->dev) - hiscore.attrs |= IPV6_SADDR_SCORE_OIF; - hiscore.rule++; - } - if (daddr_dev == NULL || - daddr_dev == ifa->idev->dev) { - score.attrs |= IPV6_SADDR_SCORE_OIF; - if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) { - score.rule = 5; - goto record_it; - } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_OIF) - continue; - } + /* restore our iterator */ + score->ifa = hiscore->ifa; - /* Rule 6: Prefer matching label */ - if (hiscore.rule < 6) { - if (ipv6_addr_label(&ifa_result->addr, - hiscore.addr_type, - ifa_result->idev->dev->ifindex) == daddr_label) - hiscore.attrs |= IPV6_SADDR_SCORE_LABEL; - hiscore.rule++; - } - if (ipv6_addr_label(&ifa->addr, - score.addr_type, - ifa->idev->dev->ifindex) == daddr_label) { - score.attrs |= IPV6_SADDR_SCORE_LABEL; - if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) { - score.rule = 6; - goto record_it; - } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL) - continue; - } - -#ifdef CONFIG_IPV6_PRIVACY - /* Rule 7: Prefer public address - * Note: prefer temprary address if use_tempaddr >= 2 - */ - if (hiscore.rule < 7) { - if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^ - (ifa_result->idev->cnf.use_tempaddr >= 2)) - hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY; - hiscore.rule++; - } - if ((!(ifa->flags & IFA_F_TEMPORARY)) ^ - (ifa->idev->cnf.use_tempaddr >= 2)) { - score.attrs |= IPV6_SADDR_SCORE_PRIVACY; - if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) { - score.rule = 7; - goto record_it; + break; } - } else { - if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) - continue; - } -#else - if (hiscore.rule < 7) - hiscore.rule++; -#endif - /* Rule 8: Use longest matching prefix */ - if (hiscore.rule < 8) { - hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); - hiscore.rule++; - } - score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); - if (score.matchlen > hiscore.matchlen) { - score.rule = 8; - goto record_it; } -#if 0 - else if (score.matchlen < hiscore.matchlen) - continue; -#endif - - /* Final Rule: choose first available one */ - continue; -record_it: - if (ifa_result) - in6_ifa_put(ifa_result); - in6_ifa_hold(ifa); - ifa_result = ifa; - hiscore = score; } +try_nextdev: read_unlock_bh(&idev->lock); } rcu_read_unlock(); - read_unlock(&dev_base_lock); - if (!ifa_result) + if (!hiscore->ifa) return -EADDRNOTAVAIL; - ipv6_addr_copy(saddr, &ifa_result->addr); - in6_ifa_put(ifa_result); + *saddr = hiscore->ifa->addr; + in6_ifa_put(hiscore->ifa); return 0; } +EXPORT_SYMBOL(ipv6_dev_get_saddr); - -int ipv6_get_saddr(struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) +int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) { - return ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, daddr, saddr); -} + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; -EXPORT_SYMBOL(ipv6_get_saddr); + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, - unsigned char banned_flags) + u32 banned_flags) { struct inet6_dev *idev; int err = -EADDRNOTAVAIL; rcu_read_lock(); - if ((idev = __in6_dev_get(dev)) != NULL) { - struct inet6_ifaddr *ifp; - + idev = __in6_dev_get(dev); + if (idev) { read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == IFA_LINK && !(ifp->flags & banned_flags)) { - ipv6_addr_copy(addr, &ifp->addr); - err = 0; - break; - } - } + err = __ipv6_get_lladdr(idev, addr, banned_flags); read_unlock_bh(&idev->lock); } rcu_read_unlock(); @@ -1217,120 +1475,142 @@ static int ipv6_count_addresses(struct inet6_dev *idev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) + list_for_each_entry(ifp, &idev->addr_list, if_list) cnt++; read_unlock_bh(&idev->lock); return cnt; } -int ipv6_chk_addr(struct net *net, struct in6_addr *addr, - struct net_device *dev, int strict) +int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp; + unsigned int hash = inet6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ifp->idev->dev->nd_net != net) + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && - !(ifp->flags&IFA_F_TENTATIVE)) { - if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) - break; + !(ifp->flags&IFA_F_TENTATIVE) && + (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { + rcu_read_unlock_bh(); + return 1; } } - read_unlock_bh(&addrconf_hash_lock); - return ifp != NULL; + + rcu_read_unlock_bh(); + return 0; } EXPORT_SYMBOL(ipv6_chk_addr); -static -int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); + struct inet6_ifaddr *ifp; - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ifp->idev->dev->nd_net != net) + hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +/* Compares an address/prefix_len with addresses on device @dev. + * If one is found it returns true. + */ +bool ipv6_chk_custom_prefix(const struct in6_addr *addr, + const unsigned int prefix_len, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool ret = false; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); + if (ret) break; } + read_unlock_bh(&idev->lock); } - return ifp != NULL; + rcu_read_unlock(); + + return ret; } +EXPORT_SYMBOL(ipv6_chk_custom_prefix); + +int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int onlink; -struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, struct in6_addr *addr, + onlink = 0; + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + onlink = ipv6_prefix_equal(addr, &ifa->addr, + ifa->prefix_len); + if (onlink) + break; + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return onlink; +} +EXPORT_SYMBOL(ipv6_chk_prefix); + +struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp, *result = NULL; + unsigned int hash = inet6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { - if (ifp->idev->dev->nd_net != net) + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + result = ifp; in6_ifa_hold(ifp); break; } } } - read_unlock_bh(&addrconf_hash_lock); - - return ifp; -} - -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) -{ - const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - __be32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; - __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - int sk_ipv6only = ipv6_only_sock(sk); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(sk_rcv_saddr6); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - if (!sk2_rcv_saddr && !sk_ipv6only) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && - !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) - return 1; - - if (addr_type == IPV6_ADDR_MAPPED && - !sk2_ipv6only && - (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr)) - return 1; + rcu_read_unlock_bh(); - return 0; + return result; } /* Gets referenced address, destroys ifaddr */ -static void addrconf_dad_stop(struct inet6_ifaddr *ifp) +static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) { if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); - addrconf_del_timer(ifp); + addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; + if (dad_failed) + ifp->flags |= IFA_F_DADFAILED; spin_unlock_bh(&ifp->lock); + if (dad_failed) + ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); -#ifdef CONFIG_IPV6_PRIVACY } else if (ifp->flags&IFA_F_TEMPORARY) { struct inet6_ifaddr *ifpub; spin_lock_bh(&ifp->lock); @@ -1344,24 +1624,69 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp) spin_unlock_bh(&ifp->lock); } ipv6_del_addr(ifp); -#endif - } else + } else { ipv6_del_addr(ifp); + } +} + +static int addrconf_dad_end(struct inet6_ifaddr *ifp) +{ + int err = -ENOENT; + + spin_lock_bh(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_DAD) { + ifp->state = INET6_IFADDR_STATE_POSTDAD; + err = 0; + } + spin_unlock_bh(&ifp->state_lock); + + return err; } void addrconf_dad_failure(struct inet6_ifaddr *ifp) { - if (net_ratelimit()) - printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); - addrconf_dad_stop(ifp); + struct inet6_dev *idev = ifp->idev; + + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); + return; + } + + net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n", + ifp->idev->dev->name, &ifp->addr); + + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { + struct in6_addr addr; + + addr.s6_addr32[0] = htonl(0xfe800000); + addr.s6_addr32[1] = 0; + + if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && + ipv6_addr_equal(&ifp->addr, &addr)) { + /* DAD failed for link-local based on MAC address */ + idev->cnf.disable_ipv6 = 1; + + pr_info("%s: IPv6 being disabled!\n", + ifp->idev->dev->name); + } + } + + spin_lock_bh(&ifp->state_lock); + /* transition from _POSTDAD to _ERRDAD */ + ifp->state = INET6_IFADDR_STATE_ERRDAD; + spin_unlock_bh(&ifp->state_lock); + + addrconf_mod_dad_work(ifp, 0); } /* Join to solicited addr multicast group. */ -void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) +void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1369,10 +1694,12 @@ void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) ipv6_dev_mc_inc(dev, &maddr); } -void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) +void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1383,6 +1710,11 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; @@ -1392,6 +1724,11 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; @@ -1429,13 +1766,36 @@ static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) return 0; } +static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != IEEE802154_ADDR_LEN) + return -1; + memcpy(eui, dev->dev_addr, 8); + eui[0] ^= 2; + return 0; +} + +static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev) +{ + union fwnet_hwaddr *ha; + + if (dev->addr_len != FWNET_ALEN) + return -1; + + ha = (union fwnet_hwaddr *)dev->dev_addr; + + memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id)); + eui[0] ^= 2; + return 0; +} + static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) { /* XXX: inherit EUI-64 from other interface -- yoshfuji */ if (dev->addr_len != ARCNET_ALEN) return -1; memset(eui, 0, 7); - eui[7] = *(u8*)dev->dev_addr; + eui[7] = *(u8 *)dev->dev_addr; return 0; } @@ -1448,20 +1808,66 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) return 0; } +static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) +{ + if (addr == 0) + return -1; + eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) || + ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) || + ipv4_is_private_172(addr) || ipv4_is_test_192(addr) || + ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) || + ipv4_is_test_198(addr) || ipv4_is_multicast(addr) || + ipv4_is_lbcast(addr)) ? 0x00 : 0x02; + eui[1] = 0; + eui[2] = 0x5E; + eui[3] = 0xFE; + memcpy(eui + 4, &addr, 4); + return 0; +} + +static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) +{ + if (dev->priv_flags & IFF_ISATAP) + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); + return -1; +} + +static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) +{ + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); +} + +static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev) +{ + memcpy(eui, dev->perm_addr, 3); + memcpy(eui + 5, dev->perm_addr + 3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: - case ARPHRD_IEEE802_TR: return addrconf_ifid_eui48(eui, dev); case ARPHRD_ARCNET: return addrconf_ifid_arcnet(eui, dev); case ARPHRD_INFINIBAND: return addrconf_ifid_infiniband(eui, dev); case ARPHRD_SIT: - if (dev->priv_flags & IFF_ISATAP) - return ipv6_isatap_eui64(eui, *(__be32 *)dev->dev_addr); + return addrconf_ifid_sit(eui, dev); + case ARPHRD_IPGRE: + return addrconf_ifid_gre(eui, dev); + case ARPHRD_6LOWPAN: + case ARPHRD_IEEE802154: + return addrconf_ifid_eui64(eui, dev); + case ARPHRD_IEEE1394: + return addrconf_ifid_ieee1394(eui, dev); + case ARPHRD_TUNNEL6: + return addrconf_ifid_ip6tnl(eui, dev); } return -1; } @@ -1472,7 +1878,9 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { memcpy(eui, ifp->addr.s6_addr+8, 8); err = 0; @@ -1483,9 +1891,8 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) return err; } -#ifdef CONFIG_IPV6_PRIVACY /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ -static int __ipv6_regen_rndid(struct inet6_dev *idev) +static void __ipv6_regen_rndid(struct inet6_dev *idev) { regen: get_random_bytes(idev->rndid, sizeof(idev->rndid)); @@ -1512,8 +1919,6 @@ regen: if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) goto regen; } - - return 0; } static void ipv6_regen_rndid(unsigned long data) @@ -1527,16 +1932,16 @@ static void ipv6_regen_rndid(unsigned long data) if (idev->dead) goto out; - if (__ipv6_regen_rndid(idev) < 0) - goto out; + __ipv6_regen_rndid(idev); expires = jiffies + idev->cnf.temp_prefered_lft * HZ - - idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor; + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) - + idev->cnf.max_desync_factor * HZ; if (time_before(expires, jiffies)) { - printk(KERN_WARNING - "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", - idev->dev->name); + pr_warn("%s: too short regeneration interval; timer disabled for %s\n", + __func__, idev->dev->name); goto out; } @@ -1549,14 +1954,11 @@ out: in6_dev_put(idev); } -static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { - int ret = 0; - +static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) +{ if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) - ret = __ipv6_regen_rndid(idev); - return ret; + __ipv6_regen_rndid(idev); } -#endif /* * Add prefix route. @@ -1573,16 +1975,17 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_expires = expires, .fc_dst_len = plen, .fc_flags = RTF_UP | flags, - .fc_nlinfo.nl_net = &init_net, + .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; - ipv6_addr_copy(&cfg.fc_dst, pfx); + cfg.fc_dst = *pfx; /* Prevent useless cloning on PtP SIT. This thing is done here expecting that the whole class of non-broadcast devices need not cloning. */ -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) cfg.fc_flags |= RTF_NONEXTHOP; #endif @@ -1590,6 +1993,40 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, ip6_route_add(&cfg); } + +static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, + int plen, + const struct net_device *dev, + u32 flags, u32 noflags) +{ + struct fib6_node *fn; + struct rt6_info *rt = NULL; + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); + if (table == NULL) + return NULL; + + read_lock_bh(&table->tb6_lock); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + if (!fn) + goto out; + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != dev->ifindex) + continue; + if ((rt->rt6i_flags & flags) != flags) + continue; + if ((rt->rt6i_flags & noflags) != 0) + continue; + dst_hold(&rt->dst); + break; + } +out: + read_unlock_bh(&table->tb6_lock); + return rt; +} + + /* Create "default" multicast route to the interface */ static void addrconf_add_mroute(struct net_device *dev) @@ -1600,7 +2037,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, - .fc_nlinfo.nl_net = &init_net, + .fc_nlinfo.nl_net = dev_net(dev), }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); @@ -1608,61 +2045,106 @@ static void addrconf_add_mroute(struct net_device *dev) ip6_route_add(&cfg); } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) -static void sit_route_add(struct net_device *dev) +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) { - struct fib6_config cfg = { - .fc_table = RT6_TABLE_MAIN, - .fc_metric = IP6_RT_PRIO_ADDRCONF, - .fc_ifindex = dev->ifindex, - .fc_dst_len = 96, - .fc_flags = RTF_UP | RTF_NONEXTHOP, - .fc_nlinfo.nl_net = &init_net, - }; + struct inet6_dev *idev; - /* prefix length - 96 bits "::d.d.d.d" */ - ip6_route_add(&cfg); -} -#endif + ASSERT_RTNL(); -static void addrconf_add_lroute(struct net_device *dev) -{ - struct in6_addr addr; + idev = ipv6_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, 0); + if (idev->cnf.disable_ipv6) + return ERR_PTR(-EACCES); + + /* Add default multicast route */ + if (!(dev->flags & IFF_LOOPBACK)) + addrconf_add_mroute(dev); + + return idev; } -static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +static void manage_tempaddrs(struct inet6_dev *idev, + struct inet6_ifaddr *ifp, + __u32 valid_lft, __u32 prefered_lft, + bool create, unsigned long now) { - struct inet6_dev *idev; + u32 flags; + struct inet6_ifaddr *ift; - ASSERT_RTNL(); + read_lock_bh(&idev->lock); + /* update all temporary addresses in the list */ + list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) { + int age, max_valid, max_prefered; - if ((idev = ipv6_find_idev(dev)) == NULL) - return NULL; + if (ifp != ift->ifpub) + continue; - /* Add default multicast route */ - addrconf_add_mroute(dev); + /* RFC 4941 section 3.3: + * If a received option will extend the lifetime of a public + * address, the lifetimes of temporary addresses should + * be extended, subject to the overall constraint that no + * temporary addresses should ever remain "valid" or "preferred" + * for a time longer than (TEMP_VALID_LIFETIME) or + * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively. + */ + age = (now - ift->cstamp) / HZ; + max_valid = idev->cnf.temp_valid_lft - age; + if (max_valid < 0) + max_valid = 0; + + max_prefered = idev->cnf.temp_prefered_lft - + idev->cnf.max_desync_factor - age; + if (max_prefered < 0) + max_prefered = 0; + + if (valid_lft > max_valid) + valid_lft = max_valid; + + if (prefered_lft > max_prefered) + prefered_lft = max_prefered; + + spin_lock(&ift->lock); + flags = ift->flags; + ift->valid_lft = valid_lft; + ift->prefered_lft = prefered_lft; + ift->tstamp = now; + if (prefered_lft > 0) + ift->flags &= ~IFA_F_DEPRECATED; + + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } - /* Add link local route */ - addrconf_add_lroute(dev); - return idev; + if ((create || list_empty(&idev->tempaddr_list)) && + idev->cnf.use_tempaddr > 0) { + /* When a new public address is created as described + * in [ADDRCONF], also create a new temporary address. + * Also create a temporary address if it's enabled but + * no temporary address currently exists. + */ + read_unlock_bh(&idev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&idev->lock); + } } -void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; __u32 valid_lft; __u32 prefered_lft; int addr_type; - unsigned long rt_expires; struct inet6_dev *in6_dev; + struct net *net = dev_net(dev); pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG(("addrconf: prefix option too short\n")); + ADBG("addrconf: prefix option too short\n"); return; } @@ -1679,16 +2161,15 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) prefered_lft = ntohl(pinfo->prefered); if (prefered_lft > valid_lft) { - if (net_ratelimit()) - printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n"); return; } in6_dev = in6_dev_get(dev); if (in6_dev == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + net_dbg_ratelimited("addrconf: device %s not configured\n", + dev->name); return; } @@ -1698,70 +2179,86 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) * 2) Configure prefixes with the auto flag set */ - /* Avoid arithmetic overflow. Really, we could - save rt_expires in seconds, likely valid_lft, - but it would require division in fib gc, that it - not good. - */ - if (valid_lft >= 0x7FFFFFFF/HZ) - rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); - else - rt_expires = valid_lft * HZ; - - /* - * We convert this (in jiffies) to clock_t later. - * Avoid arithmetic overflow there as well. - * Overflow can happen only if HZ < USER_HZ. - */ - if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ) - rt_expires = 0x7FFFFFFF / USER_HZ; - if (pinfo->onlink) { struct rt6_info *rt; - rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); + unsigned long rt_expires; - if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { - if (rt->rt6i_flags&RTF_EXPIRES) { - if (valid_lft == 0) { - ip6_del_rt(rt); - rt = NULL; - } else { - rt->rt6i_expires = jiffies + rt_expires; - } + /* Avoid arithmetic overflow. Really, we could + * save rt_expires in seconds, likely valid_lft, + * but it would require division in fib gc, that it + * not good. + */ + if (HZ > USER_HZ) + rt_expires = addrconf_timeout_fixup(valid_lft, HZ); + else + rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ); + + if (addrconf_finite_timeout(rt_expires)) + rt_expires *= HZ; + + rt = addrconf_get_prefix_route(&pinfo->prefix, + pinfo->prefix_len, + dev, + RTF_ADDRCONF | RTF_PREFIX_RT, + RTF_GATEWAY | RTF_DEFAULT); + + if (rt) { + /* Autoconf prefix route */ + if (valid_lft == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + rt6_set_expires(rt, jiffies + rt_expires); + } else { + rt6_clean_expires(rt); } } else if (valid_lft) { + clock_t expires = 0; + int flags = RTF_ADDRCONF | RTF_PREFIX_RT; + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + flags |= RTF_EXPIRES; + expires = jiffies_to_clock_t(rt_expires); + } addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + dev, expires, flags); } - if (rt) - dst_release(&rt->u.dst); + ip6_rt_put(rt); } /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { - struct inet6_ifaddr * ifp; + struct inet6_ifaddr *ifp; struct in6_addr addr; int create = 0, update_lft = 0; + bool tokenized = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && - ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + + if (!ipv6_addr_any(&in6_dev->token)) { + read_lock_bh(&in6_dev->lock); + memcpy(addr.s6_addr + 8, + in6_dev->token.s6_addr + 8, 8); + read_unlock_bh(&in6_dev->lock); + tokenized = true; + } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); return; } goto ok; } - if (net_ratelimit()) - printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", - pinfo->prefix_len); + net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); in6_dev_put(in6_dev); return; ok: - ifp = ipv6_get_ifaddr(&init_net, &addr, dev, 1); + ifp = ipv6_get_ifaddr(net, &addr, dev, 1); if (ifp == NULL && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -1769,7 +2266,7 @@ ok: #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && - !ipv6_devconf.forwarding) + !net->ipv6.devconf_all->forwarding && sllao) addr_flags = IFA_F_OPTIMISTIC; #endif @@ -1778,26 +2275,30 @@ ok: */ if (!max_addresses || ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + ifp = ipv6_add_addr(in6_dev, &addr, NULL, + pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags); + addr_flags, valid_lft, + prefered_lft); - if (!ifp || IS_ERR(ifp)) { + if (IS_ERR_OR_NULL(ifp)) { in6_dev_put(in6_dev); return; } - update_lft = create = 1; + update_lft = 0; + create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; ifp->cstamp = jiffies; - addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); + addrconf_dad_start(ifp); } if (ifp) { - int flags; + u32 flags; unsigned long now; -#ifdef CONFIG_IPV6_PRIVACY - struct inet6_ifaddr *ift; -#endif u32 stored_lft; /* update lifetime (RFC2462 5.5.3 e) */ @@ -1807,20 +2308,22 @@ ok: stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!update_lft && stored_lft) { - if (valid_lft > MIN_VALID_LIFETIME || - valid_lft > stored_lft) - update_lft = 1; - else if (stored_lft <= MIN_VALID_LIFETIME) { - /* valid_lft <= stored_lft is always true */ - /* XXX: IPsec */ - update_lft = 0; - } else { - valid_lft = MIN_VALID_LIFETIME; - if (valid_lft < prefered_lft) - prefered_lft = valid_lft; - update_lft = 1; - } + if (!update_lft && !create && stored_lft) { + const u32 minimum_lft = min( + stored_lft, (u32)MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; } if (update_lft) { @@ -1836,46 +2339,11 @@ ok: } else spin_unlock(&ifp->lock); -#ifdef CONFIG_IPV6_PRIVACY - read_lock_bh(&in6_dev->lock); - /* update all temporary addresses in the list */ - for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) { - /* - * When adjusting the lifetimes of an existing - * temporary address, only lower the lifetimes. - * Implementations must not increase the - * lifetimes of an existing temporary address - * when processing a Prefix Information Option. - */ - if (ifp != ift->ifpub) - continue; - - spin_lock(&ift->lock); - flags = ift->flags; - if (ift->valid_lft > valid_lft && - ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) - ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; - if (ift->prefered_lft > prefered_lft && - ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) - ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; - spin_unlock(&ift->lock); - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify(0, ift); - } + manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, + create, now); - if (create && in6_dev->cnf.use_tempaddr > 0) { - /* - * When a new public address is created as described in [ADDRCONF], - * also create a new temporary address. - */ - read_unlock_bh(&in6_dev->lock); - ipv6_create_tempaddr(ifp, NULL); - } else { - read_unlock_bh(&in6_dev->lock); - } -#endif in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify(); } } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); @@ -1887,7 +2355,7 @@ ok: * Special case for SIT interfaces where we create a new "virtual" * device. */ -int addrconf_set_dstaddr(void __user *arg) +int addrconf_set_dstaddr(struct net *net, void __user *arg) { struct in6_ifreq ireq; struct net_device *dev; @@ -1899,16 +2367,16 @@ int addrconf_set_dstaddr(void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) goto err_exit; - dev = __dev_get_by_index(&init_net, ireq.ifr6_ifindex); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); err = -ENODEV; if (dev == NULL) goto err_exit; -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT) { + const struct net_device_ops *ops = dev->netdev_ops; struct ifreq ifr; - mm_segment_t oldfs; struct ip_tunnel_parm p; err = -EADDRNOTAVAIL; @@ -1924,13 +2392,19 @@ int addrconf_set_dstaddr(void __user *arg) p.iph.ttl = 64; ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - oldfs = get_fs(); set_fs(KERNEL_DS); - err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + } else + err = -EOPNOTSUPP; if (err == 0) { err = -ENOBUFS; - if ((dev = __dev_get_by_name(&init_net, p.name)) == NULL) + dev = __dev_get_by_name(net, p.name); + if (!dev) goto err_exit; err = dev_open(dev); } @@ -1945,92 +2419,116 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen, - __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) +static int inet6_addr_add(struct net *net, int ifindex, + const struct in6_addr *pfx, + const struct in6_addr *peer_pfx, + unsigned int plen, __u32 ifa_flags, + __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; int scope; - u32 flags = RTF_EXPIRES; + u32 flags; + clock_t expires; + unsigned long timeout; ASSERT_RTNL(); + if (plen > 128) + return -EINVAL; + /* check the lifetime */ if (!valid_lft || prefered_lft > valid_lft) return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, ifindex)) == NULL) + if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) return -ENODEV; - if ((idev = addrconf_add_dev(dev)) == NULL) - return -ENOBUFS; + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); scope = ipv6_addr_scope(pfx); - if (valid_lft == INFINITY_LIFE_TIME) { - ifa_flags |= IFA_F_PERMANENT; + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; flags = 0; - } else if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + ifa_flags |= IFA_F_PERMANENT; + } - if (prefered_lft == 0) - ifa_flags |= IFA_F_DEPRECATED; - else if ((prefered_lft >= 0x7FFFFFFF/HZ) && - (prefered_lft != INFINITY_LIFE_TIME)) - prefered_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } - ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); + ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, + valid_lft, prefered_lft); if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = jiffies; - spin_unlock_bh(&ifp->lock); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, + expires, flags); + } - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - jiffies_to_clock_t(valid_lft * HZ), flags); /* * Note that section 3.1 of RFC 4429 indicates * that the Optimistic flag should not be set for * manually configured addresses */ - addrconf_dad_start(ifp, 0); + addrconf_dad_start(ifp); + if (ifa_flags & IFA_F_MANAGETEMPADDR) + manage_tempaddrs(idev, ifp, valid_lft, prefered_lft, + true, jiffies); in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify_rtnl(); return 0; } return PTR_ERR(ifp); } -static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) +static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, + const struct in6_addr *pfx, unsigned int plen) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; - if ((dev = __dev_get_by_index(&init_net, ifindex)) == NULL) + if (plen > 128) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) return -ENODEV; if ((idev = __in6_dev_get(dev)) == NULL) return -ENXIO; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) { + list_for_each_entry(ifp, &idev->addr_list, if_list) { if (ifp->prefix_len == plen && ipv6_addr_equal(pfx, &ifp->addr)) { in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifa_flags & IFA_F_MANAGETEMPADDR)) + manage_tempaddrs(idev, ifp, 0, 0, false, + jiffies); ipv6_del_addr(ifp); - - /* If the last address is deleted administratively, - disable IPv6 on this interface. - */ - if (idev->addr_list == NULL) - addrconf_ifdown(idev->dev, 1); + addrconf_verify_rtnl(); return 0; } } @@ -2039,48 +2537,68 @@ static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) } -int addrconf_add_ifaddr(void __user *arg) +int addrconf_add_ifaddr(struct net *net, void __user *arg) { struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; rtnl_lock(); - err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen, - IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, + ireq.ifr6_prefixlen, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); rtnl_unlock(); return err; } -int addrconf_del_ifaddr(void __user *arg) +int addrconf_del_ifaddr(struct net *net, void __user *arg) { struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; rtnl_lock(); - err = inet6_addr_del(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, + ireq.ifr6_prefixlen); rtnl_unlock(); return err; } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + int plen, int scope) +{ + struct inet6_ifaddr *ifp; + + ifp = ipv6_add_addr(idev, addr, NULL, plen, + scope, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + +#if IS_ENABLED(CONFIG_IPV6_SIT) static void sit_add_v4_addrs(struct inet6_dev *idev) { - struct inet6_ifaddr * ifp; struct in6_addr addr; struct net_device *dev; - int scope; + struct net *net = dev_net(idev->dev); + int scope, plen; + u32 pflags = 0; ASSERT_RTNL(); @@ -2090,31 +2608,27 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; + plen = 64; } else { scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { - ifp = ipv6_add_addr(idev, &addr, 128, scope, IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); - } + add_addr(idev, &addr, plen, scope); + addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); return; } - for_each_netdev(&init_net, dev) { - struct in_device * in_dev = __in_dev_get_rtnl(dev); + for_each_netdev(net, dev) { + struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { - struct in_ifaddr * ifa; + struct in_ifaddr *ifa; int flag = scope; for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - int plen; addr.s6_addr32[3] = ifa->ifa_local; @@ -2125,20 +2639,10 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) continue; flag |= IFA_HOST; } - if (idev->dev->flags&IFF_POINTOPOINT) - plen = 64; - else - plen = 96; - - ifp = ipv6_add_addr(idev, &addr, plen, flag, - IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); - } + + add_addr(idev, &addr, plen, flag); + addrconf_prefix_route(&addr, plen, idev->dev, 0, + pflags); } } } @@ -2148,43 +2652,78 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) static void init_loopback(struct net_device *dev) { struct inet6_dev *idev; - struct inet6_ifaddr * ifp; + struct net_device *sp_dev; + struct inet6_ifaddr *sp_ifa; + struct rt6_info *sp_rt; /* ::1 */ ASSERT_RTNL(); if ((idev = ipv6_find_idev(dev)) == NULL) { - printk(KERN_DEBUG "init loopback: add_dev failed\n"); + pr_debug("%s: add_dev failed\n", __func__); return; } - ifp = ipv6_add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFA_F_PERMANENT); - if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->flags &= ~IFA_F_TENTATIVE; - spin_unlock_bh(&ifp->lock); - ipv6_ifa_notify(RTM_NEWADDR, ifp); - in6_ifa_put(ifp); + add_addr(idev, &in6addr_loopback, 128, IFA_HOST); + + /* Add routes to other interface's IPv6 addresses */ + for_each_netdev(dev_net(dev), sp_dev) { + if (!strcmp(sp_dev->name, dev->name)) + continue; + + idev = __in6_dev_get(sp_dev); + if (!idev) + continue; + + read_lock_bh(&idev->lock); + list_for_each_entry(sp_ifa, &idev->addr_list, if_list) { + + if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) + continue; + + if (sp_ifa->rt) { + /* This dst has been added to garbage list when + * lo device down, release this obsolete dst and + * reallocate a new router for ifa. + */ + if (sp_ifa->rt->dst.obsolete > 0) { + ip6_rt_put(sp_ifa->rt); + sp_ifa->rt = NULL; + } else { + continue; + } + } + + sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false); + + /* Failure cases are ignored */ + if (!IS_ERR(sp_rt)) { + sp_ifa->rt = sp_rt; + ip6_ins_rt(sp_rt); + } + } + read_unlock_bh(&idev->lock); } } -static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) { - struct inet6_ifaddr * ifp; + struct inet6_ifaddr *ifp; u32 addr_flags = IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (idev->cnf.optimistic_dad && - !ipv6_devconf.forwarding) + !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif - ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); + ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); - addrconf_dad_start(ifp, 0); + addrconf_dad_start(ifp); in6_ifa_put(ifp); } } @@ -2192,21 +2731,24 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr static void addrconf_dev_config(struct net_device *dev) { struct in6_addr addr; - struct inet6_dev * idev; + struct inet6_dev *idev; ASSERT_RTNL(); if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_FDDI) && - (dev->type != ARPHRD_IEEE802_TR) && (dev->type != ARPHRD_ARCNET) && - (dev->type != ARPHRD_INFINIBAND)) { + (dev->type != ARPHRD_INFINIBAND) && + (dev->type != ARPHRD_IEEE802154) && + (dev->type != ARPHRD_IEEE1394) && + (dev->type != ARPHRD_TUNNEL6) && + (dev->type != ARPHRD_6LOWPAN)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } idev = addrconf_add_dev(dev); - if (idev == NULL) + if (IS_ERR(idev)) return; memset(&addr, 0, sizeof(struct in6_addr)); @@ -2216,7 +2758,7 @@ static void addrconf_dev_config(struct net_device *dev) addrconf_add_linklocal(idev, &addr); } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) static void addrconf_sit_config(struct net_device *dev) { struct inet6_dev *idev; @@ -2230,7 +2772,7 @@ static void addrconf_sit_config(struct net_device *dev) */ if ((idev = ipv6_find_idev(dev)) == NULL) { - printk(KERN_DEBUG "init sit: add_dev failed\n"); + pr_debug("%s: add_dev failed\n", __func__); return; } @@ -2238,7 +2780,6 @@ static void addrconf_sit_config(struct net_device *dev) struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, 0); if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) addrconf_add_linklocal(idev, &addr); return; @@ -2246,74 +2787,41 @@ static void addrconf_sit_config(struct net_device *dev) sit_add_v4_addrs(idev); - if (dev->flags&IFF_POINTOPOINT) { + if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); - addrconf_add_lroute(dev); - } else - sit_route_add(dev); } #endif -static inline int -ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) -{ - struct in6_addr lladdr; - - if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { - addrconf_add_linklocal(idev, &lladdr); - return 0; - } - return -1; -} - -static void ip6_tnl_add_linklocal(struct inet6_dev *idev) -{ - struct net_device *link_dev; - - /* first try to inherit the link-local address from the link device */ - if (idev->dev->iflink && - (link_dev = __dev_get_by_index(&init_net, idev->dev->iflink))) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - /* then try to inherit it from any device */ - for_each_netdev(&init_net, link_dev) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); -} - -/* - * Autoconfigure tunnel with a link-local address so routing protocols, - * DHCPv6, MLD etc. can be run over the virtual link - */ - -static void addrconf_ip6_tnl_config(struct net_device *dev) +#if IS_ENABLED(CONFIG_NET_IPGRE) +static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; + struct in6_addr addr; ASSERT_RTNL(); - if ((idev = addrconf_add_dev(dev)) == NULL) { - printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + if ((idev = ipv6_find_idev(dev)) == NULL) { + pr_debug("%s: add_dev failed\n", __func__); return; } - ip6_tnl_add_linklocal(idev); + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); + else + addrconf_prefix_route(&addr, 64, dev, 0, 0); } +#endif static int addrconf_notify(struct notifier_block *this, unsigned long event, - void * data) + void *ptr) { - struct net_device *dev = (struct net_device *) data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - - switch(event) { + switch (event) { case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); @@ -2321,6 +2829,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, return notifier_from_errno(-ENOMEM); } break; + case NETDEV_UP: case NETDEV_CHANGE: if (dev->flags & IFF_SLAVE) @@ -2329,9 +2838,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (event == NETDEV_UP) { if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ - printk(KERN_INFO - "ADDRCONF(NETDEV_UP): %s: " - "link is not ready\n", + pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); break; } @@ -2339,8 +2846,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (!idev && dev->mtu >= IPV6_MIN_MTU) idev = ipv6_add_dev(dev); - if (idev) + if (idev) { idev->if_flags |= IF_READY; + run_pending = 1; + } } else { if (!addrconf_qdisc_ok(dev)) { /* device is still not ready. */ @@ -2348,30 +2857,29 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } if (idev) { - if (idev->if_flags & IF_READY) { + if (idev->if_flags & IF_READY) /* device is already configured. */ break; - } idev->if_flags |= IF_READY; } - printk(KERN_INFO - "ADDRCONF(NETDEV_CHANGE): %s: " - "link becomes ready\n", - dev->name); + pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", + dev->name); run_pending = 1; } - switch(dev->type) { -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: addrconf_sit_config(dev); break; #endif - case ARPHRD_TUNNEL6: - addrconf_ip6_tnl_config(dev); +#if IS_ENABLED(CONFIG_NET_IPGRE) + case ARPHRD_IPGRE: + addrconf_gre_config(dev); break; +#endif case ARPHRD_LOOPBACK: init_loopback(dev); break; @@ -2380,25 +2888,30 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_dev_config(dev); break; } + if (idev) { if (run_pending) addrconf_dad_run(idev); - /* If the MTU changed during the interface down, when the - interface up, the changed MTU must be reflected in the - idev as well as routers. + /* + * If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. */ - if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { + if (idev->cnf.mtu6 != dev->mtu && + dev->mtu >= IPV6_MIN_MTU) { rt6_mtu_change(dev, dev->mtu); idev->cnf.mtu6 = dev->mtu; } idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); - /* If the changed mtu during down is lower than IPV6_MIN_MTU - stop IPv6 on this interface. + + /* + * If the changed mtu during down is lower than + * IPV6_MIN_MTU stop IPv6 on this interface. */ if (dev->mtu < IPV6_MIN_MTU) - addrconf_ifdown(dev, event != NETDEV_DOWN); + addrconf_ifdown(dev, 1); } break; @@ -2415,7 +2928,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */ + /* + * if MTU under IPV6_MIN_MTU. + * Stop IPv6 on this interface. + */ case NETDEV_DOWN: case NETDEV_UNREGISTER: @@ -2435,6 +2951,11 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, return notifier_from_errno(err); } break; + + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + addrconf_type_change(dev, event); + break; } return NOTIFY_OK; @@ -2445,35 +2966,46 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ static struct notifier_block ipv6_dev_notf = { .notifier_call = addrconf_notify, - .priority = 0 }; +static void addrconf_type_change(struct net_device *dev, unsigned long event) +{ + struct inet6_dev *idev; + ASSERT_RTNL(); + + idev = __in6_dev_get(dev); + + if (event == NETDEV_POST_TYPE_CHANGE) + ipv6_mc_remap(idev); + else if (event == NETDEV_PRE_TYPE_CHANGE) + ipv6_mc_unmap(idev); +} + static int addrconf_ifdown(struct net_device *dev, int how) { + struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa, **bifa; - int i; + struct inet6_ifaddr *ifa; + int state, i; ASSERT_RTNL(); - if (dev == init_net.loopback_dev && how == 1) - how = 0; - - rt6_ifdown(dev); + rt6_ifdown(net, dev); neigh_ifdown(&nd_tbl, dev); idev = __in6_dev_get(dev); if (idev == NULL) return -ENODEV; - /* Step 1: remove reference to ipv6 device from parent device. - Do not dev_put! + /* + * Step 1: remove reference to ipv6 device from parent device. + * Do not dev_put! */ - if (how == 1) { + if (how) { idev->dead = 1; /* protected by rtnl_lock */ - rcu_assign_pointer(dev->ip6_ptr, NULL); + RCU_INIT_POINTER(dev->ip6_ptr, NULL); /* Step 1.5: remove snmp6 entry */ snmp6_unregister_dev(idev); @@ -2481,39 +3013,37 @@ static int addrconf_ifdown(struct net_device *dev, int how) } /* Step 2: clear hash table */ - for (i=0; i<IN6_ADDR_HSIZE; i++) { - bifa = &inet6_addr_lst[i]; + for (i = 0; i < IN6_ADDR_HSIZE; i++) { + struct hlist_head *h = &inet6_addr_lst[i]; - write_lock_bh(&addrconf_hash_lock); - while ((ifa = *bifa) != NULL) { + spin_lock_bh(&addrconf_hash_lock); + restart: + hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - *bifa = ifa->lst_next; - ifa->lst_next = NULL; - addrconf_del_timer(ifa); - in6_ifa_put(ifa); - continue; + hlist_del_init_rcu(&ifa->addr_lst); + addrconf_del_dad_work(ifa); + goto restart; } - bifa = &ifa->lst_next; } - write_unlock_bh(&addrconf_hash_lock); + spin_unlock_bh(&addrconf_hash_lock); } write_lock_bh(&idev->lock); - /* Step 3: clear flags for stateless addrconf */ - if (how != 1) + addrconf_del_rs_timer(idev); + + /* Step 2: clear flags for stateless addrconf */ + if (!how) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); - /* Step 4: clear address list */ -#ifdef CONFIG_IPV6_PRIVACY - if (how == 1 && del_timer(&idev->regen_timer)) + if (how && del_timer(&idev->regen_timer)) in6_dev_put(idev); - /* clear tempaddr list */ - while ((ifa = idev->tempaddr_list) != NULL) { - idev->tempaddr_list = ifa->tmp_next; - ifa->tmp_next = NULL; - ifa->dead = 1; + /* Step 3: clear tempaddr list */ + while (!list_empty(&idev->tempaddr_list)) { + ifa = list_first_entry(&idev->tempaddr_list, + struct inet6_ifaddr, tmp_list); + list_del(&ifa->tmp_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -2525,34 +3055,42 @@ static int addrconf_ifdown(struct net_device *dev, int how) in6_ifa_put(ifa); write_lock_bh(&idev->lock); } -#endif - while ((ifa = idev->addr_list) != NULL) { - idev->addr_list = ifa->if_next; - ifa->if_next = NULL; - ifa->dead = 1; - addrconf_del_timer(ifa); + + while (!list_empty(&idev->addr_list)) { + ifa = list_first_entry(&idev->addr_list, + struct inet6_ifaddr, if_list); + addrconf_del_dad_work(ifa); + + list_del(&ifa->if_list); + write_unlock_bh(&idev->lock); - __ipv6_ifa_notify(RTM_DELADDR, ifa); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); + spin_lock_bh(&ifa->state_lock); + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifa->state_lock); + + if (state != INET6_IFADDR_STATE_DEAD) { + __ipv6_ifa_notify(RTM_DELADDR, ifa); + inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } in6_ifa_put(ifa); write_lock_bh(&idev->lock); } + write_unlock_bh(&idev->lock); /* Step 5: Discard multicast list */ - - if (how == 1) + if (how) ipv6_mc_destroy_dev(idev); else ipv6_mc_down(idev); idev->tstamp = jiffies; - /* Shot the device (if unregistered) */ - - if (how == 1) { + /* Last: Shot the device (if unregistered) */ + if (how) { addrconf_sysctl_unregister(idev); neigh_parms_release(&nd_tbl, idev->nd_parms); neigh_ifdown(&nd_tbl, dev); @@ -2563,45 +3101,47 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = (struct inet6_dev *)data; + struct net_device *dev = idev->dev; + struct in6_addr lladdr; - if (ifp->idev->cnf.forwarding) + write_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) goto out; - if (ifp->idev->if_flags & IF_RA_RCVD) { - /* - * Announcement received after solicitation - * was sent - */ + if (!ipv6_accept_ra(idev)) goto out; - } - - spin_lock(&ifp->lock); - if (ifp->probes++ < ifp->idev->cnf.rtr_solicits) { - struct in6_addr all_routers; - /* The wait after the last probe can be shorter */ - addrconf_mod_timer(ifp, AC_RS, - (ifp->probes == ifp->idev->cnf.rtr_solicits) ? - ifp->idev->cnf.rtr_solicit_delay : - ifp->idev->cnf.rtr_solicit_interval); - spin_unlock(&ifp->lock); + /* Announcement received after solicitation was sent */ + if (idev->if_flags & IF_RA_RCVD) + goto out; - ipv6_addr_all_routers(&all_routers); + if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + write_unlock(&idev->lock); + if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + ndisc_send_rs(dev, &lladdr, + &in6addr_linklocal_allrouters); + else + goto put; - ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + write_lock(&idev->lock); + /* The wait after the last probe can be shorter */ + addrconf_mod_rs_timer(idev, (idev->rs_probes == + idev->cnf.rtr_solicits) ? + idev->cnf.rtr_solicit_delay : + idev->cnf.rtr_solicit_interval); } else { - spin_unlock(&ifp->lock); /* * Note: we do not support deprecated "all on-link" * assumption any longer. */ - printk(KERN_DEBUG "%s: no IPv6 routers present\n", - ifp->idev->dev->name); + pr_debug("%s: no IPv6 routers present\n", idev->dev->name); } out: - in6_ifa_put(ifp); + write_unlock(&idev->lock); +put: + in6_dev_put(idev); } /* @@ -2615,31 +3155,32 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); + ifp->dad_probes = idev->cnf.dad_transmits; + addrconf_mod_dad_work(ifp, rand_num); } -static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) +static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; addrconf_join_solict(dev, &ifp->addr); - net_srandom(ifp->addr.s6_addr32[3]); + prandom_seed((__force u32) ifp->addr.s6_addr32[3]); read_lock_bh(&idev->lock); - if (ifp->dead) + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) goto out; - spin_lock_bh(&ifp->lock); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { - ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC); - spin_unlock_bh(&ifp->lock); + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); addrconf_dad_completed(ifp); @@ -2647,15 +3188,15 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) } if (!(idev->if_flags & IF_READY)) { - spin_unlock_bh(&ifp->lock); + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); /* - * If the defice is not ready: + * If the device is not ready: * - keep it tentative if it is a permanent address. * - otherwise, kill it. */ in6_ifa_hold(ifp); - addrconf_dad_stop(ifp); + addrconf_dad_stop(ifp, 0); return; } @@ -2663,58 +3204,133 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) * Optimistic nodes can start receiving * Frames right away */ - if(ifp->flags & IFA_F_OPTIMISTIC) + if (ifp->flags & IFA_F_OPTIMISTIC) ip6_ins_rt(ifp->rt); addrconf_dad_kick(ifp); - spin_unlock_bh(&ifp->lock); out: + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); } -static void addrconf_dad_timer(unsigned long data) +static void addrconf_dad_start(struct inet6_ifaddr *ifp) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + bool begin_dad = false; + + spin_lock_bh(&ifp->state_lock); + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + ifp->state = INET6_IFADDR_STATE_PREDAD; + begin_dad = true; + } + spin_unlock_bh(&ifp->state_lock); + + if (begin_dad) + addrconf_mod_dad_work(ifp, 0); +} + +static void addrconf_dad_work(struct work_struct *w) +{ + struct inet6_ifaddr *ifp = container_of(to_delayed_work(w), + struct inet6_ifaddr, + dad_work); struct inet6_dev *idev = ifp->idev; - struct in6_addr unspec; struct in6_addr mcaddr; - read_lock_bh(&idev->lock); - if (idev->dead) { - read_unlock_bh(&idev->lock); + enum { + DAD_PROCESS, + DAD_BEGIN, + DAD_ABORT, + } action = DAD_PROCESS; + + rtnl_lock(); + + spin_lock_bh(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) { + action = DAD_BEGIN; + ifp->state = INET6_IFADDR_STATE_DAD; + } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) { + action = DAD_ABORT; + ifp->state = INET6_IFADDR_STATE_POSTDAD; + } + spin_unlock_bh(&ifp->state_lock); + + if (action == DAD_BEGIN) { + addrconf_dad_begin(ifp); + goto out; + } else if (action == DAD_ABORT) { + addrconf_dad_stop(ifp, 1); goto out; } - spin_lock_bh(&ifp->lock); - if (ifp->probes == 0) { + + if (!ifp->dad_probes && addrconf_dad_end(ifp)) + goto out; + + write_lock_bh(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + write_unlock_bh(&idev->lock); + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) { + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); + goto out; + } + + if (ifp->dad_probes == 0) { /* * DAD was successful */ - ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC); - spin_unlock_bh(&ifp->lock); - read_unlock_bh(&idev->lock); + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); addrconf_dad_completed(ifp); goto out; } - ifp->probes--; - addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); - spin_unlock_bh(&ifp->lock); - read_unlock_bh(&idev->lock); + ifp->dad_probes--; + addrconf_mod_dad_work(ifp, + NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); /* send a neighbour solicitation for our addr */ - memset(&unspec, 0, sizeof(unspec)); addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); + rtnl_unlock(); +} + +/* ifp->idev must be at least read locked */ +static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifpiter; + struct inet6_dev *idev = ifp->idev; + + list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) { + if (ifpiter->scope > IFA_LINK) + break; + if (ifp != ifpiter && ifpiter->scope == IFA_LINK && + (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE| + IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) == + IFA_F_PERMANENT) + return false; + } + return true; } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) { - struct net_device * dev = ifp->idev->dev; + struct net_device *dev = ifp->idev->dev; + struct in6_addr lladdr; + bool send_rs, send_mld; + + addrconf_del_dad_work(ifp); /* * Configure the address for reception. Now it is valid. @@ -2722,45 +3338,56 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) ipv6_ifa_notify(RTM_NEWADDR, ifp); - /* If added prefix is link local and forwarding is off, - start sending router solicitations. + /* If added prefix is link local and we are prepared to process + router advertisements, start sending router solicitations. */ - if (ifp->idev->cnf.forwarding == 0 && - ifp->idev->cnf.rtr_solicits > 0 && - (dev->flags&IFF_LOOPBACK) == 0 && - (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { - struct in6_addr all_routers; + read_lock_bh(&ifp->idev->lock); + send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); + send_rs = send_mld && + ipv6_accept_ra(ifp->idev) && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0; + read_unlock_bh(&ifp->idev->lock); - ipv6_addr_all_routers(&all_routers); + /* While dad is in progress mld report's source address is in6_addrany. + * Resend with proper ll now. + */ + if (send_mld) + ipv6_mc_dad_complete(ifp->idev); + if (send_rs) { /* * If a host as already performed a random delay * [...] as part of DAD [...] there is no need * to delay again before sending the first RS */ - ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + return; + ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters); - spin_lock_bh(&ifp->lock); - ifp->probes = 1; + write_lock_bh(&ifp->idev->lock); + spin_lock(&ifp->lock); + ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); - spin_unlock_bh(&ifp->lock); + addrconf_mod_rs_timer(ifp->idev, + ifp->idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + write_unlock_bh(&ifp->idev->lock); } } -static void addrconf_dad_run(struct inet6_dev *idev) { +static void addrconf_dad_run(struct inet6_dev *idev) +{ struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { - spin_lock_bh(&ifp->lock); - if (!(ifp->flags & IFA_F_TENTATIVE)) { - spin_unlock_bh(&ifp->lock); - continue; - } - spin_unlock_bh(&ifp->lock); - addrconf_dad_kick(ifp); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) + addrconf_dad_kick(ifp); + spin_unlock(&ifp->lock); } read_unlock_bh(&idev->lock); } @@ -2769,62 +3396,75 @@ static void addrconf_dad_run(struct inet6_dev *idev) { struct if6_iter_state { struct seq_net_private p; int bucket; + int offset; }; -static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) { struct inet6_ifaddr *ifa = NULL; struct if6_iter_state *state = seq->private; - struct net *net = state->p.net; + struct net *net = seq_file_net(seq); + int p = 0; - for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - ifa = inet6_addr_lst[state->bucket]; + /* initial bucket if pos is 0 */ + if (pos == 0) { + state->bucket = 0; + state->offset = 0; + } - while (ifa && ifa->idev->dev->nd_net != net) - ifa = ifa->lst_next; - if (ifa) - break; + for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], + addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + /* sync with offset */ + if (p < state->offset) { + p++; + continue; + } + state->offset++; + return ifa; + } + + /* prepare for next bucket */ + state->offset = 0; + p = 0; } - return ifa; + return NULL; } -static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa) +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, + struct inet6_ifaddr *ifa) { struct if6_iter_state *state = seq->private; - struct net *net = state->p.net; - - ifa = ifa->lst_next; -try_again: - if (ifa) { - if (ifa->idev->dev->nd_net != net) { - ifa = ifa->lst_next; - goto try_again; - } - } + struct net *net = seq_file_net(seq); - if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { - ifa = inet6_addr_lst[state->bucket]; - goto try_again; + hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + state->offset++; + return ifa; } - return ifa; -} - -static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) -{ - struct inet6_ifaddr *ifa = if6_get_first(seq); + while (++state->bucket < IN6_ADDR_HSIZE) { + state->offset = 0; + hlist_for_each_entry_rcu_bh(ifa, + &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + state->offset++; + return ifa; + } + } - if (ifa) - while(pos && (ifa = if6_get_next(seq, ifa)) != NULL) - --pos; - return pos ? NULL : ifa; + return NULL; } static void *if6_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(addrconf_hash_lock) + __acquires(rcu_bh) { - read_lock_bh(&addrconf_hash_lock); - return if6_get_idx(seq, *pos); + rcu_read_lock_bh(); + return if6_get_first(seq, *pos); } static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2837,21 +3477,20 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void if6_seq_stop(struct seq_file *seq, void *v) - __releases(addrconf_hash_lock) + __releases(rcu_bh) { - read_unlock_bh(&addrconf_hash_lock); + rcu_read_unlock_bh(); } static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; - seq_printf(seq, - NIP6_SEQFMT " %02x %02x %02x %02x %8s\n", - NIP6(ifp->addr), + seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", + &ifp->addr, ifp->idev->dev->ifindex, ifp->prefix_len, ifp->scope, - ifp->flags, + (u8) ifp->flags, ifp->idev->dev->name); return 0; } @@ -2877,16 +3516,16 @@ static const struct file_operations if6_fops = { .release = seq_release_net, }; -static int if6_proc_net_init(struct net *net) +static int __net_init if6_proc_net_init(struct net *net) { - if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) + if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops)) return -ENOMEM; return 0; } -static void if6_proc_net_exit(struct net *net) +static void __net_exit if6_proc_net_exit(struct net *net) { - proc_net_remove(net, "if_inet6"); + remove_proc_entry("if_inet6", net->proc_net); } static struct pernet_operations if6_proc_net_ops = { @@ -2905,24 +3544,25 @@ void if6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) /* Check if address is a home address configured on any interface. */ -int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) +int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) { int ret = 0; - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for (ifp = inet6_addr_lst[hash]; ifp; ifp = ifp->lst_next) { - if (ifp->idev->dev->nd_net != net) + struct inet6_ifaddr *ifp = NULL; + unsigned int hash = inet6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) continue; - if (ipv6_addr_cmp(&ifp->addr, addr) == 0 && + if (ipv6_addr_equal(&ifp->addr, addr) && (ifp->flags & IFA_F_HOMEADDRESS)) { ret = 1; break; } } - read_unlock_bh(&addrconf_hash_lock); + rcu_read_unlock_bh(); return ret; } #endif @@ -2931,52 +3571,48 @@ int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) * Periodic address status verification */ -static void addrconf_verify(unsigned long foo) +static void addrconf_verify_rtnl(void) { + unsigned long now, next, next_sec, next_sched; struct inet6_ifaddr *ifp; - unsigned long now, next; int i; - spin_lock_bh(&addrconf_verify_lock); - now = jiffies; - next = now + ADDR_CHECK_FREQUENCY; + ASSERT_RTNL(); - del_timer(&addr_chk_timer); + rcu_read_lock_bh(); + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - for (i=0; i < IN6_ADDR_HSIZE; i++) { + cancel_delayed_work(&addr_chk_work); + for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: - read_lock(&addrconf_hash_lock); - for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) { unsigned long age; -#ifdef CONFIG_IPV6_PRIVACY - unsigned long regen_advance; -#endif - if (ifp->flags & IFA_F_PERMANENT) + /* When setting preferred_lft to a value not zero or + * infinity, while valid_lft is infinity + * IFA_F_PERMANENT has a non-infinity life time. + */ + if ((ifp->flags & IFA_F_PERMANENT) && + (ifp->prefered_lft == INFINITY_LIFE_TIME)) continue; spin_lock(&ifp->lock); - age = (now - ifp->tstamp) / HZ; - -#ifdef CONFIG_IPV6_PRIVACY - regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - ifp->idev->nd_parms->retrans_time / HZ; -#endif + /* We try to batch several events at once. */ + age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifp->valid_lft != INFINITY_LIFE_TIME && age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); - read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); continue; } else if (age >= ifp->prefered_lft) { - /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ + /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ int deprecate = 0; if (!(ifp->flags&IFA_F_DEPRECATED)) { @@ -2984,22 +3620,25 @@ restart: ifp->flags |= IFA_F_DEPRECATED; } - if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + if ((ifp->valid_lft != INFINITY_LIFE_TIME) && + (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))) next = ifp->tstamp + ifp->valid_lft * HZ; spin_unlock(&ifp->lock); if (deprecate) { in6_ifa_hold(ifp); - read_unlock(&addrconf_hash_lock); ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); goto restart; } -#ifdef CONFIG_IPV6_PRIVACY } else if ((ifp->flags&IFA_F_TEMPORARY) && !(ifp->flags&IFA_F_TENTATIVE)) { + unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -3009,7 +3648,7 @@ restart: in6_ifa_hold(ifp); in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); - read_unlock(&addrconf_hash_lock); + spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); @@ -3021,7 +3660,6 @@ restart: } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; spin_unlock(&ifp->lock); -#endif } else { /* ifp->prefered_lft <= ifp->valid_lft */ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -3029,26 +3667,51 @@ restart: spin_unlock(&ifp->lock); } } - read_unlock(&addrconf_hash_lock); } - addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; - add_timer(&addr_chk_timer); - spin_unlock_bh(&addrconf_verify_lock); + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; + + ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); + mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); + rcu_read_unlock_bh(); +} + +static void addrconf_verify_work(struct work_struct *w) +{ + rtnl_lock(); + addrconf_verify_rtnl(); + rtnl_unlock(); +} + +static void addrconf_verify(void) +{ + mod_delayed_work(addrconf_wq, &addr_chk_work, 0); } -static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) +static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local, + struct in6_addr **peer_pfx) { struct in6_addr *pfx = NULL; + *peer_pfx = NULL; + if (addr) pfx = nla_data(addr); if (local) { if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) - pfx = NULL; - else - pfx = nla_data(local); + *peer_pfx = pfx; + pfx = nla_data(local); } return pfx; @@ -3058,54 +3721,81 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) }, [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .len = sizeof(u32) }, }; static int -inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; + u32 ifa_flags; int err; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; - return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen); + ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + + /* We ignore other flags so far. */ + ifa_flags &= IFA_F_MANAGETEMPADDR; + + return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen); } -static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, +static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, u32 prefered_lft, u32 valid_lft) { - u32 flags = RTF_EXPIRES; + u32 flags; + clock_t expires; + unsigned long timeout; + bool was_managetempaddr; + bool had_prefixroute; + + ASSERT_RTNL(); if (!valid_lft || (prefered_lft > valid_lft)) return -EINVAL; - if (valid_lft == INFINITY_LIFE_TIME) { - ifa_flags |= IFA_F_PERMANENT; + if (ifa_flags & IFA_F_MANAGETEMPADDR && + (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) + return -EINVAL; + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; flags = 0; - } else if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + ifa_flags |= IFA_F_PERMANENT; + } - if (prefered_lft == 0) - ifa_flags |= IFA_F_DEPRECATED; - else if ((prefered_lft >= 0x7FFFFFFF/HZ) && - (prefered_lft != INFINITY_LIFE_TIME)) - prefered_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } spin_lock_bh(&ifp->lock); - ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; + was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR; + had_prefixroute = ifp->flags & IFA_F_PERMANENT && + !(ifp->flags & IFA_F_NOPREFIXROUTE); + ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | + IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE); + ifp->flags |= ifa_flags; ifp->tstamp = jiffies; ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; @@ -3114,35 +3804,54 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - jiffies_to_clock_t(valid_lft * HZ), flags); - addrconf_verify(0); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, + expires, flags); + } else if (had_prefixroute) { + enum cleanup_prefix_rt_t action; + unsigned long rt_expires; + + write_lock_bh(&ifp->idev->lock); + action = check_cleanup_prefix_route(ifp, &rt_expires); + write_unlock_bh(&ifp->idev->lock); + + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, rt_expires, + action == CLEANUP_PREFIX_RT_DEL); + } + } + + if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { + if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) + valid_lft = prefered_lft = 0; + manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft, + !was_managetempaddr, jiffies); + } + + addrconf_verify_rtnl(); return 0; } static int -inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; - u8 ifa_flags; + u32 ifa_flags; int err; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; @@ -3157,21 +3866,25 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) valid_lft = INFINITY_LIFE_TIME; } - dev = __dev_get_by_index(&init_net, ifm->ifa_index); + dev = __dev_get_by_index(net, ifm->ifa_index); if (dev == NULL) return -ENODEV; + ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + /* We ignore other flags so far. */ - ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); + ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { /* * It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ - return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen, - ifa_flags, preferred_lft, valid_lft); + return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, + ifm->ifa_prefixlen, ifa_flags, + preferred_lft, valid_lft); } if (nlh->nlmsg_flags & NLM_F_EXCL || @@ -3185,7 +3898,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return err; } -static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags, +static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags, u8 scope, int ifindex) { struct ifaddrmsg *ifm; @@ -3203,10 +3916,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, { struct ifa_cacheinfo ci; - ci.cstamp = (u32)(TIME_DELTA(cstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.tstamp = (u32)(TIME_DELTA(tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; @@ -3228,48 +3939,70 @@ static inline int rt_scope(int ifa_scope) static inline int inet6_ifaddr_msgsize(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16) /* IFA_LOCAL */ + nla_total_size(16) /* IFA_ADDRESS */ - + nla_total_size(sizeof(struct ifa_cacheinfo)); + + nla_total_size(sizeof(struct ifa_cacheinfo)) + + nla_total_size(4) /* IFA_FLAGS */; } static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); - if (!(ifa->flags&IFA_F_PERMANENT)) { + if (!((ifa->flags&IFA_F_PERMANENT) && + (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; valid = ifa->valid_lft; if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - ifa->tstamp)/HZ; - preferred -= tval; - if (valid != INFINITY_LIFE_TIME) - valid -= tval; + if (preferred > tval) + preferred -= tval; + else + preferred = 0; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } - if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || - put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } + if (!ipv6_addr_any(&ifa->peer_addr)) { + if (nla_put(skb, IFA_LOCAL, 16, &ifa->addr) < 0 || + nla_put(skb, IFA_ADDRESS, 16, &ifa->peer_addr) < 0) + goto error; + } else + if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0) + goto error; + + if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) + goto error; + + if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) + goto error; return nlmsg_end(skb, nlh); + +error: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, - u32 pid, u32 seq, int event, u16 flags) + u32 portid, u32 seq, int event, u16 flags) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -3278,7 +4011,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3294,7 +4027,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, } static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -3303,7 +4036,7 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3318,148 +4051,164 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return nlmsg_end(skb, nlh); } -enum addr_type_t -{ +enum addr_type_t { UNICAST_ADDR, MULTICAST_ADDR, ANYCAST_ADDR, }; +/* called with rcu_read_lock() */ +static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, + struct netlink_callback *cb, enum addr_type_t type, + int s_ip_idx, int *p_ip_idx) +{ + struct ifmcaddr6 *ifmca; + struct ifacaddr6 *ifaca; + int err = 1; + int ip_idx = *p_ip_idx; + + read_lock_bh(&idev->lock); + switch (type) { + case UNICAST_ADDR: { + struct inet6_ifaddr *ifa; + + /* unicast address incl. temp addr */ + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR, + NLM_F_MULTI); + if (err <= 0) + break; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + } + break; + } + case MULTICAST_ADDR: + /* multicast address */ + for (ifmca = idev->mc_list; ifmca; + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_GETMULTICAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + case ANYCAST_ADDR: + /* anycast address */ + for (ifaca = idev->ac_list; ifaca; + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_GETANYCAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + default: + break; + } + read_unlock_bh(&idev->lock); + *p_ip_idx = ip_idx; + return err; +} + static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { + struct net *net = sock_net(skb->sk); + int h, s_h; int idx, ip_idx; int s_idx, s_ip_idx; - int err = 1; struct net_device *dev; - struct inet6_dev *idev = NULL; - struct inet6_ifaddr *ifa; - struct ifmcaddr6 *ifmca; - struct ifacaddr6 *ifaca; + struct inet6_dev *idev; + struct hlist_head *head; - s_idx = cb->args[0]; - s_ip_idx = ip_idx = cb->args[1]; + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; - idx = 0; - for_each_netdev(&init_net, dev) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - s_ip_idx = 0; - ip_idx = 0; - if ((idev = in6_dev_get(dev)) == NULL) - goto cont; - read_lock_bh(&idev->lock); - switch (type) { - case UNICAST_ADDR: - /* unicast address incl. temp addr */ - for (ifa = idev->addr_list; ifa; - ifa = ifa->if_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - err = inet6_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWADDR, - NLM_F_MULTI); - } - break; - case MULTICAST_ADDR: - /* multicast address */ - for (ifmca = idev->mc_list; ifmca; - ifmca = ifmca->next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - err = inet6_fill_ifmcaddr(skb, ifmca, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_GETMULTICAST, - NLM_F_MULTI); - } - break; - case ANYCAST_ADDR: - /* anycast address */ - for (ifaca = idev->ac_list; ifaca; - ifaca = ifaca->aca_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - err = inet6_fill_ifacaddr(skb, ifaca, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_GETANYCAST, - NLM_F_MULTI); - } - break; - default: - break; - } - read_unlock_bh(&idev->lock); - in6_dev_put(idev); + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + ip_idx = 0; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; - if (err <= 0) - break; + if (in6_dump_addrs(idev, skb, cb, type, + s_ip_idx, &ip_idx) <= 0) + goto done; cont: - idx++; + idx++; + } } - cb->args[0] = idx; - cb->args[1] = ip_idx; +done: + rcu_read_unlock(); + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; + return skb->len; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; enum addr_type_t type = UNICAST_ADDR; - if (net != &init_net) - return 0; - return inet6_dump_addr(skb, cb, type); } static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; enum addr_type_t type = MULTICAST_ADDR; - if (net != &init_net) - return 0; - return inet6_dump_addr(skb, cb, type); } static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; enum addr_type_t type = ANYCAST_ADDR; - if (net != &init_net) - return 0; - return inet6_dump_addr(skb, cb, type); } -static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, - void *arg) +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *addr = NULL; + struct in6_addr *addr = NULL, *peer; struct net_device *dev = NULL; struct inet6_ifaddr *ifa; struct sk_buff *skb; int err; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); if (err < 0) goto errout; - addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); if (addr == NULL) { err = -EINVAL; goto errout; @@ -3467,19 +4216,21 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(&init_net, ifm->ifa_index); + dev = __dev_get_by_index(net, ifm->ifa_index); - if ((ifa = ipv6_get_ifaddr(net, addr, dev, 1)) == NULL) { + ifa = ipv6_get_ifaddr(net, addr, dev, 1); + if (!ifa) { err = -EADDRNOTAVAIL; goto errout; } - if ((skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL)) == NULL) { + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL); + if (!skb) { err = -ENOBUFS; goto errout_ifa; } - err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, + err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWADDR, 0); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ @@ -3487,7 +4238,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, kfree_skb(skb); goto errout_ifa; } - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_ifa: in6_ifa_put(ifa); errout: @@ -3497,6 +4248,7 @@ errout: static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; + struct net *net = dev_net(ifa->idev->dev); int err = -ENOBUFS; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); @@ -3510,10 +4262,11 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; errout: if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); } static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, @@ -3530,22 +4283,27 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_AUTOCONF] = cnf->autoconf; array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; - array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; - array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; -#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; -#endif array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; - array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif @@ -3555,6 +4313,24 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; #endif +#ifdef CONFIG_IPV6_MROUTE + array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; +#endif + array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; + array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; + array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; + array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; + array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; +} + +static inline size_t inet6_ifla6_size(void) +{ + return nla_total_size(4) /* IFLA_INET6_FLAGS */ + + nla_total_size(sizeof(struct ifla_cacheinfo)) + + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ } static inline size_t inet6_if_nlmsg_size(void) @@ -3564,17 +4340,11 @@ static inline size_t inet6_if_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ - + nla_total_size( /* IFLA_PROTINFO */ - nla_total_size(4) /* IFLA_INET6_FLAGS */ - + nla_total_size(sizeof(struct ifla_cacheinfo)) - + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ - + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ - + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ - ); + + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ } -static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, - int bytes) +static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, + int items, int bytes) { int i; int pad = bytes - sizeof(u64) * items; @@ -3583,7 +4353,22 @@ static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, /* Use put_unaligned() because stats may not be aligned for u64. */ put_unaligned(items, &stats[0]); for (i = 1; i < items; i++) - put_unaligned(snmp_fold_field(mib, i), &stats[i]); + put_unaligned(atomic_long_read(&mib[i]), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, + int items, int bytes, size_t syncpoff) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); memset(&stats[items], 0, pad); } @@ -3591,27 +4376,169 @@ static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, int bytes) { - switch(attrtype) { + switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats(stats, (void **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes); + __snmp6_fill_stats64(stats, idev->stats.ipv6, + IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: - __snmp6_fill_stats(stats, (void **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes); + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); break; } } +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +{ + struct nlattr *nla; + struct ifla_cacheinfo ci; + + if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags)) + goto nla_put_failure; + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME)); + if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; + nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); + if (nla == NULL) + goto nla_put_failure; + ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); + + /* XXX - MC not implemented */ + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); + if (nla == NULL) + goto nla_put_failure; + read_lock_bh(&idev->lock); + memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); + read_unlock_bh(&idev->lock); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t inet6_get_link_af_size(const struct net_device *dev) +{ + if (!__in6_dev_get(dev)) + return 0; + + return inet6_ifla6_size(); +} + +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev) + return -ENODATA; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + return -EMSGSIZE; + + return 0; +} + +static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) +{ + struct inet6_ifaddr *ifp; + struct net_device *dev = idev->dev; + bool update_rs = false; + struct in6_addr ll_addr; + + ASSERT_RTNL(); + + if (token == NULL) + return -EINVAL; + if (ipv6_addr_any(token)) + return -EINVAL; + if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) + return -EINVAL; + if (!ipv6_accept_ra(idev)) + return -EINVAL; + if (idev->cnf.rtr_solicits <= 0) + return -EINVAL; + + write_lock_bh(&idev->lock); + + BUILD_BUG_ON(sizeof(token->s6_addr) != 16); + memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8); + + write_unlock_bh(&idev->lock); + + if (!idev->dead && (idev->if_flags & IF_READY) && + !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE | + IFA_F_OPTIMISTIC)) { + + /* If we're not ready, then normal ifup will take care + * of this. Otherwise, we need to request our rs here. + */ + ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters); + update_rs = true; + } + + write_lock_bh(&idev->lock); + + if (update_rs) { + idev->if_flags |= IF_RS_SENT; + idev->rs_probes = 1; + addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + } + + /* Well, that's kinda nasty ... */ + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->tokenized) { + ifp->valid_lft = 0; + ifp->prefered_lft = 0; + } + spin_unlock(&ifp->lock); + } + + write_unlock_bh(&idev->lock); + addrconf_verify_rtnl(); + return 0; +} + +static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) +{ + int err = -EINVAL; + struct inet6_dev *idev = __in6_dev_get(dev); + struct nlattr *tb[IFLA_INET6_MAX + 1]; + + if (!idev) + return -EAFNOSUPPORT; + + if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) + BUG(); + + if (tb[IFLA_INET6_TOKEN]) + err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + + return err; +} + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct net_device *dev = idev->dev; - struct nlattr *nla; struct ifinfomsg *hdr; struct nlmsghdr *nlh; void *protoinfo; - struct ifla_cacheinfo ci; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3623,44 +4550,19 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; - NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); - - if (dev->addr_len) - NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); - - NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); - if (dev->ifindex != dev->iflink) - NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); - + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || + (dev->addr_len && + nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || + nla_put_u32(skb, IFLA_MTU, dev->mtu) || + (dev->ifindex != dev->iflink && + nla_put_u32(skb, IFLA_LINK, dev->iflink))) + goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); if (protoinfo == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); - - ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.reachable_time = idev->nd_parms->reachable_time; - ci.retrans_time = idev->nd_parms->retrans_time; - NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); - - nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); - if (nla == NULL) - goto nla_put_failure; - ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); - - /* XXX - MC not implemented */ - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (nla == NULL) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (nla == NULL) + if (inet6_fill_ifla6_attrs(skb, idev) < 0) goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); nla_nest_end(skb, protoinfo); return nlmsg_end(skb, nlh); @@ -3672,32 +4574,39 @@ nla_put_failure: static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; - int idx, err; - int s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; struct net_device *dev; struct inet6_dev *idev; + struct hlist_head *head; - if (net != &init_net) - return 0; + s_h = cb->args[0]; + s_idx = cb->args[1]; - read_lock(&dev_base_lock); - idx = 0; - for_each_netdev(&init_net, dev) { - if (idx < s_idx) - goto cont; - if ((idev = in6_dev_get(dev)) == NULL) - goto cont; - err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); - in6_dev_put(idev); - if (err <= 0) - break; + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + if (inet6_fill_ifinfo(skb, idev, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWLINK, NLM_F_MULTI) <= 0) + goto out; cont: - idx++; + idx++; + } } - read_unlock(&dev_base_lock); - cb->args[0] = idx; +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; return skb->len; } @@ -3705,6 +4614,7 @@ cont: void inet6_ifinfo_notify(int event, struct inet6_dev *idev) { struct sk_buff *skb; + struct net *net = dev_net(idev->dev); int err = -ENOBUFS; skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC); @@ -3718,10 +4628,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); + return; errout: if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -3732,14 +4643,14 @@ static inline size_t inet6_prefix_nlmsg_size(void) } static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, - struct prefix_info *pinfo, u32 pid, u32 seq, + struct prefix_info *pinfo, u32 portid, u32 seq, int event, unsigned int flags) { struct prefixmsg *pmsg; struct nlmsghdr *nlh; struct prefix_cacheinfo ci; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3757,12 +4668,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, if (pinfo->autoconf) pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; - NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); - + if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix)) + goto nla_put_failure; ci.preferred_time = ntohl(pinfo->prefered); ci.valid_time = ntohl(pinfo->valid); - NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); - + if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; return nlmsg_end(skb, nlh); nla_put_failure: @@ -3774,6 +4685,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo) { struct sk_buff *skb; + struct net *net = dev_net(idev->dev); int err = -ENOBUFS; skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC); @@ -3787,14 +4699,20 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; errout: if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_PREFIX, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); } static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { + struct net *net = dev_net(ifp->idev->dev); + + if (event) + ASSERT_RTNL(); + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); switch (event) { @@ -3809,16 +4727,34 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); + if (!ipv6_addr_any(&ifp->peer_addr)) + addrconf_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); - dst_hold(&ifp->rt->u.dst); + if (!ipv6_addr_any(&ifp->peer_addr)) { + struct rt6_info *rt; + struct net_device *dev = ifp->idev->dev; + + rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, + dev->ifindex, 1); + if (rt) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } + } + dst_hold(&ifp->rt->dst); + if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->u.dst); + dst_free(&ifp->rt->dst); break; } + atomic_inc(&net->ipv6.dev_addr_genid); + rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -3832,333 +4768,437 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL static -int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, +int addrconf_sysctl_forward(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; int ret; - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + /* + * ctl->data points to idev->cnf.forwarding, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); if (write) - addrconf_fixup_forwarding(ctl, valp, val); + ret = addrconf_fixup_forwarding(ctl, valp, val); + if (ret) + *ppos = pos; return ret; } -static int addrconf_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) +static void dev_disable_change(struct inet6_dev *idev) { - int *valp = table->data; - int val = *valp; - int new; + struct netdev_notifier_info info; - if (!newval || !newlen) - return 0; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - if (new == *valp) + if (!idev || !idev->dev) + return; + + netdev_notifier_info_init(&info, idev->dev); + if (idev->cnf.disable_ipv6) + addrconf_notify(NULL, NETDEV_DOWN, &info); + else + addrconf_notify(NULL, NETDEV_UP, &info); +} + +static void addrconf_disable_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.disable_ipv6) ^ (!newf); + idev->cnf.disable_ipv6 = newf; + if (changed) + dev_disable_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->disable_ipv6) { + rtnl_unlock(); return 0; - if (oldval && oldlenp) { - size_t len; - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; + } + + if (p == &net->ipv6.devconf_all->disable_ipv6) { + net->ipv6.devconf_dflt->disable_ipv6 = newf; + addrconf_disable_change(net, newf); + } else if ((!newf) ^ (!old)) + dev_disable_change((struct inet6_dev *)table->extra1); + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + /* + * ctl->data points to idev->cnf.disable_ipv6, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_disable_ipv6(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static +int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int ret; + int old, new; + + old = *valp; + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new = *valp; + + if (write && old != new) { + struct net *net = ctl->extra2; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + else if (valp == &net->ipv6.devconf_all->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + else { + struct inet6_dev *idev = ctl->extra1; + + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + idev->dev->ifindex, + &idev->cnf); } + rtnl_unlock(); } - *valp = new; - addrconf_fixup_forwarding(table, valp, val); - return 1; + return ret; } + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table addrconf_vars[__NET_IPV6_MAX]; - char *dev_name; + struct ctl_table addrconf_vars[DEVCONF_MAX+1]; } addrconf_sysctl __read_mostly = { .sysctl_header = NULL, .addrconf_vars = { { - .ctl_name = NET_IPV6_FORWARDING, - .procname = "forwarding", - .data = &ipv6_devconf.forwarding, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &addrconf_sysctl_forward, - .strategy = &addrconf_sysctl_forward_strategy, + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_forward, }, { - .ctl_name = NET_IPV6_HOP_LIMIT, - .procname = "hop_limit", - .data = &ipv6_devconf.hop_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_MTU, - .procname = "mtu", - .data = &ipv6_devconf.mtu6, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ACCEPT_RA, - .procname = "accept_ra", - .data = &ipv6_devconf.accept_ra, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, - .procname = "accept_redirects", - .data = &ipv6_devconf.accept_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_AUTOCONF, - .procname = "autoconf", - .data = &ipv6_devconf.autoconf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_DAD_TRANSMITS, - .procname = "dad_transmits", - .data = &ipv6_devconf.dad_transmits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_RTR_SOLICITS, - .procname = "router_solicitations", - .data = &ipv6_devconf.rtr_solicits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, - .procname = "router_solicitation_interval", - .data = &ipv6_devconf.rtr_solicit_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, - .procname = "router_solicitation_delay", - .data = &ipv6_devconf.rtr_solicit_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_FORCE_MLD_VERSION, - .procname = "force_mld_version", - .data = &ipv6_devconf.force_mld_version, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, -#ifdef CONFIG_IPV6_PRIVACY { - .ctl_name = NET_IPV6_USE_TEMPADDR, - .procname = "use_tempaddr", - .data = &ipv6_devconf.use_tempaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .ctl_name = NET_IPV6_TEMP_VALID_LFT, - .procname = "temp_valid_lft", - .data = &ipv6_devconf.temp_valid_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, - .procname = "temp_prefered_lft", - .data = &ipv6_devconf.temp_prefered_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_REGEN_MAX_RETRY, - .procname = "regen_max_retry", - .data = &ipv6_devconf.regen_max_retry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, - .procname = "max_desync_factor", - .data = &ipv6_devconf.max_desync_factor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, -#endif { - .ctl_name = NET_IPV6_MAX_ADDRESSES, - .procname = "max_addresses", - .data = &ipv6_devconf.max_addresses, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ACCEPT_RA_DEFRTR, - .procname = "accept_ra_defrtr", - .data = &ipv6_devconf.accept_ra_defrtr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ACCEPT_RA_PINFO, - .procname = "accept_ra_pinfo", - .data = &ipv6_devconf.accept_ra_pinfo, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_ra_defrtr", + .data = &ipv6_devconf.accept_ra_defrtr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_IPV6_ROUTER_PREF { - .ctl_name = NET_IPV6_ACCEPT_RA_RTR_PREF, - .procname = "accept_ra_rtr_pref", - .data = &ipv6_devconf.accept_ra_rtr_pref, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_RTR_PROBE_INTERVAL, - .procname = "router_probe_interval", - .data = &ipv6_devconf.rtr_probe_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_IPV6_ROUTE_INFO { - .ctl_name = NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, - .procname = "accept_ra_rt_info_max_plen", - .data = &ipv6_devconf.accept_ra_rt_info_max_plen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #endif #endif { - .ctl_name = NET_IPV6_PROXY_NDP, - .procname = "proxy_ndp", - .data = &ipv6_devconf.proxy_ndp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "proxy_ndp", + .data = &ipv6_devconf.proxy_ndp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_proxy_ndp, }, { - .ctl_name = NET_IPV6_ACCEPT_SOURCE_ROUTE, - .procname = "accept_source_route", - .data = &ipv6_devconf.accept_source_route, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "accept_source_route", + .data = &ipv6_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD { - .ctl_name = CTL_UNNUMBERED, - .procname = "optimistic_dad", - .data = &ipv6_devconf.optimistic_dad, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_IPV6_MROUTE + { + .procname = "mc_forwarding", + .data = &ipv6_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif { - .ctl_name = 0, /* sentinel */ + .procname = "disable_ipv6", + .data = &ipv6_devconf.disable_ipv6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable, + }, + { + .procname = "accept_dad", + .data = &ipv6_devconf.accept_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "force_tllao", + .data = &ipv6_devconf.force_tllao, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ndisc_notify", + .data = &ipv6_devconf.ndisc_notify, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + /* sentinel */ } }, }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, - int ctl_name, struct inet6_dev *idev, struct ipv6_devconf *p) + struct inet6_dev *idev, struct ipv6_devconf *p) { int i; struct addrconf_sysctl_table *t; - -#define ADDRCONF_CTL_PATH_DEV 3 - - struct ctl_path addrconf_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv6", .ctl_name = NET_IPV6, }, - { .procname = "conf", .ctl_name = NET_IPV6_CONF, }, - { /* to be set */ }, - { }, - }; - + char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) goto out; - for (i=0; t->addrconf_vars[i].data; i++) { - t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + for (i = 0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ t->addrconf_vars[i].extra2 = net; } - /* - * Make a copy of dev_name, because '.procname' is regarded as const - * by sysctl and we wouldn't want anyone to change it under our feet - * (see SIOCSIFNAME). - */ - t->dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!t->dev_name) - goto free; + snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); - addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; - addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].ctl_name = ctl_name; - - t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, - t->addrconf_vars); + t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars); if (t->sysctl_header == NULL) - goto free_procname; + goto free; p->sysctl = t; return 0; -free_procname: - kfree(t->dev_name); free: kfree(t); out: @@ -4174,19 +5214,16 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) t = p->sysctl; p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->dev_name); + unregister_net_sysctl_table(t->sysctl_header); kfree(t); } static void addrconf_sysctl_register(struct inet6_dev *idev) { - neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6, - NET_IPV6_NEIGH, "ipv6", - &ndisc_ifinfo_sysctl_change, - NULL); - __addrconf_sysctl_register(idev->dev->nd_net, idev->dev->name, - idev->dev->ifindex, idev, &idev->cnf); + neigh_sysctl_register(idev->dev, idev->nd_parms, + &ndisc_ifinfo_sysctl_change); + __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, + idev, &idev->cnf); } static void addrconf_sysctl_unregister(struct inet6_dev *idev) @@ -4198,36 +5235,32 @@ static void addrconf_sysctl_unregister(struct inet6_dev *idev) #endif -static int addrconf_init_net(struct net *net) +static int __net_init addrconf_init_net(struct net *net) { - int err; + int err = -ENOMEM; struct ipv6_devconf *all, *dflt; - err = -ENOMEM; - all = &ipv6_devconf; - dflt = &ipv6_devconf_dflt; + all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; - if (net != &init_net) { - all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); - if (all == NULL) - goto err_alloc_all; + dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; - dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) - goto err_alloc_dflt; - } + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; #ifdef CONFIG_SYSCTL - err = __addrconf_sysctl_register(net, "all", NET_PROTO_CONF_ALL, - NULL, all); + err = __addrconf_sysctl_register(net, "all", NULL, all); if (err < 0) goto err_reg_all; - err = __addrconf_sysctl_register(net, "default", NET_PROTO_CONF_DEFAULT, - NULL, dflt); + err = __addrconf_sysctl_register(net, "default", NULL, dflt); if (err < 0) goto err_reg_dflt; #endif @@ -4245,13 +5278,13 @@ err_alloc_all: return err; } -static void addrconf_exit_net(struct net *net) +static void __net_exit addrconf_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); __addrconf_sysctl_unregister(net->ipv6.devconf_all); #endif - if (net != &init_net) { + if (!net_eq(net, &init_net)) { kfree(net->ipv6.devconf_dflt); kfree(net->ipv6.devconf_all); } @@ -4262,23 +5295,12 @@ static struct pernet_operations addrconf_ops = { .exit = addrconf_exit_net, }; -/* - * Device notifier - */ - -int register_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&inet6addr_chain, nb); -} - -EXPORT_SYMBOL(register_inet6addr_notifier); - -int unregister_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&inet6addr_chain,nb); -} - -EXPORT_SYMBOL(unregister_inet6addr_notifier); +static struct rtnl_af_ops inet6_ops = { + .family = AF_INET6, + .fill_link_af = inet6_fill_link_af, + .get_link_af_size = inet6_get_link_af_size, + .set_link_af = inet6_set_link_af, +}; /* * Init / cleanup code @@ -4286,15 +5308,24 @@ EXPORT_SYMBOL(unregister_inet6addr_notifier); int __init addrconf_init(void) { - int err; + int i, err; - if ((err = ipv6_addr_label_init()) < 0) { - printk(KERN_CRIT "IPv6 Addrconf: cannot initialize default policy table: %d.\n", - err); - return err; + err = ipv6_addr_label_init(); + if (err < 0) { + pr_crit("%s: cannot initialize default policy table: %d\n", + __func__, err); + goto out; } - register_pernet_subsys(&addrconf_ops); + err = register_pernet_subsys(&addrconf_ops); + if (err < 0) + goto out_addrlabel; + + addrconf_wq = create_workqueue("ipv6_addrconf"); + if (!addrconf_wq) { + err = -ENOMEM; + goto out_nowq; + } /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup @@ -4321,57 +5352,62 @@ int __init addrconf_init(void) if (err) goto errlo; - ip6_null_entry.u.dst.dev = init_net.loopback_dev; - ip6_null_entry.rt6i_idev = in6_dev_get(init_net.loopback_dev); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - ip6_prohibit_entry.u.dst.dev = init_net.loopback_dev; - ip6_prohibit_entry.rt6i_idev = in6_dev_get(init_net.loopback_dev); - ip6_blk_hole_entry.u.dst.dev = init_net.loopback_dev; - ip6_blk_hole_entry.rt6i_idev = in6_dev_get(init_net.loopback_dev); -#endif + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_addr_lst[i]); register_netdevice_notifier(&ipv6_dev_notf); - addrconf_verify(0); + addrconf_verify(); - err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo); + rtnl_af_register(&inet6_ops); + + err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, + NULL); if (err < 0) goto errout; /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL); - __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr); - __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr); - __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr); + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, + inet6_dump_ifaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, + inet6_dump_ifmcaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, + inet6_dump_ifacaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, + inet6_netconf_dump_devconf, NULL); ipv6_addr_label_rtnl_register(); return 0; errout: + rtnl_af_unregister(&inet6_ops); unregister_netdevice_notifier(&ipv6_dev_notf); errlo: + destroy_workqueue(addrconf_wq); +out_nowq: unregister_pernet_subsys(&addrconf_ops); - +out_addrlabel: + ipv6_addr_label_cleanup(); +out: return err; } void addrconf_cleanup(void) { struct net_device *dev; - struct inet6_ifaddr *ifa; int i; unregister_netdevice_notifier(&ipv6_dev_notf); - unregister_pernet_subsys(&addrconf_ops); + ipv6_addr_label_cleanup(); rtnl_lock(); - /* - * clean dev list. - */ + __rtnl_af_unregister(&inet6_ops); + /* clean dev list */ for_each_netdev(&init_net, dev) { if (__in6_dev_get(dev) == NULL) continue; @@ -4382,23 +5418,12 @@ void addrconf_cleanup(void) /* * Check hash table. */ - - write_lock_bh(&addrconf_hash_lock); - for (i=0; i < IN6_ADDR_HSIZE; i++) { - for (ifa=inet6_addr_lst[i]; ifa; ) { - struct inet6_ifaddr *bifa; - - bifa = ifa; - ifa = ifa->lst_next; - printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa); - /* Do not free it; something is wrong. - Now we can investigate it with debugger. - */ - } - } - write_unlock_bh(&addrconf_hash_lock); - - del_timer(&addr_chk_timer); - + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + cancel_delayed_work(&addr_chk_work); rtnl_unlock(); + + destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 3f82e9542ed..e6960457f62 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -3,13 +3,16 @@ * not configured or static. */ +#include <linux/export.h> #include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip.h> #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) -static inline unsigned ipv6_addr_scope2type(unsigned scope) +static inline unsigned int ipv6_addr_scope2type(unsigned int scope) { - switch(scope) { + switch (scope) { case IPV6_ADDR_SCOPE_NODELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | IPV6_ADDR_LOOPBACK); @@ -72,8 +75,76 @@ int __ipv6_addr_type(const struct in6_addr *addr) IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } - return (IPV6_ADDR_RESERVED | + return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } EXPORT_SYMBOL(__ipv6_addr_type); +static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(register_inet6addr_notifier); + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(unregister_inet6addr_notifier); + +int inet6addr_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&inet6addr_chain, val, v); +} +EXPORT_SYMBOL(inet6addr_notifier_call_chain); + +const struct ipv6_stub *ipv6_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_stub); + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +EXPORT_SYMBOL(in6addr_loopback); +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +EXPORT_SYMBOL(in6addr_any); +const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allnodes); +const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allrouters); +const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); +const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); +const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_sitelocal_allrouters); + +static void snmp6_free_dev(struct inet6_dev *idev) +{ + kfree(idev->stats.icmpv6msgdev); + kfree(idev->stats.icmpv6dev); + free_percpu(idev->stats.ipv6); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + WARN_ON(idev->mc_list != NULL); + WARN_ON(timer_pending(&idev->rs_timer)); + +#ifdef NET_REFCNT_DEBUG + pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + pr_warn("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree_rcu(idev, rcu); +} +EXPORT_SYMBOL(in6_dev_finish_destroy); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index a3c5a72218f..731e1e1722d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -6,13 +6,14 @@ */ /* * Author: - * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/in6.h> +#include <linux/slab.h> #include <net/addrconf.h> #include <linux/if_addrlabel.h> #include <linux/netlink.h> @@ -21,14 +22,16 @@ #if 0 #define ADDRLABEL(x...) printk(x) #else -#define ADDRLABEL(x...) do { ; } while(0) +#define ADDRLABEL(x...) do { ; } while (0) #endif /* * Policy Table */ -struct ip6addrlbl_entry -{ +struct ip6addrlbl_entry { +#ifdef CONFIG_NET_NS + struct net *lbl_net; +#endif struct in6_addr prefix; int prefixlen; int ifindex; @@ -46,8 +49,14 @@ static struct ip6addrlbl_table u32 seq; } ip6addrlbl_table; +static inline +struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) +{ + return read_pnet(&lbl->lbl_net); +} + /* - * Default policy table (RFC3484 + extensions) + * Default policy table (RFC6724 + extensions) * * prefix addr_type label * ------------------------------------------------------------------------- @@ -58,13 +67,18 @@ static struct ip6addrlbl_table * ::ffff:0:0/96 V4MAPPED 4 * fc00::/7 N/A 5 ULA (RFC 4193) * 2001::/32 N/A 6 Teredo (RFC 4380) + * 2001:10::/28 N/A 7 ORCHID (RFC 4843) + * fec0::/10 N/A 11 Site-local + * (deprecated by RFC3879) + * 3ffe::/16 N/A 12 6bone * * Note: 0xffffffff is used if we do not have any policies. + * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724. */ #define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL -static const __initdata struct ip6addrlbl_init_table +static const __net_initconst struct ip6addrlbl_init_table { const struct in6_addr *prefix; int prefixlen; @@ -73,27 +87,39 @@ static const __initdata struct ip6addrlbl_init_table { /* ::/0 */ .prefix = &in6addr_any, .label = 1, - },{ /* fc00::/7 */ - .prefix = &(struct in6_addr){{{ 0xfc }}}, + }, { /* fc00::/7 */ + .prefix = &(struct in6_addr){ { { 0xfc } } } , .prefixlen = 7, .label = 5, - },{ /* 2002::/16 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + }, { /* fec0::/10 */ + .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } }, + .prefixlen = 10, + .label = 11, + }, { /* 2002::/16 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } }, .prefixlen = 16, .label = 2, - },{ /* 2001::/32 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + }, { /* 3ffe::/16 */ + .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } }, + .prefixlen = 16, + .label = 12, + }, { /* 2001::/32 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } }, .prefixlen = 32, .label = 6, - },{ /* ::ffff:0:0 */ - .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + }, { /* 2001:10::/28 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } }, + .prefixlen = 28, + .label = 7, + }, { /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } }, .prefixlen = 96, .label = 4, - },{ /* ::/96 */ + }, { /* ::/96 */ .prefix = &in6addr_any, .prefixlen = 96, .label = 3, - },{ /* ::1/128 */ + }, { /* ::1/128 */ .prefix = &in6addr_loopback, .prefixlen = 128, .label = 0, @@ -103,6 +129,9 @@ static const __initdata struct ip6addrlbl_init_table /* Object management */ static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) { +#ifdef CONFIG_NET_NS + release_net(p->lbl_net); +#endif kfree(p); } @@ -111,7 +140,7 @@ static void ip6addrlbl_free_rcu(struct rcu_head *h) ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); } -static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p) +static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) { return atomic_inc_not_zero(&p->refcnt); } @@ -123,32 +152,36 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static int __ip6addrlbl_match(struct ip6addrlbl_entry *p, - const struct in6_addr *addr, - int addrtype, int ifindex) +static bool __ip6addrlbl_match(struct net *net, + const struct ip6addrlbl_entry *p, + const struct in6_addr *addr, + int addrtype, int ifindex) { + if (!net_eq(ip6addrlbl_net(p), net)) + return false; if (p->ifindex && p->ifindex != ifindex) - return 0; + return false; if (p->addrtype && p->addrtype != addrtype) - return 0; + return false; if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) - return 0; - return 1; + return false; + return true; } -static struct ip6addrlbl_entry *__ipv6_addr_label(const struct in6_addr *addr, +static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, + const struct in6_addr *addr, int type, int ifindex) { - struct hlist_node *pos; struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { - if (__ip6addrlbl_match(p, addr, type, ifindex)) + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(net, p, addr, type, ifindex)) return p; } return NULL; } -u32 ipv6_addr_label(const struct in6_addr *addr, int type, int ifindex) +u32 ipv6_addr_label(struct net *net, + const struct in6_addr *addr, int type, int ifindex) { u32 label; struct ip6addrlbl_entry *p; @@ -156,31 +189,27 @@ u32 ipv6_addr_label(const struct in6_addr *addr, int type, int ifindex) type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK; rcu_read_lock(); - p = __ipv6_addr_label(addr, type, ifindex); + p = __ipv6_addr_label(net, addr, type, ifindex); label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; rcu_read_unlock(); - ADDRLABEL(KERN_DEBUG "%s(addr=" NIP6_FMT ", type=%d, ifindex=%d) => %08x\n", - __FUNCTION__, - NIP6(*addr), type, ifindex, - label); + ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", + __func__, addr, type, ifindex, label); return label; } /* allocate one entry */ -static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, +static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, + const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { struct ip6addrlbl_entry *newp; int addrtype; - ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u)\n", - __FUNCTION__, - NIP6(*prefix), prefixlen, - ifindex, - (unsigned int)label); + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label); addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); @@ -211,6 +240,9 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); +#ifdef CONFIG_NET_NS + newp->lbl_net = hold_net(net); +#endif atomic_set(&newp->refcnt, 1); return newp; } @@ -218,37 +250,36 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, /* add a label */ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) { + struct hlist_node *n; + struct ip6addrlbl_entry *last = NULL, *p = NULL; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", - __FUNCTION__, - newp, replace); + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, + replace); - if (hlist_empty(&ip6addrlbl_table.head)) { - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); - } else { - struct hlist_node *pos, *n; - struct ip6addrlbl_entry *p = NULL; - hlist_for_each_entry_safe(p, pos, n, - &ip6addrlbl_table.head, list) { - if (p->prefixlen == newp->prefixlen && - p->ifindex == newp->ifindex && - ipv6_addr_equal(&p->prefix, &newp->prefix)) { - if (!replace) { - ret = -EEXIST; - goto out; - } - hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); - goto out; - } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || - (p->prefixlen < newp->prefixlen)) { - hlist_add_before_rcu(&newp->list, &p->list); + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; goto out; } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; } - hlist_add_after_rcu(&p->list, &newp->list); + last = p; } + if (last) + hlist_add_after_rcu(&last->list, &newp->list); + else + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: if (!ret) ip6addrlbl_table.seq++; @@ -256,20 +287,18 @@ out: } /* add a label */ -static int ip6addrlbl_add(const struct in6_addr *prefix, int prefixlen, +static int ip6addrlbl_add(struct net *net, + const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label, int replace) { struct ip6addrlbl_entry *newp; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", - __FUNCTION__, - NIP6(*prefix), prefixlen, - ifindex, - (unsigned int)label, - replace); + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label, + replace); - newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); + newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); spin_lock(&ip6addrlbl_table.lock); @@ -281,20 +310,20 @@ static int ip6addrlbl_add(const struct in6_addr *prefix, int prefixlen, } /* remove a label */ -static int __ip6addrlbl_del(const struct in6_addr *prefix, int prefixlen, +static int __ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, int ifindex) { struct ip6addrlbl_entry *p = NULL; - struct hlist_node *pos, *n; + struct hlist_node *n; int ret = -ESRCH; - ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n", - __FUNCTION__, - NIP6(*prefix), prefixlen, - ifindex); + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && + net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); @@ -306,34 +335,34 @@ static int __ip6addrlbl_del(const struct in6_addr *prefix, int prefixlen, return ret; } -static int ip6addrlbl_del(const struct in6_addr *prefix, int prefixlen, +static int ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, int ifindex) { struct in6_addr prefix_buf; int ret; - ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n", - __FUNCTION__, - NIP6(*prefix), prefixlen, - ifindex); + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); spin_lock(&ip6addrlbl_table.lock); - ret = __ip6addrlbl_del(&prefix_buf, prefixlen, ifindex); + ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); spin_unlock(&ip6addrlbl_table.lock); return ret; } /* add default label */ -static __init int ip6addrlbl_init(void) +static int __net_init ip6addrlbl_net_init(struct net *net) { int err = 0; int i; - ADDRLABEL(KERN_DEBUG "%s()\n", __FUNCTION__); + ADDRLABEL(KERN_DEBUG "%s\n", __func__); for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { - int ret = ip6addrlbl_add(ip6addrlbl_init_table[i].prefix, + int ret = ip6addrlbl_add(net, + ip6addrlbl_init_table[i].prefix, ip6addrlbl_init_table[i].prefixlen, 0, ip6addrlbl_init_table[i].label, 0); @@ -344,11 +373,37 @@ static __init int ip6addrlbl_init(void) return err; } +static void __net_exit ip6addrlbl_net_exit(struct net *net) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *n; + + /* Remove all labels belonging to the exiting net */ + spin_lock(&ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + if (net_eq(ip6addrlbl_net(p), net)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + } + } + spin_unlock(&ip6addrlbl_table.lock); +} + +static struct pernet_operations ipv6_addr_label_ops = { + .init = ip6addrlbl_net_init, + .exit = ip6addrlbl_net_exit, +}; + int __init ipv6_addr_label_init(void) { spin_lock_init(&ip6addrlbl_table.lock); - return ip6addrlbl_init(); + return register_pernet_subsys(&ipv6_addr_label_ops); +} + +void ipv6_addr_label_cleanup(void) +{ + unregister_pernet_subsys(&ipv6_addr_label_ops); } static const struct nla_policy ifal_policy[IFAL_MAX+1] = { @@ -356,19 +411,15 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_LABEL] = { .len = sizeof(u32), }, }; -static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct ifaddrlblmsg *ifal; struct nlattr *tb[IFAL_MAX+1]; struct in6_addr *pfx; u32 label; int err = 0; - if (net != &init_net) - return 0; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); if (err < 0) return err; @@ -379,16 +430,9 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, ifal->ifal_prefixlen > 128) return -EINVAL; - if (ifal->ifal_index && - !__dev_get_by_index(&init_net, ifal->ifal_index)) - return -EINVAL; - if (!tb[IFAL_ADDRESS]) return -EINVAL; - pfx = nla_data(tb[IFAL_ADDRESS]); - if (!pfx) - return -EINVAL; if (!tb[IFAL_LABEL]) return -EINVAL; @@ -396,14 +440,18 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, if (label == IPV6_ADDR_LABEL_DEFAULT) return -EINVAL; - switch(nlh->nlmsg_type) { + switch (nlh->nlmsg_type) { case RTM_NEWADDRLABEL: - err = ip6addrlbl_add(pfx, ifal->ifal_prefixlen, + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + + err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, ifal->ifal_index, label, nlh->nlmsg_flags & NLM_F_REPLACE); break; case RTM_DELADDRLABEL: - err = ip6addrlbl_del(pfx, ifal->ifal_prefixlen, + err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen, ifal->ifal_index); break; default: @@ -412,8 +460,8 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, - int prefixlen, int ifindex, u32 lseq) +static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, + int prefixlen, int ifindex, u32 lseq) { struct ifaddrlblmsg *ifal = nlmsg_data(nlh); ifal->ifal_family = AF_INET6; @@ -426,10 +474,10 @@ static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, static int ip6addrlbl_fill(struct sk_buff *skb, struct ip6addrlbl_entry *p, u32 lseq, - u32 pid, u32 seq, int event, + u32 portid, u32 seq, int event, unsigned int flags) { - struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event, + struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrlblmsg), flags); if (!nlh) return -EMSGSIZE; @@ -447,24 +495,22 @@ static int ip6addrlbl_fill(struct sk_buff *skb, static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; - struct hlist_node *pos; int idx = 0, s_idx = cb->args[0]; int err; - if (net != &init_net) - return 0; - rcu_read_lock(); - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { - if (idx >= s_idx) { - if ((err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWADDRLABEL, - NLM_F_MULTI)) <= 0) + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { + if (idx >= s_idx && + net_eq(ip6addrlbl_net(p), net)) { + err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI); + if (err <= 0) break; } idx++; @@ -476,16 +522,14 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) static inline int ip6addrlbl_msgsize(void) { - return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + nla_total_size(16) /* IFAL_ADDRESS */ - + nla_total_size(4) /* IFAL_LABEL */ - ); + + nla_total_size(4); /* IFAL_LABEL */ } -static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, - void *arg) +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; struct nlattr *tb[IFAL_MAX+1]; struct in6_addr *addr; @@ -494,9 +538,6 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, struct ip6addrlbl_entry *p; struct sk_buff *skb; - if (net != &init_net) - return 0; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); if (err < 0) return err; @@ -508,18 +549,15 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, return -EINVAL; if (ifal->ifal_index && - !__dev_get_by_index(&init_net, ifal->ifal_index)) + !__dev_get_by_index(net, ifal->ifal_index)) return -EINVAL; if (!tb[IFAL_ADDRESS]) return -EINVAL; - addr = nla_data(tb[IFAL_ADDRESS]); - if (!addr) - return -EINVAL; rcu_read_lock(); - p = __ipv6_addr_label(addr, ipv6_addr_type(addr), ifal->ifal_index); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); if (p && ip6addrlbl_hold(p)) p = NULL; lseq = ip6addrlbl_table.seq; @@ -530,13 +568,14 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, goto out; } - if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); + if (!skb) { ip6addrlbl_put(p); return -ENOBUFS; } err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, 0); ip6addrlbl_put(p); @@ -547,15 +586,18 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, goto out; } - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; } void __init ipv6_addr_label_rtnl_register(void) { - __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL); - __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL); - __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump); + __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, + ip6addrlbl_dump, NULL); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f0aa9773874..7cb4392690d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,8 +7,6 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.66 2002/02/01 22:01:04 davem Exp $ - * * Fixes: * piggy, Karl Knutson : Socket protocol table * Hideaki YOSHIFUJI : sin6_scope_id support @@ -20,6 +18,7 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/capability.h> @@ -38,6 +37,7 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -49,18 +49,20 @@ #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> -#include <net/ipip.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/inet_common.h> +#include <net/route.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif #include <asm/uaccess.h> -#include <asm/system.h> +#include <linux/mroute6.h> MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); @@ -72,6 +74,22 @@ MODULE_LICENSE("GPL"); static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); +struct ipv6_params ipv6_defaults = { + .disable_ipv6 = 0, + .autoconf = 1, +}; + +static int disable_ipv6_mod; + +module_param_named(disable, disable_ipv6_mod, int, 0444); +MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); + +module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444); +MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); + +module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); +MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); + static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -79,35 +97,25 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } -static int inet6_create(struct net *net, struct socket *sock, int protocol) +static int inet6_create(struct net *net, struct socket *sock, int protocol, + int kern) { struct inet_sock *inet; struct ipv6_pinfo *np; struct sock *sk; - struct list_head *p; struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; - if (net != &init_net) - return -EAFNOSUPPORT; - - if (sock->type != SOCK_RAW && - sock->type != SOCK_DGRAM && - !inet_ehash_secret) - build_ehash_secret(); - /* Look for the requested type/protocol pair. */ - answer = NULL; lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); - list_for_each_rcu(p, &inetsw6[sock->type]) { - answer = list_entry(p, struct inet_protosw, list); + list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) { + err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) @@ -122,10 +130,9 @@ lookup_protocol: break; } err = -EPROTONOSUPPORT; - answer = NULL; } - if (!answer) { + if (err) { if (try_loading_module < 2) { rcu_read_unlock(); /* @@ -148,16 +155,16 @@ lookup_protocol: } err = -EPERM; - if (answer->capability > 0 && !capable(answer->capability)) + if (sock->type == SOCK_RAW && !kern && + !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); - BUG_TRAP(answer_prot->slab != NULL); + WARN_ON(answer_prot->slab == NULL); err = -ENOBUFS; sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); @@ -167,15 +174,14 @@ lookup_protocol: sock_init_data(sock, sk); err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) - sk->sk_reuse = 1; + sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; if (SOCK_RAW == sock->type) { - inet->num = protocol; + inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } @@ -188,10 +194,10 @@ lookup_protocol: inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; - np->mcast_hops = -1; + np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = init_net.ipv6.sysctl.bindv6only; + np->ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -202,8 +208,9 @@ lookup_protocol: inet->mc_ttl = 1; inet->mc_index = 0; inet->mc_list = NULL; + inet->rcv_tos = 0; - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -218,12 +225,12 @@ lookup_protocol: */ sk_refcnt_debug_inc(sk); - if (inet->num) { + if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically shares. */ - inet->sport = htons(inet->num); + inet->inet_sport = htons(inet->inet_num); sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { @@ -244,10 +251,11 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; int addr_type = 0; @@ -259,26 +267,47 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; lock_sock(sk); /* Check these errors (active socket, double bind). */ - if (sk->sk_state != TCP_CLOSE || inet->num) { + if (sk->sk_state != TCP_CLOSE || inet->inet_num) { err = -EINVAL; goto out; } /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { + int chk_addr_ret; + + /* Binding to v4-mapped address on a v6-only socket + * makes no sense + */ + if (np->ipv6only) { + err = -EINVAL; + goto out; + } + + /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; - if (inet_addr_type(&init_net, v4addr) != RTN_LOCAL) { + chk_addr_ret = inet_addr_type(net, v4addr); + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) { err = -EADDRNOTAVAIL; goto out; } @@ -286,7 +315,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; - if (addr_type & IPV6_ADDR_LINKLOCAL) { + rcu_read_lock(); + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another one @@ -298,12 +328,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; - goto out; + goto out_unlock; } - dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; - goto out; + goto out_unlock; } } @@ -312,26 +342,24 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!ipv6_chk_addr(&init_net, &addr->sin6_addr, + if (!(inet->freebind || inet->transparent) && + !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { - if (dev) - dev_put(dev); err = -EADDRNOTAVAIL; - goto out; + goto out_unlock; } } - if (dev) - dev_put(dev); + rcu_read_unlock(); } } - inet->rcv_saddr = v4addr; - inet->saddr = v4addr; + inet->inet_rcv_saddr = v4addr; + inet->inet_saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ if (sk->sk_prot->get_port(sk, snum)) { @@ -340,18 +368,23 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - if (addr_type != IPV6_ADDR_ANY) + if (addr_type != IPV6_ADDR_ANY) { sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (addr_type != IPV6_ADDR_MAPPED) + np->ipv6only = 1; + } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; - inet->sport = htons(inet->num); - inet->dport = 0; - inet->daddr = 0; + inet->inet_sport = htons(inet->inet_num); + inet->inet_dport = 0; + inet->inet_daddr = 0; out: release_sock(sk); return err; +out_unlock: + rcu_read_unlock(); + goto out; } - EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) @@ -369,10 +402,9 @@ int inet6_release(struct socket *sock) return inet_release(sock); } - EXPORT_SYMBOL(inet6_release); -int inet6_destroy_sock(struct sock *sk) +void inet6_destroy_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; @@ -380,7 +412,12 @@ int inet6_destroy_sock(struct sock *sk) /* Release rx options */ - if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + skb = xchg(&np->pktoptions, NULL); + if (skb != NULL) + kfree_skb(skb); + + skb = xchg(&np->rxpmtu, NULL); + if (skb != NULL) kfree_skb(skb); /* Free flowlabels */ @@ -388,12 +425,10 @@ int inet6_destroy_sock(struct sock *sk) /* Free tx options */ - if ((opt = xchg(&np->opt, NULL)) != NULL) + opt = xchg(&np->opt, NULL); + if (opt != NULL) sock_kfree_s(sk, opt, opt->tot_len); - - return 0; } - EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* @@ -403,7 +438,7 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { - struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -412,37 +447,36 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; if (peer) { - if (!inet->dport) + if (!inet->inet_dport) return -ENOTCONN; if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1) return -ENOTCONN; - sin->sin6_port = inet->dport; - ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + sin->sin6_port = inet->inet_dport; + sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; } else { - if (ipv6_addr_any(&np->rcv_saddr)) - ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + sin->sin6_addr = np->saddr; else - ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->sport; + sin->sin6_port = inet->inet_sport; } - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = sk->sk_bound_dev_if; + sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, + sk->sk_bound_dev_if); *uaddr_len = sizeof(*sin); - return(0); + return 0; } - EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; + struct net *net = sock_net(sk); - switch(cmd) - { + switch (cmd) { case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); @@ -452,23 +486,22 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: - return(ipv6_route_ioctl(cmd,(void __user *)arg)); + return ipv6_route_ioctl(net, cmd, (void __user *)arg); case SIOCSIFADDR: - return addrconf_add_ifaddr((void __user *) arg); + return addrconf_add_ifaddr(net, (void __user *) arg); case SIOCDIFADDR: - return addrconf_del_ifaddr((void __user *) arg); + return addrconf_del_ifaddr(net, (void __user *) arg); case SIOCSIFDSTADDR: - return addrconf_set_dstaddr((void __user *) arg); + return addrconf_set_dstaddr(net, (void __user *) arg); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ - return(0); + return 0; } - EXPORT_SYMBOL(inet6_ioctl); const struct proto_ops inet6_stream_ops = { @@ -486,10 +519,10 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = tcp_sendmsg, /* ok */ - .recvmsg = sock_common_recvmsg, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, - .sendpage = tcp_sendpage, + .sendpage = inet_sendpage, .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, @@ -513,7 +546,7 @@ const struct proto_ops inet6_dgram_ops = { .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = sock_common_recvmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT @@ -522,7 +555,7 @@ const struct proto_ops inet6_dgram_ops = { #endif }; -static struct net_proto_family inet6_family_ops = { +static const struct net_proto_family inet6_family_ops = { .family = PF_INET6, .create = inet6_create, .owner = THIS_MODULE, @@ -574,25 +607,21 @@ out: return ret; out_permanent: - printk(KERN_ERR "Attempt to override permanent protocol %d.\n", - protocol); + pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: - printk(KERN_ERR - "Ignoring attempt to register invalid socket type %d.\n", + pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } - EXPORT_SYMBOL(inet6_register_protosw); void inet6_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { - printk(KERN_ERR - "Attempt to unregister permanent protocol %d.\n", + pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw6_lock); @@ -602,50 +631,38 @@ inet6_unregister_protosw(struct inet_protosw *p) synchronize_net(); } } - EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { - int err; - struct dst_entry *dst; struct ipv6_pinfo *np = inet6_sk(sk); + struct dst_entry *dst; dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - security_sk_classify_flow(sk, &fl); - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { + struct in6_addr *final_p, final; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = np->saddr; + fl6.flowlabel = np->flow_label; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; + sk->sk_err_soft = -PTR_ERR(dst); + return PTR_ERR(dst); } __ip6_dst_store(sk, dst, NULL, NULL); @@ -653,90 +670,138 @@ int inet6_sk_rebuild_header(struct sock *sk) return 0; } - EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & - *(__be32 *)skb_network_header(skb)) && + (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; + return true; } + return false; +} +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + +static struct packet_type ipv6_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), + .func = ipv6_rcv, +}; + +static int __init ipv6_packet_init(void) +{ + dev_add_pack(&ipv6_packet_type); return 0; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); +static void ipv6_packet_cleanup(void) +{ + dev_remove_pack(&ipv6_packet_type); +} -static int __init init_ipv6_mibs(void) +static int __net_init ipv6_init_mibs(struct net *net) { - if (snmp_mib_init((void **)ipv6_statistics, - sizeof(struct ipstats_mib)) < 0) + int i; + + net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udp_stats_in6) + return -ENOMEM; + net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_stats_in6) + goto err_udplite_mib; + net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ipv6_statistics) goto err_ip_mib; - if (snmp_mib_init((void **)icmpv6_statistics, - sizeof(struct icmpv6_mib)) < 0) + + for_each_possible_cpu(i) { + struct ipstats_mib *af_inet6_stats; + af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); + u64_stats_init(&af_inet6_stats->syncp); + } + + + net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); + if (!net->mib.icmpv6_statistics) goto err_icmp_mib; - if (snmp_mib_init((void **)icmpv6msg_statistics, - sizeof(struct icmpv6msg_mib)) < 0) + net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), + GFP_KERNEL); + if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; - if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0) - goto err_udp_mib; - if (snmp_mib_init((void **)udplite_stats_in6, - sizeof (struct udp_mib)) < 0) - goto err_udplite_mib; return 0; -err_udplite_mib: - snmp_mib_free((void **)udp_stats_in6); -err_udp_mib: - snmp_mib_free((void **)icmpv6msg_statistics); err_icmpmsg_mib: - snmp_mib_free((void **)icmpv6_statistics); + free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void **)ipv6_statistics); + free_percpu(net->mib.ipv6_statistics); err_ip_mib: + free_percpu(net->mib.udplite_stats_in6); +err_udplite_mib: + free_percpu(net->mib.udp_stats_in6); return -ENOMEM; - } -static void cleanup_ipv6_mibs(void) +static void ipv6_cleanup_mibs(struct net *net) { - snmp_mib_free((void **)ipv6_statistics); - snmp_mib_free((void **)icmpv6_statistics); - snmp_mib_free((void **)icmpv6msg_statistics); - snmp_mib_free((void **)udp_stats_in6); - snmp_mib_free((void **)udplite_stats_in6); + free_percpu(net->mib.udp_stats_in6); + free_percpu(net->mib.udplite_stats_in6); + free_percpu(net->mib.ipv6_statistics); + free_percpu(net->mib.icmpv6_statistics); + kfree(net->mib.icmpv6msg_statistics); } -static int inet6_net_init(struct net *net) +static int __net_init inet6_net_init(struct net *net) { + int err = 0; + net->ipv6.sysctl.bindv6only = 0; - net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; - net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; - net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; - net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; - net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; - net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; - net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.flowlabel_consistency = 1; + atomic_set(&net->ipv6.rt_genid, 0); - return 0; + err = ipv6_init_mibs(net); + if (err) + return err; +#ifdef CONFIG_PROC_FS + err = udp6_proc_init(net); + if (err) + goto out; + err = tcp6_proc_init(net); + if (err) + goto proc_tcp6_fail; + err = ac6_proc_init(net); + if (err) + goto proc_ac6_fail; +#endif + return err; + +#ifdef CONFIG_PROC_FS +proc_ac6_fail: + tcp6_proc_exit(net); +proc_tcp6_fail: + udp6_proc_exit(net); +out: + ipv6_cleanup_mibs(net); + return err; +#endif } -static void inet6_net_exit(struct net *net) +static void __net_exit inet6_net_exit(struct net *net) { - return; +#ifdef CONFIG_PROC_FS + udp6_proc_exit(net); + tcp6_proc_exit(net); + ac6_proc_exit(net); +#endif + ipv6_cleanup_mibs(net); } static struct pernet_operations inet6_net_ops = { @@ -744,13 +809,30 @@ static struct pernet_operations inet6_net_ops = { .exit = inet6_net_exit, }; +static const struct ipv6_stub ipv6_stub_impl = { + .ipv6_sock_mc_join = ipv6_sock_mc_join, + .ipv6_sock_mc_drop = ipv6_sock_mc_drop, + .ipv6_dst_lookup = ip6_dst_lookup, + .udpv6_encap_enable = udpv6_encap_enable, + .ndisc_send_na = ndisc_send_na, + .nd_tbl = &nd_tbl, +}; + static int __init inet6_init(void) { - struct sk_buff *dummy_skb; struct list_head *r; - int err; + int err = 0; + + BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + /* Register the socket-side information for inet6_create. */ + for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_ipv6_mod) { + pr_info("Loaded, but administratively disabled, reboot required to enable\n"); + goto out; + } err = proto_register(&tcpv6_prot, 1); if (err) @@ -768,10 +850,9 @@ static int __init inet6_init(void) if (err) goto out_unregister_udplite_proto; - - /* Register the socket-side information for inet6_create. */ - for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) - INIT_LIST_HEAD(r); + err = proto_register(&pingv6_prot, 1); + if (err) + goto out_unregister_ping_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. @@ -787,36 +868,31 @@ static int __init inet6_init(void) if (err) goto out_sock_register_fail; - /* Initialise ipv6 mibs */ - err = init_ipv6_mibs(); - if (err) - goto out_unregister_sock; - /* * ipngwg API draft makes clear that the correct semantics * for TCP and UDP is to consider one TCP and UDP instance - * in a host availiable by both INET and INET6 APIs and + * in a host available by both INET and INET6 APIs and * able to communicate via both network protocols. */ err = register_pernet_subsys(&inet6_net_ops); if (err) goto register_pernet_fail; - -#ifdef CONFIG_SYSCTL - err = ipv6_sysctl_register(); - if (err) - goto sysctl_fail; -#endif - err = icmpv6_init(&inet6_family_ops); + err = icmpv6_init(); if (err) goto icmp_fail; - err = ndisc_init(&inet6_family_ops); + err = ip6_mr_init(); + if (err) + goto ipmr_fail; + err = ndisc_init(); if (err) goto ndisc_fail; - err = igmp6_init(&inet6_family_ops); + err = igmp6_init(); if (err) goto igmp_fail; + + ipv6_stub = &ipv6_stub_impl; + err = ipv6_netfilter_init(); if (err) goto netfilter_fail; @@ -825,23 +901,19 @@ static int __init inet6_init(void) err = -ENOMEM; if (raw6_proc_init()) goto proc_raw6_fail; - if (tcp6_proc_init()) - goto proc_tcp6_fail; - if (udp6_proc_init()) - goto proc_udp6_fail; if (udplite6_proc_init()) goto proc_udplite6_fail; if (ipv6_misc_proc_init()) goto proc_misc6_fail; - - if (ac6_proc_init()) - goto proc_anycast6_fail; if (if6_proc_init()) goto proc_if6_fail; #endif err = ip6_route_init(); if (err) goto ip6_route_fail; + err = ndisc_late_init(); + if (err) + goto ndisc_late_fail; err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; @@ -874,9 +946,25 @@ static int __init inet6_init(void) err = ipv6_packet_init(); if (err) goto ipv6_packet_fail; + + err = pingv6_init(); + if (err) + goto pingv6_fail; + +#ifdef CONFIG_SYSCTL + err = ipv6_sysctl_register(); + if (err) + goto sysctl_fail; +#endif out: return err; +#ifdef CONFIG_SYSCTL +sysctl_fail: + pingv6_exit(); +#endif +pingv6_fail: + ipv6_packet_cleanup(); ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: @@ -892,21 +980,17 @@ ipv6_exthdrs_fail: addrconf_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: + ndisc_late_cleanup(); +ndisc_late_fail: ip6_route_cleanup(); ip6_route_fail: #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: - ac6_proc_exit(); -proc_anycast6_fail: ipv6_misc_proc_exit(); proc_misc6_fail: udplite6_proc_exit(); proc_udplite6_fail: - udp6_proc_exit(); -proc_udp6_fail: - tcp6_proc_exit(); -proc_tcp6_fail: raw6_proc_exit(); proc_raw6_fail: #endif @@ -916,20 +1000,18 @@ netfilter_fail: igmp_fail: ndisc_cleanup(); ndisc_fail: + ip6_mr_cleanup(); +ipmr_fail: icmpv6_cleanup(); icmp_fail: -#ifdef CONFIG_SYSCTL - ipv6_sysctl_unregister(); -sysctl_fail: -#endif unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: - cleanup_ipv6_mibs(); -out_unregister_sock: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); out_sock_register_fail: rawv6_exit(); +out_unregister_ping_proto: + proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: @@ -942,50 +1024,4 @@ out_unregister_tcp_proto: } module_init(inet6_init); -static void __exit inet6_exit(void) -{ - /* First of all disallow new sockets creation. */ - sock_unregister(PF_INET6); - /* Disallow any further netlink messages */ - rtnl_unregister_all(PF_INET6); - - udpv6_exit(); - udplitev6_exit(); - tcpv6_exit(); - - /* Cleanup code parts. */ - ipv6_packet_cleanup(); - ipv6_frag_exit(); - ipv6_exthdrs_exit(); - addrconf_cleanup(); - ip6_flowlabel_cleanup(); - ip6_route_cleanup(); -#ifdef CONFIG_PROC_FS - - /* Cleanup code parts. */ - if6_proc_exit(); - ac6_proc_exit(); - ipv6_misc_proc_exit(); - udplite6_proc_exit(); - udp6_proc_exit(); - tcp6_proc_exit(); - raw6_proc_exit(); -#endif - ipv6_netfilter_fini(); - igmp6_cleanup(); - ndisc_cleanup(); - icmpv6_cleanup(); - rawv6_exit(); -#ifdef CONFIG_SYSCTL - ipv6_sysctl_unregister(); -#endif - unregister_pernet_subsys(&inet6_net_ops); - cleanup_ipv6_mibs(); - proto_unregister(&rawv6_prot); - proto_unregister(&udplitev6_prot); - proto_unregister(&udpv6_prot); - proto_unregister(&tcpv6_prot); -} -module_exit(inet6_exit); - MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 2ff0c8233e4..72a4930bdc0 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors * @@ -24,19 +23,97 @@ * This file is derived from net/ipv4/ah.c. */ +#define pr_fmt(fmt) "IPv6: " fmt + +#include <crypto/hash.h> #include <linux/module.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> -#include <linux/spinlock.h> #include <linux/string.h> +#include <linux/scatterlist.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> -static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +#define IPV6HDR_BASELEN 8 + +struct tmp_ext { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + struct in6_addr saddr; +#endif + struct in6_addr daddr; + char hdrs[0]; +}; + +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline struct tmp_ext *ah_tmp_ext(void *base) +{ + return base + IPV6HDR_BASELEN; +} + +static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} + +static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) { u8 *opt = (u8 *)opthdr; int len = ipv6_optlen(opthdr); @@ -50,7 +127,7 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) switch (opt[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; default: @@ -68,13 +145,13 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) len -= optlen; } if (len == 0) - return 1; + return true; bad: - return 0; + return false; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) /** * ipv6_rearrange_destopt - rearrange IPv6 destination options header * @iph: IPv6 header @@ -94,7 +171,7 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des switch (opt[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; default: @@ -114,13 +191,13 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des hao = (struct ipv6_destopt_hao *)&opt[off]; if (hao->length != sizeof(hao->addr)) { - if (net_ratelimit()) - printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length); + net_warn_ratelimited("destopt hao: invalid header length: %u\n", + hao->length); goto bad; } - ipv6_addr_copy(&final_addr, &hao->addr); - ipv6_addr_copy(&hao->addr, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &final_addr); + final_addr = hao->addr; + hao->addr = iph->saddr; + iph->saddr = final_addr; } break; } @@ -166,13 +243,13 @@ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) segments = rthdr->hdrlen >> 1; addrs = ((struct rt0_hdr *)rthdr)->addr; - ipv6_addr_copy(&final_addr, addrs + segments - 1); + final_addr = addrs[segments - 1]; addrs += segments - segments_left; memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); - ipv6_addr_copy(addrs, &iph->daddr); - ipv6_addr_copy(&iph->daddr, &final_addr); + addrs[0] = iph->daddr; + iph->daddr = final_addr; } static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) @@ -218,24 +295,94 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) return 0; } +static void ah6_output_done(struct crypto_async_request *base, int err) +{ + int extlen; + u8 *iph_base; + u8 *icv; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct ipv6hdr *top_iph = ipv6_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + struct tmp_ext *iph_ext; + + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + iph_base = AH_SKB_CB(skb)->tmp; + iph_ext = ah_tmp_ext(iph_base); + icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen); + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + memcpy(&top_iph->saddr, iph_ext, extlen); +#else + memcpy(&top_iph->daddr, iph_ext, extlen); +#endif + } + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) { int err; + int nfrags; int extlen; + u8 *iph_base; + u8 *icv; + u8 nexthdr; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; struct ipv6hdr *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; - u8 nexthdr; - char tmp_base[8]; - struct { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - struct in6_addr saddr; -#endif - struct in6_addr daddr; - char hdrs[0]; - } *tmp_ext; + struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; skb_push(skb, -skb_network_offset(skb)); + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + err = -ENOMEM; + iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN + + extlen + seqhi_len); + if (!iph_base) + goto out; + + iph_ext = ah_tmp_ext(iph_base); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; + + ah = ip_auth_hdr(skb); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + top_iph = ipv6_hdr(skb); top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); @@ -245,31 +392,22 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) /* When there are no extension headers, we only need to save the first * 8 bytes of the base IP header. */ - memcpy(tmp_base, top_iph, sizeof(tmp_base)); + memcpy(iph_base, top_iph, IPV6HDR_BASELEN); - tmp_ext = NULL; - extlen = skb_transport_offset(skb) - sizeof(struct ipv6hdr); if (extlen) { - extlen += sizeof(*tmp_ext); - tmp_ext = kmalloc(extlen, GFP_ATOMIC); - if (!tmp_ext) { - err = -ENOMEM; - goto error; - } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - memcpy(tmp_ext, &top_iph->saddr, extlen); +#if IS_ENABLED(CONFIG_IPV6_MIP6) + memcpy(iph_ext, &top_iph->saddr, extlen); #else - memcpy(tmp_ext, &top_iph->daddr, extlen); + memcpy(iph_ext, &top_iph->daddr, extlen); #endif err = ipv6_clear_mutable_options(top_iph, - extlen - sizeof(*tmp_ext) + + extlen - sizeof(*iph_ext) + sizeof(*top_iph), XFRM_POLICY_OUT); if (err) - goto error_free_iph; + goto out_free; } - ah = ip_auth_hdr(skb); ah->nexthdr = nexthdr; top_iph->priority = 0; @@ -278,36 +416,88 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->flow_lbl[2] = 0; top_iph->hop_limit = 0; - ahp = x->data; ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - spin_lock_bh(&x->lock); - err = ah_mac_digest(ahp, skb, ah->auth_data); - memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); - spin_unlock_bh(&x->lock); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - if (err) - goto error_free_iph; + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); + ahash_request_set_callback(req, 0, ah6_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph_base; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; - memcpy(top_iph, tmp_base, sizeof(tmp_base)); - if (tmp_ext) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - memcpy(&top_iph->saddr, tmp_ext, extlen); + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + memcpy(&top_iph->saddr, iph_ext, extlen); #else - memcpy(&top_iph->daddr, tmp_ext, extlen); + memcpy(&top_iph->daddr, iph_ext, extlen); #endif -error_free_iph: - kfree(tmp_ext); } -error: +out_free: + kfree(iph_base); +out: return err; } +static void ah6_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int hdr_len = skb_network_header_len(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, hdr_len); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + err = ah->nexthdr; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + + + static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) { /* @@ -325,29 +515,41 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) * There is offset of AH before IPv6 header after the process. */ + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; struct ip_auth_hdr *ah; struct ipv6hdr *ip6h; struct ah_data *ahp; - unsigned char *tmp_hdr = NULL; u16 hdr_len; u16 ah_hlen; int nexthdr; - int err = -EINVAL; + int nfrags; + int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; - hdr_len = skb->data - skb_network_header(skb); + hdr_len = skb_network_header_len(skb); ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; + ahash = ahp->ahash; + nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; @@ -358,78 +560,119 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, ah_hlen)) goto out; - tmp_hdr = kmemdup(skb_network_header(skb), hdr_len, GFP_ATOMIC); - if (!tmp_hdr) + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; + nfrags = err; + + ah = (struct ip_auth_hdr *)skb->data; ip6h = ipv6_hdr(skb); + + skb_push(skb, hdr_len); + + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + + ahp->icv_trunc_len + seqhi_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; + + memcpy(work_iph, ip6h, hdr_len); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) - goto free_out; + goto out_free; + ip6h->priority = 0; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->hop_limit = 0; - spin_lock(&x->lock); - { - u8 auth_data[MAX_AH_AUTH_LEN]; + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - memset(ah->auth_data, 0, ahp->icv_trunc_len); - skb_push(skb, hdr_len); - err = ah_mac_digest(ahp, skb, ah->auth_data); - if (err) - goto unlock; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) - err = -EBADMSG; + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); } -unlock: - spin_unlock(&x->lock); + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); + ahash_request_set_callback(req, 0, ah6_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + goto out_free; + } + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; if (err) - goto free_out; + goto out_free; skb->network_header += ah_hlen; - memcpy(skb_network_header(skb), tmp_hdr, hdr_len); - skb->transport_header = skb->network_header; + memcpy(skb_network_header(skb), work_iph, hdr_len); __skb_pull(skb, ah_hlen + hdr_len); - kfree(tmp_hdr); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); - return nexthdr; + err = nexthdr; -free_out: - kfree(tmp_hdr); +out_free: + kfree(work_iph); out: return err; } -static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) +static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); if (!x) - return; - - NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" NIP6_FMT "\n", - ntohl(ah->spi), NIP6(iph->daddr)); + return 0; + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static int ah6_init_state(struct xfrm_state *x) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; - struct crypto_hash *tfm; + struct crypto_ahash *ahash; if (!x->aalg) goto error; @@ -441,12 +684,12 @@ static int ah6_init_state(struct xfrm_state *x) if (ahp == NULL) return -ENOMEM; - tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) goto error; - ahp->tfm = tfm; - if (crypto_hash_setkey(tfm, x->aalg->alg_key, + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) goto error; @@ -460,22 +703,18 @@ static int ah6_init_state(struct xfrm_state *x) BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != - crypto_hash_digestsize(tfm)) { - printk(KERN_INFO "AH: %s digestsize %u != %hu\n", - x->aalg->alg_name, crypto_hash_digestsize(tfm), - aalg_desc->uinfo.auth.icv_fullbits/8); + crypto_ahash_digestsize(ahash)) { + pr_info("AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_ahash_digestsize(ahash), + aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; - ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); - if (!ahp->work_icv) - goto error; - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); switch (x->props.mode) { @@ -494,8 +733,7 @@ static int ah6_init_state(struct xfrm_state *x) error: if (ahp) { - kfree(ahp->work_icv); - crypto_free_hash(ahp->tfm); + crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; @@ -508,13 +746,15 @@ static void ah6_destroy(struct xfrm_state *x) if (!ahp) return; - kfree(ahp->work_icv); - ahp->work_icv = NULL; - crypto_free_hash(ahp->tfm); - ahp->tfm = NULL; + crypto_free_ahash(ahp->ahash); kfree(ahp); } +static int ah6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ah6_type = { .description = "AH6", @@ -528,21 +768,22 @@ static const struct xfrm_type ah6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static struct inet6_protocol ah6_protocol = { +static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ah6_init(void) { if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { - printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah6_type, AF_INET6); return -EAGAIN; } @@ -552,11 +793,11 @@ static int __init ah6_init(void) static void __exit ah6_fini(void) { - if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) - printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 9c7f83fbc3a..21018324468 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -43,96 +44,73 @@ #include <net/checksum.h> -static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr); +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); /* Big ac list lock for all the sockets */ -static DEFINE_RWLOCK(ipv6_sk_ac_lock); +static DEFINE_SPINLOCK(ipv6_sk_ac_lock); -static int -ip6_onlink(struct in6_addr *addr, struct net_device *dev) -{ - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; - int onlink; - - onlink = 0; - rcu_read_lock(); - idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { - onlink = ipv6_prefix_equal(addr, &ifa->addr, - ifa->prefix_len); - if (onlink) - break; - } - read_unlock_bh(&idev->lock); - } - rcu_read_unlock(); - return onlink; -} /* * socket join an anycast group */ -int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) +int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct inet6_dev *idev; struct ipv6_ac_socklist *pac; - int ishost = !ipv6_devconf.forwarding; + struct net *net = sock_net(sk); + int ishost = !net->ipv6.devconf_all->forwarding; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; - if (ipv6_chk_addr(&init_net, addr, NULL, 0)) + if (ipv6_chk_addr(net, addr, NULL, 0)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); if (pac == NULL) return -ENOMEM; pac->acl_next = NULL; - ipv6_addr_copy(&pac->acl_addr, addr); + pac->acl_addr = *addr; + rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } else if (ishost) { err = -EADDRNOTAVAIL; - goto out_free_pac; + goto error; } else { /* router, no matching interface: just pick one */ - - dev = dev_get_by_flags(&init_net, IFF_UP, IFF_UP|IFF_LOOPBACK); + dev = dev_get_by_flags_rcu(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev == NULL) { err = -ENODEV; - goto out_free_pac; + goto error; } - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; - goto out_dev_put; + goto error; } /* reset ishost, now that we have a specific device */ ishost = !idev->cnf.forwarding; - in6_dev_put(idev); pac->acl_ifindex = dev->ifindex; @@ -141,43 +119,40 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) * This obviates the need for propagating anycast routes while * still allowing some non-router anycast participation. */ - if (!ip6_onlink(addr, dev)) { + if (!ipv6_chk_prefix(addr, dev)) { if (ishost) err = -EADDRNOTAVAIL; if (err) - goto out_dev_put; + goto error; } err = ipv6_dev_ac_inc(dev, addr); - if (err) - goto out_dev_put; - - write_lock_bh(&ipv6_sk_ac_lock); - pac->acl_next = np->ipv6_ac_list; - np->ipv6_ac_list = pac; - write_unlock_bh(&ipv6_sk_ac_lock); - - dev_put(dev); - - return 0; + if (!err) { + spin_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + spin_unlock_bh(&ipv6_sk_ac_lock); + pac = NULL; + } -out_dev_put: - dev_put(dev); -out_free_pac: - sock_kfree_s(sk, pac, sizeof(*pac)); +error: + rcu_read_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); return err; } /* * socket leave an anycast group */ -int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; + struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_ac_lock); + spin_lock_bh(&ipv6_sk_ac_lock); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -186,7 +161,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) prev_pac = pac; } if (!pac) { - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); return -ENOENT; } if (prev_pac) @@ -194,13 +169,14 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); - dev = dev_get_by_index(&init_net, pac->acl_ifindex); - if (dev) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - dev_put(dev); - } + rcu_read_unlock(); + sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } @@ -210,21 +186,24 @@ void ipv6_sock_ac_close(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; + struct net *net = sock_net(sk); int prev_index; - write_lock_bh(&ipv6_sk_ac_lock); + if (!np->ipv6_ac_list) + return; + + spin_lock_bh(&ipv6_sk_ac_lock); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; + rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - if (dev) - dev_put(dev); - dev = dev_get_by_index(&init_net, pac->acl_ifindex); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -232,44 +211,14 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - if (dev) - dev_put(dev); -} - -#if 0 -/* The function is not used, which is funny. Apparently, author - * supposed to use it to filter out datagrams inside udp/raw but forgot. - * - * It is OK, anycasts are not special comparing to delivery to unicasts. - */ - -int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) -{ - struct ipv6_ac_socklist *pac; - struct ipv6_pinfo *np = inet6_sk(sk); - int found; - - found = 0; - read_lock(&ipv6_sk_ac_lock); - for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) { - if (ifindex && pac->acl_ifindex != ifindex) - continue; - found = ipv6_addr_equal(&pac->acl_addr, addr); - if (found) - break; - } - read_unlock(&ipv6_sk_ac_lock); - - return found; + rcu_read_unlock(); } -#endif - static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->u.dst); + dst_release(&ac->aca_rt->dst); kfree(ac); } } @@ -277,7 +226,7 @@ static void aca_put(struct ifacaddr6 *ac) /* * device anycast group inc (add if not found) */ -int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) +int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct inet6_dev *idev; @@ -314,14 +263,14 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) goto out; } - rt = addrconf_dst_alloc(idev, addr, 1); + rt = addrconf_dst_alloc(idev, addr, true); if (IS_ERR(rt)) { kfree(aca); err = PTR_ERR(rt); goto out; } - ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_addr = *addr; aca->aca_idev = idev; aca->aca_rt = rt; aca->aca_users = 1; @@ -334,9 +283,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) idev->ac_list = aca; write_unlock_bh(&idev->lock); - dst_hold(&rt->u.dst); - if (ip6_ins_rt(rt)) - dst_release(&rt->u.dst); + ip6_ins_rt(rt); addrconf_join_solict(dev, &aca->aca_addr); @@ -351,7 +298,7 @@ out: /* * device anycast group decrement */ -int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) +int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; @@ -377,70 +324,80 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->u.dst); - if (ip6_del_rt(aca->aca_rt)) - dst_free(&aca->aca_rt->u.dst); - else - dst_release(&aca->aca_rt->u.dst); + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); aca_put(aca); return 0; } -static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) +/* called with rcu_read_lock() */ +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { - int ret; - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); + if (idev == NULL) return -ENODEV; - ret = __ipv6_dev_ac_dec(idev, addr); - in6_dev_put(idev); - return ret; + return __ipv6_dev_ac_dec(idev, addr); } /* * check if the interface has this anycast address + * called with rcu_read_lock() */ -static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) +static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); for (aca = idev->ac_list; aca; aca = aca->aca_next) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; read_unlock_bh(&idev->lock); - in6_dev_put(idev); return aca != NULL; } - return 0; + return false; } /* * check if given interface (or any, if dev==0) has this anycast address */ -int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr) +bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, + const struct in6_addr *addr) { - int found = 0; + bool found = false; + rcu_read_lock(); if (dev) - return ipv6_chk_acast_dev(dev, addr); - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { - found = 1; - break; - } - read_unlock(&dev_base_lock); + found = ipv6_chk_acast_dev(dev, addr); + else + for_each_netdev_rcu(net, dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = true; + break; + } + rcu_read_unlock(); return found; } +/* check if this anycast address is link-local on given interface or + * is global + */ +bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, + const struct in6_addr *addr) +{ + return ipv6_chk_acast_addr(net, + (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? + dev : NULL), + addr); +} #ifdef CONFIG_PROC_FS struct ac6_iter_state { + struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; }; @@ -451,11 +408,12 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { struct ifacaddr6 *im = NULL; struct ac6_iter_state *state = ac6_seq_private(seq); + struct net *net = seq_file_net(seq); state->idev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; - idev = in6_dev_get(state->dev); + idev = __in6_dev_get(state->dev); if (!idev) continue; read_lock_bh(&idev->lock); @@ -465,7 +423,6 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) break; } read_unlock_bh(&idev->lock); - in6_dev_put(idev); } return im; } @@ -476,16 +433,15 @@ static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im im = im->aca_next; while (!im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; break; } - state->idev = in6_dev_get(state->dev); + state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; read_lock_bh(&state->idev->lock); @@ -504,29 +460,30 @@ static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) } static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { - read_lock(&dev_base_lock); + rcu_read_lock(); return ac6_get_idx(seq, *pos); } static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct ifacaddr6 *im; - im = ac6_get_next(seq, v); + struct ifacaddr6 *im = ac6_get_next(seq, v); + ++*pos; return im; } static void ac6_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { struct ac6_iter_state *state = ac6_seq_private(seq); + if (likely(state->idev != NULL)) { read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); + state->idev = NULL; } - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int ac6_seq_show(struct seq_file *seq, void *v) @@ -534,11 +491,9 @@ static int ac6_seq_show(struct seq_file *seq, void *v) struct ifacaddr6 *im = (struct ifacaddr6 *)v; struct ac6_iter_state *state = ac6_seq_private(seq); - seq_printf(seq, - "%-4d %-15s " NIP6_SEQFMT " %5d\n", + seq_printf(seq, "%-4d %-15s %pi6 %5d\n", state->dev->ifindex, state->dev->name, - NIP6(im->aca_addr), - im->aca_users); + &im->aca_addr, im->aca_users); return 0; } @@ -551,8 +506,8 @@ static const struct seq_operations ac6_seq_ops = { static int ac6_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ac6_seq_ops, - sizeof(struct ac6_iter_state)); + return seq_open_net(inode, file, &ac6_seq_ops, + sizeof(struct ac6_iter_state)); } static const struct file_operations ac6_seq_fops = { @@ -560,20 +515,20 @@ static const struct file_operations ac6_seq_fops = { .open = ac6_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init ac6_proc_init(void) +int __net_init ac6_proc_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "anycast6", S_IRUGO, &ac6_seq_fops)) + if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops)) return -ENOMEM; return 0; } -void ac6_proc_exit(void) +void ac6_proc_exit(struct net *net) { - proc_net_remove(&init_net, "anycast6"); + remove_proc_entry("anycast6", net->proc_net); } #endif diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 94fa6ae77cf..c3bf2d2e519 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.24 2002/02/01 22:01:04 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -23,6 +21,8 @@ #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -30,19 +30,26 @@ #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> +#include <net/dsfield.h> #include <linux/errqueue.h> #include <asm/uaccess.h> +static bool ipv6_mapped_addr_any(const struct in6_addr *a) +{ + return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); +} + int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p = NULL, final; + struct in6_addr *daddr, *final_p, final; struct dst_entry *dst; - struct flowi fl; + struct flowi6 fl6; struct ip6_flowlabel *flowlabel = NULL; + struct ipv6_txoptions *opt; int addr_type; int err; @@ -59,14 +66,13 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { - fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); } } @@ -93,28 +99,31 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sin.sin_port = usin->sin6_port; err = ip4_datagram_connect(sk, - (struct sockaddr*) &sin, + (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; - ipv6_addr_set(&np->daddr, 0, 0, htonl(0x0000ffff), inet->daddr); + ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); - if (ipv6_addr_any(&np->saddr)) { - ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000ffff), - inet->saddr); - } + if (ipv6_addr_any(&np->saddr) || + ipv6_mapped_addr_any(&np->saddr)) + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - if (ipv6_addr_any(&np->rcv_saddr)) { - ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000ffff), - inet->rcv_saddr); + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || + ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &sk->sk_v6_rcv_saddr); + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); } + goto out; } - if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (sk->sk_bound_dev_if && @@ -135,70 +144,56 @@ ipv4_connected: } } - ipv6_addr_copy(&np->daddr, daddr); - np->flow_label = fl.fl6_flowlabel; + sk->sk_v6_daddr = *daddr; + np->flow_label = fl6.flowlabel; - inet->dport = usin->sin6_port; + inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; - if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST)) - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl6.flowi6_oif = np->mcast_oif; - security_sk_classify_flow(sk, &fl); + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (flowlabel) { - if (flowlabel->opt && flowlabel->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - } else if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + opt = flowlabel ? flowlabel->opt : np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; } /* source address lookup done in ip6_dst_lookup */ if (ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&np->saddr, &fl.fl6_src); + np->saddr = fl6.saddr; - if (ipv6_addr_any(&np->rcv_saddr)) { - ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); - inet->rcv_saddr = LOOPBACK4_IPV6; + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + sk->sk_v6_rcv_saddr = fl6.saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); } ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? - &np->daddr : NULL, + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? &np->saddr : #endif NULL); @@ -208,6 +203,17 @@ out: fl6_sock_release(flowlabel); return err; } +EXPORT_SYMBOL_GPL(ip6_datagram_connect); + +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); + if (sin6->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + return ip6_datagram_connect(sk, uaddr, addr_len); +} +EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) @@ -223,6 +229,8 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, if (!skb) return; + skb->protocol = htons(ETH_P_IPV6); + serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; @@ -242,7 +250,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, kfree_skb(skb); } -void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) +void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; @@ -256,10 +264,12 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) if (!skb) return; + skb->protocol = htons(ETH_P_IPV6); + skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); - ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); + iph->daddr = fl6->daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; @@ -270,7 +280,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); - serr->port = fl->fl_ip_dport; + serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); @@ -279,15 +289,50 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) kfree_skb(skb); } +void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + struct ip6_mtuinfo *mtu_info; + + if (!np->rxopt.bits.rxpmtu) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + iph->daddr = fl6->daddr; + + mtu_info = IP6CBMTU(skb); + + mtu_info->ip6m_mtu = mtu; + mtu_info->ip6m_addr.sin6_family = AF_INET6; + mtu_info->ip6m_addr.sin6_port = 0; + mtu_info->ip6m_addr.sin6_flowinfo = 0; + mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; + mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + skb = xchg(&np->rxpmtu, skb); + kfree_skb(skb); +} + /* * Handle MSG_ERRQUEUE */ -int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in6 *sin; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; @@ -313,27 +358,26 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in6 *)msg->msg_name; if (sin) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; - sin->sin6_scope_id = 0; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { - ipv6_addr_copy(&sin->sin6_addr, - (struct in6_addr *)(nh + serr->addr_offset)); + if (skb->protocol == htons(ETH_P_IPV6)) { + const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), + struct ipv6hdr, daddr); + sin->sin6_addr = ip6h->daddr; if (np->sndflow) - sin->sin6_flowinfo = - (*(__be32 *)(nh + serr->addr_offset - 24) & - IPV6_FLOWINFO_MASK); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + sin->sin6_flowinfo = ip6_flowinfo(ip6h); + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { - ipv6_addr_set(&sin->sin6_addr, 0, 0, - htonl(0xffff), - *(__be32 *)(nh + serr->addr_offset)); + ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), + &sin->sin6_addr); + sin->sin6_scope_id = 0; } + *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -342,18 +386,22 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; - sin->sin6_scope_id = 0; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { - ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr); + sin->sin6_port = 0; + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6)) { + sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + ip6_datagram_recv_specific_ctl(sk, msg, skb); + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { struct inet_sock *inet = inet_sk(sk); - ipv6_addr_set(&sin->sin6_addr, 0, 0, - htonl(0xffff), ip_hdr(skb)->saddr); + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin->sin6_addr); + sin->sin6_scope_id = 0; if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } @@ -382,22 +430,87 @@ out_free_skb: out: return err; } +EXPORT_SYMBOL_GPL(ipv6_recv_error); +/* + * Handle IPV6_RECVPATHMTU + */ +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, + int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ip6_mtuinfo mtu_info; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); + int err; + int copied; + + err = -EAGAIN; + skb = xchg(&np->rxpmtu, NULL); + if (skb == NULL) + goto out; + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); + + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = 0; + sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; + sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; + *addr_len = sizeof(*sin); + } -int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) + put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); + + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - unsigned char *nh = skb_network_header(skb); + bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + if (is_ipv6) { + src_info.ipi6_ifindex = IP6CB(skb)->iif; + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + } else { + src_info.ipi6_ifindex = + PKTINFO_SKB_CB(skb)->ipi_ifindex; + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + &src_info.ipi6_addr); + } put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } +} + +void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; @@ -405,13 +518,14 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) } if (np->rxopt.bits.rxtclass) { - int tclass = (ntohl(*(__be32 *)ipv6_hdr(skb)) >> 20) & 0xff; + int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } - if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { - __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; - put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); + if (flowinfo) + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ @@ -435,10 +549,10 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { - unsigned len; + unsigned int len; u8 *ptr = nh + off; - switch(nexthdr) { + switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; @@ -470,7 +584,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { @@ -493,12 +607,41 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } - return 0; + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; + __be16 *ports = (__be16 *) skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 <= skb->len) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ipv6_hdr(skb)->daddr; + sin6.sin6_port = ports[1]; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = + ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, + opt->iif); + + put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); + } + } +} + +void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + ip6_datagram_recv_common_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); } +EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); -int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, - struct ipv6_txoptions *opt, - int *hlimit, int *tclass) +int ip6_datagram_send_ctl(struct net *net, struct sock *sk, + struct msghdr *msg, struct flowi6 *fl6, + struct ipv6_txoptions *opt, + int *hlimit, int *tclass, int *dontfrag) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -509,7 +652,6 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { int addr_type; - struct net_device *dev = NULL; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; @@ -522,6 +664,9 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: + { + struct net_device *dev = NULL; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; @@ -530,37 +675,45 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (src_info->ipi6_ifindex) { - if (fl->oif && src_info->ipi6_ifindex != fl->oif) + if (fl6->flowi6_oif && + src_info->ipi6_ifindex != fl6->flowi6_oif) return -EINVAL; - fl->oif = src_info->ipi6_ifindex; + fl6->flowi6_oif = src_info->ipi6_ifindex; } - addr_type = ipv6_addr_type(&src_info->ipi6_addr); - - if (addr_type == IPV6_ADDR_ANY) - break; + addr_type = __ipv6_addr_type(&src_info->ipi6_addr); - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (!src_info->ipi6_ifindex) - return -EINVAL; - else { - dev = dev_get_by_index(&init_net, src_info->ipi6_ifindex); - if (!dev) - return -ENODEV; + rcu_read_lock(); + if (fl6->flowi6_oif) { + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; } + } else if (addr_type & IPV6_ADDR_LINKLOCAL) { + rcu_read_unlock(); + return -EINVAL; } - if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr, - dev, 0)) { - if (dev) - dev_put(dev); - err = -EINVAL; - goto exit_f; + + if (addr_type != IPV6_ADDR_ANY) { + int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; + if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + !ipv6_chk_addr(net, &src_info->ipi6_addr, + strict ? dev : NULL, 0) && + !ipv6_chk_acast_addr_src(net, dev, + &src_info->ipi6_addr)) + err = -EINVAL; + else + fl6->saddr = src_info->ipi6_addr; } - if (dev) - dev_put(dev); - ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); + rcu_read_unlock(); + + if (err) + goto exit_f; + break; + } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { @@ -568,13 +721,13 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, goto exit_f; } - if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) { - if ((fl->fl6_flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } - fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); + fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: @@ -590,7 +743,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -610,7 +763,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -635,7 +788,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -658,8 +811,13 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) { + err = -EINVAL; + goto exit_f; + } break; #endif default: @@ -702,6 +860,11 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, } *hlimit = *(int *)CMSG_DATA(cmsg); + if (*hlimit < -1 || *hlimit > 0xff) { + err = -EINVAL; + goto exit_f; + } + break; case IPV6_TCLASS: @@ -709,9 +872,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, int tc; err = -EINVAL; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; - } tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) @@ -722,14 +884,59 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, break; } + + case IPV6_DONTFRAG: + { + int df; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + goto exit_f; + + df = *(int *)CMSG_DATA(cmsg); + if (df < 0 || df > 1) + goto exit_f; + + err = 0; + *dontfrag = df; + + break; + } default: LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; - break; + goto exit_f; } } exit_f: return err; } +EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); + +void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int bucket) +{ + const struct in6_addr *dest, *src; + + dest = &sp->sk_v6_daddr; + src = &sp->sk_v6_rcv_saddr; + seq_printf(seq, + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops)); +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index c6bb4c6d24b..d15da137714 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors * @@ -24,6 +23,8 @@ * This file is derived from net/ipv4/esp.c */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> @@ -37,6 +38,7 @@ #include <linux/random.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -49,19 +51,25 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); + /* * Allocate an AEAD request structure with extra space for SG and IV. * - * For alignment considerations the IV is placed at the front, followed - * by the request and finally the SG list. + * For alignment considerations the upper 32 bits of the sequence number are + * placed at the front, if present. Followed by the IV, the request and finally + * the SG list. * * TODO: Use spare space in skb for this where possible. */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) { unsigned int len; - len = crypto_aead_ivsize(aead); + len = seqihlen; + + len += crypto_aead_ivsize(aead); + if (len) { len += crypto_aead_alignmask(aead) & ~(crypto_tfm_ctx_alignment() - 1); @@ -76,10 +84,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) return kmalloc(len, GFP_ATOMIC); } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} + +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) { return crypto_aead_ivsize(aead) ? - PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -140,47 +154,73 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) int blksize; int clen; int alen; + int plen; + int tfclen; int nfrags; + int assoclen; + int sglists; + int seqhilen; u8 *iv; u8 *tail; - struct esp_data *esp = x->data; + __be32 *seqhi; /* skb is pure payload to encrypt */ - err = -ENOMEM; - - /* Round to block size */ - clen = skb->len; - - aead = esp->aead; + aead = x->data; alen = crypto_aead_authsize(aead); + tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + tfclen = padto - skb->len; + } blksize = ALIGN(crypto_aead_blocksize(aead), 4); - clen = ALIGN(clen + 2, blksize); - if (esp->padlen) - clen = ALIGN(clen, esp->padlen); + clen = ALIGN(skb->len + 2 + tfclen, blksize); + plen = clen - skb->len - tfclen; - if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) + err = skb_cow_data(skb, tfclen + plen + alen, &trailer); + if (err < 0) goto error; nfrags = err; - tmp = esp_alloc_tmp(aead, nfrags + 1); - if (!tmp) + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) { + err = -ENOMEM; goto error; + } - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_givreq(aead, iv); asg = esp_givreq_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; /* Fill padding... */ tail = skb_tail_pointer(trailer); + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } do { int i; - for (i=0; i<clen-skb->len - 2; i++) + for (i = 0; i < plen - 2; i++) tail[i] = i + 1; } while (0); - tail[clen-skb->len - 2] = (clen - skb->len) - 2; - tail[clen - skb->len - 1] = *skb_mac_header(skb); + tail[plen - 2] = plen - 2; + tail[plen - 1] = *skb_mac_header(skb); pskb_put(skb, trailer, clen - skb->len + alen); skb_push(skb, -skb_network_offset(skb)); @@ -188,19 +228,27 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, esph->enc_data + crypto_aead_ivsize(aead) - skb->data, clen + alen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_givcrypt_set_callback(req, 0, esp_output_done, skb); aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); + aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output); + XFRM_SKB_CB(skb)->seq.output.low); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); @@ -219,8 +267,7 @@ error: static int esp_input_done2(struct sk_buff *skb, int err) { struct xfrm_state *x = xfrm_input_state(skb); - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int elen = skb->len - hlen; @@ -248,7 +295,10 @@ static int esp_input_done2(struct sk_buff *skb, int err) pskb_trim(skb, skb->len - alen - padlen - 2); __skb_pull(skb, hlen); - skb_set_transport_header(skb, -hdr_len); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); err = nexthdr[1]; @@ -270,14 +320,17 @@ static void esp_input_done(struct crypto_async_request *base, int err) static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); int nfrags; + int assoclen; + int sglists; + int seqhilen; int ret = 0; void *tmp; + __be32 *seqhi; u8 *iv; struct scatterlist *sg; struct scatterlist *asg; @@ -298,15 +351,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) } ret = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + 1); + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto out; ESP_SKB_CB(skb)->tmp = tmp; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; @@ -317,11 +382,19 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, sizeof(*esph)); + aead_request_set_assoc(req, asg, assoclen); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) @@ -335,57 +408,57 @@ out: static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) { - struct esp_data *esp = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); - u32 align = max_t(u32, blksize, esp->padlen); - u32 rem; - - mtu -= x->props.header_len + crypto_aead_authsize(esp->aead); - rem = mtu & (align - 1); - mtu &= ~(align - 1); - - if (x->props.mode != XFRM_MODE_TUNNEL) { - u32 padsize = ((blksize - 1) & 7) + 1; - mtu -= blksize - padsize; - mtu += min_t(u32, blksize - padsize, rem); - } + struct crypto_aead *aead = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); + unsigned int net_adj; + + if (x->props.mode != XFRM_MODE_TUNNEL) + net_adj = sizeof(struct ipv6hdr); + else + net_adj = 0; - return mtu - 2; + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) +static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET6); if (!x) - return; - printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" NIP6_FMT "\n", - ntohl(esph->spi), NIP6(iph->daddr)); + return 0; + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static void esp6_destroy(struct xfrm_state *x) { - struct esp_data *esp = x->data; + struct crypto_aead *aead = x->data; - if (!esp) + if (!aead) return; - crypto_free_aead(esp->aead); - kfree(esp); + crypto_free_aead(aead); } static int esp_init_aead(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; int err; @@ -394,7 +467,7 @@ static int esp_init_aead(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; err = crypto_aead_setkey(aead, x->aead->alg_key, (x->aead->alg_key_len + 7) / 8); @@ -411,7 +484,6 @@ error: static int esp_init_authenc(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; struct crypto_authenc_key_param *param; struct rtattr *rta; @@ -426,17 +498,27 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; err = -ENAMETOOLONG; - if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", - x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -472,7 +554,7 @@ static int esp_init_authenc(struct xfrm_state *x) } err = crypto_aead_setauthsize( - aead, aalg_desc->uinfo.auth.icv_truncbits / 8); + aead, x->aalg->alg_trunc_len / 8); if (err) goto free_key; } @@ -491,7 +573,6 @@ error: static int esp6_init_state(struct xfrm_state *x) { - struct esp_data *esp; struct crypto_aead *aead; u32 align; int err; @@ -499,11 +580,7 @@ static int esp6_init_state(struct xfrm_state *x) if (x->encap) return -EINVAL; - esp = kzalloc(sizeof(*esp), GFP_KERNEL); - if (esp == NULL) - return -ENOMEM; - - x->data = esp; + x->data = NULL; if (x->aead) err = esp_init_aead(x); @@ -513,14 +590,16 @@ static int esp6_init_state(struct xfrm_state *x) if (err) goto error; - aead = esp->aead; - - esp->padlen = 0; + aead = x->data; x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); switch (x->props.mode) { case XFRM_MODE_BEET: + if (x->sel.family != AF_INET6) + x->props.header_len += IPV4_BEET_PHMAXLEN + + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + break; case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: @@ -531,14 +610,17 @@ static int esp6_init_state(struct xfrm_state *x) } align = ALIGN(crypto_aead_blocksize(aead), 4); - if (esp->padlen) - align = max_t(u32, align, esp->padlen); - x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); error: return err; } +static int esp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp6_type = { .description = "ESP6", @@ -553,20 +635,21 @@ static const struct xfrm_type esp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static struct inet6_protocol esp6_protocol = { - .handler = xfrm6_rcv, +static struct xfrm6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init esp6_init(void) { if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { - printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp6_type, AF_INET6); return -EAGAIN; } @@ -576,10 +659,10 @@ static int __init esp6_init(void) static void __exit esp6_fini(void) { - if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) - printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } module_init(esp6_init); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 3cd1c993d52..8d67900aa00 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -7,8 +7,6 @@ * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: exthdrs.c,v 1.13 2001/06/19 15:58:56 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -31,6 +29,8 @@ #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/dst.h> #include <net/sock.h> @@ -43,67 +43,23 @@ #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <asm/uaccess.h> -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) -{ - const unsigned char *nh = skb_network_header(skb); - int packet_len = skb->tail - skb->network_header; - struct ipv6_opt_hdr *hdr; - int len; - - if (offset + 2 > packet_len) - goto bad; - hdr = (struct ipv6_opt_hdr *)(nh + offset); - len = ((hdr->hdrlen + 1) << 3); - - if (offset + len > packet_len) - goto bad; - - offset += 2; - len -= 2; - - while (len > 0) { - int opttype = nh[offset]; - int optlen; - - if (opttype == type) - return offset; - - switch (opttype) { - case IPV6_TLV_PAD0: - optlen = 1; - break; - default: - optlen = nh[offset + 1] + 2; - if (optlen > len) - goto bad; - break; - } - offset += optlen; - len -= optlen; - } - /* not_found */ - bad: - return -1; -} -EXPORT_SYMBOL_GPL(ipv6_find_tlv); - /* * Parsing tlv encoded headers. * - * Parsing function "func" returns 1, if parsing succeed - * and 0, if it failed. + * Parsing function "func" returns true, if parsing succeed + * and false, if it failed. * It MUST NOT touch skb->h. */ struct tlvtype_proc { int type; - int (*func)(struct sk_buff *skb, int offset); + bool (*func)(struct sk_buff *skb, int offset); }; /********************* @@ -112,11 +68,11 @@ struct tlvtype_proc { /* An unknown option is detected, decide what to do */ -static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) { switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ - return 1; + return true; case 1: /* drop packet */ break; @@ -129,21 +85,22 @@ static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) break; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); - return 0; + return false; } kfree_skb(skb); - return 0; + return false; } /* Parse tlv encoded option header (hop-by-hop or destination) */ -static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) { - struct tlvtype_proc *curr; + const struct tlvtype_proc *curr; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); int len = (skb_transport_header(skb)[1] + 1) << 3; + int padlen = 0; if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; @@ -153,13 +110,33 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) while (len > 0) { int optlen = nh[off + 1] + 2; + int i; switch (nh[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; + padlen++; + if (padlen > 7) + goto bad; break; case IPV6_TLV_PADN: + /* RFC 2460 states that the purpose of PadN is + * to align the containing header to multiples + * of 8. 7 is therefore the highest valid value. + * See also RFC 4942, Section 2.1.9.5. + */ + padlen += optlen; + if (padlen > 7) + goto bad; + /* RFC 4942 recommends receiving hosts to + * actively check PadN payload to contain + * only zeroes. + */ + for (i = 2; i < optlen; i++) { + if (nh[off + i] != 0) + goto bad; + } break; default: /* Other TLV code so scan list */ @@ -170,33 +147,35 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) /* type specific length/alignment checks will be performed in the func(). */ - if (curr->func(skb, off) == 0) - return 0; + if (curr->func(skb, off) == false) + return false; break; } } if (curr->type < 0) { if (ip6_tlvopt_unknown(skb, off) == 0) - return 0; + return false; } + padlen = 0; break; } off += optlen; len -= optlen; } + if (len == 0) - return 1; + return true; bad: kfree_skb(skb); - return 0; + return false; } /***************************** Destination options header. *****************************/ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -static int ipv6_dest_hao(struct sk_buff *skb, int optoff) +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); @@ -221,7 +200,7 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff) if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { LIMIT_NETDEBUG( - KERN_DEBUG "hao is not an unicast addr: " NIP6_FMT "\n", NIP6(hao->addr)); + KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr); goto discard; } @@ -243,23 +222,23 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; - ipv6_addr_copy(&tmp_addr, &ipv6h->saddr); - ipv6_addr_copy(&ipv6h->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp_addr); + tmp_addr = ipv6h->saddr; + ipv6h->saddr = hao->addr; + hao->addr = tmp_addr; if (skb->tstamp.tv64 == 0) __net_timestamp(skb); - return 1; + return true; discard: kfree_skb(skb); - return 0; + return false; } #endif -static struct tlvtype_proc tlvprocdestopt_lst[] = { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static const struct tlvtype_proc tlvprocdestopt_lst[] = { +#if IS_ENABLED(CONFIG_IPV6_MIP6) { .type = IPV6_TLV_HAO, .func = ipv6_dest_hao, @@ -271,31 +250,29 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = { static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif - struct dst_entry *dst; + struct dst_entry *dst = skb_dst(skb); if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } opt->lastopt = opt->dst1 = skb_network_header_len(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif - dst = dst_clone(skb->dst); if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { - dst_release(dst); skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; @@ -303,8 +280,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) return 1; } - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); - dst_release(dst); + IP6_INC_STATS_BH(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); return -1; } @@ -312,6 +289,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) Routing header. ********************************/ +/* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); @@ -321,19 +299,17 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; - int accept_source_route = ipv6_devconf.accept_source_route; + struct net *net = dev_net(skb->dev); + int accept_source_route = net->ipv6.devconf_all->accept_source_route; - idev = in6_dev_get(skb->dev); - if (idev) { - if (accept_source_route > idev->cnf.accept_source_route) - accept_source_route = idev->cnf.accept_source_route; - in6_dev_put(idev); - } + idev = __in6_dev_get(skb->dev); + if (idev && accept_source_route > idev->cnf.accept_source_route) + accept_source_route = idev->cnf.accept_source_route; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; @@ -343,7 +319,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -352,13 +328,13 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own */ if (!addr) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -378,13 +354,13 @@ looped_back: } switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; @@ -403,7 +379,7 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - @@ -417,7 +393,7 @@ looped_back: if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; @@ -435,18 +411,18 @@ looped_back: addr += i - 1; switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } - if (!ipv6_chk_home_addr(&init_net, addr)) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -458,30 +434,30 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } - ipv6_addr_copy(&daddr, addr); - ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr); - ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr); + daddr = *addr; + *addr = ipv6_hdr(skb)->daddr; + ipv6_hdr(skb)->daddr = daddr; - dst_release(xchg(&skb->dst, NULL)); + skb_dst_drop(skb); ip6_route_input(skb); - if (skb->dst->error) { + if (skb_dst(skb)->error) { skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } - if (skb->dst->dev->flags&IFF_LOOPBACK) { + if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - 0, skb->dev); + 0); kfree_skb(skb); return -1; } @@ -494,23 +470,23 @@ looped_back: return -1; unknown_rh: - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; } -static struct inet6_protocol rthdr_protocol = { +static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, + .flags = INET6_PROTO_NOPOLICY, }; -static struct inet6_protocol destopt_protocol = { +static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, + .flags = INET6_PROTO_NOPOLICY, }; -static struct inet6_protocol nodata_protocol = { +static const struct inet6_protocol nodata_protocol = { .handler = dst_discard, .flags = INET6_PROTO_NOPOLICY, }; @@ -533,10 +509,10 @@ int __init ipv6_exthdrs_init(void) out: return ret; -out_rthdr: - inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); +out_rthdr: + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; @@ -552,72 +528,82 @@ void ipv6_exthdrs_exit(void) **********************************/ /* - * Note: we cannot rely on skb->dst before we assign it in ip6_route_input(). + * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) { - return skb->dst ? ip6_dst_idev(skb->dst) : __in6_dev_get(skb->dev); + return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); +} + +static inline struct net *ipv6_skb_net(struct sk_buff *skb) +{ + return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); } /* Router Alert as of RFC 2711 */ -static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { - IP6CB(skb)->ra = optoff; - return 1; + IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; + memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); + return true; } LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb(skb); - return 0; + return false; } /* Jumbo payload */ -static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); + struct net *net = ipv6_skb_net(skb); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - IP6_INC_STATS_BH(ipv6_skb_idev(skb), + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); - return 0; + return false; } if (ipv6_hdr(skb)->payload_len) { - IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); - return 0; + return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - IP6_INC_STATS_BH(ipv6_skb_idev(skb), IPSTATS_MIB_INTRUNCATEDPKTS); + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; - return 1; + return true; drop: kfree_skb(skb); - return 0; + return false; } -static struct tlvtype_proc tlvprochopopt_lst[] = { +static const struct tlvtype_proc tlvprochopopt_lst[] = { { .type = IPV6_TLV_ROUTERALERT, .func = ipv6_hop_ra, @@ -684,7 +670,7 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); - ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; @@ -716,7 +702,6 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } - EXPORT_SYMBOL(ipv6_push_nfrag_opts); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) @@ -732,20 +717,19 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); if (opt2) { - long dif = (char*)opt2 - (char*)opt; + long dif = (char *)opt2 - (char *)opt; memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) - *((char**)&opt2->hopopt) += dif; + *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) - *((char**)&opt2->dst0opt) += dif; + *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) - *((char**)&opt2->dst1opt) += dif; + *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) - *((char**)&opt2->srcrt) += dif; + *((char **)&opt2->srcrt) += dif; } return opt2; } - EXPORT_SYMBOL_GPL(ipv6_dup_options); static int ipv6_renew_option(void *ohdr, @@ -758,14 +742,14 @@ static int ipv6_renew_option(void *ohdr, if (ohdr) { memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); *hdr = (struct ipv6_opt_hdr *)*p; - *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr)); + *p += CMSG_ALIGN(ipv6_optlen(*hdr)); } } else { if (newopt) { if (copy_from_user(*p, newopt, newoptlen)) return -EFAULT; *hdr = (struct ipv6_opt_hdr *)*p; - if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen) + if (ipv6_optlen(*hdr) > newoptlen) return -EINVAL; *p += CMSG_ALIGN(newoptlen); } @@ -863,4 +847,28 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, return opt; } +EXPORT_SYMBOL_GPL(ipv6_fixup_options); +/** + * fl6_update_dst - update flowi destination address with info given + * by srcrt option, if any. + * + * @fl6: flowi6 for which daddr is to be updated + * @opt: struct ipv6_txoptions in which to look for srcrt opt + * @orig: copy of original daddr address if modified + * + * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * and initial value of fl6->daddr set in orig + */ +struct in6_addr *fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (!opt || !opt->srcrt) + return NULL; + + *orig = fl6->daddr; + fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; + return orig; +} +EXPORT_SYMBOL_GPL(fl6_update_dst); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index e1caa5d526c..8af3eb57f43 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -2,24 +2,26 @@ * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ +#include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ -int ipv6_ext_hdr(u8 nexthdr) +bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ - return ( (nexthdr == NEXTHDR_HOP) || + return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || - (nexthdr == NEXTHDR_DEST) ); + (nexthdr == NEXTHDR_DEST); } +EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. @@ -56,6 +58,9 @@ int ipv6_ext_hdr(u8 nexthdr) * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. + * - Reports the offset field of the final fragment header so it is + * possible to tell whether this is a first fragment, later fragment, + * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. @@ -63,10 +68,13 @@ int ipv6_ext_hdr(u8 nexthdr) * --ANK (980726) */ -int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, + __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + *frag_offp = 0; + while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; @@ -86,7 +94,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) if (fp == NULL) return -1; - if (ntohs(*fp) & ~0x7) + *frag_offp = *fp; + if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) @@ -101,6 +110,172 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) *nexthdrp = nexthdr; return start; } - -EXPORT_SYMBOL(ipv6_ext_hdr); EXPORT_SYMBOL(ipv6_skip_exthdr); + +int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +{ + const unsigned char *nh = skb_network_header(skb); + int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); + struct ipv6_opt_hdr *hdr; + int len; + + if (offset + 2 > packet_len) + goto bad; + hdr = (struct ipv6_opt_hdr *)(nh + offset); + len = ((hdr->hdrlen + 1) << 3); + + if (offset + len > packet_len) + goto bad; + + offset += 2; + len -= 2; + + while (len > 0) { + int opttype = nh[offset]; + int optlen; + + if (opttype == type) + return offset; + + switch (opttype) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + default: + optlen = nh[offset + 1] + 2; + if (optlen > len) + goto bad; + break; + } + offset += optlen; + len -= optlen; + } + /* not_found */ + bad: + return -1; +} +EXPORT_SYMBOL_GPL(ipv6_find_tlv); + +/* + * find the offset to specified header or the protocol number of last header + * if target < 0. "last header" is transport protocol header, ESP, or + * "No next header". + * + * Note that *offset is used as input/output parameter. an if it is not zero, + * then it must be a valid offset to an inner IPv6 header. This can be used + * to explore inner IPv6 header, eg. ICMPv6 error messages. + * + * If target header is found, its offset is set in *offset and return protocol + * number. Otherwise, return -1. + * + * If the first fragment doesn't contain the final protocol header or + * NEXTHDR_NONE it is considered invalid. + * + * Note that non-1st fragment is special case that "the protocol number + * of last header" is "next header" field in Fragment header. In this case, + * *offset is meaningless and fragment offset is stored in *fragoff if fragoff + * isn't NULL. + * + * if flags is not NULL and it's a fragment, then the frag flag + * IP6_FH_F_FRAG will be set. If it's an AH header, the + * IP6_FH_F_AUTH flag is set and target < 0, then this function will + * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this + * function will skip all those routing headers, where segements_left was 0. + */ +int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff, int *flags) +{ + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + unsigned int len; + bool found; + + if (fragoff) + *fragoff = 0; + + if (*offset) { + struct ipv6hdr _ip6, *ip6; + + ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); + if (!ip6 || (ip6->version != 6)) { + printk(KERN_ERR "IPv6 header not found\n"); + return -EBADMSG; + } + start = *offset + sizeof(struct ipv6hdr); + nexthdr = ip6->nexthdr; + } + len = skb->len - start; + + do { + struct ipv6_opt_hdr _hdr, *hp; + unsigned int hdrlen; + found = (nexthdr == target); + + if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { + if (target < 0 || found) + break; + return -ENOENT; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -EBADMSG; + + if (nexthdr == NEXTHDR_ROUTING) { + struct ipv6_rt_hdr _rh, *rh; + + rh = skb_header_pointer(skb, start, sizeof(_rh), + &_rh); + if (rh == NULL) + return -EBADMSG; + + if (flags && (*flags & IP6_FH_F_SKIP_RH) && + rh->segments_left == 0) + found = false; + } + + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off; + __be16 *fp; + + if (flags) /* Indicate that this is a fragment */ + *flags |= IP6_FH_F_FRAG; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -EBADMSG; + + _frag_off = ntohs(*fp) & ~0x7; + if (_frag_off) { + if (target < 0 && + ((!ipv6_ext_hdr(hp->nexthdr)) || + hp->nexthdr == NEXTHDR_NONE)) { + if (fragoff) + *fragoff = _frag_off; + return hp->nexthdr; + } + return -ENOENT; + } + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) + break; + hdrlen = (hp->hdrlen + 2) << 2; + } else + hdrlen = ipv6_optlen(hp); + + if (!found) { + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + } while (!found); + + *offset = start; + return nexthdr; +} +EXPORT_SYMBOL(ipv6_find_hdr); + diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c new file mode 100644 index 00000000000..447a7fbd1bb --- /dev/null +++ b/net/ipv6/exthdrs_offload.c @@ -0,0 +1,41 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * IPV6 Extension Header GSO/GRO support + */ +#include <net/protocol.h> +#include "ip6_offload.h" + +static const struct net_offload rthdr_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + +static const struct net_offload dstopt_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + +int __init ipv6_exthdrs_offload_init(void) +{ + int ret; + + ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING); + if (ret) + goto out; + + ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS); + if (ret) + goto out_rt; + +out: + return ret; + +out_rt: + inet_del_offload(&rthdr_offload, IPPROTO_ROUTING); + goto out; +} diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 695c0ca8a41..b4d5e1d97c1 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -14,6 +14,7 @@ */ #include <linux/netdevice.h> +#include <linux/export.h> #include <net/fib_rules.h> #include <net/ipv6.h> @@ -21,61 +22,67 @@ #include <net/ip6_route.h> #include <net/netlink.h> -struct fib6_rule -{ +struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; u8 tclass; }; -static struct fib_rules_ops fib6_rules_ops; - -struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags, - pol_lookup_t lookup) +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, }; - fib_rules_lookup(&fib6_rules_ops, fl, flags, &arg); - if (arg.rule) - fib_rule_put(arg.rule); + fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); if (arg.result) return arg.result; - dst_hold(&ip6_null_entry.u.dst); - return &ip6_null_entry.u.dst; + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; + struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; + int err = 0; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: - rt = &ip6_null_entry; + err = -ENETUNREACH; + rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: - rt = &ip6_blk_hole_entry; + err = -EINVAL; + rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: - rt = &ip6_prohibit_entry; + err = -EACCES; + rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } - table = fib6_get_table(rule->table); - if (table) - rt = lookup(table, flp, flags); + table = fib6_get_table(net, rule->table); + if (!table) { + err = -EAGAIN; + goto out; + } - if (rt != &ip6_null_entry) { + rt = lookup(net, table, flp6, flags); + if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; /* @@ -85,35 +92,67 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; - if (ipv6_get_saddr(&rt->u.dst, &flp->fl6_dst, - &saddr)) + + if (ipv6_dev_get_saddr(net, + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), + &saddr)) goto again; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) goto again; - ipv6_addr_copy(&flp->fl6_src, &saddr); + flp6->saddr = saddr; } goto out; } again: - dst_release(&rt->u.dst); + ip6_rt_put(rt); + err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); out: arg->result = rt; - return rt == NULL ? -EAGAIN : 0; + return err; } +static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct rt6_info *rt = (struct rt6_info *) arg->result; + struct net_device *dev = NULL; + + if (rt->rt6i_idev) + dev = rt->rt6i_idev->dev; + + /* do not accept result if the route does + * not meet the required prefix length + */ + if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: + ip6_rt_put(rt); + return true; +} static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; + struct flowi6 *fl6 = &fl->u.ip6; if (r->dst.plen && - !ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen)) + !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* @@ -123,14 +162,14 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) */ if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { - if (!ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, + if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } - if (r->tclass && r->tclass != ((ntohl(fl->fl6_flowlabel) >> 20) & 0xff)) + if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; return 1; @@ -141,17 +180,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { }; static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_rule_hdr *frh, + struct fib_rule_hdr *frh, struct nlattr **tb) { int err = -EINVAL; + struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (rule->action == FR_ACT_TO_TBL) { if (rule->table == RT6_TABLE_UNSPEC) goto errout; - if (fib6_new_table(rule->table) == NULL) { + if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; goto errout; } @@ -200,23 +240,21 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, } static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_rule_hdr *frh) + struct fib_rule_hdr *frh) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; - frh->family = AF_INET6; frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; frh->tos = rule6->tclass; - if (rule6->dst.plen) - NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr), - &rule6->dst.addr); - - if (rule6->src.plen) - NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr), - &rule6->src.addr); - + if ((rule6->dst.plen && + nla_put(skb, FRA_DST, sizeof(struct in6_addr), + &rule6->dst.addr)) || + (rule6->src.plen && + nla_put(skb, FRA_SRC, sizeof(struct in6_addr), + &rule6->src.addr))) + goto nla_put_failure; return 0; nla_put_failure: @@ -234,12 +272,13 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(16); /* src */ } -static struct fib_rules_ops fib6_rules_ops = { +static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, + .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, @@ -247,45 +286,56 @@ static struct fib_rules_ops fib6_rules_ops = { .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, - .rules_list = LIST_HEAD_INIT(fib6_rules_ops.rules_list), .owner = THIS_MODULE, .fro_net = &init_net, }; -static int __init fib6_default_rules_init(void) +static int __net_init fib6_rules_net_init(struct net *net) { - int err; - - err = fib_default_rule_add(&fib6_rules_ops, 0, - RT6_TABLE_LOCAL, FIB_RULE_PERMANENT); - if (err < 0) - return err; - err = fib_default_rule_add(&fib6_rules_ops, 0x7FFE, RT6_TABLE_MAIN, 0); - if (err < 0) - return err; - return 0; -} + struct fib_rules_ops *ops; + int err = -ENOMEM; -int __init fib6_rules_init(void) -{ - int ret; + ops = fib_rules_register(&fib6_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv6.fib6_rules_ops = ops; - ret = fib6_default_rules_init(); - if (ret) - goto out; - ret = fib_rules_register(&fib6_rules_ops); - if (ret) - goto out_default_rules_init; + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0, + RT6_TABLE_LOCAL, 0); + if (err) + goto out_fib6_rules_ops; + + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, + 0x7FFE, RT6_TABLE_MAIN, 0); + if (err) + goto out_fib6_rules_ops; + out: - return ret; + return err; -out_default_rules_init: - fib_rules_cleanup_ops(&fib6_rules_ops); +out_fib6_rules_ops: + fib_rules_unregister(ops); goto out; } +static void __net_exit fib6_rules_net_exit(struct net *net) +{ + fib_rules_unregister(net->ipv6.fib6_rules_ops); +} + +static struct pernet_operations fib6_rules_net_ops = { + .init = fib6_rules_net_init, + .exit = fib6_rules_net_exit, +}; + +int __init fib6_rules_init(void) +{ + return register_pernet_subsys(&fib6_rules_net_ops); +} + + void fib6_rules_cleanup(void) { - fib_rules_unregister(&fib6_rules_ops); + unregister_pernet_subsys(&fib6_rules_net_ops); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f204a7275a0..f6c84a6eb23 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.38 2002/02/08 03:57:19 davem Exp $ - * * Based on net/ipv4/icmp.c * * RFC 1885 @@ -31,6 +29,8 @@ * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -42,6 +42,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/netfilter.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -56,6 +57,7 @@ #include <net/ipv6.h> #include <net/ip6_checksum.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> @@ -64,14 +66,10 @@ #include <net/addrconf.h> #include <net/icmp.h> #include <net/xfrm.h> +#include <net/inet_common.h> +#include <net/dsfield.h> #include <asm/uaccess.h> -#include <asm/system.h> - -DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; -EXPORT_SYMBOL(icmpv6_statistics); -DEFINE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics) __read_mostly; -EXPORT_SYMBOL(icmpv6msg_statistics); /* * The ICMP socket(s). This is the most convenient way to flow control @@ -80,43 +78,57 @@ EXPORT_SYMBOL(icmpv6msg_statistics); * * On SMP we have one ICMP socket per-cpu. */ -static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL; -#define icmpv6_socket __get_cpu_var(__icmpv6_socket) +static inline struct sock *icmpv6_sk(struct net *net) +{ + return net->ipv6.icmp_sk[smp_processor_id()]; +} + +static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ + struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); + struct net *net = dev_net(skb->dev); + + if (type == ICMPV6_PKT_TOOBIG) + ip6_update_pmtu(skb, net, info, 0, 0); + else if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + + if (!(type & ICMPV6_INFOMSG_MASK)) + if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) + ping_err(skb, offset, info); +} static int icmpv6_rcv(struct sk_buff *skb); -static struct inet6_protocol icmpv6_protocol = { +static const struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, + .err_handler = icmpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -static __inline__ int icmpv6_xmit_lock(void) +static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) { + struct sock *sk; + local_bh_disable(); - if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.slock))) { + sk = icmpv6_sk(net); + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ local_bh_enable(); - return 1; + return NULL; } - return 0; + return sk; } -static __inline__ void icmpv6_xmit_unlock(void) -{ - spin_unlock_bh(&icmpv6_socket->sk->sk_lock.slock); -} - -/* - * Slightly more convenient version of icmpv6_send. - */ -void icmpv6_param_prob(struct sk_buff *skb, int code, int pos) +static __inline__ void icmpv6_xmit_unlock(struct sock *sk) { - icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); - kfree_skb(skb); + spin_unlock_bh(&sk->sk_lock.slock); } /* @@ -130,18 +142,19 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, int pos) * --ANK (980726) */ -static int is_ineligible(struct sk_buff *skb) +static bool is_ineligible(const struct sk_buff *skb) { int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; if (len < 0) - return 1; + return true; - ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr); + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) - return 0; + return false; if (nexthdr == IPPROTO_ICMPV6) { u8 _type, *tp; tp = skb_header_pointer(skb, @@ -149,48 +162,53 @@ static int is_ineligible(struct sk_buff *skb) sizeof(_type), &_type); if (tp == NULL || !(*tp & ICMPV6_INFOMSG_MASK)) - return 1; + return true; } - return 0; + return false; } /* * Check the ICMP output rate limit */ -static inline int icmpv6_xrlim_allow(struct sock *sk, int type, - struct flowi *fl) +static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { struct dst_entry *dst; - int res = 0; + struct net *net = sock_net(sk); + bool res = false; /* Informational messages are not limited. */ if (type & ICMPV6_INFOMSG_MASK) - return 1; + return true; /* Do not limit pmtu discovery, it would break it. */ if (type == ICMPV6_PKT_TOOBIG) - return 1; + return true; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ - dst = ip6_route_output(sk, fl); + dst = ip6_route_output(net, sk, fl6); if (dst->error) { - IP6_INC_STATS(ip6_dst_idev(dst), + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { - res = 1; + res = true; } else { struct rt6_info *rt = (struct rt6_info *)dst; - int tmo = init_net.ipv6.sysctl.icmpv6_time; + int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - res = xrlim_allow(dst, tmo); + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; @@ -203,18 +221,19 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type, * highest-order two bits set to 10 */ -static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +static bool opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (op == NULL) - return 1; + return true; return (*op & 0xC0) == 0x80; } -static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) +int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; @@ -228,11 +247,11 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6h->icmp6_cksum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { - skb->csum = csum_partial((char *)icmp6h, + skb->csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), skb->csum); - icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - len, fl->proto, + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, skb->csum); } else { __wsum tmp_csum = 0; @@ -241,11 +260,11 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct tmp_csum = csum_add(tmp_csum, skb->csum); } - tmp_csum = csum_partial((char *)icmp6h, + tmp_csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), tmp_csum); - icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - len, fl->proto, + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, tmp_csum); } ip6_push_pending_frames(sk); @@ -273,7 +292,7 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st return 0; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) static void mip6_addr_swap(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -287,9 +306,9 @@ static void mip6_addr_swap(struct sk_buff *skb) if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - ipv6_addr_copy(&tmp, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp); + tmp = iph->saddr; + iph->saddr = hao->addr; + hao->addr = tmp; } } } @@ -297,42 +316,106 @@ static void mip6_addr_swap(struct sk_buff *skb) static inline void mip6_addr_swap(struct sk_buff *skb) {} #endif +static struct dst_entry *icmpv6_route_lookup(struct net *net, + struct sk_buff *skb, + struct sock *sk, + struct flowi6 *fl6) +{ + struct dst_entry *dst, *dst2; + struct flowi6 fl2; + int err; + + err = ip6_dst_lookup(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: acast source\n"); + dst_release(dst); + return ERR_PTR(-EINVAL); + } + + /* No need to clone since we're just using its address. */ + dst2 = dst; + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); + if (!IS_ERR(dst)) { + if (dst != dst2) + return dst; + } else { + if (PTR_ERR(dst) == -EPERM) + dst = NULL; + else + return dst; + } + + err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6); + if (err) + goto relookup_failed; + + err = ip6_dst_lookup(sk, &dst2, &fl2); + if (err) + goto relookup_failed; + + dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); + if (!IS_ERR(dst2)) { + dst_release(dst); + dst = dst2; + } else { + err = PTR_ERR(dst2); + if (err == -EPERM) { + dst_release(dst); + return dst2; + } else + goto relookup_failed; + } + +relookup_failed: + if (dst) + return dst; + return ERR_PTR(err); +} + /* * Send an ICMP message in response to a packet in error */ -void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, - struct net_device *dev) +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { + struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct ipv6_pinfo *np; - struct in6_addr *saddr = NULL; + const struct in6_addr *saddr = NULL; struct dst_entry *dst; - struct dst_entry *dst2; struct icmp6hdr tmp_hdr; - struct flowi fl; - struct flowi fl2; + struct flowi6 fl6; struct icmpv6_msg msg; int iif = 0; int addr_type = 0; int len; - int hlimit, tclass; + int hlimit; int err = 0; + u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || - (skb->network_header + sizeof(*hdr)) > skb->tail) + (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) - * Rule (e.1) is enforced by not using icmpv6_send + * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(&init_net, &hdr->daddr, skb->dev, 0)) + if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || + ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* @@ -355,7 +438,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * Source addr check */ - if (addr_type & IPV6_ADDR_LINKLOCAL) + if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; /* @@ -365,7 +448,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: addr_any/mcast source\n"); return; } @@ -373,29 +456,30 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: no reply to icmp error\n"); return; } mip6_addr_swap(skb); - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr); + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = hdr->saddr; if (saddr) - ipv6_addr_copy(&fl.fl6_src, saddr); - fl.oif = iif; - fl.fl_icmp_type = type; - fl.fl_icmp_code = code; - security_skb_classify_flow(skb, &fl); - - if (icmpv6_xmit_lock()) + fl6.saddr = *saddr; + fl6.flowi6_mark = mark; + fl6.flowi6_oif = iif; + fl6.fl6_icmp_type = type; + fl6.fl6_icmp_code = code; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) return; - - sk = icmpv6_socket->sk; + sk->sk_mark = mark; np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl)) + if (!icmpv6_xrlim_allow(sk, type, &fl6)) goto out; tmp_hdr.icmp6_type = type; @@ -403,70 +487,16 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, tmp_hdr.icmp6_cksum = 0; tmp_hdr.icmp6_pointer = htonl(info); - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) + dst = icmpv6_route_lookup(net, skb, sk, &fl6); + if (IS_ERR(dst)) goto out; - /* - * We won't send icmp if the destination is known - * anycast. - */ - if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); - goto out_dst_release; - } - - /* No need to clone since we're just using its address. */ - dst2 = dst; - - err = xfrm_lookup(&dst, &fl, sk, 0); - switch (err) { - case 0: - if (dst != dst2) - goto route_done; - break; - case -EPERM: - dst = NULL; - break; - default: - goto out; - } - - if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6)) - goto out_dst_release; - - if (ip6_dst_lookup(sk, &dst2, &fl)) - goto out_dst_release; - - err = xfrm_lookup(&dst2, &fl, sk, XFRM_LOOKUP_ICMP); - if (err == -ENOENT) { - if (!dst) - goto out; - goto route_done; - } - - dst_release(dst); - dst = dst2; - - if (err) - goto out; - -route_done: - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hlimit < 0) - hlimit = ipv6_get_hoplimit(dst->dev); - - tclass = np->tclass; - if (tclass < 0) - tclass = 0; + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -479,119 +509,122 @@ route_done: goto out_dst_release; } - idev = in6_dev_get(skb->dev); + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); err = ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), - hlimit, tclass, NULL, &fl, (struct rt6_info*)dst, - MSG_DONTWAIT); + sizeof(struct icmp6hdr), hlimit, + np->tclass, NULL, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, np->dontfrag); if (err) { + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); - goto out_put; + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } - err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr)); - -out_put: - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); out_dst_release: dst_release(dst); out: - icmpv6_xmit_unlock(); + icmpv6_xmit_unlock(sk); } -EXPORT_SYMBOL(icmpv6_send); +/* Slightly more convenient version of icmp6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos); + kfree_skb(skb); +} static void icmpv6_echo_reply(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; - struct in6_addr *saddr = NULL; + const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); struct icmp6hdr tmp_hdr; - struct flowi fl; + struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; int err = 0; int hlimit; - int tclass; + u8 tclass; + u32 mark = IP6_REPLY_MARK(net, skb->mark); saddr = &ipv6_hdr(skb)->daddr; - if (!ipv6_unicast_destination(skb)) + if (!ipv6_unicast_destination(skb) && + !(net->ipv6.sysctl.anycast_src_echo_reply && + ipv6_anycast_destination(skb))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) - ipv6_addr_copy(&fl.fl6_src, saddr); - fl.oif = skb->dev->ifindex; - fl.fl_icmp_type = ICMPV6_ECHO_REPLY; - security_skb_classify_flow(skb, &fl); - - if (icmpv6_xmit_lock()) + fl6.saddr = *saddr; + fl6.flowi6_oif = skb->dev->ifindex; + fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.flowi6_mark = mark; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) return; - - sk = icmpv6_socket->sk; + sk->sk_mark = mark; np = inet6_sk(sk); - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(sk, &dst, &fl); + err = ip6_dst_lookup(sk, &dst, &fl6); if (err) goto out; - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); + if (IS_ERR(dst)) goto out; - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hlimit < 0) - hlimit = ipv6_get_hoplimit(dst->dev); - - tclass = np->tclass; - if (tclass < 0) - tclass = 0; + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + tclass = ipv6_get_dsfield(ipv6_hdr(skb)); err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl, - (struct rt6_info*)dst, MSG_DONTWAIT); + sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + np->dontfrag); if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); - goto out_put; + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } - err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); - -out_put: - if (likely(idev != NULL)) - in6_dev_put(idev); dst_release(dst); out: - icmpv6_xmit_unlock(); + icmpv6_xmit_unlock(sk); } -static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) +void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { - struct inet6_protocol *ipprot; + const struct inet6_protocol *ipprot; int inner_offset; - int hash; + __be16 frag_off; u8 nexthdr; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) @@ -600,7 +633,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ - inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); if (inner_offset<0) return; } else { @@ -618,10 +652,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, __be32 info) --ANK (980726) */ - hash = nexthdr & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet6_protos[hash]); + ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); rcu_read_unlock(); @@ -637,19 +669,19 @@ static int icmpv6_rcv(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); - struct in6_addr *saddr, *daddr; - struct ipv6hdr *orig_hdr; + const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; - int type; + u8 type; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + struct sec_path *sp = skb_sec_path(skb); int nh; - if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & + if (!(sp && sp->xvec[sp->len - 1]->props.flags & XFRM_STATE_ICMP)) goto drop_no_count; - if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) + if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) goto drop_no_count; nh = skb_network_offset(skb); @@ -661,26 +693,16 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS); + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; - /* Perform checksum. */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [" NIP6_FMT " > " NIP6_FMT "]\n", - NIP6(*saddr), NIP6(*daddr)); - goto discard_it; - } + if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ICMPv6 checksum failed [%pI6c > %pI6c]\n", + saddr, daddr); + goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) @@ -690,7 +712,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS_BH(idev, type); + ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -698,7 +720,7 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - /* we couldn't care less */ + ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: @@ -710,9 +732,6 @@ static int icmpv6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; hdr = icmp6_hdr(skb); - orig_hdr = (struct ipv6hdr *) (hdr + 1); - rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, - ntohl(hdr->icmp6_mtu)); /* * Drop through to notify @@ -768,40 +787,61 @@ static int icmpv6_rcv(struct sk_buff *skb) kfree_skb(skb); return 0; +csum_error: + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb(skb); return 0; } +void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, + u8 type, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int oif) +{ + memset(fl6, 0, sizeof(*fl6)); + fl6->saddr = *saddr; + fl6->daddr = *daddr; + fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->fl6_icmp_type = type; + fl6->fl6_icmp_code = 0; + fl6->flowi6_oif = oif; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); +} + /* - * Special lock-class for __icmpv6_socket: + * Special lock-class for __icmpv6_sk: */ static struct lock_class_key icmpv6_socket_sk_dst_lock_key; -int __init icmpv6_init(struct net_proto_family *ops) +static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; int err, i, j; + net->ipv6.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv6.icmp_sk == NULL) + return -ENOMEM; + for_each_possible_cpu(i) { - err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, - &per_cpu(__icmpv6_socket, i)); + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - printk(KERN_ERR - "Failed to initialize the ICMP6 control socket " - "(err %d).\n", + pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); goto fail; } - sk = per_cpu(__icmpv6_socket, i)->sk; - sk->sk_allocation = GFP_ATOMIC; + net->ipv6.icmp_sk[i] = sk; + /* * Split off their lock-class, because sk->sk_dst_lock * gets used from softirqs, which is safe for - * __icmpv6_socket (because those never get directly used + * __icmpv6_sk (because those never get directly used * via userspace syscalls), but unsafe for normal sockets. */ lockdep_set_class(&sk->sk_dst_lock, @@ -810,41 +850,65 @@ int __init icmpv6_init(struct net_proto_family *ops) /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ - sk->sk_sndbuf = - (2 * ((64 * 1024) + sizeof(struct sk_buff))); + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); + } + return 0; + + fail: + for (j = 0; j < i; j++) + inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]); + kfree(net->ipv6.icmp_sk); + return err; +} - sk->sk_prot->unhash(sk); +static void __net_exit icmpv6_sk_exit(struct net *net) +{ + int i; + + for_each_possible_cpu(i) { + inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]); } + kfree(net->ipv6.icmp_sk); +} + +static struct pernet_operations icmpv6_sk_ops = { + .init = icmpv6_sk_init, + .exit = icmpv6_sk_exit, +}; +int __init icmpv6_init(void) +{ + int err; - if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) { - printk(KERN_ERR "Failed to register ICMP6 protocol\n"); - err = -EAGAIN; + err = register_pernet_subsys(&icmpv6_sk_ops); + if (err < 0) + return err; + + err = -EAGAIN; + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; - } + err = inet6_register_icmp_sender(icmp6_send); + if (err) + goto sender_reg_err; return 0; - fail: - for (j = 0; j < i; j++) { - if (!cpu_possible(j)) - continue; - sock_release(per_cpu(__icmpv6_socket, j)); - } - +sender_reg_err: + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); +fail: + pr_err("Failed to register ICMP6 protocol\n"); + unregister_pernet_subsys(&icmpv6_sk_ops); return err; } void icmpv6_cleanup(void) { - int i; - - for_each_possible_cpu(i) { - sock_release(per_cpu(__icmpv6_socket, i)); - } + inet6_unregister_icmp_sender(icmp6_send); + unregister_pernet_subsys(&icmpv6_sk_ops); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } + static const struct icmp6_err { int err; int fatal; @@ -869,9 +933,17 @@ static const struct icmp6_err { .err = ECONNREFUSED, .fatal = 1, }, + { /* POLICY_FAIL */ + .err = EACCES, + .fatal = 1, + }, + { /* REJECT_ROUTE */ + .err = EACCES, + .fatal = 1, + }, }; -int icmpv6_err_convert(int type, int code, int *err) +int icmpv6_err_convert(u8 type, u8 code, int *err) { int fatal = 0; @@ -880,7 +952,7 @@ int icmpv6_err_convert(int type, int code, int *err) switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; - if (code <= ICMPV6_PORT_UNREACH) { + if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } @@ -902,29 +974,31 @@ int icmpv6_err_convert(int type, int code, int *err) return fatal; } - EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL -ctl_table ipv6_icmp_table_template[] = { +static struct ctl_table ipv6_icmp_table_template[] = { { - .ctl_name = NET_IPV6_ICMP_RATELIMIT, .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec_ms_jiffies, }, - { .ctl_name = 0 }, + { }, }; -struct ctl_table *ipv6_icmp_sysctl_init(struct net *net) +struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_icmp_table_template, sizeof(ipv6_icmp_table_template), GFP_KERNEL); + + if (table) + table[0].data = &net->ipv6.sysctl.icmpv6_time; + return table; } #endif diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 78de42ada84..a245e5ddffb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -17,6 +17,7 @@ #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/jhash.h> +#include <linux/slab.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -27,46 +28,87 @@ #include <net/inet6_connection_sock.h> int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) + const struct inet_bind_bucket *tb, bool relax) { const struct sock *sk2; - const struct hlist_node *node; + int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { + /* + * See comment in inet_csk_bind_conflict about sock lookup + * vs net namespaces issues. + */ + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, + sock_i_uid((struct sock *)sk2))))) { + if (ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + if (!relax && reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } } - return node != NULL; + return sk2 != NULL; } EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); +struct dst_entry *inet6_csk_route_req(struct sock *sk, + struct flowi6 *fl6, + const struct request_sock *req) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *final_p, final; + struct dst_entry *dst; + + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_TCP; + fl6->daddr = ireq->ir_v6_rmt_addr; + final_p = fl6_update_dst(fl6, np->opt, &final); + fl6->saddr = ireq->ir_v6_loc_addr; + fl6->flowi6_oif = ireq->ir_iif; + fl6->flowi6_mark = ireq->ir_mark; + fl6->fl6_dport = ireq->ir_rmt_port; + fl6->fl6_sport = htons(ireq->ir_num); + security_req_classify_flow(req, flowi6_to_flowi(fl6)); + + dst = ip6_dst_lookup_flow(sk, fl6, final_p); + if (IS_ERR(dst)) + return NULL; + + return dst; +} + /* * request_sock (formerly open request) hash tables. */ static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u16 synq_hsize) + const u32 rnd, const u32 synq_hsize) { - u32 a = (__force u32)raddr->s6_addr32[0]; - u32 b = (__force u32)raddr->s6_addr32[1]; - u32 c = (__force u32)raddr->s6_addr32[2]; + u32 c; - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); + c = jhash_3words((__force u32)raddr->s6_addr32[0], + (__force u32)raddr->s6_addr32[1], + (__force u32)raddr->s6_addr32[2], + rnd); - a += (__force u32)raddr->s6_addr32[3]; - b += (__force u32)rport; - __jhash_mix(a, b, c); + c = jhash_2words((__force u32)raddr->s6_addr32[3], + (__force u32)rport, + c); return c & (synq_hsize - 1); } @@ -87,14 +129,14 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { - const struct inet6_request_sock *treq = inet6_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); - if (inet_rsk(req)->rmt_port == rport && + if (ireq->ir_rmt_port == rport && req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); + ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && + ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && + (!ireq->ir_iif || ireq->ir_iif == iif)) { + WARN_ON(req->sk != NULL); *prevp = prev; return req; } @@ -111,8 +153,8 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, - inet_rsk(req)->rmt_port, + const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, + inet_rsk(req)->ir_rmt_port, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -123,113 +165,102 @@ EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { - struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; + sin6->sin6_addr = sk->sk_v6_daddr; + sin6->sin6_port = inet_sk(sk)->inet_dport; /* We do not store received flowlabel for TCP */ sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + sk->sk_bound_dev_if); } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) + const struct in6_addr *daddr, + const struct in6_addr *saddr) { __ip6_dst_store(sk, dst, daddr, saddr); - -#ifdef CONFIG_XFRM - { - struct rt6_info *rt = (struct rt6_info *)dst; - rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); - } -#endif } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst; - - dst = __sk_dst_check(sk, cookie); - -#ifdef CONFIG_XFRM - if (dst) { - struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { - sk->sk_dst_cache = NULL; - dst_release(dst); - dst = NULL; - } - } -#endif - - return dst; + return __sk_dst_check(sk, cookie); } -int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +static struct dst_entry *inet6_csk_route_socket(struct sock *sk, + struct flowi6 *fl6) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; + struct in6_addr *final_p, final; struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - security_sk_classify_flow(sk, &fl); - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - dst = __inet6_csk_dst_check(sk, np->dst_cookie); + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = sk->sk_protocol; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = np->saddr; + fl6->flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl6->flowlabel); + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_sport = inet->inet_sport; + fl6->fl6_dport = inet->inet_dport; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); + final_p = fl6_update_dst(fl6, np->opt, &final); - if (err) { - sk->sk_err_soft = -err; - kfree_skb(skb); - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); + dst = __inet6_csk_dst_check(sk, np->dst_cookie); + if (!dst) { + dst = ip6_dst_lookup_flow(sk, fl6, final_p); - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - kfree_skb(skb); - return err; - } + if (!IS_ERR(dst)) + __inet6_csk_dst_store(sk, dst, NULL, NULL); + } + return dst; +} - __inet6_csk_dst_store(sk, dst, NULL, NULL); +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + struct dst_entry *dst; + int res; + + dst = inet6_csk_route_socket(sk, &fl6); + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); } - skb->dst = dst_clone(dst); + rcu_read_lock(); + skb_dst_set_noref(skb, dst); /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + fl6.daddr = sk->sk_v6_daddr; - return ip6_xmit(sk, skb, &fl, np->opt, 0); + res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_unlock(); + return res; } - EXPORT_SYMBOL_GPL(inet6_csk_xmit); + +struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct flowi6 fl6; + struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); + + if (IS_ERR(dst)) + return NULL; + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = inet6_csk_route_socket(sk, &fl6); + return IS_ERR(dst) ? NULL : dst; +} +EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 99fd25f7f00..262e13c02ec 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -20,31 +20,75 @@ #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> +#include <net/secure_seq.h> #include <net/ip.h> -void __inet6_hash(struct sock *sk) +static unsigned int inet6_ehashfn(struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { - struct inet_hashinfo *hashinfo = sk->sk_prot->hashinfo; - struct hlist_head *list; - rwlock_t *lock; + static u32 inet6_ehash_secret __read_mostly; + static u32 ipv6_hash_secret __read_mostly; - BUG_TRAP(sk_unhashed(sk)); + u32 lhash, fhash; + + net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); + net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + + return __inet6_ehashfn(lhash, lport, fhash, fport, + inet6_ehash_secret + net_hash_mix(net)); +} + +static int inet6_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct in6_addr *laddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *faddr = &sk->sk_v6_daddr; + const __u16 lport = inet->inet_num; + const __be16 fport = inet->inet_dport; + struct net *net = sock_net(sk); + + return inet6_ehashfn(net, laddr, lport, faddr, fport); +} + +int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + int twrefcnt = 0; + + WARN_ON(!sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { - list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &hashinfo->lhash_lock; - inet_listen_wlock(hashinfo); + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + spin_lock(&ilb->lock); + __sk_nulls_add_node_rcu(sk, &ilb->head); + spin_unlock(&ilb->lock); } else { unsigned int hash; + struct hlist_nulls_head *list; + spinlock_t *lock; + sk->sk_hash = hash = inet6_sk_ehashfn(sk); list = &inet_ehash_bucket(hashinfo, hash)->chain; lock = inet_ehash_lockp(hashinfo, hash); - write_lock(lock); + spin_lock(lock); + __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } + spin_unlock(lock); } - __sk_add_node(sk, list); - sock_prot_inuse_add(sk->sk_prot, 1); - write_unlock(lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; } EXPORT_SYMBOL(__inet6_hash); @@ -63,76 +107,119 @@ struct sock *__inet6_lookup_established(struct net *net, const int dif) { struct sock *sk; - const struct hlist_node *node; + const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); - struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); - rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); - - prefetch(head->chain.first); - read_lock(lock); - sk_for_each(sk, node, &head->chain) { - /* For IPV6 do the cheaper port and family tests first. */ - if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &head->twchain) { - if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) - goto hit; - } - read_unlock(lock); - return NULL; + unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + -hit: - sock_hold(sk); - read_unlock(lock); + rcu_read_lock(); +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (sk->sk_hash != hash) + continue; + if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif)) + continue; + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) + goto out; + + if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) { + sock_gen_put(sk); + goto begin; + } + goto found; + } + if (get_nulls_value(node) != slot) + goto begin; +out: + sk = NULL; +found: + rcu_read_unlock(); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); +static inline int compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, + const struct in6_addr *daddr, + const int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + sk->sk_family == PF_INET6) { + + score = 1; + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, + struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { struct sock *sk; - const struct hlist_node *node; - struct sock *result = NULL; - int score, hiscore = 0; - - read_lock(&hashinfo->lhash_lock); - sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { - if (sk->sk_net == net && inet_sk(sk)->num == hnum && - sk->sk_family == PF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); - - score = 1; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) - continue; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score++; - } - if (score == 3) { - result = sk; - break; + const struct hlist_nulls_node *node; + struct sock *result; + int score, hiscore, matches = 0, reuseport = 0; + u32 phash = 0; + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + + rcu_read_lock(); +begin: + result = NULL; + hiscore = 0; + sk_nulls_for_each(sk, node, &ilb->head) { + score = compute_score(sk, net, hnum, daddr, dif); + if (score > hiscore) { + hiscore = score; + result = sk; + reuseport = sk->sk_reuseport; + if (reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; } - if (score > hiscore) { - hiscore = score; + } else if (score == hiscore && reuseport) { + matches++; + if (((u64)phash * matches) >> 32 == 0) result = sk; - } + phash = next_pseudo_random32(phash); } } - if (result) - sock_hold(result); - read_unlock(&hashinfo->lhash_lock); + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) + goto begin; + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, daddr, + dif) < hiscore)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } @@ -160,77 +247,75 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; - const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); - const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, - inet->dport); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); + struct net *net = sock_net(sk); + const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, + inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - rwlock_t *lock = inet_ehash_lockp(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - struct net *net = sk->sk_net; - - prefetch(head->chain.first); - write_lock(lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &head->twchain) { - tw = inet_twsk(sk2); - - if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; + const struct hlist_nulls_node *node; + struct inet_timewait_sock *tw = NULL; + int twrefcnt = 0; - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) + spin_lock(lock); + + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash) + continue; + + if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) { + if (sk2->sk_state == TCP_TIME_WAIT) { + tw = inet_twsk(sk2); + if (twsk_unique(sk, sk2, twp)) + break; + } goto not_unique; + } } -unique: /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); + * in hash table socket with a funny identity. + */ + inet->inet_num = lport; + inet->inet_sport = htons(lport); sk->sk_hash = hash; - sock_prot_inuse_add(sk->sk_prot, 1); - write_unlock(lock); + WARN_ON(!sk_unhashed(sk)); + __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } + spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - if (twp != NULL) { + if (twp) { *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { + } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } return 0; not_unique: - write_unlock(lock); + spin_unlock(lock); return -EADDRNOTAVAIL; } static inline u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); + + return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_dport); } int inet6_hash_connect(struct inet_timewait_death_row *death_row, diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c new file mode 100644 index 00000000000..9a4d7322fb2 --- /dev/null +++ b/net/ipv6/ip6_checksum.c @@ -0,0 +1,124 @@ +#include <net/ip.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <asm/checksum.h> + +#ifndef _HAVE_ARCH_IPV6_CSUM +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum csum) +{ + + int carry; + __u32 ulen; + __u32 uproto; + __u32 sum = (__force u32)csum; + + sum += (__force u32)saddr->s6_addr32[0]; + carry = (sum < (__force u32)saddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[1]; + carry = (sum < (__force u32)saddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[2]; + carry = (sum < (__force u32)saddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[3]; + carry = (sum < (__force u32)saddr->s6_addr32[3]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[0]; + carry = (sum < (__force u32)daddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[1]; + carry = (sum < (__force u32)daddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[2]; + carry = (sum < (__force u32)daddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[3]; + carry = (sum < (__force u32)daddr->s6_addr32[3]); + sum += carry; + + ulen = (__force u32)htonl((__u32) len); + sum += ulen; + carry = (sum < ulen); + sum += carry; + + uproto = (__force u32)htonl(proto); + sum += uproto; + carry = (sum < uproto); + sum += carry; + + return csum_fold((__force __wsum)sum); +} +EXPORT_SYMBOL(csum_ipv6_magic); +#endif + +int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) + * we accept a checksum of zero here. When we find the socket + * for the UDP packet we'll check if that socket allows zero checksum + * for IPv6 (set by socket option). + */ + return skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); +} +EXPORT_SYMBOL(udp6_csum_init); + +/* Function to set UDP checksum for an IPv6 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v6_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bab72b6f144..cb4459bd1d2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,21 +5,20 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. */ -/* - * Changes: - * Yuji SEKIYA @USAGI: Support default route on router node; - * remove ip6_null_entry from the top of - * routing table. - * Ville Nuorvala: Fixed routing subtrees. - */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> @@ -28,10 +27,7 @@ #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> - -#ifdef CONFIG_PROC_FS -#include <linux/proc_fs.h> -#endif +#include <linux/slab.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -43,17 +39,14 @@ #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#define RT6_TRACE(x...) pr_debug(x) #else #define RT6_TRACE(x...) do { ; } while (0) #endif -struct rt6_statistics rt6_stats; +static struct kmem_cache *fib6_node_kmem __read_mostly; -static struct kmem_cache * fib6_node_kmem __read_mostly; - -enum fib_walk_state_t -{ +enum fib_walk_state_t { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif @@ -63,9 +56,9 @@ enum fib_walk_state_t FWS_U }; -struct fib6_cleaner_t -{ +struct fib6_cleaner_t { struct fib6_walker_t w; + struct net *net; int (*func)(struct rt6_info *, void *arg); void *arg; }; @@ -78,9 +71,9 @@ static DEFINE_RWLOCK(fib6_walker_lock); #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt); -static struct rt6_info * fib6_find_prefix(struct fib6_node *fn); -static struct fib6_node * fib6_repair_tree(struct fib6_node *fn); +static void fib6_prune_clones(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct fib6_walker_t *w); static int fib6_walk_continue(struct fib6_walker_t *w); @@ -93,31 +86,22 @@ static int fib6_walk_continue(struct fib6_walker_t *w); static __u32 rt_sernum; -static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0); +static void fib6_gc_timer_cb(unsigned long arg); -static struct fib6_walker_t fib6_walker_list = { - .prev = &fib6_walker_list, - .next = &fib6_walker_list, -}; - -#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) +static LIST_HEAD(fib6_walkers); +#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) static inline void fib6_walker_link(struct fib6_walker_t *w) { write_lock_bh(&fib6_walker_lock); - w->next = fib6_walker_list.next; - w->prev = &fib6_walker_list; - w->next->prev = w; - w->prev->next = w; + list_add(&w->lh, &fib6_walkers); write_unlock_bh(&fib6_walker_lock); } static inline void fib6_walker_unlink(struct fib6_walker_t *w) { write_lock_bh(&fib6_walker_lock); - w->next->prev = w->prev; - w->prev->next = w->next; - w->prev = w->next = w; + list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } static __inline__ u32 fib6_new_sernum(void) @@ -138,15 +122,27 @@ static __inline__ u32 fib6_new_sernum(void) /* * test bit */ +#if defined(__LITTLE_ENDIAN) +# define BITOP_BE32_SWIZZLE (0x1F & ~7) +#else +# define BITOP_BE32_SWIZZLE 0 +#endif -static __inline__ __be32 addr_bit_set(void *token, int fn_bit) +static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) { - __be32 *addr = token; - - return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; + const __be32 *addr = token; + /* + * Here, + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * is optimized version of + * htonl(1 << ((~fn_bit)&0x1F)) + * See include/asm-generic/bitops/le.h. + */ + return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & + addr[fn_bit >> 5]; } -static __inline__ struct fib6_node * node_alloc(void) +static __inline__ struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -155,7 +151,7 @@ static __inline__ struct fib6_node * node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node * fn) +static __inline__ void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } @@ -163,25 +159,10 @@ static __inline__ void node_free(struct fib6_node * fn) static __inline__ void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) - dst_free(&rt->u.dst); + dst_free(&rt->dst); } -static struct fib6_table fib6_main_tbl = { - .tb6_id = RT6_TABLE_MAIN, - .tb6_root = { - .leaf = &ip6_null_entry, - .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, - }, -}; - -#ifdef CONFIG_IPV6_MULTIPLE_TABLES -#define FIB_TABLE_HASHSZ 256 -#else -#define FIB_TABLE_HASHSZ 1 -#endif -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; - -static void fib6_link_table(struct fib6_table *tb) +static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; @@ -191,66 +172,61 @@ static void fib6_link_table(struct fib6_table *tb) */ rwlock_init(&tb->tb6_lock); - h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1); + h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ - hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]); + hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES -static struct fib6_table fib6_local_tbl = { - .tb6_id = RT6_TABLE_LOCAL, - .tb6_root = { - .leaf = &ip6_null_entry, - .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, - }, -}; -static struct fib6_table *fib6_alloc_table(u32 id) +static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (table != NULL) { + if (table) { table->tb6_id = id; - table->tb6_root.leaf = &ip6_null_entry; + table->tb6_root.leaf = net->ipv6.ip6_null_entry; table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&table->tb6_peers); } return table; } -struct fib6_table *fib6_new_table(u32 id) +struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; - tb = fib6_get_table(id); + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(id); - if (tb != NULL) - fib6_link_table(tb); + tb = fib6_alloc_table(net, id); + if (tb) + fib6_link_table(net, tb); return tb; } -struct fib6_table *fib6_get_table(u32 id) +struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; - struct hlist_node *node; + struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; - h = id & (FIB_TABLE_HASHSZ - 1); + h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); - hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; @@ -261,33 +237,32 @@ struct fib6_table *fib6_get_table(u32 id) return NULL; } -static void __init fib6_tables_init(void) +static void __net_init fib6_tables_init(struct net *net) { - fib6_link_table(&fib6_main_tbl); - fib6_link_table(&fib6_local_tbl); + fib6_link_table(net, net->ipv6.fib6_main_tbl); + fib6_link_table(net, net->ipv6.fib6_local_tbl); } - #else -struct fib6_table *fib6_new_table(u32 id) +struct fib6_table *fib6_new_table(struct net *net, u32 id) { - return fib6_get_table(id); + return fib6_get_table(net, id); } -struct fib6_table *fib6_get_table(u32 id) +struct fib6_table *fib6_get_table(struct net *net, u32 id) { - return &fib6_main_tbl; + return net->ipv6.fib6_main_tbl; } -struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags, - pol_lookup_t lookup) +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) { - return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags); + return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); } -static void __init fib6_tables_init(void) +static void __net_init fib6_tables_init(struct net *net) { - fib6_link_table(&fib6_main_tbl); + fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif @@ -297,14 +272,14 @@ static int fib6_dump_node(struct fib6_walker_t *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ w->leaf = rt; return 1; } - BUG_TRAP(res!=0); + WARN_ON(res == 0); } w->leaf = NULL; return 0; @@ -312,13 +287,17 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void*)cb->args[2]; + struct fib6_walker_t *w = (void *)cb->args[2]; if (w) { + if (cb->args[4]) { + cb->args[4] = 0; + fib6_walker_unlink(w); + } cb->args[2] = 0; kfree(w); } - cb->done = (void*)cb->args[3]; + cb->done = (void *)cb->args[3]; cb->args[1] = 3; } @@ -338,46 +317,54 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->root = &table->tb6_root; if (cb->args[4] == 0) { + w->count = 0; + w->skip = 0; + read_lock_bh(&table->tb6_lock); res = fib6_walk(w); read_unlock_bh(&table->tb6_lock); - if (res > 0) + if (res > 0) { cb->args[4] = 1; + cb->args[5] = w->root->fn_sernum; + } } else { + if (cb->args[5] != w->root->fn_sernum) { + /* Begin at the root if the tree changed */ + cb->args[5] = w->root->fn_sernum; + w->state = FWS_INIT; + w->node = w->root; + w->skip = w->count; + } else + w->skip = 0; + read_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); - if (res != 0) { - if (res < 0) - fib6_walker_unlink(w); - goto end; + if (res <= 0) { + fib6_walker_unlink(w); + cb->args[4] = 0; } - fib6_walker_unlink(w); - cb->args[4] = 0; } -end: + return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb->sk->sk_net; + struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct rt6_rtnl_dump_arg arg; struct fib6_walker_t *w; struct fib6_table *tb; - struct hlist_node *node; + struct hlist_head *head; int res = 0; - if (net != &init_net) - return 0; - s_h = cb->args[0]; s_e = cb->args[1]; w = (void *)cb->args[2]; - if (w == NULL) { + if (!w) { /* New dump: * * 1. hook callback destructor. @@ -389,7 +376,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (w == NULL) + if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -397,11 +384,14 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) arg.skb = skb; arg.cb = cb; + arg.net = net; w->args = &arg; - for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { + rcu_read_lock(); + for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; - hlist_for_each_entry(tb, node, &fib_table_hash[h], tb6_hlist) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); @@ -412,6 +402,7 @@ next: } } out: + rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; @@ -429,9 +420,10 @@ out: * node. */ -static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, - int addrlen, int plen, - int offset) +static struct fib6_node *fib6_add_1(struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -453,8 +445,16 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, * Prefix match */ if (plen < fn->fn_bit || - !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { + if (!allow_create) { + if (replace_required) { + pr_warn("Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + } goto insert_above; + } /* * Exact match ? @@ -462,7 +462,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, if (plen == fn->fn_bit) { /* clean up an intermediate node */ - if ((fn->fn_flags & RTN_RTINFO) == 0) { + if (!(fn->fn_flags & RTN_RTINFO)) { rt6_release(fn->leaf); fn->leaf = NULL; } @@ -480,9 +480,25 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right: fn->left; + fn = dir ? fn->right : fn->left; } while (fn); + if (!allow_create) { + /* We should not create new node because + * NLM_F_REPLACE was specified without NLM_F_CREATE + * I assume it is safe to require NLM_F_CREATE when + * REPLACE flag is used! Later we may want to remove the + * check for replace_required, because according + * to netlink specification, NLM_F_CREATE + * MUST be specified if new route is created. + * That would keep IPv6 consistent with IPv4 + */ + if (replace_required) { + pr_warn("Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + } /* * We walked to the bottom of tree. * Create new leaf node without children. @@ -490,8 +506,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, ln = node_alloc(); - if (ln == NULL) - return NULL; + if (!ln) + return ERR_PTR(-ENOMEM); ln->fn_bit = plen; ln->parent = pn; @@ -522,7 +538,7 @@ insert_above: but if it is >= plen, the value is ignored in any case. */ - bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] @@ -533,12 +549,12 @@ insert_above: in = node_alloc(); ln = node_alloc(); - if (in == NULL || ln == NULL) { + if (!in || !ln) { if (in) node_free(in); if (ln) node_free(ln); - return NULL; + return ERR_PTR(-ENOMEM); } /* @@ -587,8 +603,8 @@ insert_above: ln = node_alloc(); - if (ln == NULL) - return NULL; + if (!ln) + return ERR_PTR(-ENOMEM); ln->fn_bit = plen; @@ -611,19 +627,61 @@ insert_above: return ln; } +static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + +static int fib6_commit_metrics(struct dst_entry *dst, + struct nlattr *mx, int mx_len) +{ + struct nlattr *nla; + int remaining; + u32 *mp; + + if (dst->flags & DST_HOST) { + mp = dst_metrics_write_ptr(dst); + } else { + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!mp) + return -ENOMEM; + dst_init_metrics(dst, mp, 0); + } + + nla_for_each_attr(nla, mx, mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) + return -EINVAL; + + mp[type - 1] = nla_get_u32(nla); + } + } + return 0; +} + /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info) + struct nl_info *info, struct nlattr *mx, int mx_len) { struct rt6_info *iter = NULL; struct rt6_info **ins; + int replace = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int add = (!info->nlh || + (info->nlh->nlmsg_flags & NLM_F_CREATE)); + int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + int err; ins = &fn->leaf; - for (iter = fn->leaf; iter; iter=iter->u.dst.rt6_next) { + for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { /* * Search for duplicates */ @@ -632,64 +690,149 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, /* * Same priority level */ + if (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_EXCL)) + return -EEXIST; + if (replace) { + found++; + break; + } - if (iter->rt6i_dev == rt->rt6i_dev && + if (iter->dst.dev == rt->dst.dev && iter->rt6i_idev == rt->rt6i_idev && ipv6_addr_equal(&iter->rt6i_gateway, &rt->rt6i_gateway)) { - if (!(iter->rt6i_flags&RTF_EXPIRES)) + if (rt->rt6i_nsiblings) + rt->rt6i_nsiblings = 0; + if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; - iter->rt6i_expires = rt->rt6i_expires; - if (!(rt->rt6i_flags&RTF_EXPIRES)) { - iter->rt6i_flags &= ~RTF_EXPIRES; - iter->rt6i_expires = 0; - } + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_clean_expires(iter); + else + rt6_set_expires(iter, rt->dst.expires); return -EEXIST; } + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->rt6i_nsiblings++; } if (iter->rt6i_metric > rt->rt6i_metric) break; - ins = &iter->u.dst.rt6_next; + ins = &iter->dst.rt6_next; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; + /* Link this route to others same route. */ + if (rt->rt6i_nsiblings) { + unsigned int rt6i_nsiblings; + struct rt6_info *sibling, *temp_sibling; + + /* Find the first route that have the same metric */ + sibling = fn->leaf; + while (sibling) { + if (sibling->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->rt6i_siblings, + &sibling->rt6i_siblings); + break; + } + sibling = sibling->dst.rt6_next; + } + /* For each sibling in the list, increment the counter of + * siblings. BUG() if counters does not match, list of siblings + * is broken! + */ + rt6i_nsiblings = 0; + list_for_each_entry_safe(sibling, temp_sibling, + &rt->rt6i_siblings, rt6i_siblings) { + sibling->rt6i_nsiblings++; + BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); + rt6i_nsiblings++; + } + BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + } + /* * insert node */ + if (!replace) { + if (!add) + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + +add: + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } + rt->dst.rt6_next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + info->nl_net->ipv6.rt6_stats->fib_rt_entries++; + + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } - rt->u.dst.rt6_next = iter; - *ins = rt; - rt->rt6i_node = fn; - atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); - rt6_stats.fib_rt_entries++; - - if ((fn->fn_flags & RTN_RTINFO) == 0) { - rt6_stats.fib_route_nodes++; - fn->fn_flags |= RTN_RTINFO; + } else { + if (!found) { + if (add) + goto add; + pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); + return -ENOENT; + } + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } + *ins = rt; + rt->rt6i_node = fn; + rt->dst.rt6_next = iter->dst.rt6_next; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + rt6_release(iter); + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } } return 0; } -static __inline__ void fib6_start_gc(struct rt6_info *rt) +static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) { - if (ip6_fib_timer.expires == 0 && - (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) - mod_timer(&ip6_fib_timer, jiffies + - init_net.ipv6.sysctl.ip6_rt_gc_interval); + if (!timer_pending(&net->ipv6.ip6_fib_timer) && + (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -void fib6_force_start_gc(void) +void fib6_force_start_gc(struct net *net) { - if (ip6_fib_timer.expires == 0) - mod_timer(&ip6_fib_timer, jiffies + - init_net.ipv6.sysctl.ip6_rt_gc_interval); + if (!timer_pending(&net->ipv6.ip6_fib_timer)) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } /* @@ -698,16 +841,31 @@ void fib6_force_start_gc(void) * with source addr info in sub-trees */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; - - fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); - - if (fn == NULL) + int allow_create = 1; + int replace_required = 0; + + if (info->nlh) { + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) + allow_create = 0; + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) + replace_required = 1; + } + if (!allow_create && !replace_required) + pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + offsetof(struct rt6_info, rt6i_dst), allow_create, + replace_required); + if (IS_ERR(fn)) { + err = PTR_ERR(fn); + fn = NULL; goto out; + } pn = fn; @@ -715,7 +873,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (fn->subtree == NULL) { + if (!fn->subtree) { struct fib6_node *sfn; /* @@ -730,26 +888,28 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) /* Create subtree root node */ sfn = node_alloc(); - if (sfn == NULL) + if (!sfn) goto st_failure; - sfn->leaf = &ip6_null_entry; - atomic_inc(&ip6_null_entry.rt6i_ref); + sfn->leaf = info->nl_net->ipv6.ip6_null_entry; + atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); sfn->fn_flags = RTN_ROOT; sfn->fn_sernum = fib6_new_sernum(); /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) { + if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node in main tree. */ node_free(sfn); + err = PTR_ERR(sn); goto st_failure; } @@ -758,14 +918,17 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) fn->subtree = sfn; } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) + if (IS_ERR(sn)) { + err = PTR_ERR(sn); goto st_failure; + } } - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->leaf = rt; atomic_inc(&rt->rt6i_ref); } @@ -773,12 +936,11 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) } #endif - err = fib6_add_rt2node(fn, rt, info); - - if (err == 0) { - fib6_start_gc(rt); - if (!(rt->rt6i_flags&RTF_CACHE)) - fib6_prune_clones(pn, rt); + err = fib6_add_rt2node(fn, rt, info, mx, mx_len); + if (!err) { + fib6_start_gc(info->nl_net, rt); + if (!(rt->rt6i_flags & RTF_CACHE)) + fib6_prune_clones(info->nl_net, pn); } out: @@ -788,18 +950,22 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ + if (pn != fn && pn->leaf == rt) { + pn->leaf = NULL; + atomic_dec(&rt->rt6i_ref); + } if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(pn); + pn->leaf = fib6_find_prefix(info->nl_net, pn); #if RT6_DEBUG >= 2 if (!pn->leaf) { - BUG_TRAP(pn->leaf != NULL); - pn->leaf = &ip6_null_entry; + WARN_ON(pn->leaf == NULL); + pn->leaf = info->nl_net->ipv6.ip6_null_entry; } #endif atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->u.dst); + dst_free(&rt->dst); } return err; @@ -809,8 +975,8 @@ out: */ st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(fn); - dst_free(&rt->u.dst); + fib6_repair_tree(info->nl_net, fn); + dst_free(&rt->dst); return err; #endif } @@ -821,12 +987,12 @@ st_failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ - struct in6_addr *addr; /* search key */ + int offset; /* key offset on rt6_info */ + const struct in6_addr *addr; /* search key */ }; -static struct fib6_node * fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -851,11 +1017,10 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, fn = next; continue; } - break; } - while(fn) { + while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { struct rt6key *key; @@ -864,14 +1029,22 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) - fn = fib6_lookup_1(fn->subtree, args + 1); + if (fn->subtree) { + struct fib6_node *sfn; + sfn = fib6_lookup_1(fn->subtree, + args + 1); + if (!sfn) + goto backtrack; + fn = sfn; + } #endif - if (!fn || fn->fn_flags & RTN_RTINFO) + if (fn->fn_flags & RTN_RTINFO) return fn; } } - +#ifdef CONFIG_IPV6_SUBTREES +backtrack: +#endif if (fn->fn_flags & RTN_ROOT) break; @@ -881,8 +1054,8 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, return NULL; } -struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, - struct in6_addr *saddr) +struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { @@ -902,8 +1075,7 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, }; fn = fib6_lookup_1(root, daddr ? args : args + 1); - - if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; @@ -915,9 +1087,9 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, */ -static struct fib6_node * fib6_locate_1(struct fib6_node *root, - struct in6_addr *addr, - int plen, int offset) +static struct fib6_node *fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset) { struct fib6_node *fn; @@ -945,9 +1117,9 @@ static struct fib6_node * fib6_locate_1(struct fib6_node *root, return NULL; } -struct fib6_node * fib6_locate(struct fib6_node *root, - struct in6_addr *daddr, int dst_len, - struct in6_addr *saddr, int src_len) +struct fib6_node *fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len) { struct fib6_node *fn; @@ -956,14 +1128,14 @@ struct fib6_node * fib6_locate(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (src_len) { - BUG_TRAP(saddr!=NULL); + WARN_ON(saddr == NULL); if (fn && fn->subtree) fn = fib6_locate_1(fn->subtree, saddr, src_len, offsetof(struct rt6_info, rt6i_src)); } #endif - if (fn && fn->fn_flags&RTN_RTINFO) + if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; @@ -975,16 +1147,15 @@ struct fib6_node * fib6_locate(struct fib6_node *root, * */ -static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) { - if (fn->fn_flags&RTN_ROOT) - return &ip6_null_entry; + if (fn->fn_flags & RTN_ROOT) + return net->ipv6.ip6_null_entry; - while(fn) { - if(fn->left) + while (fn) { + if (fn->left) return fn->left->leaf; - - if(fn->right) + if (fn->right) return fn->right->leaf; fn = FIB6_SUBTREE(fn); @@ -997,7 +1168,8 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) * is the node we want to try and remove. */ -static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_node *fn) { int children; int nstate; @@ -1009,26 +1181,28 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; - BUG_TRAP(!(fn->fn_flags&RTN_RTINFO)); - BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT)); - BUG_TRAP(fn->leaf==NULL); + WARN_ON(fn->fn_flags & RTN_RTINFO); + WARN_ON(fn->fn_flags & RTN_TL_ROOT); + WARN_ON(fn->leaf != NULL); children = 0; child = NULL; - if (fn->right) child = fn->right, children |= 1; - if (fn->left) child = fn->left, children |= 2; + if (fn->right) + child = fn->right, children |= 1; + if (fn->left) + child = fn->left, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ - || (children && fn->fn_flags&RTN_ROOT) + || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(fn); + fn->leaf = fib6_find_prefix(net, fn); #if RT6_DEBUG >= 2 - if (fn->leaf==NULL) { - BUG_TRAP(fn->leaf); - fn->leaf = &ip6_null_entry; + if (!fn->leaf) { + WARN_ON(!fn->leaf); + fn->leaf = net->ipv6.ip6_null_entry; } #endif atomic_inc(&fn->leaf->rt6i_ref); @@ -1038,16 +1212,19 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { - BUG_TRAP(fn->fn_flags&RTN_ROOT); + WARN_ON(!(fn->fn_flags & RTN_ROOT)); FIB6_SUBTREE(pn) = NULL; nstate = FWS_L; } else { - BUG_TRAP(!(fn->fn_flags&RTN_ROOT)); + WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) pn->right = child; - else if (pn->left == fn) pn->left = child; + if (pn->right == fn) + pn->right = child; + else if (pn->left == fn) + pn->left = child; #if RT6_DEBUG >= 2 - else BUG_TRAP(0); + else + WARN_ON(1); #endif if (child) child->parent = pn; @@ -1058,7 +1235,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) read_lock(&fib6_walker_lock); FOR_WALKERS(w) { - if (child == NULL) { + if (!child) { if (w->root == fn) { w->root = w->node = NULL; RT6_TRACE("W %p adjusted by delroot 1\n", w); @@ -1076,10 +1253,10 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } @@ -1087,7 +1264,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) read_unlock(&fib6_walker_lock); node_free(fn); - if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) + if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; rt6_release(pn->leaf); @@ -1101,38 +1278,50 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, { struct fib6_walker_t *w; struct rt6_info *rt = *rtp; + struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); /* Unlink it */ - *rtp = rt->u.dst.rt6_next; + *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; - rt6_stats.fib_rt_entries--; - rt6_stats.fib_discarded_routes++; + net->ipv6.rt6_stats->fib_rt_entries--; + net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (fn->rr_ptr == rt) fn->rr_ptr = NULL; + /* Remove this entry from other siblings */ + if (rt->rt6i_nsiblings) { + struct rt6_info *sibling, *next_sibling; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->rt6i_siblings, rt6i_siblings) + sibling->rt6i_nsiblings--; + rt->rt6i_nsiblings = 0; + list_del_init(&rt->rt6i_siblings); + } + /* Adjust walkers */ read_lock(&fib6_walker_lock); FOR_WALKERS(w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->u.dst.rt6_next; - if (w->leaf == NULL) + w->leaf = rt->dst.rt6_next; + if (!w->leaf) w->state = FWS_U; } } read_unlock(&fib6_walker_lock); - rt->u.dst.rt6_next = NULL; + rt->dst.rt6_next = NULL; /* If it was last route, expunge its radix tree node */ - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->fn_flags &= ~RTN_RTINFO; - rt6_stats.fib_route_nodes--; - fn = fib6_repair_tree(fn); + net->ipv6.rt6_stats->fib_route_nodes--; + fn = fib6_repair_tree(net, fn); } if (atomic_read(&rt->rt6i_ref) != 1) { @@ -1143,8 +1332,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(fn); + if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(net, fn); atomic_inc(&fn->leaf->rt6i_ref); rt6_release(rt); } @@ -1160,38 +1349,39 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, int fib6_del(struct rt6_info *rt, struct nl_info *info) { + struct net *net = info->nl_net; struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; #if RT6_DEBUG >= 2 - if (rt->u.dst.obsolete>0) { - BUG_TRAP(fn==NULL); + if (rt->dst.obsolete > 0) { + WARN_ON(fn != NULL); return -ENOENT; } #endif - if (fn == NULL || rt == &ip6_null_entry) + if (!fn || rt == net->ipv6.ip6_null_entry) return -ENOENT; - BUG_TRAP(fn->fn_flags&RTN_RTINFO); + WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags&RTF_CACHE)) { + if (!(rt->rt6i_flags & RTF_CACHE)) { struct fib6_node *pn = fn; #ifdef CONFIG_IPV6_SUBTREES /* clones of this route might be in another subtree */ if (rt->rt6i_src.plen) { - while (!(pn->fn_flags&RTN_ROOT)) + while (!(pn->fn_flags & RTN_ROOT)) pn = pn->parent; pn = pn->parent; } #endif - fib6_prune_clones(pn, rt); + fib6_prune_clones(info->nl_net, pn); } /* * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.dst.rt6_next) { + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { if (*rtp == rt) { fib6_del_route(fn, rtp, info); return 0; @@ -1230,11 +1420,11 @@ static int fib6_walk_continue(struct fib6_walker_t *w) for (;;) { fn = w->node; - if (fn == NULL) + if (!fn) return 0; if (w->prune && fn != w->root && - fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { w->state = FWS_C; w->leaf = fn->leaf; } @@ -1263,12 +1453,22 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->state = FWS_C; w->leaf = fn->leaf; case FWS_C: - if (w->leaf && fn->fn_flags&RTN_RTINFO) { - int err = w->func(w); + if (w->leaf && fn->fn_flags & RTN_RTINFO) { + int err; + + if (w->skip) { + w->skip--; + goto skip; + } + + err = w->func(w); if (err) return err; + + w->count++; continue; } +skip: w->state = FWS_U; case FWS_U: if (fn == w->root) @@ -1277,7 +1477,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { - BUG_TRAP(fn->fn_flags&RTN_ROOT); + WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } @@ -1292,7 +1492,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w) continue; } #if RT6_DEBUG >= 2 - BUG_TRAP(0); + WARN_ON(1); #endif } } @@ -1314,27 +1514,28 @@ static int fib6_walk(struct fib6_walker_t *w) static int fib6_clean_node(struct fib6_walker_t *w) { - struct nl_info info = { - .nl_net = &init_net, - }; int res; struct rt6_info *rt; struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct nl_info info = { + .nl_net = c->net, + }; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 - printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); + pr_debug("%s: del failed: rt=%p@%p err=%d\n", + __func__, rt, rt->rt6i_node, res); #endif continue; } return 0; } - BUG_TRAP(res==0); + WARN_ON(res != 0); } w->leaf = rt; return 0; @@ -1351,7 +1552,7 @@ static int fib6_clean_node(struct fib6_walker_t *w) * ignoring pure split nodes) will be scanned. */ -static void fib6_clean_tree(struct fib6_node *root, +static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), int prune, void *arg) { @@ -1360,25 +1561,29 @@ static void fib6_clean_tree(struct fib6_node *root, c.w.root = root; c.w.func = fib6_clean_node; c.w.prune = prune; + c.w.count = 0; + c.w.skip = 0; c.func = func; c.arg = arg; + c.net = net; fib6_walk(&c.w); } -void fib6_clean_all(int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) +void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), + void *arg) { struct fib6_table *table; - struct hlist_node *node; + struct hlist_head *head; unsigned int h; rcu_read_lock(); - for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry_rcu(table, node, &fib_table_hash[h], - tb6_hlist) { + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); - fib6_clean_tree(&table->tb6_root, func, prune, arg); + fib6_clean_tree(net, &table->tb6_root, + func, 0, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1395,9 +1600,9 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) return 0; } -static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt) +static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(fn, fib6_prune_clone, 1, rt); + fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); } /* @@ -1422,22 +1627,31 @@ static int fib6_age(struct rt6_info *rt, void *arg) * only if they are not in use now. */ - if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { - if (time_after(now, rt->rt6i_expires)) { + if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { + if (time_after(now, rt->dst.expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args.more++; } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->u.dst.__refcnt) == 0 && - time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { + if (atomic_read(&rt->dst.__refcnt) == 0 && + time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; - } else if ((rt->rt6i_flags & RTF_GATEWAY) && - (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } } gc_args.more++; } @@ -1447,54 +1661,138 @@ static int fib6_age(struct rt6_info *rt, void *arg) static DEFINE_SPINLOCK(fib6_gc_lock); -void fib6_run_gc(unsigned long dummy) +void fib6_run_gc(unsigned long expires, struct net *net, bool force) { - if (dummy != ~0UL) { + unsigned long now; + + if (force) { spin_lock_bh(&fib6_gc_lock); - gc_args.timeout = dummy ? (int)dummy : - init_net.ipv6.sysctl.ip6_rt_gc_interval; - } else { - local_bh_disable(); - if (!spin_trylock(&fib6_gc_lock)) { - mod_timer(&ip6_fib_timer, jiffies + HZ); - local_bh_enable(); - return; - } - gc_args.timeout = init_net.ipv6.sysctl.ip6_rt_gc_interval; + } else if (!spin_trylock_bh(&fib6_gc_lock)) { + mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); + return; } - gc_args.more = 0; + gc_args.timeout = expires ? (int)expires : + net->ipv6.sysctl.ip6_rt_gc_interval; - ndisc_dst_gc(&gc_args.more); - fib6_clean_all(fib6_age, 0, NULL); + gc_args.more = icmp6_dst_gc(); + + fib6_clean_all(net, fib6_age, NULL); + now = jiffies; + net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) - mod_timer(&ip6_fib_timer, jiffies + - init_net.ipv6.sysctl.ip6_rt_gc_interval); - else { - del_timer(&ip6_fib_timer); - ip6_fib_timer.expires = 0; - } + mod_timer(&net->ipv6.ip6_fib_timer, + round_jiffies(now + + net->ipv6.sysctl.ip6_rt_gc_interval)); + else + del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&fib6_gc_lock); } +static void fib6_gc_timer_cb(unsigned long arg) +{ + fib6_run_gc(0, (struct net *)arg, true); +} + +static int __net_init fib6_net_init(struct net *net) +{ + size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + + setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); + + net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); + if (!net->ipv6.rt6_stats) + goto out_timer; + + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); + if (!net->ipv6.fib_table_hash) + goto out_rt6_stats; + + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_main_tbl) + goto out_fib_table_hash; + + net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; + net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_main_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_local_tbl) + goto out_fib6_main_tbl; + net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; + net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_local_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); +#endif + fib6_tables_init(net); + + return 0; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_fib6_main_tbl: + kfree(net->ipv6.fib6_main_tbl); +#endif +out_fib_table_hash: + kfree(net->ipv6.fib_table_hash); +out_rt6_stats: + kfree(net->ipv6.rt6_stats); +out_timer: + return -ENOMEM; +} + +static void fib6_net_exit(struct net *net) +{ + rt6_ifdown(net, NULL); + del_timer_sync(&net->ipv6.ip6_fib_timer); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); + kfree(net->ipv6.fib6_local_tbl); +#endif + inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); + kfree(net->ipv6.fib6_main_tbl); + kfree(net->ipv6.fib_table_hash); + kfree(net->ipv6.rt6_stats); +} + +static struct pernet_operations fib6_net_ops = { + .init = fib6_net_init, + .exit = fib6_net_exit, +}; + int __init fib6_init(void) { - int ret; + int ret = -ENOMEM; + fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN, NULL); if (!fib6_node_kmem) - return -ENOMEM; - - fib6_tables_init(); + goto out; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); + ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; + + ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, + NULL); + if (ret) + goto out_unregister_subsys; out: return ret; +out_unregister_subsys: + unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; @@ -1502,6 +1800,192 @@ out_kmem_cache_create: void fib6_gc_cleanup(void) { - del_timer(&ip6_fib_timer); + unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } + +#ifdef CONFIG_PROC_FS + +struct ipv6_route_iter { + struct seq_net_private p; + struct fib6_walker_t w; + loff_t skip; + struct fib6_table *tbl; + __u32 sernum; +}; + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct rt6_info *rt = v; + struct ipv6_route_iter *iter = seq->private; + + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); +#else + seq_puts(seq, "00000000000000000000000000000000 00 "); +#endif + if (rt->rt6i_flags & RTF_GATEWAY) + seq_printf(seq, "%pi6", &rt->rt6i_gateway); + else + seq_puts(seq, "00000000000000000000000000000000"); + + seq_printf(seq, " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + iter->w.leaf = NULL; + return 0; +} + +static int ipv6_route_yield(struct fib6_walker_t *w) +{ + struct ipv6_route_iter *iter = w->args; + + if (!iter->skip) + return 1; + + do { + iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->skip--; + if (!iter->skip && iter->w.leaf) + return 1; + } while (iter->w.leaf); + + return 0; +} + +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +{ + memset(&iter->w, 0, sizeof(iter->w)); + iter->w.func = ipv6_route_yield; + iter->w.root = &iter->tbl->tb6_root; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + iter->w.args = iter; + iter->sernum = iter->w.root->fn_sernum; + INIT_LIST_HEAD(&iter->w.lh); + fib6_walker_link(&iter->w); +} + +static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, + struct net *net) +{ + unsigned int h; + struct hlist_node *node; + + if (tbl) { + h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; + node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); + } else { + h = 0; + node = NULL; + } + + while (!node && h < FIB6_TABLE_HASHSZ) { + node = rcu_dereference_bh( + hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); + } + return hlist_entry_safe(node, struct fib6_table, tb6_hlist); +} + +static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) +{ + if (iter->sernum != iter->w.root->fn_sernum) { + iter->sernum = iter->w.root->fn_sernum; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + WARN_ON(iter->w.skip); + iter->w.skip = iter->w.count; + } +} + +static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int r; + struct rt6_info *n; + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + if (!v) + goto iter_table; + + n = ((struct rt6_info *)v)->dst.rt6_next; + if (n) { + ++*pos; + return n; + } + +iter_table: + ipv6_route_check_sernum(iter); + read_lock(&iter->tbl->tb6_lock); + r = fib6_walk_continue(&iter->w); + read_unlock(&iter->tbl->tb6_lock); + if (r > 0) { + if (v) + ++*pos; + return iter->w.leaf; + } else if (r < 0) { + fib6_walker_unlink(&iter->w); + return NULL; + } + fib6_walker_unlink(&iter->w); + + iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); + if (!iter->tbl) + return NULL; + + ipv6_route_seq_setup_walk(iter); + goto iter_table; +} + +static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU_BH) +{ + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + rcu_read_lock_bh(); + iter->tbl = ipv6_route_seq_next_table(NULL, net); + iter->skip = *pos; + + if (iter->tbl) { + ipv6_route_seq_setup_walk(iter); + return ipv6_route_seq_next(seq, NULL, pos); + } else { + return NULL; + } +} + +static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) +{ + struct fib6_walker_t *w = &iter->w; + return w->node && !(w->state == FWS_U && w->node == w->root); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) + __releases(RCU_BH) +{ + struct ipv6_route_iter *iter = seq->private; + + if (ipv6_route_iter_active(iter)) + fib6_walker_unlink(&iter->w); + + rcu_read_unlock_bh(); +} + +static const struct seq_operations ipv6_route_seq_ops = { + .start = ipv6_route_seq_start, + .next = ipv6_route_seq_next, + .stop = ipv6_route_seq_stop, + .show = ipv6_route_seq_show +}; + +int ipv6_route_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipv6_route_seq_ops, + sizeof(struct ipv6_route_iter)); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 2b7d9ee9883..4052694c6f2 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -15,22 +15,18 @@ #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> #include <linux/in6.h> -#include <linux/route.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> -#include <net/ndisc.h> -#include <net/protocol.h> -#include <net/ip6_route.h> -#include <net/addrconf.h> #include <net/rawv6.h> -#include <net/icmp.h> #include <net/transp_v6.h> #include <asm/uaccess.h> @@ -38,7 +34,7 @@ #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ -#define FL_MAX_LINGER 60 /* Maximal linger timeout */ +#define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ @@ -48,54 +44,71 @@ #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); -static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; +static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); /* FL hash table lock: it protects only of GC */ -static DEFINE_RWLOCK(ip6_fl_lock); +static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ -static DEFINE_RWLOCK(ip6_sk_fl_lock); +static DEFINE_SPINLOCK(ip6_sk_fl_lock); +#define for_each_fl_rcu(hash, fl) \ + for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) +#define for_each_fl_continue_rcu(fl) \ + for (fl = rcu_dereference_bh(fl->next); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) -static __inline__ struct ip6_flowlabel * __fl_lookup(__be32 label) +#define for_each_sk_fl_rcu(np, sfl) \ + for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ + sfl != NULL; \ + sfl = rcu_dereference_bh(sfl->next)) + +static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { - if (fl->label == label) + for_each_fl_rcu(FL_HASH(label), fl) { + if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } -static struct ip6_flowlabel * fl_lookup(__be32 label) +static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - read_lock_bh(&ip6_fl_lock); - fl = __fl_lookup(label); - if (fl) - atomic_inc(&fl->users); - read_unlock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); + fl = __fl_lookup(net, label); + if (fl && !atomic_inc_not_zero(&fl->users)) + fl = NULL; + rcu_read_unlock_bh(); return fl; } static void fl_free(struct ip6_flowlabel *fl) { - if (fl) + if (fl) { + if (fl->share == IPV6_FL_S_PROCESS) + put_pid(fl->owner.pid); + release_net(fl->fl_net); kfree(fl->opt); - kfree(fl); + kfree_rcu(fl, rcu); + } } static void fl_release(struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { @@ -112,8 +125,7 @@ static void fl_release(struct ip6_flowlabel *fl) time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } - - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(unsigned long dummy) @@ -122,12 +134,15 @@ static void ip6_fl_gc(unsigned long dummy) unsigned long now = jiffies; unsigned long sched = 0; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i=0; i<=FL_HASH_MASK; i++) { - struct ip6_flowlabel *fl, **flp; + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + flp = &fl_ht[i]; - while ((fl=*flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) @@ -148,24 +163,49 @@ static void ip6_fl_gc(unsigned long dummy) if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { - ip6_fl_gc_timer.expires = sched; - add_timer(&ip6_fl_gc_timer); + mod_timer(&ip6_fl_gc_timer, sched); } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } -static struct ip6_flowlabel *fl_intern(struct ip6_flowlabel *fl, __be32 label) +static void __net_exit ip6_fl_purge(struct net *net) +{ + int i; + + spin_lock(&ip6_fl_lock); + for (i = 0; i <= FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + + flp = &fl_ht[i]; + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { + if (net_eq(fl->fl_net, net) && + atomic_read(&fl->users) == 0) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + flp = &fl->next; + } + } + spin_unlock(&ip6_fl_lock); +} + +static struct ip6_flowlabel *fl_intern(struct net *net, + struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { - fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { - lfl = __fl_lookup(fl->label); + lfl = __fl_lookup(net, fl->label); if (lfl == NULL) break; } @@ -179,19 +219,19 @@ static struct ip6_flowlabel *fl_intern(struct ip6_flowlabel *fl, __be32 label) * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ - lfl = __fl_lookup(fl->label); + lfl = __fl_lookup(net, fl->label); if (lfl != NULL) { atomic_inc(&lfl->users); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; - fl_ht[FL_HASH(fl->label)] = fl; + rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return NULL; } @@ -206,17 +246,17 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) label &= IPV6_FLOWLABEL_MASK; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label) { fl->lastuse = jiffies; atomic_inc(&fl->users); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return fl; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return NULL; } @@ -227,11 +267,21 @@ void fl6_free_socklist(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; - while ((sfl = np->ipv6_fl_list) != NULL) { + if (!rcu_access_pointer(np->ipv6_fl_list)) + return; + + spin_lock_bh(&ip6_sk_fl_lock); + while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); + + spin_lock_bh(&ip6_sk_fl_lock); } + spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ @@ -269,6 +319,7 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, opt_space->opt_flen = fopt->opt_flen; return opt_space; } +EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { @@ -287,6 +338,8 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo expires = check_linger(expires); if (!expires) return -EPERM; + + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; @@ -294,26 +347,33 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; + spin_unlock_bh(&ip6_fl_lock); + return 0; } static struct ip6_flowlabel * -fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *err_p) +fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, + char __user *optval, int optlen, int *err_p) { - struct ip6_flowlabel *fl; + struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + err = -EINVAL; + if (olen > 64 * 1024) + goto done; + err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (fl == NULL) goto done; - olen = optlen - CMSG_ALIGN(sizeof(*freq)); if (olen > 0) { struct msghdr msg; - struct flowi flowi; + struct flowi6 flowi6; int junk; err = -ENOMEM; @@ -329,9 +389,10 @@ fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int * msg.msg_controllen = olen; msg.msg_control = (void*)(fl->opt+1); - flowi.oif = 0; + memset(&flowi6, 0, sizeof(flowi6)); - err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, + &junk, &junk, &junk); if (err) goto done; err = -EINVAL; @@ -343,28 +404,29 @@ fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int * } } + fl->fl_net = hold_net(net); fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); - if ((addr_type&IPV6_ADDR_MAPPED) - || addr_type == IPV6_ADDR_ANY) { + if ((addr_type & IPV6_ADDR_MAPPED) || + addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } - ipv6_addr_copy(&fl->dst, &freq->flr_dst); + fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: - fl->owner = current->pid; + fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: - fl->owner = current->euid; + fl->owner.uid = current_euid(); break; default: err = -EINVAL; @@ -388,63 +450,76 @@ static int mem_check(struct sock *sk) if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) count++; + rcu_read_unlock_bh(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) - && !capable(CAP_NET_ADMIN))) + (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } -static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { - if (h1 == h2) - return 0; - if (h1 == NULL || h2 == NULL) - return 1; - if (h1->hdrlen != h2->hdrlen) - return 1; - return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); + spin_lock_bh(&ip6_sk_fl_lock); + sfl->fl = fl; + sfl->next = np->ipv6_fl_list; + rcu_assign_pointer(np->ipv6_fl_list, sfl); + spin_unlock_bh(&ip6_sk_fl_lock); } -static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, + int flags) { - if (o1 == o2) + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + if (flags & IPV6_FL_F_REMOTE) { + freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; - if (o1 == NULL || o2 == NULL) - return 1; - if (o1->opt_nflen != o2->opt_nflen) - return 1; - if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) - return 1; - if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) - return 1; - if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) - return 1; - return 0; -} + } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) -{ - write_lock_bh(&ip6_sk_fl_lock); - sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - np->ipv6_fl_list = sfl; - write_unlock_bh(&ip6_sk_fl_lock); + if (np->repflow) { + freq->flr_label = np->flow_label; + return 0; + } + + rcu_read_lock_bh(); + + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { + spin_lock_bh(&ip6_fl_lock); + freq->flr_label = sfl->fl->label; + freq->flr_dst = sfl->fl->dst; + freq->flr_share = sfl->fl->share; + freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; + freq->flr_linger = sfl->fl->linger / HZ; + + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); + return 0; + } + } + rcu_read_unlock_bh(); + + return -ENOENT; } int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) { - int err; + int uninitialized_var(err); + struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; struct ipv6_fl_socklist *sfl1=NULL; - struct ipv6_fl_socklist *sfl, **sflp; + struct ipv6_fl_socklist *sfl; + struct ipv6_fl_socklist __rcu **sflp; struct ip6_flowlabel *fl, *fl1 = NULL; @@ -456,34 +531,46 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) switch (freq.flr_action) { case IPV6_FL_A_PUT: - write_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (!np->repflow) + return -ESRCH; + np->flow_label = 0; + np->repflow = 0; + return 0; + } + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = rcu_dereference(*sflp))!=NULL; + sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - write_unlock_bh(&ip6_sk_fl_lock); + *sflp = rcu_dereference(sfl->next); + spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); return 0; } } - write_unlock_bh(&ip6_sk_fl_lock); + spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; case IPV6_FL_A_RENEW: - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return err; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); - if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { - fl = fl_lookup(freq.flr_label); + if (freq.flr_share == IPV6_FL_S_NONE && + ns_capable(net->user_ns, CAP_NET_ADMIN)) { + fl = fl_lookup(net, freq.flr_label); if (fl) { err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); fl_release(fl); @@ -493,21 +580,35 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) return -ESRCH; case IPV6_FL_A_GET: + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + struct net *net = sock_net(sk); + if (net->ipv6.sysctl.flowlabel_consistency) { + net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); + return -EPERM; + } + + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + + np->repflow = 1; + return 0; + } + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; - fl = fl_create(&freq, optval, optlen, &err); + fl = fl_create(net, sk, &freq, optval, optlen, &err); if (fl == NULL) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq.flr_label) { err = -EEXIST; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_flags&IPV6_FL_F_EXCL) { - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); goto done; } fl1 = sfl->fl; @@ -515,10 +616,10 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) break; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (fl1 == NULL) - fl1 = fl_lookup(freq.flr_label); + fl1 = fl_lookup(net, freq.flr_label); if (fl1) { recheck: err = -EEXIST; @@ -527,12 +628,10 @@ recheck: err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || - fl1->owner != fl->owner) - goto release; - - err = -EINVAL; - if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || - ipv6_opt_cmp(fl1->opt, fl->opt)) + ((fl1->share == IPV6_FL_S_PROCESS) && + (fl1->owner.pid == fl->owner.pid)) || + ((fl1->share == IPV6_FL_S_USER) && + uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; @@ -559,7 +658,7 @@ release: if (sfl1 == NULL || (err = mem_check(sk)) != 0) goto done; - fl1 = fl_intern(fl, freq.flr_label); + fl1 = fl_intern(net, fl, freq.flr_label); if (fl1 != NULL) goto recheck; @@ -586,6 +685,8 @@ done: #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { + struct seq_net_private p; + struct pid_namespace *pid_ns; int bucket; }; @@ -595,27 +696,40 @@ static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { - if (fl_ht[state->bucket]) { - fl = fl_ht[state->bucket]; - break; + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; } } + fl = NULL; +out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); - fl = fl->next; - while (!fl) { - if (++state->bucket <= FL_HASH_MASK) - fl = fl_ht[state->bucket]; - else - break; + for_each_fl_continue_rcu(fl) { + if (net_eq(fl->fl_net, net)) + goto out; } + +try_again: + if (++state->bucket <= FL_HASH_MASK) { + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + goto try_again; + } + fl = NULL; + +out: return fl; } @@ -629,9 +743,9 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(ip6_fl_lock) + __acquires(RCU) { - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -648,27 +762,32 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip6fl_seq_stop(struct seq_file *seq, void *v) - __releases(ip6_fl_lock) + __releases(RCU) { - read_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); else { struct ip6_flowlabel *fl = v; seq_printf(seq, - "%05X %-1d %-6d %-6d %-6ld %-8ld " NIP6_SEQFMT " %-4d\n", - (unsigned)ntohl(fl->label), + "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", + (unsigned int)ntohl(fl->label), fl->share, - (unsigned)fl->owner, + ((fl->share == IPV6_FL_S_PROCESS) ? + pid_nr_ns(fl->owner.pid, state->pid_ns) : + ((fl->share == IPV6_FL_S_USER) ? + from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : + 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, - NIP6(fl->dst), + &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; @@ -683,8 +802,29 @@ static const struct seq_operations ip6fl_seq_ops = { static int ip6fl_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ip6fl_seq_ops, - sizeof(struct ip6fl_iter_state)); + struct seq_file *seq; + struct ip6fl_iter_state *state; + int err; + + err = seq_open_net(inode, file, &ip6fl_seq_ops, + sizeof(struct ip6fl_iter_state)); + + if (!err) { + seq = file->private_data; + state = ip6fl_seq_private(seq); + rcu_read_lock(); + state->pid_ns = get_pid_ns(task_active_pid_ns(current)); + rcu_read_unlock(); + } + return err; +} + +static int ip6fl_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + put_pid_ns(state->pid_ns); + return seq_release_net(inode, file); } static const struct file_operations ip6fl_seq_fops = { @@ -692,19 +832,20 @@ static const struct file_operations ip6fl_seq_fops = { .open = ip6fl_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = ip6fl_seq_release, }; -static int ip6_flowlabel_proc_init(struct net *net) +static int __net_init ip6_flowlabel_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops)) + if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net, + &ip6fl_seq_fops)) return -ENOMEM; return 0; } -static void ip6_flowlabel_proc_fini(struct net *net) +static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { - proc_net_remove(net, "ip6_flowlabel"); + remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) @@ -713,17 +854,27 @@ static inline int ip6_flowlabel_proc_init(struct net *net) } static inline void ip6_flowlabel_proc_fini(struct net *net) { - return ; } #endif +static void __net_exit ip6_flowlabel_net_exit(struct net *net) +{ + ip6_fl_purge(net); + ip6_flowlabel_proc_fini(net); +} + +static struct pernet_operations ip6_flowlabel_net_ops = { + .init = ip6_flowlabel_proc_init, + .exit = ip6_flowlabel_net_exit, +}; + int ip6_flowlabel_init(void) { - return ip6_flowlabel_proc_init(&init_net); + return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { del_timer(&ip6_fl_gc_timer); - ip6_flowlabel_proc_fini(&init_net); + unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c new file mode 100644 index 00000000000..3873181ed85 --- /dev/null +++ b/net/ipv6/ip6_gre.c @@ -0,0 +1,1722 @@ +/* + * GRE over IPv6 protocol decoder. + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/hash.h> +#include <linux/if_tunnel.h> +#include <linux/ip6_tunnel.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/addrconf.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ip6_tunnel.h> + + +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) + +static int ip6gre_net_id __read_mostly; +struct ip6gre_net { + struct ip6_tnl __rcu *tunnels[4][HASH_SIZE]; + + struct net_device *fb_tunnel_dev; +}; + +static struct rtnl_link_ops ip6gre_link_ops __read_mostly; +static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static int ip6gre_tunnel_init(struct net_device *dev); +static void ip6gre_tunnel_setup(struct net_device *dev); +static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); +static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1)) +static u32 HASH_ADDR(const struct in6_addr *addr) +{ + u32 hash = ipv6_addr_hash(addr); + + return hash_32(hash, HASH_SIZE_SHIFT); +} + +#define tunnels_r_l tunnels[3] +#define tunnels_r tunnels[2] +#define tunnels_l tunnels[1] +#define tunnels_wc tunnels[0] + +/* Given src, dst and key, find appropriate for input tunnel. */ + +static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, + const struct in6_addr *remote, const struct in6_addr *local, + __be32 key, __be16 gre_proto) +{ + struct net *net = dev_net(dev); + int link = dev->ifindex; + unsigned int h0 = HASH_ADDR(remote); + unsigned int h1 = HASH_KEY(key); + struct ip6_tnl *t, *cand = NULL; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + ARPHRD_ETHER : ARPHRD_IP6GRE; + int score, cand_score = 4; + + for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_equal(remote, &t->parms.raddr) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { + if (!ipv6_addr_equal(remote, &t->parms.raddr) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { + if ((!ipv6_addr_equal(local, &t->parms.laddr) && + (!ipv6_addr_equal(local, &t->parms.raddr) || + !ipv6_addr_is_multicast(local))) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { + if (t->parms.i_key != key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + if (cand != NULL) + return cand; + + dev = ign->fb_tunnel_dev; + if (dev->flags & IFF_UP) + return netdev_priv(dev); + + return NULL; +} + +static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign, + const struct __ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = HASH_KEY(p->i_key); + int prio = 0; + + if (!ipv6_addr_any(local)) + prio |= 1; + if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) { + prio |= 2; + h ^= HASH_ADDR(remote); + } + + return &ign->tunnels[prio][h]; +} + +static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign, + const struct ip6_tnl *t) +{ + return __ip6gre_bucket(ign, &t->parms); +} + +static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6gre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static struct ip6_tnl *ip6gre_tunnel_find(struct net *net, + const struct __ip6_tnl_parm *parms, + int type) +{ + const struct in6_addr *remote = &parms->raddr; + const struct in6_addr *local = &parms->laddr; + __be32 key = parms->i_key; + int link = parms->link; + struct ip6_tnl *t; + struct ip6_tnl __rcu **tp; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + for (tp = __ip6gre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + key == t->parms.i_key && + link == t->parms.link && + type == t->dev->type) + break; + + return t; +} + +static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, + const struct __ip6_tnl_parm *parms, int create) +{ + struct ip6_tnl *t, *nt; + struct net_device *dev; + char name[IFNAMSIZ]; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); + if (t || !create) + return t; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "ip6gre%d"); + + dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup); + if (!dev) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + nt->parms = *parms; + dev->rtnl_link_ops = &ip6gre_link_ops; + + nt->dev = dev; + nt->net = dev_net(dev); + ip6gre_tnl_link_config(nt, 1); + + if (register_netdevice(dev) < 0) + goto failed_free; + + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + + dev_hold(dev); + ip6gre_tunnel_link(ign, nt); + return nt; + +failed_free: + free_netdev(dev); + return NULL; +} + +static void ip6gre_tunnel_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); + + ip6gre_tunnel_unlink(ign, t); + dev_put(dev); +} + + +static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; + __be16 *p = (__be16 *)(skb->data + offset); + int grehlen = offset + 4; + struct ip6_tnl *t; + __be16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (!pskb_may_pull(skb, grehlen)) + return; + ipv6h = (const struct ipv6hdr *)skb->data; + p = (__be16 *)(skb->data + offset); + + t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, + flags & GRE_KEY ? + *(((__be32 *)p) + (grehlen / 4) - 1) : 0, + p[1]); + if (t == NULL) + return; + + switch (type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); + break; + case ICMPV6_TIME_EXCEED: + if (code == ICMPV6_EXC_HOPLIMIT) { + net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); + } + break; + case ICMPV6_PARAMPROB: + teli = 0; + if (code == ICMPV6_HDR_FIELD) + teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); + + if (teli && teli == info - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); + } + } else { + net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = info - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + break; + } + + if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +} + +static int ip6gre_rcv(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + u8 *h; + __be16 flags; + __sum16 csum = 0; + __be32 key = 0; + u32 seqno = 0; + struct ip6_tnl *tunnel; + int offset = 4; + __be16 gre_proto; + int err; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + goto drop; + + ipv6h = ipv6_hdr(skb); + h = skb->data; + flags = *(__be16 *)h; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop; + + if (flags&GRE_CSUM) { + csum = skb_checksum_simple_validate(skb); + offset += 4; + } + if (flags&GRE_KEY) { + key = *(__be32 *)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(__be32 *)(h + offset)); + offset += 4; + } + } + + gre_proto = *(__be16 *)(h + 2); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, key, + gre_proto); + if (tunnel) { + struct pcpu_sw_netstats *tstats; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) { + tunnel->dev->stats.rx_dropped++; + goto drop; + } + + skb->protocol = gre_proto; + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { + skb->protocol = htons(ETH_P_IP); + if ((*(h + offset) & 0xF0) != 0x40) + offset += 4; + } + + skb->mac_header = skb->network_header; + __pskb_pull(skb, offset); + skb_postpull_rcsum(skb, skb_transport_header(skb), offset); + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && + (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + ipv6h = ipv6_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + skb_reset_network_header(skb); + + err = IP6_ECN_decapsulate(ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; + } + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + netif_rx(skb); + + return 0; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +{ + memset(opt, 0, sizeof(struct ipv6_tel_txoption)); + + opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; + opt->dst_opt[3] = 1; + opt->dst_opt[4] = encap_limit; + opt->dst_opt[5] = IPV6_TLV_PADN; + opt->dst_opt[6] = 1; + + opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; + opt->ops.opt_nflen = 8; +} + +static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, + struct net_device *dev, + __u8 dsfield, + struct flowi6 *fl6, + int encap_limit, + __u32 *pmtu) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *ipv6h; /* Our new IP header */ + unsigned int max_headroom = 0; /* The extra header space needed */ + int gre_hlen; + struct ipv6_tel_txoption opt; + int mtu; + struct dst_entry *dst = NULL, *ndst = NULL; + struct net_device_stats *stats = &tunnel->dev->stats; + int err = -1; + u8 proto; + struct sk_buff *new_skb; + + if (dev->type == ARPHRD_ETHER) + IPCB(skb)->flags = 0; + + if (dev->header_ops && dev->type == ARPHRD_IP6GRE) { + gre_hlen = 0; + ipv6h = (struct ipv6hdr *)skb->data; + fl6->daddr = ipv6h->daddr; + } else { + gre_hlen = tunnel->hlen; + fl6->daddr = tunnel->parms.raddr; + } + + if (!fl6->flowi6_mark) + dst = ip6_tnl_dst_check(tunnel); + + if (!dst) { + ndst = ip6_route_output(net, NULL, fl6); + + if (ndst->error) + goto tx_err_link_failure; + ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); + ndst = NULL; + goto tx_err_link_failure; + } + dst = ndst; + } + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + tunnel->parms.name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst) - sizeof(*ipv6h); + if (encap_limit >= 0) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->len > mtu) { + *pmtu = mtu; + err = -EMSGSIZE; + goto tx_err_dst_release; + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + + max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + if (!new_skb) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + consume_skb(skb); + skb = new_skb; + } + + if (fl6->flowi6_mark) { + skb_dst_set(skb, dst); + ndst = NULL; + } else { + skb_dst_set_noref(skb, dst); + } + + proto = NEXTHDR_GRE; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); + } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + skb_push(skb, gre_hlen); + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*ipv6h)); + + /* + * Push down and install the IP header. + */ + ipv6h = ipv6_hdr(skb); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ipv6h->hop_limit = tunnel->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; + + ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; + ((__be16 *)(ipv6h + 1))[1] = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__sum16 *)ptr = ip_compute_csum((void *)(ipv6h+1), + skb->len - sizeof(struct ipv6hdr)); + } + } + + ip6tunnel_xmit(skb, dev); + if (ndst) + ip6_tnl_dst_store(tunnel, ndst); + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(ndst); + return err; +} + +static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + const struct iphdr *iph = ip_hdr(skb); + int encap_limit = -1; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + dsfield = ipv4_get_dsfield(iph); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + return -1; + } + + return 0; +} + +static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int encap_limit = -1; + __u16 offset; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + return -1; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + + dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + if (err == -EMSGSIZE) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + return -1; + } + + return 0; +} + +/** + * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t, + const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int encap_limit = -1; + struct flowi6 fl6; + __u32 mtu; + int err; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = skb->protocol; + + err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu); + + return err; +} + +static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + int ret; + + if (!ip6_tnl_xmit_ctl(t)) + goto tx_err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ret = ip6gre_xmit_ipv4(skb, dev); + break; + case htons(ETH_P_IPV6): + ret = ip6gre_xmit_ipv6(skb, dev); + break; + default: + ret = ip6gre_xmit_other(skb, dev); + break; + } + + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) +{ + struct net_device *dev = t->dev; + struct __ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; + int addend = sizeof(struct ipv6hdr) + 4; + + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + } + + /* Set up flowi template */ + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); + + if (p->flags&IP6_TNL_F_CAP_XMIT && + p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + /* Precalculate GRE options length */ + if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (t->parms.o_flags&GRE_CSUM) + addend += 4; + if (t->parms.o_flags&GRE_KEY) + addend += 4; + if (t->parms.o_flags&GRE_SEQ) + addend += 4; + } + t->hlen = addend; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); + + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, + p->link, strict); + + if (rt == NULL) + return; + + if (rt->dst.dev) { + dev->hard_header_len = rt->dst.dev->hard_header_len + addend; + + if (set_mtu) { + dev->mtu = rt->dst.dev->mtu - addend; + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + } + ip6_rt_put(rt); + } +} + +static int ip6gre_tnl_change(struct ip6_tnl *t, + const struct __ip6_tnl_parm *p, int set_mtu) +{ + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + t->parms.link = p->link; + t->parms.proto = p->proto; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; + ip6_tnl_dst_reset(t); + ip6gre_tnl_link_config(t, set_mtu); + return 0; +} + +static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, + const struct ip6_tnl_parm2 *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->flags = u->flags; + p->hop_limit = u->hop_limit; + p->encap_limit = u->encap_limit; + p->flowinfo = u->flowinfo; + p->link = u->link; + p->i_key = u->i_key; + p->o_key = u->o_key; + p->i_flags = u->i_flags; + p->o_flags = u->o_flags; + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, + const struct __ip6_tnl_parm *p) +{ + u->proto = IPPROTO_GRE; + u->laddr = p->laddr; + u->raddr = p->raddr; + u->flags = p->flags; + u->hop_limit = p->hop_limit; + u->encap_limit = p->encap_limit; + u->flowinfo = p->flowinfo; + u->link = p->link; + u->i_key = p->i_key; + u->o_key = p->o_key; + u->i_flags = p->i_flags; + u->o_flags = p->o_flags; + memcpy(u->name, p->name, sizeof(u->name)); +} + +static int ip6gre_tunnel_ioctl(struct net_device *dev, + struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm2 p; + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ign->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); + } + memset(&p, 0, sizeof(p)); + ip6gre_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)) + goto done; + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL); + + if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + t = netdev_priv(dev); + + ip6gre_tunnel_unlink(ign, t); + synchronize_net(); + ip6gre_tnl_change(t, &p1, 1); + ip6gre_tunnel_link(ign, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + + memset(&p, 0, sizeof(p)); + ip6gre_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + if (dev == ign->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, 0); + if (t == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(ign->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned int len) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); + __be16 *p = (__be16 *)(ipv6h+1); + + ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = NEXTHDR_GRE; + ipv6h->saddr = t->parms.laddr; + ipv6h->daddr = t->parms.raddr; + + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr)); + if (daddr) + memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr)); + if (!ipv6_addr_any(&ipv6h->daddr)) + return t->hlen; + + return -t->hlen; +} + +static const struct header_ops ip6gre_header_ops = { + .create = ip6gre_header, +}; + +static const struct net_device_ops ip6gre_netdev_ops = { + .ndo_init = ip6gre_tunnel_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6gre_tunnel_xmit, + .ndo_do_ioctl = ip6gre_tunnel_ioctl, + .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void ip6gre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ip6gre_tunnel_setup(struct net_device *dev) +{ + struct ip6_tnl *t; + + dev->netdev_ops = &ip6gre_netdev_ops; + dev->destructor = ip6gre_dev_free; + + dev->type = ARPHRD_IP6GRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4; + dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4; + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + dev->flags |= IFF_NOARP; + dev->iflink = 0; + dev->addr_len = sizeof(struct in6_addr); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int i; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); + + if (ipv6_addr_any(&tunnel->parms.raddr)) + dev->header_ops = &ip6gre_header_ops; + + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct pcpu_sw_netstats *ip6gre_tunnel_stats; + ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); + u64_stats_init(&ip6gre_tunnel_stats->syncp); + } + + + return 0; +} + +static void ip6gre_fb_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + tunnel->hlen = sizeof(struct ipv6hdr) + 4; + + dev_hold(dev); +} + + +static struct inet6_protocol ip6gre_protocol __read_mostly = { + .handler = ip6gre_rcv, + .err_handler = ip6gre_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +{ + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct net_device *dev, *aux; + int prio; + + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6gre_link_ops || + dev->rtnl_link_ops == &ip6gre_tap_ops) + unregister_netdevice_queue(dev, head); + + for (prio = 0; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip6_tnl *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); + + while (t != NULL) { + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, + head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init ip6gre_init_net(struct net *net) +{ + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int err; + + ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", + ip6gre_tunnel_setup); + if (!ign->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(ign->fb_tunnel_dev, net); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + + + ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); + ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; + + err = register_netdev(ign->fb_tunnel_dev); + if (err) + goto err_reg_dev; + + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); + return 0; + +err_reg_dev: + ip6gre_dev_free(ign->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ip6gre_exit_net(struct net *net) +{ + LIST_HEAD(list); + + rtnl_lock(); + ip6gre_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations ip6gre_net_ops = { + .init = ip6gre_init_net, + .exit = ip6gre_exit_net, + .id = &ip6gre_net_id, + .size = sizeof(struct ip6gre_net), +}; + +static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + struct in6_addr daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + nla_memcpy(&daddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + if (ipv6_addr_any(&daddr)) + return -EINVAL; + } + +out: + return ip6gre_tunnel_validate(tb, data); +} + + +static void ip6gre_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_GRE_LINK]) + parms->link = nla_get_u32(data[IFLA_GRE_LINK]); + + if (data[IFLA_GRE_IFLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + + if (data[IFLA_GRE_OFLAGS]) + parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + + if (data[IFLA_GRE_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); + + if (data[IFLA_GRE_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); + + if (data[IFLA_GRE_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_GRE_LOCAL], sizeof(struct in6_addr)); + + if (data[IFLA_GRE_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + + if (data[IFLA_GRE_TTL]) + parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]); + + if (data[IFLA_GRE_ENCAP_LIMIT]) + parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); + + if (data[IFLA_GRE_FLOWINFO]) + parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]); + + if (data[IFLA_GRE_FLAGS]) + parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); +} + +static int ip6gre_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + ip6gre_tnl_link_config(tunnel, 1); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static const struct net_device_ops ip6gre_tap_netdev_ops = { + .ndo_init = ip6gre_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6gre_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void ip6gre_tap_setup(struct net_device *dev) +{ + + ether_setup(dev); + + dev->netdev_ops = &ip6gre_tap_netdev_ops; + dev->destructor = ip6gre_dev_free; + + dev->iflink = 0; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static int ip6gre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ip6_tnl *nt; + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int err; + + nt = netdev_priv(dev); + ip6gre_netlink_parms(data, &nt->parms); + + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + eth_hw_addr_random(dev); + + nt->dev = dev; + nt->net = dev_net(dev); + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + ip6gre_tunnel_link(ign, nt); + +out: + return err; +} + +static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t, *nt = netdev_priv(dev); + struct net *net = nt->net; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct __ip6_tnl_parm p; + + if (dev == ign->fb_tunnel_dev) + return -EINVAL; + + ip6gre_netlink_parms(data, &p); + + t = ip6gre_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = nt; + + ip6gre_tunnel_unlink(ign, t); + ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6gre_tunnel_link(ign, t); + netdev_state_change(dev); + } + + return 0; +} + +static void ip6gre_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + if (dev != ign->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + +static size_t ip6gre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_GRE_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_ENCAP_LIMIT */ + nla_total_size(1) + + /* IFLA_GRE_FLOWINFO */ + nla_total_size(4) + + /* IFLA_GRE_FLAGS */ + nla_total_size(4) + + 0; +} + +static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct __ip6_tnl_parm *p = &t->parms; + + if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || + nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || + nla_put(skb, IFLA_GRE_LOCAL, sizeof(struct in6_addr), &p->laddr) || + nla_put(skb, IFLA_GRE_REMOTE, sizeof(struct in6_addr), &p->raddr) || + nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || + /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/ + nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || + nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || + nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, + [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, + [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { + .kind = "ip6gre", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6gre_tunnel_setup, + .validate = ip6gre_tunnel_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .dellink = ip6gre_dellink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, +}; + +static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { + .kind = "ip6gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6gre_tap_setup, + .validate = ip6gre_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, +}; + +/* + * And now the modules code and kernel interface. + */ + +static int __init ip6gre_init(void) +{ + int err; + + pr_info("GRE over IPv6 tunneling driver\n"); + + err = register_pernet_device(&ip6gre_net_ops); + if (err < 0) + return err; + + err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE); + if (err < 0) { + pr_info("%s: can't add protocol\n", __func__); + goto add_proto_failed; + } + + err = rtnl_link_register(&ip6gre_link_ops); + if (err < 0) + goto rtnl_link_failed; + + err = rtnl_link_register(&ip6gre_tap_ops); + if (err < 0) + goto tap_ops_failed; + +out: + return err; + +tap_ops_failed: + rtnl_link_unregister(&ip6gre_link_ops); +rtnl_link_failed: + inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); +add_proto_failed: + unregister_pernet_device(&ip6gre_net_ops); + goto out; +} + +static void __exit ip6gre_fini(void) +{ + rtnl_link_unregister(&ip6gre_tap_ops); + rtnl_link_unregister(&ip6gre_link_ops); + inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); + unregister_pernet_device(&ip6gre_net_ops); +} + +module_init(ip6gre_init); +module_exit(ip6gre_fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); +MODULE_ALIAS_RTNL_LINK("ip6gre"); +MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c new file mode 100644 index 00000000000..4578e23834f --- /dev/null +++ b/net/ipv6/ip6_icmp.c @@ -0,0 +1,47 @@ +#include <linux/export.h> +#include <linux/icmpv6.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include <net/ipv6.h> + +#if IS_ENABLED(CONFIG_IPV6) + +static ip6_icmp_send_t __rcu *ip6_icmp_send; + +int inet6_register_icmp_sender(ip6_icmp_send_t *fn) +{ + return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? + 0 : -EBUSY; +} +EXPORT_SYMBOL(inet6_register_icmp_sender); + +int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) +{ + int ret; + + ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? + 0 : -EINVAL; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet6_unregister_icmp_sender); + +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + ip6_icmp_send_t *send; + + rcu_read_lock(); + send = rcu_dereference(ip6_icmp_send); + + if (!send) + goto out; + send(skb, type, code, info); +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(icmpv6_send); +#endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 98ab4f45990..51d54dc376f 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,8 +6,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Ian P. Morris <I.P.Morris@soton.ac.uk> * - * $Id: ip6_input.c,v 1.19 2000/12/13 18:31:50 davem Exp $ - * * Based in linux/net/ipv4/ip_input.c * * This program is free software; you can redistribute it and/or @@ -29,6 +27,8 @@ #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> +#include <linux/mroute6.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -44,12 +44,19 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> +#include <net/inet_ecn.h> - -inline int ip6_rcv_finish( struct sk_buff *skb) +int ip6_rcv_finish(struct sk_buff *skb) { - if (skb->dst == NULL) + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + } + if (!skb_dst(skb)) ip6_route_input(skb); return dst_input(skb); @@ -57,30 +64,26 @@ inline int ip6_rcv_finish( struct sk_buff *skb) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct ipv6hdr *hdr; + const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; - - if (dev->nd_net != &init_net) { - kfree_skb(skb); - return 0; - } + struct net *net = dev_net(skb->dev); if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); - return 0; + return NET_RX_DROP; } rcu_read_lock(); idev = __in6_dev_get(skb->dev); - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INRECEIVES); + IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len); - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS); - rcu_read_unlock(); - goto out; + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || + !idev || unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + goto drop; } memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); @@ -96,7 +99,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb->dst ? ip6_dst_idev(skb->dst)->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; @@ -106,6 +109,48 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + IP6_ADD_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_NOECTPKTS + + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + /* + * RFC4291 2.5.3 + * A packet received on an interface with a destination address + * of loopback must be dropped. + */ + if (!(dev->flags & IFF_LOOPBACK) && + ipv6_addr_loopback(&hdr->daddr)) + goto err; + + /* RFC4291 Errata ID: 3480 + * Interface-Local scope spans only a single interface on a + * node and is useful only for loopback transmission of + * multicast. Packets with interface-local scope received + * from another node must be discarded. + */ + if (!(skb->pkt_type == PACKET_LOOPBACK || + dev->flags & IFF_LOOPBACK) && + ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) + goto err; + + /* RFC4291 2.7 + * Nodes must not originate a packet to a multicast address whose scope + * field contains the reserved value 0; if such a packet is received, it + * must be silently dropped. + */ + if (ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0) + goto err; + + /* + * RFC4291 2.7 + * Multicast addresses must not be used as source addresses in IPv6 + * packets or appear in any Routing header. + */ + if (ipv6_addr_is_multicast(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); @@ -114,11 +159,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INTRUNCATEDPKTS); + IP6_INC_STATS_BH(net, + idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } hdr = ipv6_hdr(skb); @@ -126,23 +172,25 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->nexthdr == NEXTHDR_HOP) { if (ipv6_parse_hopopts(skb) < 0) { - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); - return 0; + return NET_RX_DROP; } } rcu_read_unlock(); - return NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, dev, NULL, + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish); err: - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INHDRERRORS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); -out: - return 0; + return NET_RX_DROP; } /* @@ -152,11 +200,12 @@ out: static int ip6_input_finish(struct sk_buff *skb) { - struct inet6_protocol *ipprot; - unsigned int nhoff; - int nexthdr, raw; - u8 hash; + struct net *net = dev_net(skb_dst(skb)->dev); + const struct inet6_protocol *ipprot; struct inet6_dev *idev; + unsigned int nhoff; + int nexthdr; + bool raw; /* * Parse extension headers @@ -164,20 +213,18 @@ static int ip6_input_finish(struct sk_buff *skb) rcu_read_lock(); resubmit: - idev = ip6_dst_idev(skb->dst); + idev = ip6_dst_idev(skb_dst(skb)); if (!pskb_pull(skb, skb_transport_offset(skb))) goto discard; nhoff = IP6CB(skb)->nhoff; nexthdr = skb_network_header(skb)[nhoff]; raw = raw6_local_deliver(skb, nexthdr); - - hash = nexthdr & (MAX_INET_PROTOS - 1); - if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) { int ret; if (ipprot->flags & INET6_PROTO_FINAL) { - struct ipv6hdr *hdr; + const struct ipv6hdr *hdr; /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded @@ -190,7 +237,7 @@ resubmit: if (ipv6_addr_is_multicast(&hdr->daddr) && !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, &hdr->saddr) && - !ipv6_is_mld(skb, nexthdr)) + !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && @@ -201,24 +248,26 @@ resubmit: if (ret > 0) goto resubmit; else if (ret == 0) - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INUNKNOWNPROTOS); + IP6_INC_STATS_BH(net, idev, + IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_UNK_NEXTHDR, nhoff, - skb->dev); + ICMPV6_UNK_NEXTHDR, nhoff); } - } else - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + kfree_skb(skb); + } else { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + consume_skb(skb); + } } rcu_read_unlock(); return 0; discard: - IP6_INC_STATS_BH(idev, IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); rcu_read_unlock(); kfree_skb(skb); return 0; @@ -227,52 +276,86 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish); } int ip6_mc_input(struct sk_buff *skb) { - struct ipv6hdr *hdr; - int deliver; + const struct ipv6hdr *hdr; + bool deliver; - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_INMCASTPKTS); + IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + skb->len); hdr = ipv6_hdr(skb); deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); +#ifdef CONFIG_IPV6_MROUTE /* - * IPv6 multicast router mode isnt currently supported. + * IPv6 multicast router mode is now supported ;) */ -#if 0 - if (ipv6_config.multicast_route) { - int addr_type; - - addr_type = ipv6_addr_type(&hdr->daddr); - - if (!(addr_type & (IPV6_ADDR_LOOPBACK | IPV6_ADDR_LINKLOCAL))) { - struct sk_buff *skb2; - struct dst_entry *dst; + if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + !(ipv6_addr_type(&hdr->daddr) & + (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && + likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { + /* + * Okay, we try to forward - split and duplicate + * packets. + */ + struct sk_buff *skb2; + struct inet6_skb_parm *opt = IP6CB(skb); + + /* Check for MLD */ + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { + /* Check if this is a mld message */ + u8 nexthdr = hdr->nexthdr; + __be16 frag_off; + int offset; + + /* Check if the value of Router Alert + * is for MLD (0x0000). + */ + if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) { + deliver = false; + + if (!ipv6_ext_hdr(nexthdr)) { + /* BUG */ + goto out; + } + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), + &nexthdr, &frag_off); + if (offset < 0) + goto out; + + if (!ipv6_is_mld(skb, nexthdr, offset)) + goto out; + + deliver = true; + } + /* unknown RA - process it normally */ + } - dst = skb->dst; + if (deliver) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + skb2 = skb; + skb = NULL; + } - if (deliver) { - skb2 = skb_clone(skb, GFP_ATOMIC); - dst_output(skb2); - } else { - dst_output(skb); - return 0; - } + if (skb2) { + ip6_mr_input(skb2); } } +out: #endif - - if (likely(deliver)) { + if (likely(deliver)) ip6_input(skb); - return 0; + else { + /* discard */ + kfree_skb(skb); } - /* discard */ - kfree_skb(skb); return 0; } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c new file mode 100644 index 00000000000..65eda2a8af4 --- /dev/null +++ b/net/ipv6/ip6_offload.c @@ -0,0 +1,337 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/printk.h> + +#include <net/protocol.h> +#include <net/ipv6.h> + +#include "ip6_offload.h" + +static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) +{ + const struct net_offload *ops = NULL; + + for (;;) { + struct ipv6_opt_hdr *opth; + int len; + + if (proto != NEXTHDR_HOP) { + ops = rcu_dereference(inet6_offloads[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + + if (unlikely(!pskb_may_pull(skb, 8))) + break; + + opth = (void *)skb->data; + len = ipv6_optlen(opth); + + if (unlikely(!pskb_may_pull(skb, len))) + break; + + proto = opth->nexthdr; + __skb_pull(skb, len); + } + + return proto; +} + +static int ipv6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + const struct net_offload *ops; + int err = -EINVAL; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + err = -EPROTONOSUPPORT; + + ops = rcu_dereference(inet6_offloads[ + ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); + + if (likely(ops && ops->callbacks.gso_send_check)) { + skb_reset_transport_header(skb); + err = ops->callbacks.gso_send_check(skb); + } + +out: + return err; +} + +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct ipv6hdr *ipv6h; + const struct net_offload *ops; + int proto; + struct frag_hdr *fptr; + unsigned int unfrag_ip6hlen; + u8 *prevhdr; + int offset = 0; + bool encap, udpfrag; + int nhoff; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_MPLS | + SKB_GSO_TCPV6 | + 0))) + goto out; + + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + segs = ERR_PTR(-EPROTONOSUPPORT); + + proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); + + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + + ops = rcu_dereference(inet6_offloads[proto]); + if (likely(ops && ops->callbacks.gso_segment)) { + skb_reset_transport_header(skb); + segs = ops->callbacks.gso_segment(skb, features); + } + + if (IS_ERR(segs)) + goto out; + + for (skb = segs; skb; skb = skb->next) { + ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); + ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); + skb->network_header = (u8 *)ipv6h - skb->head; + + if (udpfrag) { + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + fptr->frag_off = htons(offset); + if (skb->next != NULL) + fptr->frag_off |= htons(IP6_MF); + offset += (ntohs(ipv6h->payload_len) - + sizeof(struct frag_hdr)); + } + if (encap) + skb_reset_inner_headers(skb); + } + +out: + return segs; +} + +/* Return the total length of all the extension hdrs, following the same + * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. + */ +static int ipv6_exthdrs_len(struct ipv6hdr *iph, + const struct net_offload **opps) +{ + struct ipv6_opt_hdr *opth = (void *)iph; + int len = 0, proto, optlen = sizeof(*iph); + + proto = iph->nexthdr; + for (;;) { + if (proto != NEXTHDR_HOP) { + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + opth = (void *)opth + optlen; + optlen = ipv6_optlen(opth); + len += optlen; + proto = opth->nexthdr; + } + return len; +} + +static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct ipv6hdr *iph; + unsigned int nlen; + unsigned int hlen; + unsigned int off; + u16 flush = 1; + int proto; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } + + skb_set_network_header(skb, off); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + + flush += ntohs(iph->payload_len) != skb_gro_len(skb); + + rcu_read_lock(); + proto = iph->nexthdr; + ops = rcu_dereference(inet6_offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + ops = rcu_dereference(inet6_offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + NAPI_GRO_CB(skb)->proto = proto; + + flush--; + nlen = skb_network_header_len(skb); + + for (p = *head; p; p = p->next) { + const struct ipv6hdr *iph2; + __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + iph2 = (struct ipv6hdr *)(p->data + off); + first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; + + /* All fields must match except length and Traffic Class. + * XXX skbs on the gro_list have all been parsed and pulled + * already so we don't need to compare nlen + * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) + * memcmp() alone below is suffcient, right? + */ + if ((first_word & htonl(0xF00FFFFF)) || + memcmp(&iph->nexthdr, &iph2->nexthdr, + nlen - offsetof(struct ipv6hdr, nexthdr))) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + /* flush if Traffic Class fields are different */ + NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); + NAPI_GRO_CB(p)->flush |= flush; + } + + NAPI_GRO_CB(skb)->flush |= flush; + + skb_gro_postpull_rcsum(skb, iph, nlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload *ops; + struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); + int err = -ENOSYS; + + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); + + rcu_read_lock(); + + nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct packet_offload ipv6_packet_offload __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), + .callbacks = { + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, + }, +}; + +static const struct net_offload sit_offload = { + .callbacks = { + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + }, +}; + +static int __init ipv6_offload_init(void) +{ + + if (tcpv6_offload_init() < 0) + pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + if (udp_offload_init() < 0) + pr_crit("%s: Cannot add UDP protocol offload\n", __func__); + if (ipv6_exthdrs_offload_init() < 0) + pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); + + dev_add_offload(&ipv6_packet_offload); + + inet_add_offload(&sit_offload, IPPROTO_IPV6); + + return 0; +} + +fs_initcall(ipv6_offload_init); diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h new file mode 100644 index 00000000000..2e155c651b3 --- /dev/null +++ b/net/ipv6/ip6_offload.h @@ -0,0 +1,18 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __ip6_offload_h +#define __ip6_offload_h + +int ipv6_exthdrs_offload_init(void); +int udp_offload_init(void); +int tcpv6_offload_init(void); + +#endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8b67ca07467..45702b8cd14 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $ - * * Based on linux/net/ipv4/ip_output.c * * This program is free software; you can redistribute it and/or @@ -39,6 +37,7 @@ #include <linux/tcp.h> #include <linux/route.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -55,145 +54,115 @@ #include <net/icmp.h> #include <net/xfrm.h> #include <net/checksum.h> +#include <linux/mroute6.h> -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); - -static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr) -{ - static u32 ipv6_fragmentation_id = 1; - static DEFINE_SPINLOCK(ip6_id_lock); - - spin_lock_bh(&ip6_id_lock); - fhdr->identification = htonl(ipv6_fragmentation_id); - if (++ipv6_fragmentation_id == 0) - ipv6_fragmentation_id = 1; - spin_unlock_bh(&ip6_id_lock); -} - -int __ip6_local_out(struct sk_buff *skb) -{ - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; - ipv6_hdr(skb)->payload_len = htons(len); - - return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, - dst_output); -} - -int ip6_local_out(struct sk_buff *skb) -{ - int err; - - err = __ip6_local_out(skb); - if (likely(err == 1)) - err = dst_output(skb); - - return err; -} -EXPORT_SYMBOL_GPL(ip6_local_out); - -static int ip6_output_finish(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - - if (dst->hh) - return neigh_hh_output(dst->hh, skb); - else if (dst->neighbour) - return dst->neighbour->output(skb); - - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); - return -EINVAL; - -} - -/* dev_loopback_xmit for use with netfilter. */ -static int ip6_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - BUG_TRAP(newskb->dst); - - netif_rx(newskb); - return 0; -} - - -static int ip6_output2(struct sk_buff *skb) +static int ip6_finish_output2(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; + struct neighbour *neigh; + struct in6_addr *nexthop; + int ret; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { - struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL; - struct inet6_dev *idev = ip6_dst_idev(skb->dst); + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) && - ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr)) { + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + ((mroute6_socket(dev_net(dev), skb) && + !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || + ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing is not supported in any case. */ if (newskb) - NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, - ip6_dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { - IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } - IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS); - } + IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, + skb->len); - return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev, - ip6_output_finish); -} + if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= + IPV6_ADDR_SCOPE_NODELOCAL && + !(dev->flags & IFF_LOOPBACK)) { + kfree_skb(skb); + return 0; + } + } -static inline int ip6_skb_dst_mtu(struct sk_buff *skb) -{ - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + rcu_read_lock_bh(); + nexthop = rt6_nexthop((struct rt6_info *)dst); + neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + if (!IS_ERR(neigh)) { + ret = dst_neigh_output(dst, neigh, skb); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); - return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ? - skb->dst->dev->mtu : dst_mtu(skb->dst); + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; } -int ip6_output(struct sk_buff *skb) +static int ip6_finish_output(struct sk_buff *skb) { if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || - dst_allfrag(skb->dst)) - return ip6_fragment(skb, ip6_output2); + dst_allfrag(skb_dst(skb)) || + (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) + return ip6_fragment(skb, ip6_finish_output2); else - return ip6_output2(skb); + return ip6_finish_output2(skb); +} + +int ip6_output(struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* - * xmit an sk_buff (used by TCP) + * xmit an sk_buff (used by TCP, SCTP and DCCP) */ -int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct ipv6_txoptions *opt, int ipfragok) +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + struct ipv6_txoptions *opt, int tclass) { + struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *first_hop = &fl->fl6_dst; - struct dst_entry *dst = skb->dst; + struct in6_addr *first_hop = &fl6->daddr; + struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; - u8 proto = fl->proto; + u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit, tclass; + int hlimit = -1; u32 mtu; if (opt) { @@ -209,15 +178,14 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, if (skb_headroom(skb) < head_room) { struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); if (skb2 == NULL) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -ENOBUFS; } - kfree_skb(skb); + consume_skb(skb); skb = skb2; - if (sk) - skb_set_owner_w(skb, sk); + skb_set_owner_w(skb, sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -232,88 +200,41 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, /* * Fill in the IPv6 header */ - - hlimit = -1; if (np) hlimit = np->hop_limit; if (hlimit < 0) - hlimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hlimit < 0) - hlimit = ipv6_get_hoplimit(dst->dev); - - tclass = -1; - if (np) - tclass = np->tclass; - if (tclass < 0) - tclass = 0; + hlimit = ip6_dst_hoplimit(dst); - *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel; + ip6_flow_hdr(hdr, tclass, fl6->flowlabel); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; - ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); - ipv6_addr_copy(&hdr->daddr, first_hop); + hdr->saddr = fl6->saddr; + hdr->daddr = *first_hop; + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; mtu = dst_mtu(dst); - if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), - IPSTATS_MIB_OUTREQUESTS); - return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, - dst_output); + if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { + IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUT, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + dst->dev, dst_output); } - if (net_ratelimit()) - printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); + ipv6_local_error(sk, EMSGSIZE, fl6, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } EXPORT_SYMBOL(ip6_xmit); -/* - * To avoid extra problems ND packets are send through this - * routine. It's code duplication but I really want to avoid - * extra checks since ipv6_build_header is used by TCP (which - * is for us performance critical) - */ - -int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - int proto, int len) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6hdr *hdr; - int totlen; - - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - - totlen = len + sizeof(struct ipv6hdr); - - skb_reset_network_header(skb); - skb_put(skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(skb); - - *(__be32*)hdr = htonl(0x60000000); - - hdr->payload_len = htons(len); - hdr->nexthdr = proto; - hdr->hop_limit = np->hop_limit; - - ipv6_addr_copy(&hdr->saddr, saddr); - ipv6_addr_copy(&hdr->daddr, daddr); - - return 0; -} - static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; @@ -347,10 +268,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; + __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { - offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr); + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else @@ -399,17 +321,65 @@ static inline int ip6_forward_finish(struct sk_buff *skb) return dst_output(skb); } +static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + unsigned int mtu; + struct inet6_dev *idev; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb->ignore_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(dst->dev); + u32 mtu; - if (ipv6_devconf.forwarding == 0) + if (net->ipv6.devconf_all->forwarding == 0) goto error; + if (skb->pkt_type != PACKET_HOST) + goto drop; + + if (skb_warn_if_lro(skb)) + goto drop; + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } @@ -428,9 +398,8 @@ int ip6_forward(struct sk_buff *skb) * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ - if (opt->ra) { - u8 *ptr = skb_network_header(skb) + opt->ra; - if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { + if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0; } @@ -440,41 +409,42 @@ int ip6_forward(struct sk_buff *skb) if (hdr->hop_limit <= 1) { /* Force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - 0, skb->dev); - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; } /* XXX: idev->cnf.proxy_ndp? */ - if (ipv6_devconf.proxy_ndp && - pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) { + if (net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } - dst = skb->dst; + dst = skb_dst(skb); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 && - !skb->sp) { + if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; + struct inet_peer *peer; struct rt6_info *rt; - struct neighbour *n = dst->neighbour; /* * incoming and outgoing devices are the same @@ -482,41 +452,53 @@ int ip6_forward(struct sk_buff *skb) */ rt = (struct rt6_info *) dst; - if ((rt->rt6i_flags & RTF_GATEWAY)) - target = (struct in6_addr*)&n->primary_key; + if (rt->rt6i_flags & RTF_GATEWAY) + target = &rt->rt6i_gateway; else target = &hdr->daddr; + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ - if (xrlim_allow(dst, 1*HZ)) - ndisc_send_redirect(skb, n, target); + if (inet_peer_xrlim_allow(peer, 1*HZ)) + ndisc_send_redirect(skb, target); + if (peer) + inet_putpeer(peer); } else { int addrtype = ipv6_addr_type(&hdr->saddr); /* This check is security critical. */ - if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK)) + if (addrtype == IPV6_ADDR_ANY || + addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_NOT_NEIGHBOUR, 0, skb->dev); + ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } - if (skb->len > dst_mtu(dst)) { + mtu = ip6_dst_mtu_forward(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev); - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { - IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTDISCARDS); goto drop; } @@ -526,12 +508,13 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); error: - IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; @@ -542,8 +525,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - dst_release(to->dst); - to->dst = dst_clone(from->dst); + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; @@ -551,80 +534,55 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - to->nf_trace = from->nf_trace; -#endif skb_copy_secmark(to, from); } -int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - unsigned int packet_len = skb->tail - skb->network_header; - int found_rhdr = 0; - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { - - switch (**nexthdr) { + static u32 ip6_idents_hashrnd __read_mostly; + u32 hash, id; - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - found_rhdr = 1; - break; - case NEXTHDR_DEST: -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) - break; -#endif - if (found_rhdr) - return offset; - break; - default : - return offset; - } + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + - offset); - } + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash); - return offset; + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); } -EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { - struct net_device *dev; struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info*)skb->dst; + struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; + int hroom, troom; __be32 frag_id = 0; int ptr, offset = 0, err=0; u8 *prevhdr, nexthdr = 0; + struct net *net = dev_net(skb_dst(skb)->dev); - dev = rt->u.dst.dev; hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); /* We must not fragment if the socket is set to force MTU discovery - * or if the skb it not generated by a local socket. (This last - * check should be redundant, but it's free.) + * or if the skb it not generated by a local socket. */ - if (!skb->local_df) { - skb->dev = skb->dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); + if (unlikely(!skb->ignore_df && skb->len > mtu) || + (IP6CB(skb)->frag_max_size && + IP6CB(skb)->frag_max_size > mtu)) { + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } @@ -635,45 +593,45 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); - if (skb_shinfo(skb)->frag_list) { + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); - int truesizes = 0; + struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || skb_cloned(skb)) goto slow_path; - for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen) - goto slow_path; + goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) - goto slow_path; + goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { - sock_hold(skb->sk); frag->sk = skb->sk; frag->destructor = sock_wfree; - truesizes += frag->truesize; } + skb->truesize -= frag->truesize; } err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; - skb_shinfo(skb)->frag_list = NULL; + skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); return -ENOMEM; } @@ -683,7 +641,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); - ipv6_select_ident(skb, fh); + ipv6_select_ident(fh, rt); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); @@ -691,12 +649,11 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); - skb->truesize -= truesizes; skb->len = first_len; ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); for (;;) { /* Prepare header of the next frame, @@ -724,7 +681,8 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) err = output(skb); if(!err) - IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES); + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -737,8 +695,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) kfree(tmp_hdr); if (err == 0) { - IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS); - dst_release(&rt->u.dst); + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGOKS); + ip6_rt_put(rt); return 0; } @@ -748,12 +707,26 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) frag = skb; } - IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS); - dst_release(&rt->u.dst); + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGFAILS); + ip6_rt_put(rt); return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } } slow_path: + if ((skb->ip_summed == CHECKSUM_PARTIAL) && + skb_checksum_help(skb)) + goto fail; + left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ @@ -762,6 +735,8 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; + hroom = LL_RESERVED_SPACE(rt->dst.dev); + troom = rt->dst.dev->needed_tailroom; /* * Keep copying data until we run out. @@ -771,7 +746,7 @@ slow_path: /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) len = mtu; - /* IF: we are not sending upto and including the packet end + /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len &= ~7; @@ -780,9 +755,10 @@ slow_path: * Allocate buffer. */ - if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC)) == NULL) { NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); - IP6_INC_STATS(ip6_dst_idev(skb->dst), + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; @@ -793,7 +769,7 @@ slow_path: */ ip6_copy_metadata(frag, skb); - skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(frag, hroom); skb_put(frag, len + hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); @@ -818,7 +794,7 @@ slow_path: fh->nexthdr = nexthdr; fh->reserved = 0; if (!frag_id) { - ipv6_select_ident(skb, fh); + ipv6_select_ident(fh, rt); frag_id = fh->identification; } else fh->identification = frag_id; @@ -846,38 +822,45 @@ slow_path: if (err) goto fail; - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGCREATES); } - IP6_INC_STATS(ip6_dst_idev(skb->dst), + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGOKS); - kfree_skb(skb); + consume_skb(skb); return err; fail: - IP6_INC_STATS(ip6_dst_idev(skb->dst), + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return err; } -static inline int ip6_rt_check(struct rt6key *rt_key, - struct in6_addr *fl_addr, - struct in6_addr *addr_cache) +static inline int ip6_rt_check(const struct rt6key *rt_key, + const struct in6_addr *fl_addr, + const struct in6_addr *addr_cache) { - return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && - (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache))); + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, struct dst_entry *dst, - struct flowi *fl) + const struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt; if (!dst) goto out; + if (dst->ops->family != AF_INET6) { + dst_release(dst); + return NULL; + } + + rt = (struct rt6_info *)dst; /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -895,11 +878,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl->oif && fl->oif != dst->dev->ifindex)) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } @@ -909,63 +892,77 @@ out: } static int ip6_dst_lookup_tail(struct sock *sk, - struct dst_entry **dst, struct flowi *fl) + struct dst_entry **dst, struct flowi6 *fl6) { + struct net *net = sock_net(sk); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + struct neighbour *n; + struct rt6_info *rt; +#endif int err; if (*dst == NULL) - *dst = ip6_route_output(sk, fl); + *dst = ip6_route_output(net, sk, fl6); if ((err = (*dst)->error)) goto out_err_release; - if (ipv6_addr_any(&fl->fl6_src)) { - err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); + if (ipv6_addr_any(&fl6->saddr)) { + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); if (err) goto out_err_release; } #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - /* - * Here if the dst entry we've looked up - * has a neighbour entry that is in the INCOMPLETE - * state and the src address from the flow is - * marked as OPTIMISTIC, we release the found - * dst entry and replace it instead with the - * dst entry of the nexthop router - */ - if (!((*dst)->neighbour->nud_state & NUD_VALID)) { - struct inet6_ifaddr *ifp; - struct flowi fl_gw; - int redirect; + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + rt = (struct rt6_info *) *dst; + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; + rcu_read_unlock_bh(); - ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src, - (*dst)->dev, 1); + if (err) { + struct inet6_ifaddr *ifp; + struct flowi6 fl_gw6; + int redirect; - redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); - if (ifp) - in6_ifa_put(ifp); + ifp = ipv6_get_ifaddr(net, &fl6->saddr, + (*dst)->dev, 1); - if (redirect) { - /* - * We need to get the dst entry for the - * default router instead - */ - dst_release(*dst); - memcpy(&fl_gw, fl, sizeof(struct flowi)); - memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr)); - *dst = ip6_route_output(sk, &fl_gw); - if ((err = (*dst)->error)) - goto out_err_release; - } + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); + memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw6); + if ((err = (*dst)->error)) + goto out_err_release; } + } #endif return 0; out_err_release: if (err == -ENETUNREACH) - IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; return err; @@ -975,52 +972,88 @@ out_err_release: * ip6_dst_lookup - perform route lookup on flow * @sk: socket which provides route info * @dst: pointer to dst_entry * for result - * @fl: flow to lookup + * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error. */ -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { *dst = NULL; - return ip6_dst_lookup_tail(sk, dst, fl); + return ip6_dst_lookup_tail(sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** - * ip6_sk_dst_lookup - perform socket cached route lookup on flow + * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @sk: socket which provides route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * + * This function performs a route lookup on the given flow. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) +{ + struct dst_entry *dst = NULL; + int err; + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +/** + * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info - * @dst: pointer to dst_entry * for result - * @fl: flow to lookup + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * - * It returns zero on success, or a standard errno code on error. + * It returns a valid dst pointer on success, or a pointer encoded + * error code. */ -int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) { - *dst = NULL; - if (sk) { - *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); - *dst = ip6_sk_dst_check(sk, *dst, fl); - } + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + int err; - return ip6_dst_lookup_tail(sk, dst, fl); + dst = ip6_sk_dst_check(sk, dst, fl6); + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } -EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup); +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int mtu,unsigned int flags, + struct rt6_info *rt) { struct sk_buff *skb; + struct frag_hdr fhdr; int err; /* There is support for UDP large send offload by network @@ -1032,7 +1065,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); if (skb == NULL) - return -ENOMEM; + return err; /* reserve space for Hardware header */ skb_reserve(skb, hh_len); @@ -1046,116 +1079,202 @@ static inline int ip6_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_PARTIAL; + skb->protocol = htons(ETH_P_IPV6); skb->csum = 0; - sk->sk_sndmsg_off = 0; - } - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - struct frag_hdr fhdr; - - /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->gso_size = mtu - fragheaderlen - - sizeof(struct frag_hdr); - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - ipv6_select_ident(skb, &fhdr); - skb_shinfo(skb)->ip6_frag_id = fhdr.identification; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; + } else if (skb_is_gso(skb)) { + goto append; } - /* There is not enough support do UPD LSO, - * so follow normal path + + skb->ip_summed = CHECKSUM_PARTIAL; + /* Specify the length of each IPv6 datagram fragment. + * It has to be a multiple of 8. */ - kfree_skb(skb); + skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - + sizeof(struct frag_hdr)) & ~7; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + ipv6_select_ident(&fhdr, rt); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + +append: + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); +} - return err; +static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static void ip6_append_data_mtu(unsigned int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, + struct rt6_info *rt, + unsigned int orig_mtu) +{ + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { + /* first fragment, reserve header_len */ + *mtu = orig_mtu - rt->dst.header_len; + + } else { + /* + * this fragment is not first, the headers + * space is regarded as data space. + */ + *mtu = orig_mtu; + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); + } } int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl, - struct rt6_info *rt, unsigned int flags) + int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, int dontfrag) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - unsigned int maxfraglen, fragheaderlen; + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; int exthdrlen; + int dst_exthdrlen; int hh_len; - int mtu; int copy; int err; int offset = 0; - int csummode = CHECKSUM_NONE; + __u8 tx_flags = 0; if (flags&MSG_PROBE) return 0; + cork = &inet->cork.base; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking */ if (opt) { - if (np->cork.opt == NULL) { - np->cork.opt = kmalloc(opt->tot_len, - sk->sk_allocation); - if (unlikely(np->cork.opt == NULL)) - return -ENOBUFS; - } else if (np->cork.opt->tot_len < opt->tot_len) { - printk(KERN_DEBUG "ip6_append_data: invalid option length\n"); + if (WARN_ON(np->cork.opt)) return -EINVAL; - } - memcpy(np->cork.opt, opt, opt->tot_len); - inet->cork.flags |= IPCORK_OPT; + + np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + + np->cork.opt->tot_len = opt->tot_len; + np->cork.opt->opt_flen = opt->opt_flen; + np->cork.opt->opt_nflen = opt->opt_nflen; + + np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, + sk->sk_allocation); + if (opt->dst0opt && !np->cork.opt->dst0opt) + return -ENOBUFS; + + np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, + sk->sk_allocation); + if (opt->dst1opt && !np->cork.opt->dst1opt) + return -ENOBUFS; + + np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, + sk->sk_allocation); + if (opt->hopopt && !np->cork.opt->hopopt) + return -ENOBUFS; + + np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, + sk->sk_allocation); + if (opt->srcrt && !np->cork.opt->srcrt) + return -ENOBUFS; + /* need source address above miyazawa*/ } - dst_hold(&rt->u.dst); - np->cork.rt = rt; - inet->cork.fl = *fl; + dst_hold(&rt->dst); + cork->dst = &rt->dst; + inet->cork.fl.u.ip6 = *fl6; np->cork.hop_limit = hlimit; np->cork.tclass = tclass; - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); + if (rt->dst.flags & DST_XFRM_TUNNEL) + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + else + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } - inet->cork.fragsize = mtu; - if (dst_allfrag(rt->u.dst.path)) - inet->cork.flags |= IPCORK_ALLFRAG; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) - - rt->rt6i_nfheader_len; + cork->fragsize = mtu; + if (dst_allfrag(rt->dst.path)) + cork->flags |= IPCORK_ALLFRAG; + cork->length = 0; + exthdrlen = (opt ? opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; + dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } else { - rt = np->cork.rt; - fl = &inet->cork.fl; - if (inet->cork.flags & IPCORK_OPT) - opt = np->cork.opt; + rt = (struct rt6_info *)cork->dst; + fl6 = &inet->cork.fl.u.ip6; + opt = np->cork.opt; transhdrlen = 0; exthdrlen = 0; - mtu = inet->cork.fragsize; + dst_exthdrlen = 0; + mtu = cork->fragsize; } + orig_mtu = mtu; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { - if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { - ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen); + unsigned int maxnonfragsize, headersize; + + headersize = sizeof(struct ipv6hdr) + + (opt ? opt->opt_flen + opt->opt_nflen : 0) + + (dst_allfrag(&rt->dst) ? + sizeof(struct frag_hdr) : 0) + + rt->rt6i_nfheader_len; + + if (ip6_sk_ignore_df(sk)) + maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; + else + maxnonfragsize = mtu; + + /* dontfrag active */ + if ((cork->length + length > mtu - headersize) && dontfrag && + (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_RAW)) { + ipv6_local_rxpmtu(sk, fl6, mtu - headersize + + sizeof(struct ipv6hdr)); + goto emsgsize; + } + + if (cork->length + length > maxnonfragsize - headersize) { +emsgsize: + ipv6_local_error(sk, EMSGSIZE, fl6, + mtu - headersize + + sizeof(struct ipv6hdr)); return -EMSGSIZE; } } + /* For UDP, check if TX timestamp is enabled */ + if (sk->sk_type == SOCK_DGRAM) + sock_tx_timestamp(sk, &tx_flags); + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1172,24 +1291,26 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, * --yoshfuji */ - inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - - err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, - flags); + skb = skb_peek_tail(&sk->sk_write_queue); + cork->length += length; + if (((length > mtu) || + (skb && skb_is_gso(skb))) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO)) { + err = ip6_ufo_append_data(sk, getfrag, from, length, + hh_len, fragheaderlen, + transhdrlen, mtu, flags, rt); if (err) goto error; return 0; } - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ - copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; @@ -1199,38 +1320,46 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - struct sk_buff *skb_prev; alloc_new_skb: - skb_prev = skb; - /* There's no room in the current skb */ - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; + if (skb) + fraggap = skb->len - maxfraglen; else fraggap = 0; + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, + fragheaderlen, skb, rt, + orig_mtu); + + skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; - if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) - datalen = maxfraglen - fragheaderlen; - fraglen = datalen + fragheaderlen; + if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; - /* - * The last fragment gets additional space at tail. - * Note: we overallocate on fragments with MSG_MODE - * because we have no idea if we're the last one. - */ - if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; + alloclen += dst_exthdrlen; + + if (datalen != length + fraggap) { + /* + * this is not the last fragment, the trailer + * space is regarded as data space. + */ + datalen += rt->dst.trailer_len; + } + + alloclen += rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; /* * We just reserve space for fragment header. @@ -1252,16 +1381,27 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else { + /* Only the initial fragment + * is time stamped. + */ + tx_flags = 0; + } } if (skb == NULL) goto error; /* * Fill in the control structures */ - skb->ip_summed = csummode; + skb->protocol = htons(ETH_P_IPV6); + skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; - /* reserve for fragmentation */ - skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + /* reserve for fragmentation and ipsec header */ + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + + dst_exthdrlen); + + if (sk->sk_type == SOCK_DGRAM) + skb_shinfo(skb)->tx_flags = tx_flags; /* * Find where to start putting bytes @@ -1281,6 +1421,7 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; + if (copy < 0) { err = -EINVAL; kfree_skb(skb); @@ -1295,7 +1436,7 @@ alloc_new_skb: length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; - csummode = CHECKSUM_NONE; + dst_exthdrlen = 0; /* * Put the packet on the pending queue @@ -1307,7 +1448,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1319,46 +1460,31 @@ alloc_new_skb: } } else { int i = skb_shinfo(skb)->nr_frags; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; - unsigned int left; - - if (page && (left = PAGE_SIZE - off) > 0) { - if (copy >= left) - copy = left; - if (page != frag->page) { - if (i == MAX_SKB_FRAGS) { - err = -EMSGSIZE; - goto error; - } - get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); - frag = &skb_shinfo(skb)->frags[i]; - } - } else if(i < MAX_SKB_FRAGS) { - if (copy > PAGE_SIZE) - copy = PAGE_SIZE; - page = alloc_pages(sk->sk_allocation, 0); - if (page == NULL) { - err = -ENOMEM; - goto error; - } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; + struct page_frag *pfrag = sk_page_frag(sk); - skb_fill_page_desc(skb, i, page, 0, 0); - frag = &skb_shinfo(skb)->frags[i]; - } else { - err = -EMSGSIZE; - goto error; - } - if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { - err = -EFAULT; + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); } - sk->sk_sndmsg_off += copy; - frag->size += copy; + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -1367,22 +1493,33 @@ alloc_new_skb: offset += copy; length -= copy; } + return 0; + +error_efault: + err = -EFAULT; error: - inet->cork.length -= length; - IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + cork->length -= length; + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); return err; } +EXPORT_SYMBOL_GPL(ip6_append_data); static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) { - inet->cork.flags &= ~IPCORK_OPT; - kfree(np->cork.opt); - np->cork.opt = NULL; - if (np->cork.rt) { - dst_release(&np->cork.rt->u.dst); - np->cork.rt = NULL; - inet->cork.flags &= ~IPCORK_ALLFRAG; + if (np->cork.opt) { + kfree(np->cork.opt->dst0opt); + kfree(np->cork.opt->dst1opt); + kfree(np->cork.opt->hopopt); + kfree(np->cork.opt->srcrt); + kfree(np->cork.opt); + np->cork.opt = NULL; + } + + if (inet->cork.base.dst) { + dst_release(inet->cork.base.dst); + inet->cork.base.dst = NULL; + inet->cork.base.flags &= ~IPCORK_ALLFRAG; } memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); } @@ -1394,11 +1531,12 @@ int ip6_push_pending_frames(struct sock *sk) struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = np->cork.opt; - struct rt6_info *rt = np->cork.rt; - struct flowi *fl = &inet->cork.fl; - unsigned char proto = fl->proto; + struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + unsigned char proto = fl6->flowi6_proto; int err = 0; if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) @@ -1415,16 +1553,14 @@ int ip6_push_pending_frames(struct sock *sk) skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; - __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Allow local fragmentation. */ - if (np->pmtudisc < IPV6_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip6_sk_ignore_df(sk); - ipv6_addr_copy(final_dst, &fl->fl6_dst); + *final_dst = fl6->daddr; __skb_pull(skb, skb_network_header_len(skb)); if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -1435,30 +1571,28 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - *(__be32*)hdr = fl->fl6_flowlabel | - htonl(0x60000000 | ((int)np->cork.tclass << 20)); - + ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; - ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); - ipv6_addr_copy(&hdr->daddr, final_dst); + hdr->saddr = fl6->saddr; + hdr->daddr = *final_dst; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb->dst = dst_clone(&rt->u.dst); - IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); + skb_dst_set(skb, dst_clone(&rt->dst)); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { - struct inet6_dev *idev = ip6_dst_idev(skb->dst); + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type); - ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } err = ip6_local_out(skb); if (err) { if (err > 0) - err = np->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; } @@ -1467,19 +1601,22 @@ out: ip6_cork_release(inet, np); return err; error: + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); goto out; } +EXPORT_SYMBOL_GPL(ip6_push_pending_frames); void ip6_flush_pending_frames(struct sock *sk) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { - if (skb->dst) - IP6_INC_STATS(ip6_dst_idev(skb->dst), + if (skb_dst(skb)) + IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); } ip6_cork_release(inet_sk(sk), inet6_sk(sk)); } +EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 78f43888092..afa08245836 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -6,8 +6,6 @@ * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * - * $Id$ - * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * @@ -20,6 +18,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> @@ -29,7 +29,6 @@ #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> -#include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> @@ -39,12 +38,16 @@ #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/etherdevice.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> @@ -52,45 +55,84 @@ #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); - -#define IPV6_TLV_TEL_DST_SIZE 8 +MODULE_ALIAS_RTNL_LINK("ip6tnl"); +MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG -#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__) +#define IP6_TNL_TRACE(x...) pr_debug("%s:" x "\n", __func__) #else #define IP6_TNL_TRACE(x...) do {;} while(0) #endif -#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) -#define IPV6_TCLASS_SHIFT 20 +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) + +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -#define HASH_SIZE 32 +static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) +{ + u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); -#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ - (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ - (HASH_SIZE - 1)) + return hash_32(hash, HASH_SIZE_SHIFT); +} -static int ip6_fb_tnl_dev_init(struct net_device *dev); static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); +static struct rtnl_link_ops ip6_link_ops __read_mostly; + +static int ip6_tnl_net_id __read_mostly; +struct ip6_tnl_net { + /* the IPv6 tunnel fallback device */ + struct net_device *fb_tnl_dev; + /* lists for storing tunnels in use */ + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; +}; -/* the IPv6 tunnel fallback device */ -static struct net_device *ip6_fb_tnl_dev; - - -/* lists for storing tunnels in use */ -static struct ip6_tnl *tnls_r_l[HASH_SIZE]; -static struct ip6_tnl *tnls_wc[1]; -static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l }; +static struct net_device_stats *ip6_get_stats(struct net_device *dev) +{ + struct pcpu_sw_netstats tmp, sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + unsigned int start; + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + tmp.rx_packets = tstats->rx_packets; + tmp.rx_bytes = tstats->rx_bytes; + tmp.tx_packets = tstats->tx_packets; + tmp.tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + sum.rx_packets += tmp.rx_packets; + sum.rx_bytes += tmp.rx_bytes; + sum.tx_packets += tmp.tx_packets; + sum.tx_bytes += tmp.tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} -/* lock for the tunnel lists */ -static DEFINE_RWLOCK(ip6_tnl_lock); +/* + * Locking : hash tables are protected by RCU and RTNL + */ -static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) { struct dst_entry *dst = t->dst_cache; @@ -103,20 +145,23 @@ static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) return dst; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); -static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +void ip6_tnl_dst_reset(struct ip6_tnl *t) { dst_release(t->dst_cache); t->dst_cache = NULL; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; dst_release(t->dst_cache); t->dst_cache = dst; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -129,20 +174,24 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) * else %NULL **/ +#define for_each_ip6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + static struct ip6_tnl * -ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) +ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int hash = HASH(remote, local); struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) { + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } - if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP)) + t = rcu_dereference(ip6n->tnls_wc[0]); + if (t && (t->dev->flags & IFF_UP)) return t; return NULL; @@ -159,19 +208,19 @@ ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) * Return: head of IPv6 tunnel list **/ -static struct ip6_tnl ** -ip6_tnl_bucket(struct ip6_tnl_parm *p) +static struct ip6_tnl __rcu ** +ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { - struct in6_addr *remote = &p->raddr; - struct in6_addr *local = &p->laddr; - unsigned h = 0; + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; - h = HASH(remote) ^ HASH(local); + h = HASH(remote, local); } - return &tnls[prio][h]; + return &ip6n->tnls[prio][h]; } /** @@ -180,14 +229,12 @@ ip6_tnl_bucket(struct ip6_tnl_parm *p) **/ static void -ip6_tnl_link(struct ip6_tnl *t) +ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp = ip6_tnl_bucket(&t->parms); + struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); - t->next = *tp; - write_lock_bh(&ip6_tnl_lock); - *tp = t; - write_unlock_bh(&ip6_tnl_lock); + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); } /** @@ -196,22 +243,56 @@ ip6_tnl_link(struct ip6_tnl *t) **/ static void -ip6_tnl_unlink(struct ip6_tnl *t) +ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp; - - for (tp = ip6_tnl_bucket(&t->parms); *tp; tp = &(*tp)->next) { - if (t == *tp) { - write_lock_bh(&ip6_tnl_lock); - *tp = t->next; - write_unlock_bh(&ip6_tnl_lock); + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } } +static void ip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static int ip6_tnl_create2(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err; + + t = netdev_priv(dev); + err = ip6_tnl_dev_init(dev); + if (err < 0) + goto out; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &ip6_link_ops; + + dev_hold(dev); + ip6_tnl_link(ip6n, t); + return 0; + +out: + return err; +} + /** - * ip6_tnl_create() - create a new tunnel + * ip6_tnl_create - create a new tunnel * @p: tunnel parameters * @pt: pointer to new tunnel * @@ -222,7 +303,7 @@ ip6_tnl_unlink(struct ip6_tnl *t) * created tunnel or NULL **/ -static struct ip6_tnl *ip6_tnl_create(struct ip6_tnl_parm *p) +static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; @@ -238,24 +319,19 @@ static struct ip6_tnl *ip6_tnl_create(struct ip6_tnl_parm *p) if (dev == NULL) goto failed; - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } + dev_net_set(dev, net); t = netdev_priv(dev); - dev->init = ip6_tnl_dev_init; t->parms = *p; - - if ((err = register_netdevice(dev)) < 0) + t->net = dev_net(dev); + err = ip6_tnl_create2(dev); + if (err < 0) goto failed_free; - dev_hold(dev); - ip6_tnl_link(t); return t; failed_free: - free_netdev(dev); + ip6_dev_free(dev); failed: return NULL; } @@ -274,20 +350,25 @@ failed: * matching tunnel or NULL **/ -static struct ip6_tnl *ip6_tnl_locate(struct ip6_tnl_parm *p, int create) +static struct ip6_tnl *ip6_tnl_locate(struct net *net, + struct __ip6_tnl_parm *p, int create) { - struct in6_addr *remote = &p->raddr; - struct in6_addr *local = &p->laddr; + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for (t = *ip6_tnl_bucket(p); t; t = t->next) { + for (tp = ip6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) return t; } if (!create) return NULL; - return ip6_tnl_create(p); + return ip6_tnl_create(net, p); } /** @@ -302,14 +383,13 @@ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - if (dev == ip6_fb_tnl_dev) { - write_lock_bh(&ip6_tnl_lock); - tnls_wc[0] = NULL; - write_unlock_bh(&ip6_tnl_lock); - } else { - ip6_tnl_unlink(t); - } + if (dev == ip6n->fb_tnl_dev) + RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); + else + ip6_tnl_unlink(ip6n, t); ip6_tnl_dst_reset(t); dev_put(dev); } @@ -323,10 +403,9 @@ ip6_tnl_dev_uninit(struct net_device *dev) * else index to encapsulation limit **/ -static __u16 -parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { - struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; __u8 nexthdr = ipv6h->nexthdr; __u16 off = sizeof (*ipv6h); @@ -374,6 +453,7 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) } return 0; } +EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /** * ip6_tnl_err - tunnel error handler @@ -385,13 +465,13 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, - int *type, int *code, int *msg, __u32 *info, int offset) + u8 *type, u8 *code, int *msg, __u32 *info, int offset) { - struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; struct ip6_tnl *t; int rel_msg = 0; - int rel_type = ICMPV6_DEST_UNREACH; - int rel_code = ICMPV6_ADDR_UNREACH; + u8 rel_type = ICMPV6_DEST_UNREACH; + u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; __u16 len; int err = -ENOENT; @@ -400,8 +480,9 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, in trouble since we might need the source address for further processing of the error. */ - read_lock(&ip6_tnl_lock); - if ((t = ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + rcu_read_lock(); + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, + &ipv6h->saddr)) == NULL) goto out; if (t->parms.proto != ipproto && t->parms.proto != 0) @@ -414,41 +495,32 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Path to destination invalid " - "or inactive!\n", t->parms.name); + net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Too small hop limit or " - "routing loop in tunnel!\n", - t->parms.name); + net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: teli = 0; if ((*code) == ICMPV6_HDR_FIELD) - teli = parse_tlv_tnl_enc_lim(skb, skb->data); + teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Too small encapsulation " - "limit or routing loop in " - "tunnel!\n", t->parms.name); + net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } - } else if (net_ratelimit()) { - printk(KERN_WARNING - "%s: Recipient unable to parse tunneled " - "packet!\n ", t->parms.name); + } else { + net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -472,23 +544,23 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, *msg = rel_msg; out: - read_unlock(&ip6_tnl_lock); + rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { int rel_msg = 0; - int rel_type = type; - int rel_code = code; + u8 rel_type = type; + u8 rel_code = code; __u32 rel_info = ntohl(info); int err; struct sk_buff *skb2; - struct iphdr *eiph; - struct flowi fl; + const struct iphdr *eiph; struct rtable *rt; + struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); @@ -511,6 +583,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; + case NDISC_REDIRECT: + rel_type = ICMP_REDIRECT; + rel_code = ICMP_REDIR_HOST; default: return 0; } @@ -522,50 +597,55 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!skb2) return 0; - dst_release(skb2->dst); - skb2->dst = NULL; + skb_dst_drop(skb2); + skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - memset(&fl, 0, sizeof(fl)); - fl.fl4_dst = eiph->saddr; - fl.fl4_tos = RT_TOS(eiph->tos); - fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(&init_net, &rt, &fl)) + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->saddr, 0, + 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt)) goto out; - skb2->dev = rt->u.dst.dev; + skb2->dev = rt->dst.dev; /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - fl.fl4_dst = eiph->daddr; - fl.fl4_src = eiph->saddr; - fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&init_net, &rt, &fl) || - rt->u.dst.dev->type != ARPHRD_TUNNEL) { - ip_rt_put(rt); + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->daddr, eiph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || + rt->dst.dev->type != ARPHRD_TUNNEL) { + if (!IS_ERR(rt)) + ip_rt_put(rt); goto out; } - skb2->dst = (struct dst_entry *)rt; + skb_dst_set(skb2, &rt->dst); } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || - skb2->dst->dev->type != ARPHRD_TUNNEL) + skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { - if (rel_info > dst_mtu(skb2->dst)) + if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); } + if (rel_type == ICMP_REDIRECT) + skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -576,11 +656,11 @@ out: static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { int rel_msg = 0; - int rel_type = type; - int rel_code = code; + u8 rel_type = type; + u8 rel_code = code; __u32 rel_info = ntohl(info); int err; @@ -596,21 +676,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!skb2) return 0; - dst_release(skb2->dst); - skb2->dst = NULL; + skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ - rt = rt6_lookup(&ipv6_hdr(skb2)->saddr, NULL, 0, 0); + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, 0); - if (rt && rt->rt6i_dev) - skb2->dev = rt->rt6i_dev; + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; - icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + icmpv6_send(skb2, rel_type, rel_code, rel_info); - if (rt) - dst_release(&rt->u.dst); + ip6_rt_put(rt); kfree_skb(skb2); } @@ -618,51 +697,77 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } -static void ip4ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb) +static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); - if (INET_ECN_is_ce(dsfield)) - IP_ECN_set_ce(ip_hdr(skb)); + return IP6_ECN_decapsulate(ipv6h, skb); } -static void ip6ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb) +static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); - if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) - IP6_ECN_set_ce(ipv6_hdr(skb)); + return IP6_ECN_decapsulate(ipv6h, skb); +} + +__u32 ip6_tnl_get_cap(struct ip6_tnl *t, + const struct in6_addr *laddr, + const struct in6_addr *raddr) +{ + struct __ip6_tnl_parm *p = &t->parms; + int ltype = ipv6_addr_type(laddr); + int rtype = ipv6_addr_type(raddr); + __u32 flags = 0; + + if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { + flags = IP6_TNL_F_CAP_PER_PACKET; + } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && + (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { + if (ltype&IPV6_ADDR_UNICAST) + flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + flags |= IP6_TNL_F_CAP_RCV; + } + return flags; } +EXPORT_SYMBOL(ip6_tnl_get_cap); -static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) +/* called with rcu_read_lock() */ +int ip6_tnl_rcv_ctl(struct ip6_tnl *t, + const struct in6_addr *laddr, + const struct in6_addr *raddr) { - struct ip6_tnl_parm *p = &t->parms; + struct __ip6_tnl_parm *p = &t->parms; int ret = 0; + struct net *net = t->net; - if (p->flags & IP6_TNL_F_CAP_RCV) { + if ((p->flags & IP6_TNL_F_CAP_RCV) || + ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && + (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) - ldev = dev_get_by_index(&init_net, p->link); + ldev = dev_get_by_index_rcu(net, p->link); - if ((ipv6_addr_is_multicast(&p->laddr) || - likely(ipv6_chk_addr(&init_net, &p->laddr, ldev, 0))) && - likely(!ipv6_chk_addr(&init_net, &p->raddr, NULL, 0))) + if ((ipv6_addr_is_multicast(laddr) || + likely(ipv6_chk_addr(net, laddr, ldev, 0))) && + likely(!ipv6_chk_addr(net, raddr, NULL, 0))) ret = 1; - - if (ldev) - dev_put(ldev); } return ret; } +EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); /** * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally @@ -675,51 +780,68 @@ static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, __u8 ipproto, - void (*dscp_ecn_decapsulate)(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb)) + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) { struct ip6_tnl *t; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int err; + + rcu_read_lock(); - read_lock(&ip6_tnl_lock); + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, + &ipv6h->daddr)) != NULL) { + struct pcpu_sw_netstats *tstats; - if ((t = ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { if (t->parms.proto != ipproto && t->parms.proto != 0) { - read_unlock(&ip6_tnl_lock); + rcu_read_unlock(); goto discard; } if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - read_unlock(&ip6_tnl_lock); + rcu_read_unlock(); goto discard; } - if (!ip6_tnl_rcv_ctl(t)) { - t->stat.rx_dropped++; - read_unlock(&ip6_tnl_lock); + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { + t->dev->stats.rx_dropped++; + rcu_read_unlock(); goto discard; } - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->protocol = htons(protocol); - skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - skb->dev = t->dev; - dst_release(skb->dst); - skb->dst = NULL; - nf_reset(skb); - dscp_ecn_decapsulate(t, ipv6h, skb); + __skb_tunnel_rx(skb, t->dev, t->net); + + err = dscp_ecn_decapsulate(t, ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++t->dev->stats.rx_frame_errors; + ++t->dev->stats.rx_errors; + rcu_read_unlock(); + goto discard; + } + } + + tstats = this_cpu_ptr(t->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); - t->stat.rx_packets++; - t->stat.rx_bytes += skb->len; netif_rx(skb); - read_unlock(&ip6_tnl_lock); + + rcu_read_unlock(); return 0; } - read_unlock(&ip6_tnl_lock); + rcu_read_unlock(); return 1; discard: @@ -772,40 +894,40 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) * 0 else **/ -static inline int -ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +static inline bool +ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } -static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) +int ip6_tnl_xmit_ctl(struct ip6_tnl *t) { - struct ip6_tnl_parm *p = &t->parms; + struct __ip6_tnl_parm *p = &t->parms; int ret = 0; + struct net *net = t->net; if (p->flags & IP6_TNL_F_CAP_XMIT) { struct net_device *ldev = NULL; + rcu_read_lock(); if (p->link) - ldev = dev_get_by_index(&init_net, p->link); + ldev = dev_get_by_index_rcu(net, p->link); - if (unlikely(!ipv6_chk_addr(&init_net, &p->laddr, ldev, 0))) - printk(KERN_WARNING - "%s xmit: Local address not yet configured!\n", - p->name); + if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0))) + pr_warn("%s xmit: Local address not yet configured!\n", + p->name); else if (!ipv6_addr_is_multicast(&p->raddr) && - unlikely(ipv6_chk_addr(&init_net, &p->raddr, NULL, 0))) - printk(KERN_WARNING - "%s xmit: Routing loop! " - "Remote address found on this node!\n", - p->name); + unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0))) + pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; - if (ldev) - dev_put(ldev); + rcu_read_unlock(); } return ret; } +EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); + /** * ip6_tnl_xmit2 - encapsulate packet and send * @skb: the outgoing socket buffer @@ -828,39 +950,44 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) static int ip6_tnl_xmit2(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, - struct flowi *fl, + struct flowi6 *fl6, int encap_limit, __u32 *pmtu) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->stat; + struct net *net = t->net; + struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ipv6_tel_txoption opt; - struct dst_entry *dst; + struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int max_headroom = sizeof(struct ipv6hdr); u8 proto; int err = -1; - int pkt_len; - if ((dst = ip6_tnl_dst_check(t)) != NULL) - dst_hold(dst); - else { - dst = ip6_route_output(NULL, fl); + if (!fl6->flowi6_mark) + dst = ip6_tnl_dst_check(t); + if (!dst) { + ndst = ip6_route_output(net, NULL, fl6); - if (dst->error || xfrm_lookup(&dst, fl, NULL, 0) < 0) + if (ndst->error) goto tx_err_link_failure; + ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); + ndst = NULL; + goto tx_err_link_failure; + } + dst = ndst; } tdev = dst->dev; if (tdev == dev) { stats->collisions++; - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Local routing loop detected!\n", - t->parms.name); + net_warn_ratelimited("%s: Local routing loop detected!\n", + t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - sizeof (*ipv6h); @@ -870,14 +997,16 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -892,47 +1021,45 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, if (skb->sk) skb_set_owner_w(new_skb, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = new_skb; } - dst_release(skb->dst); - skb->dst = dst_clone(dst); - + if (fl6->flowi6_mark) { + skb_dst_set(skb, dst); + ndst = NULL; + } else { + skb_dst_set_noref(skb, dst); + } skb->transport_header = skb->network_header; - proto = fl->proto; + proto = fl6->flowi6_proto; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - *(__be32*)ipv6h = fl->fl6_flowlabel | htonl(0x60000000); - dsfield = INET_ECN_encapsulate(0, dsfield); - ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; - ipv6_addr_copy(&ipv6h->saddr, &fl->fl6_src); - ipv6_addr_copy(&ipv6h->daddr, &fl->fl6_dst); - nf_reset(skb); - pkt_len = skb->len; - err = ip6_local_out(skb); - - if (net_xmit_eval(err) == 0) { - stats->tx_bytes += pkt_len; - stats->tx_packets++; - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } - ip6_tnl_dst_store(t, dst); + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; + ip6tunnel_xmit(skb, dev); + if (ndst) + ip6_tnl_dst_store(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(dst); + dst_release(ndst); return err; } @@ -940,9 +1067,9 @@ static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; - struct flowi fl; + struct flowi6 fl6; __u8 dsfield; __u32 mtu; int err; @@ -954,16 +1081,18 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl, &t->fl, sizeof (fl)); - fl.proto = IPPROTO_IPIP; + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) - fl.fl6_flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -982,7 +1111,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; __u16 offset; - struct flowi fl; + struct flowi6 fl6; __u8 dsfield; __u32 mtu; int err; @@ -991,55 +1120,52 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) return -1; - offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb)); + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2, skb->dev); + ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl, &t->fl, sizeof (fl)); - fl.proto = IPPROTO_IPV6; + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); - - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -1; } return 0; } -static int +static netdev_tx_t ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->stat; + struct net_device_stats *stats = &t->dev->stats; int ret; - if (t->recursion++) { - t->stat.collisions++; - goto tx_err; - } - switch (skb->protocol) { - case __constant_htons(ETH_P_IP): + case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); break; - case __constant_htons(ETH_P_IPV6): + case htons(ETH_P_IPV6): ret = ip6ip6_tnl_xmit(skb, dev); break; default: @@ -1049,57 +1175,37 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (ret < 0) goto tx_err; - t->recursion--; - return 0; + return NETDEV_TX_OK; tx_err: stats->tx_errors++; stats->tx_dropped++; kfree_skb(skb); - t->recursion--; - return 0; -} - -static void ip6_tnl_set_cap(struct ip6_tnl *t) -{ - struct ip6_tnl_parm *p = &t->parms; - int ltype = ipv6_addr_type(&p->laddr); - int rtype = ipv6_addr_type(&p->raddr); - - p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); - - if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && - (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { - if (ltype&IPV6_ADDR_UNICAST) - p->flags |= IP6_TNL_F_CAP_XMIT; - if (rtype&IPV6_ADDR_UNICAST) - p->flags |= IP6_TNL_F_CAP_RCV; - } + return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; - struct ip6_tnl_parm *p = &t->parms; - struct flowi *fl = &t->fl; + struct __ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; - memcpy(&dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); - memcpy(&dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ - ipv6_addr_copy(&fl->fl6_src, &p->laddr); - ipv6_addr_copy(&fl->fl6_dst, &p->raddr); - fl->oif = p->link; - fl->fl6_flowlabel = 0; + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) - fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; - ip6_tnl_set_cap(t); + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; @@ -1112,22 +1218,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr, + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, p->link, strict); if (rt == NULL) return; - if (rt->rt6i_dev) { - dev->hard_header_len = rt->rt6i_dev->hard_header_len + + if (rt->dst.dev) { + dev->hard_header_len = rt->dst.dev->hard_header_len + sizeof (struct ipv6hdr); - dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - dst_release(&rt->u.dst); + ip6_rt_put(rt); } } @@ -1135,17 +1244,16 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters - * @active: != 0 if tunnel is ready for use * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static int -ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { - ipv6_addr_copy(&t->parms.laddr, &p->laddr); - ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; @@ -1157,6 +1265,48 @@ ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) return 0; } +static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +{ + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err; + + ip6_tnl_unlink(ip6n, t); + synchronize_net(); + err = ip6_tnl_change(t, p); + ip6_tnl_link(ip6n, t); + netdev_state_change(t->dev); + return err; +} + +static void +ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->flags = u->flags; + p->hop_limit = u->hop_limit; + p->encap_limit = u->encap_limit; + p->flowinfo = u->flowinfo; + p->link = u->link; + p->proto = u->proto; + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void +ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) +{ + u->laddr = p->laddr; + u->raddr = p->raddr; + u->flags = p->flags; + u->hop_limit = p->hop_limit; + u->encap_limit = p->encap_limit; + u->flowinfo = p->flowinfo; + u->link = p->link; + u->proto = p->proto; + memcpy(u->name, p->name, sizeof(u->name)); +} + /** * ip6_tnl_ioctl - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel @@ -1190,20 +1340,26 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip6_tnl_parm p; - struct ip6_tnl *t = NULL; + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); switch (cmd) { case SIOCGETTUNNEL: - if (dev == ip6_fb_tnl_dev) { + if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { err = -EFAULT; break; } - t = ip6_tnl_locate(&p, 0); + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); + } else { + memset(&p, 0, sizeof(p)); } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof (p)); + ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { err = -EFAULT; } @@ -1211,7 +1367,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) @@ -1220,8 +1376,9 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; - t = ip6_tnl_locate(&p, cmd == SIOCADDTUNNEL); - if (dev != ip6_fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); + if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -1230,14 +1387,12 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } else t = netdev_priv(dev); - ip6_tnl_unlink(t); - err = ip6_tnl_change(t, &p); - ip6_tnl_link(t); - netdev_state_change(dev); + err = ip6_tnl_update(t, &p1); } if (t) { err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p))) + ip6_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; } else @@ -1245,18 +1400,20 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; - if (dev == ip6_fb_tnl_dev) { + if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) break; err = -ENOENT; - if ((t = ip6_tnl_locate(&p, 0)) == NULL) + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) break; err = -EPERM; - if (t->dev == ip6_fb_tnl_dev) + if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } @@ -1270,19 +1427,6 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } /** - * ip6_tnl_get_stats - return the stats for tunnel device - * @dev: virtual device associated with tunnel - * - * Return: stats for device - **/ - -static struct net_device_stats * -ip6_tnl_get_stats(struct net_device *dev) -{ - return &(((struct ip6_tnl *)netdev_priv(dev))->stat); -} - -/** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu @@ -1295,13 +1439,31 @@ ip6_tnl_get_stats(struct net_device *dev) static int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { - if (new_mtu < IPV6_MIN_MTU) { - return -EINVAL; + struct ip6_tnl *tnl = netdev_priv(dev); + + if (tnl->parms.proto == IPPROTO_IPIP) { + if (new_mtu < 68) + return -EINVAL; + } else { + if (new_mtu < IPV6_MIN_MTU) + return -EINVAL; } + if (new_mtu > 0xFFF8 - dev->hard_header_len) + return -EINVAL; dev->mtu = new_mtu; return 0; } + +static const struct net_device_ops ip6_tnl_netdev_ops = { + .ndo_uninit = ip6_tnl_dev_uninit, + .ndo_start_xmit = ip6_tnl_xmit, + .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats = ip6_get_stats, +}; + + /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel @@ -1312,18 +1474,23 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) static void ip6_tnl_dev_setup(struct net_device *dev) { - dev->uninit = ip6_tnl_dev_uninit; - dev->destructor = free_netdev; - dev->hard_start_xmit = ip6_tnl_xmit; - dev->get_stats = ip6_tnl_get_stats; - dev->do_ioctl = ip6_tnl_ioctl; - dev->change_mtu = ip6_tnl_change_mtu; + struct ip6_tnl *t; + + dev->netdev_ops = &ip6_tnl_netdev_ops; + dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } @@ -1332,12 +1499,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static inline void +static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + t->dev = dev; - strcpy(t->parms.name, dev->name); + t->net = dev_net(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } /** @@ -1345,11 +1517,13 @@ ip6_tnl_dev_init_gen(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static int -ip6_tnl_dev_init(struct net_device *dev) +static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dev_init_gen(dev); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; ip6_tnl_link_config(t); return 0; } @@ -1361,29 +1535,287 @@ ip6_tnl_dev_init(struct net_device *dev) * Return: 0 **/ -static int -ip6_fb_tnl_dev_init(struct net_device *dev) +static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dev_init_gen(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; + t->parms.proto = IPPROTO_IPV6; dev_hold(dev); - tnls_wc[0] = t; + + ip6_tnl_link_config(t); + + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPV6 && + proto != IPPROTO_IPIP && + proto != 0) + return -EINVAL; + return 0; } -static struct xfrm6_tunnel ip4ip6_handler = { +static void ip6_tnl_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); + + if (data[IFLA_IPTUN_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_IPTUN_LOCAL], + sizeof(struct in6_addr)); + + if (data[IFLA_IPTUN_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_IPTUN_REMOTE], + sizeof(struct in6_addr)); + + if (data[IFLA_IPTUN_TTL]) + parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); + + if (data[IFLA_IPTUN_ENCAP_LIMIT]) + parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); + + if (data[IFLA_IPTUN_FLOWINFO]) + parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); + + if (data[IFLA_IPTUN_FLAGS]) + parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); +} + +static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *nt; + + nt = netdev_priv(dev); + ip6_tnl_netlink_parms(data, &nt->parms); + + if (ip6_tnl_locate(net, &nt->parms, 0)) + return -EEXIST; + + return ip6_tnl_create2(dev); +} + +static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct __ip6_tnl_parm p; + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev == ip6n->fb_tnl_dev) + return -EINVAL; + + ip6_tnl_netlink_parms(data, &p); + + t = ip6_tnl_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + return ip6_tnl_update(t, &p); +} + +static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev != ip6n->fb_tnl_dev) + unregister_netdevice_queue(dev, head); +} + +static size_t ip6_tnl_get_size(const struct net_device *dev) +{ + return + /* IFLA_IPTUN_LINK */ + nla_total_size(4) + + /* IFLA_IPTUN_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_TTL */ + nla_total_size(1) + + /* IFLA_IPTUN_ENCAP_LIMIT */ + nla_total_size(1) + + /* IFLA_IPTUN_FLOWINFO */ + nla_total_size(4) + + /* IFLA_IPTUN_FLAGS */ + nla_total_size(4) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + + 0; +} + +static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct __ip6_tnl_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || + nla_put(skb, IFLA_IPTUN_LOCAL, sizeof(struct in6_addr), + &parm->laddr) || + nla_put(skb, IFLA_IPTUN_REMOTE, sizeof(struct in6_addr), + &parm->raddr) || + nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || + nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || + nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || + nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { + [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, + [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, + [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, + [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, + [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ip6_link_ops __read_mostly = { + .kind = "ip6tnl", + .maxtype = IFLA_IPTUN_MAX, + .policy = ip6_tnl_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6_tnl_dev_setup, + .validate = ip6_tnl_validate, + .newlink = ip6_tnl_newlink, + .changelink = ip6_tnl_changelink, + .dellink = ip6_tnl_dellink, + .get_size = ip6_tnl_get_size, + .fill_info = ip6_tnl_fill_info, +}; + +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; -static struct xfrm6_tunnel ip6ip6_handler = { +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct net_device *dev, *aux; + int h; + struct ip6_tnl *t; + LIST_HEAD(list); + + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6_link_ops) + unregister_netdevice_queue(dev, &list); + + for (h = 0; h < HASH_SIZE; h++) { + t = rtnl_dereference(ip6n->tnls_r_l[h]); + while (t != NULL) { + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); + } + } + + unregister_netdevice_many(&list); +} + +static int __net_init ip6_tnl_init_net(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl *t = NULL; + int err; + + ip6n->tnls[0] = ip6n->tnls_wc; + ip6n->tnls[1] = ip6n->tnls_r_l; + + err = -ENOMEM; + ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6_tnl_dev_setup); + + if (!ip6n->fb_tnl_dev) + goto err_alloc_dev; + dev_net_set(ip6n->fb_tnl_dev, net); + ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; + + err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + err = register_netdev(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); + return 0; + +err_register: + ip6_dev_free(ip6n->fb_tnl_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ip6_tnl_exit_net(struct net *net) +{ + rtnl_lock(); + ip6_tnl_destroy_tunnels(net); + rtnl_unlock(); +} + +static struct pernet_operations ip6_tnl_net_ops = { + .init = ip6_tnl_init_net, + .exit = ip6_tnl_exit_net, + .id = &ip6_tnl_net_id, + .size = sizeof(struct ip6_tnl_net), +}; + /** * ip6_tunnel_init - register protocol and reserve needed resources * @@ -1394,68 +1826,51 @@ static int __init ip6_tunnel_init(void) { int err; - if (xfrm6_tunnel_register(&ip4ip6_handler, AF_INET)) { - printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n"); - err = -EAGAIN; - goto out; - } + err = register_pernet_device(&ip6_tnl_net_ops); + if (err < 0) + goto out_pernet; - if (xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6)) { - printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n"); - err = -EAGAIN; - goto unreg_ip4ip6; + err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); + if (err < 0) { + pr_err("%s: can't register ip4ip6\n", __func__); + goto out_ip4ip6; } - ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", - ip6_tnl_dev_setup); - if (!ip6_fb_tnl_dev) { - err = -ENOMEM; - goto fail; + err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); + if (err < 0) { + pr_err("%s: can't register ip6ip6\n", __func__); + goto out_ip6ip6; } - ip6_fb_tnl_dev->init = ip6_fb_tnl_dev_init; + err = rtnl_link_register(&ip6_link_ops); + if (err < 0) + goto rtnl_link_failed; - if ((err = register_netdev(ip6_fb_tnl_dev))) { - free_netdev(ip6_fb_tnl_dev); - goto fail; - } return 0; -fail: + +rtnl_link_failed: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); -unreg_ip4ip6: +out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); -out: +out_ip4ip6: + unregister_pernet_device(&ip6_tnl_net_ops); +out_pernet: return err; } -static void __exit ip6_tnl_destroy_tunnels(void) -{ - int h; - struct ip6_tnl *t; - - for (h = 0; h < HASH_SIZE; h++) { - while ((t = tnls_r_l[h]) != NULL) - unregister_netdevice(t->dev); - } - - t = tnls_wc[0]; - unregister_netdevice(t->dev); -} - /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { + rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) - printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n"); + pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) - printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n"); + pr_info("%s: can't deregister ip6ip6\n", __func__); - rtnl_lock(); - ip6_tnl_destroy_tunnels(); - rtnl_unlock(); + unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c new file mode 100644 index 00000000000..9aaa6bb229e --- /dev/null +++ b/net/ipv6/ip6_vti.c @@ -0,0 +1,1160 @@ +/* + * IPv6 virtual tunneling interface + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv6/ip6_tunnel.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/sockios.h> +#include <linux/icmp.h> +#include <linux/if.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/route.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv6.h> +#include <linux/slab.h> +#include <linux/hash.h> + +#include <linux/uaccess.h> +#include <linux/atomic.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) + +static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) +{ + u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); + + return hash_32(hash, HASH_SIZE_SHIFT); +} + +static int vti6_dev_init(struct net_device *dev); +static void vti6_dev_setup(struct net_device *dev); +static struct rtnl_link_ops vti6_link_ops __read_mostly; + +static int vti6_net_id __read_mostly; +struct vti6_net { + /* the vti6 tunnel fallback device */ + struct net_device *fb_tnl_dev; + /* lists for storing tunnels in use */ + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; +}; + +#define for_each_vti6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/** + * vti6_tnl_lookup - fetch tunnel matching the end-point addresses + * @net: network namespace + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ +static struct ip6_tnl * +vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, + const struct in6_addr *local) +{ + unsigned int hash = HASH(remote, local); + struct ip6_tnl *t; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); + if (t && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * vti6_tnl_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * vti6_tnl_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ +static struct ip6_tnl __rcu ** +vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote, local); + } + return &ip6n->tnls[prio][h]; +} + +static void +vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); + + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void +vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = vti6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void vti6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static int vti6_tnl_create2(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err; + + err = vti6_dev_init(dev); + if (err < 0) + goto out; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &vti6_link_ops; + + dev_hold(dev); + vti6_tnl_link(ip6n, t); + + return 0; + +out: + return err; +} + +static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + sprintf(name, "ip6_vti%%d"); + + dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup); + if (dev == NULL) + goto failed; + + dev_net_set(dev, net); + + t = netdev_priv(dev); + t->parms = *p; + t->net = dev_net(dev); + + err = vti6_tnl_create2(dev); + if (err < 0) + goto failed_free; + + return t; + +failed_free: + vti6_dev_free(dev); +failed: + return NULL; +} + +/** + * vti6_locate - find or create tunnel matching given parameters + * @net: network namespace + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * vti6_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * matching tunnel or NULL + **/ +static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, + int create) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; + struct ip6_tnl *t; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + for (tp = vti6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) + return t; + } + if (!create) + return NULL; + return vti6_tnl_create(net, p); +} + +/** + * vti6_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * vti6_dev_uninit() removes tunnel from its list + **/ +static void vti6_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + if (dev == ip6n->fb_tnl_dev) + RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); + else + vti6_tnl_unlink(ip6n, t); + dev_put(dev); +} + +static int vti6_rcv(struct sk_buff *skb) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + rcu_read_lock(); + if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, + &ipv6h->daddr)) != NULL) { + if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { + rcu_read_unlock(); + goto discard; + } + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + rcu_read_unlock(); + return 0; + } + + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { + t->dev->stats.rx_dropped++; + rcu_read_unlock(); + goto discard; + } + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + skb->mark = be32_to_cpu(t->parms.i_key); + + rcu_read_unlock(); + + return xfrm6_rcv(skb); + } + rcu_read_unlock(); + return -EINVAL; +discard: + kfree_skb(skb); + return 0; +} + +static int vti6_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + + if (!t) + return 1; + + dev = t->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +/** + * vti6_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ +static inline bool +vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static bool vti6_state_check(const struct xfrm_state *x, + const struct in6_addr *dst, + const struct in6_addr *src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)dst; + xfrm_address_t *saddr = (xfrm_address_t *)src; + + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET6) + return false; + + if (ipv6_addr_any(dst)) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) + return false; + + return true; +} + +/** + * vti6_xmit - send a packet + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * @fl: the flow informations for the xfrm_lookup + **/ +static int +vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct dst_entry *dst = skb_dst(skb); + struct net_device *tdev; + int err = -1; + + if (!dst) + goto tx_err_link_failure; + + dst_hold(dst); + dst = xfrm_lookup(t->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + if (!vti6_state_check(dst->xfrm, &t->parms.raddr, &t->parms.laddr)) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = skb_dst(skb)->dev; + + err = dst_output(skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += skb->len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t +vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct ipv6hdr *ipv6h; + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + skb->mark = be32_to_cpu(t->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + ipv6h = ipv6_hdr(skb); + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h)) + goto tx_err; + + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + default: + goto tx_err; + } + + ret = vti6_xmit(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip6_tnl *t; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + int protocol = iph->nexthdr; + + t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); + if (!t) + return -1; + + mark = be32_to_cpu(t->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); + xfrm_state_put(x); + + return 0; +} + +static void vti6_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct __ip6_tnl_parm *p = &t->parms; + + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | + IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); + + if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; +} + +/** + * vti6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * + * Description: + * vti6_tnl_change() updates the tunnel parameters + **/ +static int +vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) +{ + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; + t->parms.link = p->link; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + t->parms.proto = p->proto; + ip6_tnl_dst_reset(t); + vti6_link_config(t); + return 0; +} + +static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +{ + struct net *net = dev_net(t->dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err; + + vti6_tnl_unlink(ip6n, t); + synchronize_net(); + err = vti6_tnl_change(t, p); + vti6_tnl_link(ip6n, t); + netdev_state_change(t->dev); + return err; +} + +static void +vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->link = u->link; + p->i_key = u->i_key; + p->o_key = u->o_key; + p->proto = u->proto; + + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void +vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) +{ + u->laddr = p->laddr; + u->raddr = p->raddr; + u->link = p->link; + u->i_key = p->i_key; + u->o_key = p->o_key; + u->proto = p->proto; + + memcpy(u->name, p->name, sizeof(u->name)); +} + +/** + * vti6_tnl_ioctl - configure vti6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * vti6_ioctl() is used for managing vti6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6_vti0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ +static int +vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm2 p; + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = NULL; + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6n->fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, 0); + } else { + memset(&p, 0, sizeof(p)); + } + if (t == NULL) + t = netdev_priv(dev); + vti6_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + break; + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + break; + err = -EINVAL; + if (p.proto != IPPROTO_IPV6 && p.proto != 0) + break; + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); + if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else + t = netdev_priv(dev); + + err = vti6_update(t, &p1); + } + if (t) { + err = 0; + vti6_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + break; + + if (dev == ip6n->fb_tnl_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + break; + err = -ENOENT; + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, 0); + if (t == NULL) + break; + err = -EPERM; + if (t->dev == ip6n->fb_tnl_dev) + break; + dev = t->dev; + } + err = 0; + unregister_netdevice(dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * vti6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ +static int vti6_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops vti6_netdev_ops = { + .ndo_uninit = vti6_dev_uninit, + .ndo_start_xmit = vti6_tnl_xmit, + .ndo_do_ioctl = vti6_ioctl, + .ndo_change_mtu = vti6_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +/** + * vti6_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ +static void vti6_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &vti6_netdev_ops; + dev->destructor = vti6_dev_free; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); + dev->mtu = ETH_DATA_LEN; + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +/** + * vti6_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ +static inline int vti6_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + + t->dev = dev; + t->net = dev_net(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + return 0; +} + +/** + * vti6_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ +static int vti6_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int err = vti6_dev_init_gen(dev); + + if (err) + return err; + vti6_link_config(t); + return 0; +} + +/** + * vti6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ +static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err = vti6_dev_init_gen(dev); + + if (err) + return err; + + t->parms.proto = IPPROTO_IPV6; + dev_hold(dev); + + vti6_link_config(t); + + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static int vti6_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + return 0; +} + +static void vti6_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_VTI_LINK]) + parms->link = nla_get_u32(data[IFLA_VTI_LINK]); + + if (data[IFLA_VTI_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_VTI_LOCAL], + sizeof(struct in6_addr)); + + if (data[IFLA_VTI_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_VTI_REMOTE], + sizeof(struct in6_addr)); + + if (data[IFLA_VTI_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); + + if (data[IFLA_VTI_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); +} + +static int vti6_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *nt; + + nt = netdev_priv(dev); + vti6_netlink_parms(data, &nt->parms); + + nt->parms.proto = IPPROTO_IPV6; + + if (vti6_locate(net, &nt->parms, 0)) + return -EEXIST; + + return vti6_tnl_create2(dev); +} + +static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t; + struct __ip6_tnl_parm p; + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + if (dev == ip6n->fb_tnl_dev) + return -EINVAL; + + vti6_netlink_parms(data, &p); + + t = vti6_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + return vti6_update(t, &p); +} + +static size_t vti6_get_size(const struct net_device *dev) +{ + return + /* IFLA_VTI_LINK */ + nla_total_size(4) + + /* IFLA_VTI_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_VTI_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_VTI_IKEY */ + nla_total_size(4) + + /* IFLA_VTI_OKEY */ + nla_total_size(4) + + 0; +} + +static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct __ip6_tnl_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || + nla_put(skb, IFLA_VTI_LOCAL, sizeof(struct in6_addr), + &parm->laddr) || + nla_put(skb, IFLA_VTI_REMOTE, sizeof(struct in6_addr), + &parm->raddr) || + nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || + nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { + [IFLA_VTI_LINK] = { .type = NLA_U32 }, + [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, + [IFLA_VTI_IKEY] = { .type = NLA_U32 }, + [IFLA_VTI_OKEY] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops vti6_link_ops __read_mostly = { + .kind = "vti6", + .maxtype = IFLA_VTI_MAX, + .policy = vti6_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = vti6_dev_setup, + .validate = vti6_validate, + .newlink = vti6_newlink, + .changelink = vti6_changelink, + .get_size = vti6_get_size, + .fill_info = vti6_fill_info, +}; + +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +{ + int h; + struct ip6_tnl *t; + LIST_HEAD(list); + + for (h = 0; h < HASH_SIZE; h++) { + t = rtnl_dereference(ip6n->tnls_r_l[h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); + } + } + + t = rtnl_dereference(ip6n->tnls_wc[0]); + unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init vti6_init_net(struct net *net) +{ + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct ip6_tnl *t = NULL; + int err; + + ip6n->tnls[0] = ip6n->tnls_wc; + ip6n->tnls[1] = ip6n->tnls_r_l; + + err = -ENOMEM; + ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", + vti6_dev_setup); + + if (!ip6n->fb_tnl_dev) + goto err_alloc_dev; + dev_net_set(ip6n->fb_tnl_dev, net); + + err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + err = register_netdev(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); + return 0; + +err_register: + vti6_dev_free(ip6n->fb_tnl_dev); +err_alloc_dev: + return err; +} + +static void __net_exit vti6_exit_net(struct net *net) +{ + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + rtnl_lock(); + vti6_destroy_tunnels(ip6n); + rtnl_unlock(); +} + +static struct pernet_operations vti6_net_ops = { + .init = vti6_init_net, + .exit = vti6_exit_net, + .id = &vti6_net_id, + .size = sizeof(struct vti6_net), +}; + +static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +/** + * vti6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ +static int __init vti6_tunnel_init(void) +{ + int err; + + err = register_pernet_device(&vti6_net_ops); + if (err < 0) + goto out_pernet; + + err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); + if (err < 0) { + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); + if (err < 0) { + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = rtnl_link_register(&vti6_link_ops); + if (err < 0) + goto rtnl_link_failed; + + return 0; + +rtnl_link_failed: + xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); +out: + unregister_pernet_device(&vti6_net_ops); +out_pernet: + return err; +} + +/** + * vti6_tunnel_cleanup - free resources and unregister protocol + **/ +static void __exit vti6_tunnel_cleanup(void) +{ + rtnl_link_unregister(&vti6_link_ops); + if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP)) + pr_info("%s: can't deregister protocol\n", __func__); + + unregister_pernet_device(&vti6_net_ops); +} + +module_init(vti6_tunnel_init); +module_exit(vti6_tunnel_cleanup); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("vti6"); +MODULE_ALIAS_NETDEV("ip6_vti0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("IPv6 virtual tunnel interface"); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c new file mode 100644 index 00000000000..8250474ab7d --- /dev/null +++ b/net/ipv6/ip6mr.c @@ -0,0 +1,2503 @@ +/* + * Linux IPv6 multicast routing support for BSD pim6sd + * Based on net/ipv4/ipmr.c. + * + * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> + * LSIIT Laboratory, Strasbourg, France + * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> + * 6WIND, Paris, France + * Copyright (C)2007,2008 USAGI/WIDE Project + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <net/checksum.h> +#include <net/netlink.h> +#include <net/fib_rules.h> + +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <linux/mroute6.h> +#include <linux/pim.h> +#include <net/addrconf.h> +#include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/ip6_checksum.h> +#include <linux/netconf.h> + +struct mr6_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock *mroute6_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc6_unres_queue; + struct list_head mfc6_cache_array[MFC6_LINES]; + struct mif_device vif6_table[MAXMIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; +#ifdef CONFIG_IPV6_PIMSM_V2 + int mroute_reg_vif_num; +#endif +}; + +struct ip6mr_rule { + struct fib_rule common; +}; + +struct ip6mr_result { + struct mr6_table *mrt; +}; + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + entries is changed only in process context and protected + with weak lock mrt_lock. Queue of unresolved entries is protected + with strong spinlock mfc_unres_lock. + + In this case data path is free of exclusive locks at all. + */ + +static struct kmem_cache *mrt_cachep __read_mostly; + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr6_table *mrt); + +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache); +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert); +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm); +static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, + int cmd); +static int ip6mr_rtm_dumproute(struct sk_buff *skb, + struct netlink_callback *cb); +static void mroute_clean_tables(struct mr6_table *mrt); +static void ipmr_expire_process(unsigned long arg); + +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +#define ip6mr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + + ip6mr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + int err; + struct ip6mr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; + + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(flp6), 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ip6mr_result *res = arg->result; + struct mr6_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ip6mr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) +{ + return 1; +} + +static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .rule_size = sizeof(struct ip6mr_rule), + .addr_size = sizeof(struct in6_addr), + .action = ip6mr_rule_action, + .match = ip6mr_rule_match, + .configure = ip6mr_rule_configure, + .compare = ip6mr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ip6mr_rule_fill, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = ip6mr_rule_policy, + .owner = THIS_MODULE, +}; + +static int __net_init ip6mr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr6_table *mrt; + int err; + + ops = fib_rules_register(&ip6mr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + INIT_LIST_HEAD(&net->ipv6.mr6_tables); + + mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + if (err < 0) + goto err2; + + net->ipv6.mr6_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + struct mr6_table *mrt, *next; + + rtnl_lock(); + list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { + list_del(&mrt->list); + ip6mr_free_table(mrt); + } + rtnl_unlock(); + fib_rules_unregister(net->ipv6.mr6_rules_ops); +} +#else +#define ip6mr_for_each_table(mrt, net) \ + for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + return net->ipv6.mrt6; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + *mrt = net->ipv6.mrt6; + return 0; +} + +static int __net_init ip6mr_rules_init(struct net *net) +{ + net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); + return net->ipv6.mrt6 ? 0 : -ENOMEM; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + rtnl_lock(); + ip6mr_free_table(net->ipv6.mrt6); + net->ipv6.mrt6 = NULL; + rtnl_unlock(); +} +#endif + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + unsigned int i; + + mrt = ip6mr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + /* Forwarding cache */ + for (i = 0; i < MFC6_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IPV6_PIMSM_V2 + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); +#endif + return mrt; +} + +static void ip6mr_free_table(struct mr6_table *mrt) +{ + del_timer(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + +#ifdef CONFIG_PROC_FS + +struct ipmr_mfc_iter { + struct seq_net_private p; + struct mr6_table *mrt; + struct list_head *cache; + int ct; +}; + + +static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, + struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mr6_table *mrt = it->mrt; + struct mfc6_cache *mfc; + + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + } + read_unlock(&mrt_lock); + + spin_lock_bh(&mfc_unres_lock); + it->cache = &mrt->mfc6_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + +/* + * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif + */ + +struct ipmr_vif_iter { + struct seq_net_private p; + struct mr6_table *mrt; + int ct; +}; + +static struct mif_device *ip6mr_vif_seq_idx(struct net *net, + struct ipmr_vif_iter *iter, + loff_t pos) +{ + struct mr6_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; + + read_lock(&mrt_lock); + return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip6mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) +{ + read_unlock(&mrt_lock); +} + +static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) +{ + struct ipmr_vif_iter *iter = seq->private; + struct mr6_table *mrt = iter->mrt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); + } else { + const struct mif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", + vif - mrt->vif6_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags); + } + return 0; +} + +static const struct seq_operations ip6mr_vif_seq_ops = { + .start = ip6mr_vif_seq_start, + .next = ip6mr_vif_seq_next, + .stop = ip6mr_vif_seq_stop, + .show = ip6mr_vif_seq_show, +}; + +static int ip6mr_vif_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip6mr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); +} + +static const struct file_operations ip6mr_vif_fops = { + .owner = THIS_MODULE, + .open = ip6mr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + it->mrt = mrt; + return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc6_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = it->mrt; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(net, seq->private, 0); + + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc6_cache, list); + + if (it->cache == &mrt->mfc6_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); + + while (++it->ct < MFC6_LINES) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc6_cache, list); + } + + /* exhausted cache_array, show unresolved */ + read_unlock(&mrt_lock); + it->cache = &mrt->mfc6_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc6_cache, list); + + end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc6_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == mrt->mfc6_cache_array) + read_unlock(&mrt_lock); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group " + "Origin " + "Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc6_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + seq_printf(seq, "%pI6 %pI6 %-3hd", + &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, + mfc->mf6c_parent); + + if (it->cache != &mrt->mfc6_unres_queue) { + seq_printf(seq, " %8lu %8lu %8lu", + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++) { + if (MIF_EXISTS(mrt, n) && + mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } else { + /* unresolved mfc_caches don't contain + * pkt, bytes and wrong_if values + */ + seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); + } + seq_putc(seq, '\n'); + } + return 0; +} + +static const struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); +} + +static const struct file_operations ip6mr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +#ifdef CONFIG_IPV6_PIMSM_V2 + +static int pim6_rcv(struct sk_buff *skb) +{ + struct pimreghdr *pim; + struct ipv6hdr *encap; + struct net_device *reg_dev = NULL; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int reg_vif_num; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct pimreghdr *)skb_transport_header(skb); + if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) || + (pim->flags & PIM_NULL_REGISTER) || + (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + sizeof(*pim), IPPROTO_PIM, + csum_partial((void *)pim, sizeof(*pim), 0)) && + csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + /* check if the inner packet is destined to mcast group */ + encap = (struct ipv6hdr *)(skb_transport_header(skb) + + sizeof(*pim)); + + if (!ipv6_addr_is_multicast(&encap->daddr) || + encap->payload_len == 0 || + ntohs(encap->payload_len) + sizeof(*pim) > skb->len) + goto drop; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto drop; + reg_vif_num = mrt->mroute_reg_vif_num; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = mrt->vif6_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac_header = skb->network_header; + skb_pull(skb, (u8 *)encap - skb->data); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IPV6); + skb->ip_summed = CHECKSUM_NONE; + + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); + + netif_rx(skb); + + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} + +static const struct inet6_protocol pim6_protocol = { + .handler = pim6_rcv, +}; + +/* Service routines creating virtual interfaces: PIMREG */ + +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_oif = dev->ifindex, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops reg_vif_netdev_ops = { + .ndo_start_xmit = reg_vif_xmit, +}; + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; + dev->flags = IFF_NOARP; + dev->netdev_ops = ®_vif_netdev_ops; + dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT6_TABLE_DFLT) + sprintf(name, "pim6reg"); + else + sprintf(name, "pim6reg%u", mrt->id); + + dev = alloc_netdev(0, name, reg_vif_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + if (dev_open(dev)) + goto failure; + + dev_hold(dev); + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) +{ + struct mif_device *v; + struct net_device *dev; + struct inet6_dev *in6_dev; + + if (vifi < 0 || vifi >= mrt->maxvif) + return -EADDRNOTAVAIL; + + v = &mrt->vif6_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; +#endif + + if (vifi + 1 == mrt->maxvif) { + int tmp; + for (tmp = vifi - 1; tmp >= 0; tmp--) { + if (MIF_EXISTS(mrt, tmp)) + break; + } + mrt->maxvif = tmp + 1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + in6_dev = __in6_dev_get(dev); + if (in6_dev) { + in6_dev->cnf.mc_forwarding--; + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_MC_FORWARDING, + dev->ifindex, &in6_dev->cnf); + } + + if (v->flags & MIFF_REGISTER) + unregister_netdevice_queue(dev, head); + + dev_put(dev); + return 0; +} + +static inline void ip6mr_cache_free(struct mfc6_cache *c) +{ + kmem_cache_free(mrt_cachep, c); +} + +/* Destroy an unresolved cache entry, killing queued skbs + and reporting error to netlink readers. + */ + +static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + + atomic_dec(&mrt->cache_resolve_queue_len); + + while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); + } else + kfree_skb(skb); + } + + ip6mr_cache_free(c); +} + + +/* Timer process for all the unresolved queue. */ + +static void ipmr_do_expire_process(struct mr6_table *mrt) +{ + unsigned long now = jiffies; + unsigned long expires = 10 * HZ; + struct mfc6_cache *c, *next; + + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + if (time_after(c->mfc_un.unres.expires, now)) { + /* not yet... */ + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + continue; + } + + list_del(&c->list); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, c); + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); +} + +static void ipmr_expire_process(unsigned long arg) +{ + struct mr6_table *mrt = (struct mr6_table *)arg; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); + return; + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + ipmr_do_expire_process(mrt); + + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, + unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXMIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXMIFS); + + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (MIF_EXISTS(mrt, vifi) && + ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int mif6_add(struct net *net, struct mr6_table *mrt, + struct mif6ctl *vifc, int mrtsock) +{ + int vifi = vifc->mif6c_mifi; + struct mif_device *v = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct inet6_dev *in6_dev; + int err; + + /* Is vif busy ? */ + if (MIF_EXISTS(mrt, vifi)) + return -EADDRINUSE; + + switch (vifc->mif6c_flags) { +#ifdef CONFIG_IPV6_PIMSM_V2 + case MIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (mrt->mroute_reg_vif_num >= 0) + return -EADDRINUSE; + dev = ip6mr_reg_vif(net, mrt); + if (!dev) + return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } + break; +#endif + case 0: + dev = dev_get_by_index(net, vifc->mif6c_pifi); + if (!dev) + return -EADDRNOTAVAIL; + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } + break; + default: + return -EINVAL; + } + + in6_dev = __in6_dev_get(dev); + if (in6_dev) { + in6_dev->cnf.mc_forwarding++; + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_MC_FORWARDING, + dev->ifindex, &in6_dev->cnf); + } + + /* + * Fill in the VIF structures + */ + v->rate_limit = vifc->vifc_rate_limit; + v->flags = vifc->mif6c_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold = vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags & MIFF_REGISTER) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + v->dev = dev; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (v->flags & MIFF_REGISTER) + mrt->mroute_reg_vif_num = vifi; +#endif + if (vifi + 1 > mrt->maxvif) + mrt->maxvif = vifi + 1; + write_unlock_bh(&mrt_lock); + return 0; +} + +static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp) +{ + int line = MFC6_HASH(mcastgrp, origin); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) + return c; + } + return NULL; +} + +/* Look for a (*,*,oif) entry */ +static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, + mifi_t mifi) +{ + int line = MFC6_HASH(&in6addr_any, &in6addr_any); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp) && + (c->mfc_un.res.ttls[mifi] < 255)) + return c; + + return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, + struct in6_addr *mcastgrp, + mifi_t mifi) +{ + int line = MFC6_HASH(mcastgrp, &in6addr_any); + struct mfc6_cache *c, *proxy; + + if (ipv6_addr_any(mcastgrp)) + goto skip; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) { + if (c->mfc_un.res.ttls[mifi] < 255) + return c; + + /* It's ok if the mifi is part of the static tree */ + proxy = ip6mr_cache_find_any_parent(mrt, + c->mf6c_parent); + if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) + return c; + } + +skip: + return ip6mr_cache_find_any_parent(mrt, mifi); +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc6_cache *ip6mr_cache_alloc(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); + if (c == NULL) + return NULL; + c->mfc_un.res.minvif = MAXMIFS; + return c; +} + +static struct mfc6_cache *ip6mr_cache_alloc_unres(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); + if (c == NULL) + return NULL; + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10 * HZ; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, + struct mfc6_cache *uc, struct mfc6_cache *c) +{ + struct sk_buff *skb; + + /* + * Play the pending entries through our router + */ + + while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + + if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; + } + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); + } else + ip6_mr_forward(net, mrt, skb, c); + } +} + +/* + * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert) +{ + struct sk_buff *skb; + struct mrt6msg *msg; + int ret; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) + +sizeof(*msg)); + else +#endif + skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); + + if (!skb) + return -ENOBUFS; + + /* I suppose that internal messages + * do not require checksums */ + + skb->ip_summed = CHECKSUM_UNNECESSARY; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix length etc. + And all this only to mangle msg->im6_msgtype and + to set msg->im6_mbz to "mbz" :-) + */ + skb_push(skb, -skb_network_offset(pkt)); + + skb_push(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + msg->im6_mbz = 0; + msg->im6_msgtype = MRT6MSG_WHOLEPKT; + msg->im6_mif = mrt->mroute_reg_vif_num; + msg->im6_pad = 0; + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else +#endif + { + /* + * Copy the IP header + */ + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); + + /* + * Add our header + */ + skb_put(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + + msg->im6_mbz = 0; + msg->im6_msgtype = assert; + msg->im6_mif = mifi; + msg->im6_pad = 0; + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; + + skb_dst_set(skb, dst_clone(skb_dst(pkt))); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (mrt->mroute6_sk == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + /* + * Deliver to user space multicast routing algorithms + */ + ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); + if (ret < 0) { + net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) +{ + bool found = false; + int err; + struct mfc6_cache *c; + + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && + ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { + found = true; + break; + } + } + + if (!found) { + /* + * Create a new entry if allowable + */ + + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ip6mr_cache_alloc_unres()) == NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* + * Fill in the new cache entry + */ + c->mf6c_parent = -1; + c->mf6c_origin = ipv6_hdr(skb)->saddr; + c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; + + /* + * Reflect first query at pim6sd + */ + err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); + if (err < 0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + ip6mr_cache_free(c); + kfree_skb(skb); + return err; + } + + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc6_unres_queue); + mr6_netlink_event(mrt, c, RTM_NEWROUTE); + + ipmr_do_expire_process(mrt); + } + + /* + * See if we can append the packet + */ + if (c->mfc_un.unres.unresolved.qlen > 3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC6 cache manipulation by user space + */ + +static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, + int parent) +{ + int line; + struct mfc6_cache *c, *next; + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == c->mf6c_parent)) { + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_cache_free(c); + return 0; + } + } + return -ENOENT; +} + +static int ip6mr_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct mif_device *v; + int ct; + LIST_HEAD(list); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + ip6mr_for_each_table(mrt, net) { + v = &mrt->vif6_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + mif6_delete(mrt, ct, &list); + } + } + unregister_netdevice_many(&list); + + return NOTIFY_DONE; +} + +static struct notifier_block ip6_mr_notifier = { + .notifier_call = ip6mr_device_event +}; + +/* + * Setup for IP multicast routing + */ + +static int __net_init ip6mr_net_init(struct net *net) +{ + int err; + + err = ip6mr_rules_init(net); + if (err < 0) + goto fail; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops)) + goto proc_vif_fail; + if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops)) + goto proc_cache_fail; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +proc_cache_fail: + remove_proc_entry("ip6_mr_vif", net->proc_net); +proc_vif_fail: + ip6mr_rules_exit(net); +#endif +fail: + return err; +} + +static void __net_exit ip6mr_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip6_mr_cache", net->proc_net); + remove_proc_entry("ip6_mr_vif", net->proc_net); +#endif + ip6mr_rules_exit(net); +} + +static struct pernet_operations ip6mr_net_ops = { + .init = ip6mr_net_init, + .exit = ip6mr_net_exit, +}; + +int __init ip6_mr_init(void) +{ + int err; + + mrt_cachep = kmem_cache_create("ip6_mrt_cache", + sizeof(struct mfc6_cache), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!mrt_cachep) + return -ENOMEM; + + err = register_pernet_subsys(&ip6mr_net_ops); + if (err) + goto reg_pernet_fail; + + err = register_netdevice_notifier(&ip6_mr_notifier); + if (err) + goto reg_notif_fail; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { + pr_err("%s: can't add PIM protocol\n", __func__); + err = -EAGAIN; + goto add_proto_fail; + } +#endif + rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, + ip6mr_rtm_dumproute, NULL); + return 0; +#ifdef CONFIG_IPV6_PIMSM_V2 +add_proto_fail: + unregister_netdevice_notifier(&ip6_mr_notifier); +#endif +reg_notif_fail: + unregister_pernet_subsys(&ip6mr_net_ops); +reg_pernet_fail: + kmem_cache_destroy(mrt_cachep); + return err; +} + +void ip6_mr_cleanup(void) +{ + unregister_netdevice_notifier(&ip6_mr_notifier); + unregister_pernet_subsys(&ip6mr_net_ops); + kmem_cache_destroy(mrt_cachep); +} + +static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, + struct mf6cctl *mfc, int mrtsock, int parent) +{ + bool found = false; + int line; + struct mfc6_cache *uc, *c; + unsigned char ttls[MAXMIFS]; + int i; + + if (mfc->mf6cc_parent >= MAXMIFS) + return -ENFILE; + + memset(ttls, 255, MAXMIFS); + for (i = 0; i < MAXMIFS; i++) { + if (IF_ISSET(i, &mfc->mf6cc_ifset)) + ttls[i] = 1; + + } + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == mfc->mf6cc_parent)) { + found = true; + break; + } + } + + if (found) { + write_lock_bh(&mrt_lock); + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + mr6_netlink_event(mrt, c, RTM_NEWROUTE); + return 0; + } + + if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && + !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) + return -EINVAL; + + c = ip6mr_cache_alloc(); + if (c == NULL) + return -ENOMEM; + + c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; + c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + write_lock_bh(&mrt_lock); + list_add(&c->list, &mrt->mfc6_cache_array[line]); + write_unlock_bh(&mrt_lock); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + found = false; + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && + ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; + break; + } + } + if (list_empty(&mrt->mfc6_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); + spin_unlock_bh(&mfc_unres_lock); + + if (found) { + ip6mr_cache_resolve(net, mrt, uc, c); + ip6mr_cache_free(uc); + } + mr6_netlink_event(mrt, c, RTM_NEWROUTE); + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct mr6_table *mrt) +{ + int i; + LIST_HEAD(list); + struct mfc6_cache *c, *next; + + /* + * Shut down all active vif entries + */ + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) + mif6_delete(mrt, i, &list); + } + unregister_netdevice_many(&list); + + /* + * Wipe the cache + */ + for (i = 0; i < MFC6_LINES; i++) { + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { + if (c->mfc_flags & MFC_STATIC) + continue; + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_cache_free(c); + } + } + + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_del(&c->list); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, c); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) +{ + int err = 0; + struct net *net = sock_net(sk); + + rtnl_lock(); + write_lock_bh(&mrt_lock); + if (likely(mrt->mroute6_sk == NULL)) { + mrt->mroute6_sk = sk; + net->ipv6.devconf_all->mc_forwarding++; + inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } + else + err = -EADDRINUSE; + write_unlock_bh(&mrt_lock); + + rtnl_unlock(); + + return err; +} + +int ip6mr_sk_done(struct sock *sk) +{ + int err = -EACCES; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + rtnl_lock(); + ip6mr_for_each_table(mrt, net) { + if (sk == mrt->mroute6_sk) { + write_lock_bh(&mrt_lock); + mrt->mroute6_sk = NULL; + net->ipv6.devconf_all->mc_forwarding--; + inet6_netconf_notify_devconf(net, + NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + write_unlock_bh(&mrt_lock); + + mroute_clean_tables(mrt); + err = 0; + break; + } + } + rtnl_unlock(); + + return err; +} + +struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +{ + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, + .flowi6_oif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + return NULL; + + return mrt->mroute6_sk; +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +{ + int ret, parent = 0; + struct mif6ctl vif; + struct mf6cctl mfc; + mifi_t mifi; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + if (optname != MRT6_INIT) { + if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EACCES; + } + + switch (optname) { + case MRT6_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + if (optlen < sizeof(int)) + return -EINVAL; + + return ip6mr_sk_init(mrt, sk); + + case MRT6_DONE: + return ip6mr_sk_done(sk); + + case MRT6_ADD_MIF: + if (optlen < sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif, optval, sizeof(vif))) + return -EFAULT; + if (vif.mif6c_mifi >= MAXMIFS) + return -ENFILE; + rtnl_lock(); + ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); + rtnl_unlock(); + return ret; + + case MRT6_DEL_MIF: + if (optlen < sizeof(mifi_t)) + return -EINVAL; + if (copy_from_user(&mifi, optval, sizeof(mifi_t))) + return -EFAULT; + rtnl_lock(); + ret = mif6_delete(mrt, mifi, NULL); + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT6_ADD_MFC: + case MRT6_DEL_MFC: + parent = -1; + case MRT6_ADD_MFC_PROXY: + case MRT6_DEL_MFC_PROXY: + if (optlen < sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc, optval, sizeof(mfc))) + return -EFAULT; + if (parent == 0) + parent = mfc.mf6cc_parent; + rtnl_lock(); + if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) + ret = ip6mr_mfc_delete(mrt, &mfc, parent); + else + ret = ip6mr_mfc_add(net, mrt, &mfc, + sk == mrt->mroute6_sk, parent); + rtnl_unlock(); + return ret; + + /* + * Control PIM assert (to activate pim will activate assert) + */ + case MRT6_ASSERT: + { + int v; + + if (optlen != sizeof(v)) + return -EINVAL; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + mrt->mroute_do_assert = v; + return 0; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + { + int v; + + if (optlen != sizeof(v)) + return -EINVAL; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + v = !!v; + rtnl_lock(); + ret = 0; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; + } + rtnl_unlock(); + return ret; + } + +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + case MRT6_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ + if (v != RT_TABLE_DEFAULT && v >= 100000000) + return -EINVAL; + if (sk == mrt->mroute6_sk) + return -EBUSY; + + rtnl_lock(); + ret = 0; + if (!ip6mr_new_table(net, v)) + ret = -ENOMEM; + raw6_sk(sk)->ip6mr_table = v; + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT6_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) +{ + int olr; + int val; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (optname) { + case MRT6_VERSION: + val = 0x0305; + break; +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + val = mrt->mroute_do_pim; + break; +#endif + case MRT6_ASSERT: + val = mrt->mroute_do_assert; + break; + default: + return -ENOPROTOOPT; + } + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if (put_user(olr, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req6 sr; + struct sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req6 { + struct sockaddr_in6 src; + struct sockaddr_in6 grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_mif_req6 { + mifi_t mifi; + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req6 sr; + struct compat_sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif + +static inline int ip6mr_forward2_finish(struct sk_buff *skb) +{ + IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTFORWDATAGRAMS); + IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTOCTETS, skb->len); + return dst_output(skb); +} + +/* + * Processing handlers for ip6mr_forward + */ + +static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *c, int vifi) +{ + struct ipv6hdr *ipv6h; + struct mif_device *vif = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct dst_entry *dst; + struct flowi6 fl6; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vif->flags & MIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out += skb->len; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); + goto out_free; + } +#endif + + ipv6h = ipv6_hdr(skb); + + fl6 = (struct flowi6) { + .flowi6_oif = vif->link, + .daddr = ipv6h->daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + goto out_free; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + dev = vif->dev; + skb->dev = dev; + vif->pkt_out++; + vif->bytes_out += skb->len; + + /* We are about to write */ + /* XXX: extension headers? */ + if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev))) + goto out_free; + + ipv6h = ipv6_hdr(skb); + ipv6h->hop_limit--; + + IP6CB(skb)->flags |= IP6SKB_FORWARDED; + + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev, + ip6mr_forward2_finish); + +out_free: + kfree_skb(skb); + return 0; +} + +static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) +{ + int ct; + + for (ct = mrt->maxvif - 1; ct >= 0; ct--) { + if (mrt->vif6_table[ct].dev == dev) + break; + } + return ct; +} + +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache) +{ + int psend = -1; + int vif, ct; + int true_vifi = ip6mr_find_vif(mrt, skb->dev); + + vif = cache->mf6c_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { + struct mfc6_cache *cache_proxy; + + /* For an (*,G) entry, we only check that the incomming + * interface is part of the static tree. + */ + cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); + if (cache_proxy && + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + goto forward; + } + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (mrt->vif6_table[vif].dev != skb->dev) { + cache->mfc_un.res.wrong_if++; + + if (true_vifi >= 0 && mrt->mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mrt->mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); + } + goto dont_forward; + } + +forward: + mrt->vif6_table[vif].pkt_in++; + mrt->vif6_table[vif].bytes_in += skb->len; + + /* + * Forward the frame + */ + if (ipv6_addr_any(&cache->mf6c_origin) && + ipv6_addr_any(&cache->mf6c_mcastgrp)) { + if (true_vifi >= 0 && + true_vifi != cache->mf6c_parent && + ipv6_hdr(skb)->hop_limit > + cache->mfc_un.res.ttls[cache->mf6c_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = cache->mf6c_parent; + goto last_forward; + } + goto dont_forward; + } + for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { + /* For (*,G) entry, don't forward to the incoming interface */ + if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) && + ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_forward2(net, mrt, skb2, cache, psend); + } + psend = ct; + } + } +last_forward: + if (psend != -1) { + ip6mr_forward2(net, mrt, skb, cache, psend); + return; + } + +dont_forward: + kfree_skb(skb); +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip6_mr_input(struct sk_buff *skb) +{ + struct mfc6_cache *cache; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (cache == NULL) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } + + /* + * No usable cache entry + */ + if (cache == NULL) { + int vif; + + vif = ip6mr_find_vif(mrt, skb->dev); + if (vif >= 0) { + int err = ip6mr_cache_unresolved(mrt, vif, skb); + read_unlock(&mrt_lock); + + return err; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + ip6_mr_forward(net, mrt, skb, cache); + + read_unlock(&mrt_lock); + + return 0; +} + + +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + struct nlattr *mp_attr; + struct rta_mfc_stats mfcs; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mf6c_parent >= MAXMIFS) + return -ENOENT; + + if (MIF_EXISTS(mrt, c->mf6c_parent) && + nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) + return -EMSGSIZE; + mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + if (mp_attr == NULL) + return -EMSGSIZE; + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); + if (nhp == NULL) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + + nla_nest_end(skb, mp_attr); + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) + return -EMSGSIZE; + + rtm->rtm_type = RTN_MULTICAST; + return 1; +} + +int ip6mr_get_route(struct net *net, + struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + int err; + struct mr6_table *mrt; + struct mfc6_cache *cache; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + if (!cache && skb->dev) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, + vif); + } + + if (!cache) { + struct sk_buff *skb2; + struct ipv6hdr *iph; + struct net_device *dev; + int vif; + + if (nowait) { + read_unlock(&mrt_lock); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) { + read_unlock(&mrt_lock); + return -ENODEV; + } + + /* really correct? */ + skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + return -ENOMEM; + } + + skb_reset_transport_header(skb2); + + skb_put(skb2, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb2); + + iph = ipv6_hdr(skb2); + iph->version = 0; + iph->priority = 0; + iph->flow_lbl[0] = 0; + iph->flow_lbl[1] = 0; + iph->flow_lbl[2] = 0; + iph->payload_len = 0; + iph->nexthdr = IPPROTO_NONE; + iph->hop_limit = 0; + iph->saddr = rt->rt6i_src.addr; + iph->daddr = rt->rt6i_dst.addr; + + err = ip6mr_cache_unresolved(mrt, vif, skb2); + read_unlock(&mrt_lock); + + return err; + } + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + + err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); + read_unlock(&mrt_lock); + return err; +} + +static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mfc6_cache *c, int cmd, + int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + int err; + + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IP6MR; + rtm->rtm_dst_len = 128; + rtm->rtm_src_len = 128; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + if (nla_put_u32(skb, RTA_TABLE, mrt->id)) + goto nla_put_failure; + rtm->rtm_type = RTN_MULTICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + if (c->mfc_flags & MFC_STATIC) + rtm->rtm_protocol = RTPROT_STATIC; + else + rtm->rtm_protocol = RTPROT_MROUTED; + rtm->rtm_flags = 0; + + if (nla_put(skb, RTA_SRC, 16, &c->mf6c_origin) || + nla_put(skb, RTA_DST, 16, &c->mf6c_mcastgrp)) + goto nla_put_failure; + err = __ip6mr_fill_mroute(mrt, skb, c, rtm); + /* do not break the dump if cache is unresolved */ + if (err < 0 && err != -ENOENT) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mr6_msgsize(bool unresolved, int maxvif) +{ + size_t len = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ + ; + + if (!unresolved) + len = len + + nla_total_size(4) /* RTA_IIF */ + + nla_total_size(0) /* RTA_MULTIPATH */ + + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + /* RTA_MFC_STATS */ + + nla_total_size(sizeof(struct rta_mfc_stats)) + ; + + return len; +} + +static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, + int cmd) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif), + GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); + if (err < 0) + goto errout; + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); + return; + +errout: + kfree_skb(skb); + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); +} + +static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr6_table *mrt; + struct mfc6_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ip6mr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC6_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { + spin_unlock_bh(&mfc_unres_lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(&mfc_unres_lock); + e = s_e = 0; + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index e3dcfa2f436..d1c793cffcb 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -16,8 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * [Memo] @@ -30,11 +29,13 @@ * The decompression of IP datagram MUST be done after the reassembly, * AH/ESP processing. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/module.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipcomp.h> -#include <asm/semaphore.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/pfkeyv2.h> @@ -44,6 +45,7 @@ #include <linux/list.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -51,157 +53,46 @@ #include <linux/icmpv6.h> #include <linux/mutex.h> -struct ipcomp6_tfms { - struct list_head list; - struct crypto_comp **tfms; - int users; -}; - -static DEFINE_MUTEX(ipcomp6_resource_mutex); -static void **ipcomp6_scratches; -static int ipcomp6_scratch_users; -static LIST_HEAD(ipcomp6_tfms_list); - -static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int nexthdr; - int err = -ENOMEM; - struct ip_comp_hdr *ipch; - int plen, dlen; - struct ipcomp_data *ipcd = x->data; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; - - if (skb_linearize_cow(skb)) - goto out; - - skb->ip_summed = CHECKSUM_NONE; - - /* Remove ipcomp header and decompress original payload */ - ipch = (void *)skb->data; - nexthdr = ipch->nexthdr; - - skb->transport_header = skb->network_header + sizeof(*ipch); - __skb_pull(skb, sizeof(*ipch)); - - /* decompression */ - plen = skb->len; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data; - - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); - if (err) - goto out_put_cpu; - - if (dlen < (plen + sizeof(*ipch))) { - err = -EINVAL; - goto out_put_cpu; - } - - err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); - if (err) { - goto out_put_cpu; - } - - skb->truesize += dlen - plen; - __skb_put(skb, dlen - plen); - skb_copy_to_linear_data(skb, scratch, dlen); - err = nexthdr; - -out_put_cpu: - put_cpu(); -out: - return err; -} - -static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - struct ip_comp_hdr *ipch; - struct ipcomp_data *ipcd = x->data; - int plen, dlen; - u8 *start, *scratch; - struct crypto_comp *tfm; - int cpu; - - /* check whether datagram len is larger than threshold */ - if (skb->len < ipcd->threshold) { - goto out_ok; - } - - if (skb_linearize_cow(skb)) - goto out_ok; - - /* compression */ - plen = skb->len; - dlen = IPCOMP_SCRATCH_SIZE; - start = skb->data; - - cpu = get_cpu(); - scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); - tfm = *per_cpu_ptr(ipcd->tfms, cpu); - - local_bh_disable(); - err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - local_bh_enable(); - if (err || (dlen + sizeof(*ipch)) >= plen) { - put_cpu(); - goto out_ok; - } - memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); - put_cpu(); - pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); - - /* insert ipcomp header and replace datagram */ - ipch = ip_comp_hdr(skb); - ipch->nexthdr = *skb_mac_header(skb); - ipch->flags = 0; - ipch->cpi = htons((u16 )ntohl(x->id.spi)); - *skb_mac_header(skb) = IPPROTO_COMP; - -out_ok: - skb_push(skb, -skb_network_offset(skb)); - - return 0; -} - -static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) +static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); __be32 spi; - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct ip_comp_hdr *ipcomph = (struct ip_comp_hdr *)(skb->data + offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; spi = htonl(ntohs(ipcomph->cpi)); - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET6); if (!x) - return; + return 0; - printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIP6_FMT "\n", - spi, NIP6(iph->daddr)); + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { + struct net *net = xs_net(x); struct xfrm_state *t = NULL; - t = xfrm_state_alloc(); + t = xfrm_state_alloc(net); if (!t) goto out; t->id.proto = IPPROTO_IPV6; - t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr); + t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); if (!t->id.spi) goto error; @@ -210,6 +101,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t->props.family = AF_INET6; t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) goto error; @@ -228,13 +120,15 @@ error: static int ipcomp6_tunnel_attach(struct xfrm_state *x) { + struct net *net = xs_net(x); int err = 0; struct xfrm_state *t = NULL; __be32 spi; + u32 mark = x->mark.m & x->mark.v; - spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr); + spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr); if (spi) - t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr, + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr, spi, IPPROTO_IPV6, AF_INET6); if (!t) { t = ipcomp6_tunnel_create(x); @@ -252,160 +146,9 @@ out: return err; } -static void ipcomp6_free_scratches(void) -{ - int i; - void **scratches; - - if (--ipcomp6_scratch_users) - return; - - scratches = ipcomp6_scratches; - if (!scratches) - return; - - for_each_possible_cpu(i) { - void *scratch = *per_cpu_ptr(scratches, i); - - vfree(scratch); - } - - free_percpu(scratches); -} - -static void **ipcomp6_alloc_scratches(void) -{ - int i; - void **scratches; - - if (ipcomp6_scratch_users++) - return ipcomp6_scratches; - - scratches = alloc_percpu(void *); - if (!scratches) - return NULL; - - ipcomp6_scratches = scratches; - - for_each_possible_cpu(i) { - void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); - if (!scratch) - return NULL; - *per_cpu_ptr(scratches, i) = scratch; - } - - return scratches; -} - -static void ipcomp6_free_tfms(struct crypto_comp **tfms) -{ - struct ipcomp6_tfms *pos; - int cpu; - - list_for_each_entry(pos, &ipcomp6_tfms_list, list) { - if (pos->tfms == tfms) - break; - } - - BUG_TRAP(pos); - - if (--pos->users) - return; - - list_del(&pos->list); - kfree(pos); - - if (!tfms) - return; - - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu); - crypto_free_comp(tfm); - } - free_percpu(tfms); -} - -static struct crypto_comp **ipcomp6_alloc_tfms(const char *alg_name) -{ - struct ipcomp6_tfms *pos; - struct crypto_comp **tfms; - int cpu; - - /* This can be any valid CPU ID so we don't need locking. */ - cpu = raw_smp_processor_id(); - - list_for_each_entry(pos, &ipcomp6_tfms_list, list) { - struct crypto_comp *tfm; - - tfms = pos->tfms; - tfm = *per_cpu_ptr(tfms, cpu); - - if (!strcmp(crypto_comp_name(tfm), alg_name)) { - pos->users++; - return tfms; - } - } - - pos = kmalloc(sizeof(*pos), GFP_KERNEL); - if (!pos) - return NULL; - - pos->users = 1; - INIT_LIST_HEAD(&pos->list); - list_add(&pos->list, &ipcomp6_tfms_list); - - pos->tfms = tfms = alloc_percpu(struct crypto_comp *); - if (!tfms) - goto error; - - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto error; - *per_cpu_ptr(tfms, cpu) = tfm; - } - - return tfms; - -error: - ipcomp6_free_tfms(tfms); - return NULL; -} - -static void ipcomp6_free_data(struct ipcomp_data *ipcd) -{ - if (ipcd->tfms) - ipcomp6_free_tfms(ipcd->tfms); - ipcomp6_free_scratches(); -} - -static void ipcomp6_destroy(struct xfrm_state *x) -{ - struct ipcomp_data *ipcd = x->data; - if (!ipcd) - return; - xfrm_state_delete_tunnel(x); - mutex_lock(&ipcomp6_resource_mutex); - ipcomp6_free_data(ipcd); - mutex_unlock(&ipcomp6_resource_mutex); - kfree(ipcd); - - xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); -} - static int ipcomp6_init_state(struct xfrm_state *x) { - int err; - struct ipcomp_data *ipcd; - struct xfrm_algo_desc *calg_desc; - - err = -EINVAL; - if (!x->calg) - goto out; - - if (x->encap) - goto out; + int err = -EINVAL; x->props.header_len = 0; switch (x->props.mode) { @@ -418,41 +161,24 @@ static int ipcomp6_init_state(struct xfrm_state *x) goto out; } - err = -ENOMEM; - ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); - if (!ipcd) + err = ipcomp_init_state(x); + if (err) goto out; - mutex_lock(&ipcomp6_resource_mutex); - if (!ipcomp6_alloc_scratches()) - goto error; - - ipcd->tfms = ipcomp6_alloc_tfms(x->calg->alg_name); - if (!ipcd->tfms) - goto error; - mutex_unlock(&ipcomp6_resource_mutex); - if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp6_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } - calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); - BUG_ON(!calg_desc); - ipcd->threshold = calg_desc->uinfo.comp.threshold; - x->data = ipcd; err = 0; out: return err; -error_tunnel: - mutex_lock(&ipcomp6_resource_mutex); -error: - ipcomp6_free_data(ipcd); - mutex_unlock(&ipcomp6_resource_mutex); - kfree(ipcd); +} - goto out; +static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; } static const struct xfrm_type ipcomp6_type = @@ -461,27 +187,28 @@ static const struct xfrm_type ipcomp6_type = .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp6_init_state, - .destructor = ipcomp6_destroy, - .input = ipcomp6_input, - .output = ipcomp6_output, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output, .hdr_offset = xfrm6_find_1stfragopt, }; -static struct inet6_protocol ipcomp6_protocol = +static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ipcomp6_init(void) { if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { - printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp6_type, AF_INET6); return -EAGAIN; } @@ -490,10 +217,10 @@ static int __init ipcomp6_init(void) static void __exit ipcomp6_fini(void) { - if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) - printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } module_init(ipcomp6_init); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index bf2a686aa13..edb58aff4ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,8 +7,6 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.41 2002/02/01 22:01:04 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -16,7 +14,6 @@ * * FIXME: Make the setsockopt code POSIX compliant: That is * - * o Return -EINVAL for setsockopt of short lengths * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * @@ -33,11 +30,13 @@ #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> +#include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> @@ -52,133 +51,20 @@ #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> +#include <net/compat.h> #include <asm/uaccess.h> -DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly; - -static struct inet6_protocol *ipv6_gso_pull_exthdrs(struct sk_buff *skb, - int proto) -{ - struct inet6_protocol *ops = NULL; - - for (;;) { - struct ipv6_opt_hdr *opth; - int len; - - if (proto != NEXTHDR_HOP) { - ops = rcu_dereference(inet6_protos[proto]); - - if (unlikely(!ops)) - break; - - if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } - - if (unlikely(!pskb_may_pull(skb, 8))) - break; - - opth = (void *)skb->data; - len = opth->hdrlen * 8 + 8; - - if (unlikely(!pskb_may_pull(skb, len))) - break; - - proto = opth->nexthdr; - __skb_pull(skb, len); - } - - return ops; -} - -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - struct ipv6hdr *ipv6h; - struct inet6_protocol *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - rcu_read_lock(); - ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); - if (likely(ops && ops->gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->gso_send_check(skb); - } - rcu_read_unlock(); - -out: - return err; -} - -static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct ipv6hdr *ipv6h; - struct inet6_protocol *ops; - - if (!(features & NETIF_F_V6_CSUM)) - features &= ~NETIF_F_SG; - - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - 0))) - goto out; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - segs = ERR_PTR(-EPROTONOSUPPORT); - - rcu_read_lock(); - ops = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); - if (likely(ops && ops->gso_segment)) { - skb_reset_transport_header(skb); - segs = ops->gso_segment(skb, features); - } - rcu_read_unlock(); - - if (unlikely(IS_ERR(segs))) - goto out; - - for (skb = segs; skb; skb = skb->next) { - ipv6h = ipv6_hdr(skb); - ipv6h->payload_len = htons(skb->len - skb->mac_len - - sizeof(*ipv6h)); - } - -out: - return segs; -} - -static struct packet_type ipv6_packet_type = { - .type = __constant_htons(ETH_P_IPV6), - .func = ipv6_rcv, - .gso_send_check = ipv6_gso_send_check, - .gso_segment = ipv6_gso_segment, -}; - struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); -int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) +int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ - if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW) - return -EINVAL; + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) + return -ENOPROTOOPT; new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; @@ -194,8 +80,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) *rap = ra->next; write_unlock_bh(&ip6_ra_lock); - if (ra->destructor) - ra->destructor(sk); sock_put(sk); kfree(ra); return 0; @@ -207,7 +91,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) } new_ra->sk = sk; new_ra->sel = sel; - new_ra->destructor = destructor; new_ra->next = ra; *rap = new_ra; sock_hold(sk); @@ -215,32 +98,74 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) return 0; } +static +struct ipv6_txoptions *ipv6_update_options(struct sock *sk, + struct ipv6_txoptions *opt) +{ + if (inet_sk(sk)->is_icsk) { + if (opt && + !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { + struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + opt = xchg(&inet6_sk(sk)->opt, opt); + } else { + spin_lock(&sk->sk_dst_lock); + opt = xchg(&inet6_sk(sk)->opt, opt); + spin_unlock(&sk->sk_dst_lock); + } + sk_dst_reset(sk); + + return opt; +} + static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; if (optval == NULL) val=0; - else if (get_user(val, (int __user *) optval)) - return -EFAULT; + else { + if (optlen >= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else + val = 0; + } valbool = (val!=0); + if (ip6_mroute_opt(optname)) + return ip6_mroute_setsockopt(sk, optname, optval, optlen); + lock_sock(sk); switch (optname) { case IPV6_ADDRFORM: + if (optlen < sizeof(int)) + goto e_inval; if (val == PF_INET) { struct ipv6_txoptions *opt; struct sk_buff *pktopt; - if (sk->sk_protocol != IPPROTO_UDP && - sk->sk_protocol != IPPROTO_UDPLITE && - sk->sk_protocol != IPPROTO_TCP) + if (sk->sk_type == SOCK_RAW) + break; + + if (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_UDPLITE) { + struct udp_sock *up = udp_sk(sk); + if (up->pending == AF_INET6) { + retv = -EBUSY; + break; + } + } else if (sk->sk_protocol != IPPROTO_TCP) break; if (sk->sk_state != TCP_ESTABLISHED) { @@ -249,7 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, } if (ipv6_only_sock(sk) || - !ipv6_addr_v4mapped(&np->daddr)) { + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } @@ -266,10 +191,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); - local_bh_disable(); - sock_prot_inuse_add(sk->sk_prot, -1); - sock_prot_inuse_add(&tcp_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); sk->sk_prot = &tcp_prot; icsk->icsk_af_ops = &ipv4_specific; @@ -282,8 +206,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; local_bh_disable(); - sock_prot_inuse_add(sk->sk_prot, -1); - sock_prot_inuse_add(prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, prot, 1); local_bh_enable(); sk->sk_prot = prot; sk->sk_socket->ops = &inet_dgram_ops; @@ -293,8 +217,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (opt) sock_kfree_s(sk, opt, opt->tot_len); pktopt = xchg(&np->pktoptions, NULL); - if (pktopt) - kfree_skb(pktopt); + kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; /* @@ -309,95 +232,157 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, goto e_inval; case IPV6_V6ONLY: - if (inet_sk(sk)->num) + if (optlen < sizeof(int) || + inet_sk(sk)->inet_num) goto e_inval; np->ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: + if (optlen < sizeof(int)) + goto e_inval; if (val < -1 || val > 0xff) goto e_inval; + /* RFC 3542, 6.5: default traffic class of 0x0 */ + if (val == -1) + val = 0; np->tclass = val; retv = 0; break; case IPV6_RECVTCLASS: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: + if (optlen < sizeof(int)) + goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; + case IPV6_RECVPATHMTU: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxpmtu = valbool; + retv = 0; + break; + + case IPV6_TRANSPARENT: + if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) && + !ns_capable(net->user_ns, CAP_NET_RAW)) { + retv = -EPERM; + break; + } + if (optlen < sizeof(int)) + goto e_inval; + /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ + inet_sk(sk)->transparent = valbool; + retv = 0; + break; + + case IPV6_RECVORIGDSTADDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxorigdstaddr = valbool; + retv = 0; + break; + case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ if (optlen == 0) optval = NULL; + else if (optval == NULL) + goto e_inval; + else if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || optlen > 8 * 255) + goto e_inval; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; - if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) - break; - - retv = -EINVAL; - if (optlen & 0x7 || optlen > 8 * 255) + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; opt = ipv6_renew_options(sk, np->opt, optname, @@ -409,57 +394,63 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, } /* routing header option needs extra check */ + retv = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) + goto sticky_done; + break; #endif default: goto sticky_done; } - - if ((rthdr->hdrlen & 1) || - (rthdr->hdrlen >> 1) != rthdr->segments_left) - goto sticky_done; } retv = 0; - if (inet_sk(sk)->is_icsk) { - if (opt) { - struct inet_connection_sock *icsk = inet_csk(sk); - if (!((1 << sk->sk_state) & - (TCPF_LISTEN | TCPF_CLOSE)) - && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - icsk->icsk_ext_hdr_len = - opt->opt_flen + opt->opt_nflen; - icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); - } - } - opt = xchg(&np->opt, opt); - sk_dst_reset(sk); - } else { - write_lock(&sk->sk_dst_lock); - opt = xchg(&np->opt, opt); - write_unlock(&sk->sk_dst_lock); - sk_dst_reset(sk); - } + opt = ipv6_update_options(sk, opt); sticky_done: if (opt) sock_kfree_s(sk, opt, opt->tot_len); break; } + case IPV6_PKTINFO: + { + struct in6_pktinfo pkt; + + if (optlen == 0) + goto e_inval; + else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL) + goto e_inval; + + if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { + retv = -EFAULT; + break; + } + if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + goto e_inval; + + np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; + np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; + retv = 0; + break; + } + case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; - struct flowi fl; + struct flowi6 fl6; int junk; - fl.fl6_flowlabel = 0; - fl.oif = sk->sk_bound_dev_if; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; @@ -485,37 +476,21 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void*)(opt+1); - retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, + &junk, &junk); if (retv) goto done; update: retv = 0; - if (inet_sk(sk)->is_icsk) { - if (opt) { - struct inet_connection_sock *icsk = inet_csk(sk); - if (!((1 << sk->sk_state) & - (TCPF_LISTEN | TCPF_CLOSE)) - && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - icsk->icsk_ext_hdr_len = - opt->opt_flen + opt->opt_nflen; - icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); - } - } - opt = xchg(&np->opt, opt); - sk_dst_reset(sk); - } else { - write_lock(&sk->sk_dst_lock); - opt = xchg(&np->opt, opt); - write_unlock(&sk->sk_dst_lock); - sk_dst_reset(sk); - } - + opt = ipv6_update_options(sk, opt); done: if (opt) sock_kfree_s(sk, opt, opt->tot_len); break; } case IPV6_UNICAST_HOPS: + if (optlen < sizeof(int)) + goto e_inval; if (val > 255 || val < -1) goto e_inval; np->hop_limit = val; @@ -524,30 +499,72 @@ done: case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) goto e_inval; if (val > 255 || val < -1) goto e_inval; - np->mcast_hops = val; + np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); retv = 0; break; case IPV6_MULTICAST_LOOP: + if (optlen < sizeof(int)) + goto e_inval; + if (val != valbool) + goto e_inval; np->mc_loop = valbool; retv = 0; break; + case IPV6_UNICAST_IF: + { + struct net_device *dev = NULL; + int ifindex; + + if (optlen != sizeof(int)) + goto e_inval; + + ifindex = (__force int)ntohl((__force __be32)val); + if (ifindex == 0) { + np->ucast_oif = 0; + retv = 0; + break; + } + + dev = dev_get_by_index(net, ifindex); + retv = -EADDRNOTAVAIL; + if (!dev) + break; + dev_put(dev); + + retv = -EINVAL; + if (sk->sk_bound_dev_if) + break; + + np->ucast_oif = ifindex; + retv = 0; + break; + } + case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) goto e_inval; if (val) { + struct net_device *dev; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) goto e_inval; - if (__dev_get_by_index(&init_net, val) == NULL) { + dev = dev_get_by_index(net, val); + if (!dev) { retv = -ENODEV; break; } + dev_put(dev); } np->mcast_oif = val; retv = 0; @@ -557,6 +574,9 @@ done: { struct ipv6_mreq mreq; + if (optlen < sizeof(struct ipv6_mreq)) + goto e_inval; + retv = -EPROTO; if (inet_sk(sk)->is_icsk) break; @@ -576,7 +596,7 @@ done: { struct ipv6_mreq mreq; - if (optlen != sizeof(struct ipv6_mreq)) + if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; @@ -595,6 +615,9 @@ done: struct group_req greq; struct sockaddr_in6 *psin6; + if (optlen < sizeof(struct group_req)) + goto e_inval; + retv = -EFAULT; if (copy_from_user(&greq, optval, sizeof(struct group_req))) break; @@ -619,7 +642,7 @@ done: struct group_source_req greqs; int omode, add; - if (optlen != sizeof(struct group_source_req)) + if (optlen < sizeof(struct group_source_req)) goto e_inval; if (copy_from_user(&greqs, optval, sizeof(greqs))) { retv = -EFAULT; @@ -656,7 +679,6 @@ done: } case MCAST_MSFILTER: { - extern int sysctl_mld_max_msf; struct group_filter *gsf; if (optlen < GROUP_FILTER_SIZE(0)) @@ -693,27 +715,37 @@ done: break; } case IPV6_ROUTER_ALERT: - retv = ip6_ra_control(sk, val, NULL); + if (optlen < sizeof(int)) + goto e_inval; + retv = ip6_ra_control(sk, val); break; case IPV6_MTU_DISCOVER: - if (val<0 || val>3) + if (optlen < sizeof(int)) + goto e_inval; + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) goto e_inval; np->pmtudisc = val; retv = 0; break; case IPV6_MTU: + if (optlen < sizeof(int)) + goto e_inval; if (val && val < IPV6_MIN_MTU) goto e_inval; np->frag_size = val; retv = 0; break; case IPV6_RECVERR: + if (optlen < sizeof(int)) + goto e_inval; np->recverr = valbool; if (!val) skb_queue_purge(&sk->sk_error_queue); retv = 0; break; case IPV6_FLOWINFO_SEND: + if (optlen < sizeof(int)) + goto e_inval; np->sndflow = valbool; retv = 0; break; @@ -723,12 +755,87 @@ done: case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; + case IPV6_ADDR_PREFERENCES: + { + unsigned int pref = 0; + unsigned int prefmask = ~0; + + if (optlen < sizeof(int)) + goto e_inval; + + retv = -EINVAL; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP| + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + break; + case 0: + goto pref_skip_pubtmp; + default: + goto e_inval; + } + + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP); +pref_skip_pubtmp: + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + case 0: + goto pref_skip_coa; + default: + goto e_inval; + } + + prefmask &= ~IPV6_PREFER_SRC_COA; +pref_skip_coa: + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + goto e_inval; + } + + np->srcprefs = (np->srcprefs & prefmask) | pref; + retv = 0; + + break; + } + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + np->min_hopcount = val; + retv = 0; + break; + case IPV6_DONTFRAG: + np->dontfrag = valbool; + retv = 0; + break; } + release_sock(sk); return retv; @@ -739,7 +846,7 @@ e_inval: } int ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { int err; @@ -767,7 +874,7 @@ EXPORT_SYMBOL(ipv6_setsockopt); #ifdef CONFIG_COMPAT int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { int err; @@ -781,6 +888,10 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; + if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + return compat_mc_setsockopt(sk, level, optname, optval, optlen, + ipv6_setsockopt); + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ @@ -829,16 +940,19 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_user(optval, hdr, len)) return -EFAULT; - return ipv6_optlen(hdr); + return len; } static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; + if (ip6_mroute_opt(optname)) + return ip6_mroute_getsockopt(sk, optname, optval, optlen); + if (get_user(len, optlen)) return -EFAULT; switch (optname) { @@ -846,7 +960,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) - return -EINVAL; + return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; @@ -860,6 +974,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -EINVAL; if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) return -EFAULT; + if (gsf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, (struct group_filter __user *)optval, optlen); @@ -877,7 +993,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_flags = flags; lock_sock(sk); skb = np->pktoptions; @@ -886,31 +1002,42 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, release_sock(sk); if (skb) { - int err = datagram_recv_ctl(sk, &msg, skb); + ip6_datagram_recv_ctl(sk, &msg, skb); kfree_skb(skb); - if (err) - return err; } else { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif; - ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr); + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } + if (np->rxopt.bits.rxtclass) { + int tclass = (int)ip6_tclass(np->rcv_flowinfo); + + put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); + } if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif; - ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr); + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : + np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = np->rcv_flowinfo; + + put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } } len -= msg.msg_controllen; return put_user(len, optlen); @@ -918,14 +1045,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_MTU: { struct dst_entry *dst; + val = 0; - lock_sock(sk); - dst = sk_dst_get(sk); - if (dst) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) val = dst_mtu(dst); - dst_release(dst); - } - release_sock(sk); + rcu_read_unlock(); if (!val) return -ENOTCONN; break; @@ -969,6 +1095,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, len = ipv6_getsockopt_sticky(sk, np->opt, optname, optval, len); release_sock(sk); + /* check if ipv6_getsockopt_sticky() returns err code */ + if (len < 0) + return len; return put_user(len, optlen); } @@ -990,8 +1119,6 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_TCLASS: val = np->tclass; - if (val < 0) - val = 0; break; case IPV6_RECVTCLASS: @@ -1002,6 +1129,46 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rxopt.bits.rxflow; break; + case IPV6_RECVPATHMTU: + val = np->rxopt.bits.rxpmtu; + break; + + case IPV6_PATHMTU: + { + struct dst_entry *dst; + struct ip6_mtuinfo mtuinfo; + + if (len < sizeof(mtuinfo)) + return -EINVAL; + + len = sizeof(mtuinfo); + memset(&mtuinfo, 0, sizeof(mtuinfo)); + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + mtuinfo.ip6m_mtu = dst_mtu(dst); + rcu_read_unlock(); + if (!mtuinfo.ip6m_mtu) + return -ENOTCONN; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &mtuinfo, len)) + return -EFAULT; + + return 0; + break; + } + + case IPV6_TRANSPARENT: + val = inet_sk(sk)->transparent; + break; + + case IPV6_RECVORIGDSTADDR: + val = np->rxopt.bits.rxorigdstaddr; + break; + case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { @@ -1012,16 +1179,16 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, else val = np->mcast_hops; - dst = sk_dst_get(sk); - if (dst) { - if (val < 0) - val = dst_metric(dst, RTAX_HOPLIMIT); - if (val < 0) - val = ipv6_get_hoplimit(dst->dev); - dst_release(dst); + if (val < 0) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + val = ip6_dst_hoplimit(dst); + rcu_read_unlock(); } + if (val < 0) - val = ipv6_devconf.hop_limit; + val = sock_net(sk)->ipv6.devconf_all->hop_limit; break; } @@ -1033,6 +1200,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->mcast_oif; break; + case IPV6_UNICAST_IF: + val = (__force int)htonl((__u32) np->ucast_oif); + break; + case IPV6_MTU_DISCOVER: val = np->pmtudisc; break; @@ -1045,6 +1216,63 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->sndflow; break; + case IPV6_FLOWLABEL_MGR: + { + struct in6_flowlabel_req freq; + int flags; + + if (len < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + if (freq.flr_action != IPV6_FL_A_GET) + return -EINVAL; + + len = sizeof(freq); + flags = freq.flr_flags; + + memset(&freq, 0, sizeof(freq)); + + val = ipv6_flowlabel_opt_get(sk, &freq, flags); + if (val < 0) + return val; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &freq, len)) + return -EFAULT; + + return 0; + } + + case IPV6_ADDR_PREFERENCES: + val = 0; + + if (np->srcprefs & IPV6_PREFER_SRC_TMP) + val |= IPV6_PREFER_SRC_TMP; + else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC) + val |= IPV6_PREFER_SRC_PUBLIC; + else { + /* XXX: should we return system default? */ + val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; + } + + if (np->srcprefs & IPV6_PREFER_SRC_COA) + val |= IPV6_PREFER_SRC_COA; + else + val |= IPV6_PREFER_SRC_HOME; + break; + + case IPV6_MINHOPCOUNT: + val = np->min_hopcount; + break; + + case IPV6_DONTFRAG: + val = np->dontfrag; + break; + default: return -ENOPROTOOPT; } @@ -1067,7 +1295,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if(level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen); + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { @@ -1105,7 +1333,12 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen); + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ipv6_getsockopt); + + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { @@ -1128,13 +1361,3 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif -int __init ipv6_packet_init(void) -{ - dev_add_pack(&ipv6_packet_type); - return 0; -} - -void ipv6_packet_cleanup(void) -{ - dev_remove_pack(&ipv6_packet_type); -} diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index ab228d1ea11..617f0958e16 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: mcast.c,v 1.40 2002/02/08 03:57:19 davem Exp $ - * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * * This program is free software; you can redistribute it and/or @@ -45,6 +43,9 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/pkt_sched.h> +#include <net/mld.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -59,6 +60,7 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip6_route.h> +#include <net/inet_common.h> #include <net/ip6_checksum.h> @@ -71,64 +73,17 @@ #define MDBG(x) #endif -/* - * These header formats should be in a separate include file, but icmpv6.h - * doesn't have in6_addr defined in all cases, there is no __u128, and no - * other files reference these. - * - * +-DLS 4/14/03 - */ - -/* Multicast Listener Discovery version 2 headers */ - -struct mld2_grec { - __u8 grec_type; - __u8 grec_auxwords; - __be16 grec_nsrcs; - struct in6_addr grec_mca; - struct in6_addr grec_src[0]; -}; - -struct mld2_report { - __u8 type; - __u8 resv1; - __sum16 csum; - __be16 resv2; - __be16 ngrec; - struct mld2_grec grec[0]; -}; - -struct mld2_query { - __u8 type; - __u8 code; - __sum16 csum; - __be16 mrc; - __be16 resv1; - struct in6_addr mca; -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 qrv:3, - suppress:1, - resv2:4; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 resv2:4, - suppress:1, - qrv:3; -#else -#error "Please fix <asm/byteorder.h>" -#endif - __u8 qqic; - __be16 nsrcs; - struct in6_addr srcs[0]; +/* Ensure that we have struct in6_addr aligned on 32bit word. */ +static void *__mld2_query_bugs[] __attribute__((__unused__)) = { + BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; /* Big mc list lock for all the sockets */ -static DEFINE_RWLOCK(ipv6_sk_mc_lock); - -static struct socket *igmp6_socket; - -int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr); +static DEFINE_SPINLOCK(ipv6_sk_mc_lock); static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); @@ -138,37 +93,30 @@ static void mld_gq_timer_expire(unsigned long data); static void mld_ifc_timer_expire(unsigned long data); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *addr); +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); static void mld_clear_delrec(struct inet6_dev *idev); +static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); -static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); -static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); - -#define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 +/* RFC3810, 9.2. Query Interval */ +#define MLD_QI_DEFAULT (125 * HZ) +/* RFC3810, 9.3. Query Response Interval */ +#define MLD_QRI_DEFAULT (10 * HZ) -#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ - (idev)->cnf.force_mld_version == 1 || \ - ((idev)->mc_v1_seen && \ - time_before(jiffies, (idev)->mc_v1_seen))) - -#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) -#define MLDV2_EXP(thresh, nbmant, nbexp, value) \ - ((value) < (thresh) ? (value) : \ - ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \ - (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) - -#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) -#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) +/* RFC3810, 8.1 Query Version Distinctions */ +#define MLD_V1_QUERY_LEN 24 +#define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 @@ -178,25 +126,43 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; * socket join on multicast group */ -int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) +#define for_each_pmc_rcu(np, pmc) \ + for (pmc = rcu_dereference(np->ipv6_mc_list); \ + pmc != NULL; \ + pmc = rcu_dereference(pmc->next)) + +static int unsolicited_report_interval(struct inet6_dev *idev) +{ + int iv; + + if (mld_in_v1_mode(idev)) + iv = idev->cnf.mldv1_unsolicited_report_interval; + else + iv = idev->cnf.mldv2_unsolicited_report_interval; + + return iv > 0 ? iv : 1; +} + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); int err; if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - read_lock_bh(&ipv6_sk_mc_lock); - for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) { + rcu_read_lock(); + for_each_pmc_rcu(np, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { - read_unlock_bh(&ipv6_sk_mc_lock); + rcu_read_unlock(); return -EADDRINUSE; } } - read_unlock_bh(&ipv6_sk_mc_lock); + rcu_read_unlock(); mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); @@ -204,20 +170,21 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) return -ENOMEM; mc_lst->next = NULL; - ipv6_addr_copy(&mc_lst->addr, addr); + mc_lst->addr = *addr; + rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev == NULL) { + rcu_read_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -234,17 +201,17 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { + rcu_read_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); - dev_put(dev); return err; } - write_lock_bh(&ipv6_sk_mc_lock); + spin_lock(&ipv6_sk_mc_lock); mc_lst->next = np->ipv6_mc_list; - np->ipv6_mc_list = mc_lst; - write_unlock_bh(&ipv6_sk_mc_lock); + rcu_assign_pointer(np->ipv6_mc_list, mc_lst); + spin_unlock(&ipv6_sk_mc_lock); - dev_put(dev); + rcu_read_unlock(); return 0; } @@ -252,69 +219,75 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) /* * socket leave on multicast group */ -int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_mc_socklist *mc_lst, **lnk; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_mc_socklist __rcu **lnk; + struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_mc_lock); - for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + spin_lock(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; + (mc_lst = rcu_dereference_protected(*lnk, + lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); - if ((dev = dev_get_by_index(&init_net, mc_lst->ifindex)) != NULL) { - struct inet6_dev *idev = in6_dev_get(dev); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + if (dev != NULL) { + struct inet6_dev *idev = __in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) { + if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - in6_dev_put(idev); - } - dev_put(dev); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + rcu_read_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); return 0; } } - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); return -EADDRNOTAVAIL; } -static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) +/* called with rcu_read_lock() */ +static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, + const struct in6_addr *group, + int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt; + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); - rt = rt6_lookup(group, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } } else - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) return NULL; - idev = in6_dev_get(dev); - if (!idev) { - dev_put(dev); + idev = __in6_dev_get(dev); + if (!idev) return NULL; - } read_lock_bh(&idev->lock); if (idev->dead) { read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); return NULL; } return idev; @@ -324,32 +297,37 @@ void ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; + struct net *net = sock_net(sk); + + if (!rcu_access_pointer(np->ipv6_mc_list)) + return; - write_lock_bh(&ipv6_sk_mc_lock); - while ((mc_lst = np->ipv6_mc_list) != NULL) { + spin_lock(&ipv6_sk_mc_lock); + while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); - dev = dev_get_by_index(&init_net, mc_lst->ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); if (dev) { - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) { + if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - in6_dev_put(idev); - } - dev_put(dev); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); - sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); - write_lock_bh(&ipv6_sk_mc_lock); + spin_lock(&ipv6_sk_mc_lock); } - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); } int ip6_mc_source(int add, int omode, struct sock *sk, @@ -357,34 +335,31 @@ int ip6_mc_source(int add, int omode, struct sock *sk, { struct in6_addr *source, *group; struct ipv6_mc_socklist *pmc; - struct net_device *dev; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; int pmclocked = 0; int err; - if (pgsr->gsr_group.ss_family != AF_INET6 || - pgsr->gsr_source.ss_family != AF_INET6) - return -EINVAL; - source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(group, pgsr->gsr_interface); - if (!idev) + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); + if (!idev) { + rcu_read_unlock(); return -ENODEV; - dev = idev->dev; + } err = -EADDRNOTAVAIL; - read_lock_bh(&ipv6_sk_mc_lock); - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -407,7 +382,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } - write_lock_bh(&pmc->sflock); + write_lock(&pmc->sflock); pmclocked = 1; psl = pmc->sflist; @@ -416,8 +391,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, - sizeof(struct in6_addr)); + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } @@ -467,12 +441,10 @@ int ip6_mc_source(int add, int omode, struct sock *sk, } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); - if (rv == 0) - break; + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); + if (rv == 0) /* There is an error in the address. */ + goto done; } - if (rv == 0) /* address already there is an error */ - goto done; for (j=psl->sl_count-1; j>=i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; @@ -482,11 +454,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk, ip6_mc_add_src(idev, group, omode, 1, source, 1); done: if (pmclocked) - write_unlock_bh(&pmc->sflock); - read_unlock_bh(&ipv6_sk_mc_lock); + write_unlock(&pmc->sflock); read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); if (leavegroup) return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -494,12 +464,12 @@ done: int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) { - struct in6_addr *group; + const struct in6_addr *group; struct ipv6_mc_socklist *pmc; - struct net_device *dev; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; + struct net *net = sock_net(sk); int leavegroup = 0; int i, err; @@ -511,21 +481,22 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev(group, gsf->gf_interface); + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - if (!idev) + if (!idev) { + rcu_read_unlock(); return -ENODEV; - dev = idev->dev; + } err = 0; - read_lock_bh(&ipv6_sk_mc_lock); if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; } - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -560,7 +531,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } - write_lock_bh(&pmc->sflock); + write_lock(&pmc->sflock); psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -570,13 +541,11 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; - write_unlock_bh(&pmc->sflock); + write_unlock(&pmc->sflock); err = 0; done: - read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -586,24 +555,25 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct group_filter __user *optval, int __user *optlen) { int err, i, count, copycount; - struct in6_addr *group; + const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; - struct net_device *dev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(group, gsf->gf_interface); + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - if (!idev) + if (!idev) { + rcu_read_unlock(); return -ENODEV; - - dev = idev->dev; + } err = -EADDRNOTAVAIL; /* @@ -612,7 +582,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, * so reading the list is safe. */ - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) @@ -624,8 +594,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, psl = pmc->sflist; count = psl ? psl->sl_count : 0; read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -651,27 +620,26 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, return 0; done: read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); return err; } -int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, - struct in6_addr *src_addr) +bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, + const struct in6_addr *src_addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc; struct ip6_sf_socklist *psl; - int rv = 1; + bool rv = true; - read_lock(&ipv6_sk_mc_lock); - for (mc = np->ipv6_mc_list; mc; mc = mc->next) { + rcu_read_lock(); + for_each_pmc_rcu(np, mc) { if (ipv6_addr_equal(&mc->addr, mc_addr)) break; } if (!mc) { - read_unlock(&ipv6_sk_mc_lock); - return 1; + rcu_read_unlock(); + return true; } read_lock(&mc->sflock); psl = mc->sflist; @@ -685,12 +653,12 @@ int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, break; } if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - rv = 0; + rv = false; if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - rv = 0; + rv = false; } read_unlock(&mc->sflock); - read_unlock(&ipv6_sk_mc_lock); + rcu_read_unlock(); return rv; } @@ -708,18 +676,22 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) - dev_mc_add(dev, buf, dev->addr_len, 0); + dev_mc_add(dev, buf); } spin_unlock_bh(&mc->mca_lock); if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; - if (MLD_V1_SEEN(mc->idev)) { + if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } @@ -734,11 +706,15 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) - dev_mc_delete(dev, buf, dev->addr_len, 0); + dev_mc_del(dev, buf); } if (mc->mca_flags & MAF_NOREPORT) @@ -791,18 +767,18 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } spin_unlock_bh(&im->mca_lock); - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc->next = idev->mc_tomb; idev->mc_tomb = pmc; - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); } -static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) { struct ifmcaddr6 *pmc, *pmc_prev; struct ip6_sf_list *psf, *psf_next; - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) @@ -815,7 +791,8 @@ static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) else idev->mc_tomb = pmc->next; } - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); + if (pmc) { for (psf=pmc->mca_tomb; psf; psf=psf_next) { psf_next = psf->sf_next; @@ -830,10 +807,10 @@ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc = idev->mc_tomb; idev->mc_tomb = NULL; - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; @@ -863,11 +840,12 @@ static void mld_clear_delrec(struct inet6_dev *idev) /* * device multicast group inc (add if not found) */ -int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { struct ifmcaddr6 *mc; struct inet6_dev *idev; + /* we need to take a reference on idev */ idev = in6_dev_get(dev); if (idev == NULL) @@ -905,8 +883,8 @@ int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); - ipv6_addr_copy(&mc->mca_addr, addr); - mc->idev = idev; + mc->mca_addr = *addr; + mc->idev = idev; /* (reference taken) */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; @@ -934,7 +912,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) /* * device multicast group del */ -int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr) +int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, **map; @@ -959,59 +937,35 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr) return -ENOENT; } -int ipv6_dev_mc_dec(struct net_device *dev, struct in6_addr *addr) +int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) { - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev; int err; - if (!idev) - return -ENODEV; - - err = __ipv6_dev_mc_dec(idev, addr); + rcu_read_lock(); - in6_dev_put(idev); + idev = __in6_dev_get(dev); + if (!idev) + err = -ENODEV; + else + err = __ipv6_dev_mc_dec(idev, addr); + rcu_read_unlock(); return err; } /* - * identify MLD packets for MLD filter exceptions - */ -int ipv6_is_mld(struct sk_buff *skb, int nexthdr) -{ - struct icmp6hdr *pic; - - if (nexthdr != IPPROTO_ICMPV6) - return 0; - - if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) - return 0; - - pic = icmp6_hdr(skb); - - switch (pic->icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - return 1; - default: - break; - } - return 0; -} - -/* * check if the interface/address pair is valid */ -int ipv6_chk_mcast_addr(struct net_device *dev, struct in6_addr *group, - struct in6_addr *src_addr) +bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, + const struct in6_addr *src_addr) { struct inet6_dev *idev; struct ifmcaddr6 *mc; - int rv = 0; + bool rv = false; - idev = in6_dev_get(dev); + rcu_read_lock(); + idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); for (mc = idev->mc_list; mc; mc=mc->next) { @@ -1035,31 +989,59 @@ int ipv6_chk_mcast_addr(struct net_device *dev, struct in6_addr *group, rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; spin_unlock_bh(&mc->mca_lock); } else - rv = 1; /* don't filter unspecified source */ + rv = true; /* don't filter unspecified source */ } read_unlock_bh(&idev->lock); - in6_dev_put(idev); } + rcu_read_unlock(); return rv; } static void mld_gq_start_timer(struct inet6_dev *idev) { - int tv = net_random() % idev->mc_maxdelay; + unsigned long tv = prandom_u32() % idev->mc_maxdelay; idev->mc_gq_running = 1; if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +static void mld_gq_stop_timer(struct inet6_dev *idev) +{ + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) { - int tv = net_random() % delay; + unsigned long tv = prandom_u32() % delay; if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) in6_dev_hold(idev); } +static void mld_ifc_stop_timer(struct inet6_dev *idev) +{ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); +} + +static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) +{ + unsigned long tv = prandom_u32() % delay; + + if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_dad_stop_timer(struct inet6_dev *idev) +{ + if (del_timer(&idev->mc_dad_timer)) + __in6_dev_put(idev); +} + /* * IGMP handling (alias multicast ICMPv6 messages) */ @@ -1078,12 +1060,9 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) delay = ma->mca_timer.expires - jiffies; } - if (delay >= resptime) { - if (resptime) - delay = net_random() % resptime; - else - delay = 1; - } + if (delay >= resptime) + delay = prandom_u32() % resptime; + ma->mca_timer.expires = jiffies + delay; if (!mod_timer(&ma->mca_timer, jiffies + delay)) atomic_inc(&ma->mca_refcnt); @@ -1091,8 +1070,8 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } /* mark EXCLUDE-mode sources */ -static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, - struct in6_addr *srcs) +static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; @@ -1103,10 +1082,10 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, break; for (i=0; i<nsrcs; i++) { /* skip inactive filters */ - if (pmc->mca_sfcount[MCAST_INCLUDE] || + if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) - continue; + break; if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { scount++; break; @@ -1115,12 +1094,12 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, } pmc->mca_flags &= ~MAF_GSQUERY; if (scount == nsrcs) /* all sources excluded */ - return 0; - return 1; + return false; + return true; } -static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, - struct in6_addr *srcs) +static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; @@ -1144,23 +1123,176 @@ static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, } if (!scount) { pmc->mca_flags &= ~MAF_GSQUERY; - return 0; + return false; } pmc->mca_flags |= MAF_GSQUERY; - return 1; + return true; +} + +static int mld_force_mld_version(const struct inet6_dev *idev) +{ + /* Normally, both are 0 here. If enforcement to a particular is + * being used, individual device enforcement will have a lower + * precedence over 'all' device (.../conf/all/force_mld_version). + */ + + if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0) + return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version; + else + return idev->cnf.force_mld_version; +} + +static bool mld_in_v2_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 2; +} + +static bool mld_in_v1_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 1; +} + +static bool mld_in_v1_mode(const struct inet6_dev *idev) +{ + if (mld_in_v2_mode_only(idev)) + return false; + if (mld_in_v1_mode_only(idev)) + return true; + if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) + return true; + + return false; +} + +static void mld_set_v1_mode(struct inet6_dev *idev) +{ + /* RFC3810, relevant sections: + * - 9.1. Robustness Variable + * - 9.2. Query Interval + * - 9.3. Query Response Interval + * - 9.12. Older Version Querier Present Timeout + */ + unsigned long switchback; + + switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; + + idev->mc_v1_seen = jiffies + switchback; +} + +static void mld_update_qrv(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.8. QRV (Querier's Robustness Variable) + * - 9.1. Robustness Variable + */ + + /* The value of the Robustness Variable MUST NOT be zero, + * and SHOULD NOT be one. Catch this here if we ever run + * into such a case in future. + */ + WARN_ON(idev->mc_qrv == 0); + + if (mlh2->mld2q_qrv > 0) + idev->mc_qrv = mlh2->mld2q_qrv; + + if (unlikely(idev->mc_qrv < 2)) { + net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", + idev->mc_qrv, MLD_QRV_DEFAULT); + idev->mc_qrv = MLD_QRV_DEFAULT; + } +} + +static void mld_update_qi(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.9. QQIC (Querier's Query Interval Code) + * - 9.2. Query Interval + * - 9.12. Older Version Querier Present Timeout + * (the [Query Interval] in the last Query received) + */ + unsigned long mc_qqi; + + if (mlh2->mld2q_qqic < 128) { + mc_qqi = mlh2->mld2q_qqic; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); + mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); + + mc_qqi = (mc_man | 0x10) << (mc_exp + 3); + } + + idev->mc_qi = mc_qqi * HZ; +} + +static void mld_update_qri(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.3. Maximum Response Code + * - 9.3. Query Response Interval + */ + idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); +} + +static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, + unsigned long *max_delay) +{ + unsigned long mldv1_md; + + /* Ignore v1 queries */ + if (mld_in_v2_mode_only(idev)) + return -EINVAL; + + /* MLDv1 router present */ + mldv1_md = ntohs(mld->mld_maxdelay); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); + + mld_set_v1_mode(idev); + + /* cancel MLDv2 report timer */ + mld_gq_stop_timer(idev); + /* cancel the interface change timer */ + mld_ifc_stop_timer(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + + return 0; +} + +static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) +{ + /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ + if (mld_in_v1_mode(idev)) + return -EINVAL; + + *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); + + mld_update_qrv(idev, mld); + mld_update_qi(idev, mld); + mld_update_qri(idev, mld); + + idev->mc_maxdelay = *max_delay; + + return 0; } +/* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; struct ifmcaddr6 *ma; - struct in6_addr *group; + const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; - struct icmp6hdr *hdr; + struct mld_msg *mld; int group_type; int mark = 0; - int len; + int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; @@ -1169,78 +1301,66 @@ int igmp6_event_query(struct sk_buff *skb) len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); - /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + /* RFC3810 6.2 + * Upon reception of an MLD message that contains a Query, the node + * checks if the source address of the message is a valid link-local + * address, if the Hop Limit is set to 1, and if the Router Alert + * option is present in the Hop-By-Hop Options header of the IPv6 + * packet. If any of these checks fails, the packet is dropped. + */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || + ipv6_hdr(skb)->hop_limit != 1 || + !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || + IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) return -EINVAL; - idev = in6_dev_get(skb->dev); - + idev = __in6_dev_get(skb->dev); if (idev == NULL) return 0; - hdr = icmp6_hdr(skb); - group = (struct in6_addr *) (hdr + 1); + mld = (struct mld_msg *)icmp6_hdr(skb); + group = &mld->mld_mca; group_type = ipv6_addr_type(group); if (group_type != IPV6_ADDR_ANY && - !(group_type&IPV6_ADDR_MULTICAST)) { - in6_dev_put(idev); + !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - } - - if (len == 24) { - int switchback; - /* MLDv1 router present */ - - /* Translate milliseconds to jiffies */ - max_delay = (ntohs(hdr->icmp6_maxdelay)*HZ)/1000; - switchback = (idev->mc_qrv + 1) * max_delay; - idev->mc_v1_seen = jiffies + switchback; - - /* cancel the interface change timer */ - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - /* clear deleted report items */ - mld_clear_delrec(idev); - } else if (len >= 28) { + if (len == MLD_V1_QUERY_LEN) { + err = mld_process_v1(idev, mld, &max_delay); + if (err < 0) + return err; + } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); - if (!pskb_may_pull(skb, srcs_offset)) { - in6_dev_put(idev); + + if (!pskb_may_pull(skb, srcs_offset)) return -EINVAL; - } + mlh2 = (struct mld2_query *)skb_transport_header(skb); - max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; - if (!max_delay) - max_delay = 1; - idev->mc_maxdelay = max_delay; - if (mlh2->qrv) - idev->mc_qrv = mlh2->qrv; + + err = mld_process_v2(idev, mlh2, &max_delay); + if (err < 0) + return err; + if (group_type == IPV6_ADDR_ANY) { /* general query */ - if (mlh2->nsrcs) { - in6_dev_put(idev); + if (mlh2->mld2q_nsrcs) return -EINVAL; /* no sources allowed */ - } + mld_gq_start_timer(idev); - in6_dev_put(idev); return 0; } /* mark sources to include, if group & source-specific */ - if (mlh2->nsrcs != 0) { + if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + - ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) { - in6_dev_put(idev); + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) return -EINVAL; - } + mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } - } else { - in6_dev_put(idev); + } else return -EINVAL; - } read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { @@ -1266,25 +1386,23 @@ int igmp6_event_query(struct sk_buff *skb) ma->mca_flags &= ~MAF_GSQUERY; } if (!(ma->mca_flags & MAF_GSQUERY) || - mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs)) + mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); break; } } read_unlock_bh(&idev->lock); - in6_dev_put(idev); return 0; } - +/* called with rcu_read_lock() */ int igmp6_event_report(struct sk_buff *skb) { struct ifmcaddr6 *ma; - struct in6_addr *addrp; struct inet6_dev *idev; - struct icmp6hdr *hdr; + struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ @@ -1296,10 +1414,10 @@ int igmp6_event_report(struct sk_buff *skb) skb->pkt_type != PACKET_BROADCAST) return 0; - if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) return -EINVAL; - hdr = icmp6_hdr(skb); + mld = (struct mld_msg *)icmp6_hdr(skb); /* Drop reports with not link local source */ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); @@ -1307,9 +1425,7 @@ int igmp6_event_report(struct sk_buff *skb) !(addr_type&IPV6_ADDR_LINKLOCAL)) return -EINVAL; - addrp = (struct in6_addr *) (hdr + 1); - - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); if (idev == NULL) return -ENODEV; @@ -1319,7 +1435,7 @@ int igmp6_event_report(struct sk_buff *skb) read_lock_bh(&idev->lock); for (ma = idev->mc_list; ma; ma=ma->next) { - if (ipv6_addr_equal(&ma->mca_addr, addrp)) { + if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); if (del_timer(&ma->mca_timer)) atomic_dec(&ma->mca_refcnt); @@ -1329,21 +1445,20 @@ int igmp6_event_report(struct sk_buff *skb) } } read_unlock_bh(&idev->lock); - in6_dev_put(idev); return 0; } -static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, - int gdeleted, int sdeleted) +static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) { switch (type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) - return 0; + return false; if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { if (pmc->mca_sfmode == MCAST_INCLUDE) - return 1; + return true; /* don't include if this source is excluded * in all filters */ @@ -1352,29 +1467,29 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } - return 0; + return false; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) - return 0; + return false; return psf->sf_count[MCAST_INCLUDE] != 0; case MLD2_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) - return 0; + return false; if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) - return 0; + return false; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case MLD2_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) - return 0; + return false; return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; case MLD2_BLOCK_OLD_SOURCES: if (pmc->mca_sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } - return 0; + return false; } static int @@ -1391,95 +1506,142 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static struct sk_buff *mld_newpack(struct net_device *dev, int size) +static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto, int len) { - struct sock *sk = igmp6_socket->sk; + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, 0, 0); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = inet6_sk(sk)->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} + +static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) +{ + struct net_device *dev = idev->dev; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; struct sk_buff *skb; struct mld2_report *pmr; struct in6_addr addr_buf; + const struct in6_addr *saddr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - skb = sock_alloc_send_skb(sk, size + LL_RESERVED_SPACE(dev), 1, &err); + size += hlen + tlen; + /* limit our allocations to order-0 page */ + size = min_t(int, size, SKB_MAX_ORDER(0, 0)); + skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, hlen); - if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ - memset(&addr_buf, 0, sizeof(addr_buf)); - } + saddr = &in6addr_any; + } else + saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, &addr_buf, &mld2_all_mcr, NEXTHDR_HOP, 0); + ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); skb_put(skb, sizeof(*pmr)); pmr = (struct mld2_report *)skb_transport_header(skb); - pmr->type = ICMPV6_MLD2_REPORT; - pmr->resv1 = 0; - pmr->csum = 0; - pmr->resv2 = 0; - pmr->ngrec = 0; + pmr->mld2r_type = ICMPV6_MLD2_REPORT; + pmr->mld2r_resv1 = 0; + pmr->mld2r_cksum = 0; + pmr->mld2r_resv2 = 0; + pmr->mld2r_ngrec = 0; return skb; } -static inline int mld_dev_queue_xmit2(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned char ha[MAX_ADDR_LEN]; - - ndisc_mc_map(&ipv6_hdr(skb)->daddr, ha, dev, 1); - if (dev_hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len) < 0) { - kfree_skb(skb); - return -EINVAL; - } - return dev_queue_xmit(skb); -} - -static inline int mld_dev_queue_xmit(struct sk_buff *skb) -{ - return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev, - mld_dev_queue_xmit2); -} - static void mld_sendpack(struct sk_buff *skb) { struct ipv6hdr *pip6 = ipv6_hdr(skb); struct mld2_report *pmr = (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; - struct inet6_dev *idev = in6_dev_get(skb->dev); + struct inet6_dev *idev; + struct net *net = dev_net(skb->dev); int err; + struct flowi6 fl6; + struct dst_entry *dst; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); - mldlen = skb->tail - skb->transport_header; + payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - + sizeof(*pip6); + mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); - pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, - IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), - mldlen, 0)); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, - mld_dev_queue_xmit); + pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, + csum_partial(skb_transport_header(skb), + mldlen, 0)); + + icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); + + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + } + skb_dst_set(skb, dst); + if (err) + goto err_out; + + payload_len = skb->len; + + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: if (!err) { - ICMP6MSGOUT_INC_STATS_BH(idev, ICMPV6_MLD2_REPORT); - ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); - IP6_INC_STATS_BH(idev, IPSTATS_MIB_OUTMCASTPKTS); - } else - IP6_INC_STATS_BH(idev, IPSTATS_MIB_OUTDISCARDS); + ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); + } else { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + } - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); + return; + +err_out: + kfree_skb(skb); + goto out; } static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) @@ -1495,7 +1657,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr; if (!skb) - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(pmc->idev, dev->mtu); if (!skb) return NULL; pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); @@ -1504,7 +1666,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ pmr = (struct mld2_report *)skb_transport_header(skb); - pmr->ngrec = htons(ntohs(pmr->ngrec)+1); + pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); *ppgr = pgr; return skb; } @@ -1513,9 +1675,10 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, skb_tailroom(skb)) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, int gdeleted, int sdeleted) + int type, int gdeleted, int sdeleted, int crsend) { - struct net_device *dev = pmc->idev->dev; + struct inet6_dev *idev = pmc->idev; + struct net_device *dev = idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -1540,11 +1703,11 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { - if (pmr && pmr->ngrec && + if (pmr && pmr->mld2r_ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(idev, dev->mtu); } } first = 1; @@ -1571,7 +1734,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(idev, dev->mtu); first = 1; scount = 0; } @@ -1604,7 +1767,7 @@ empty_source: if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; - if (pmc->mca_crcount || isquery) { + if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); @@ -1626,8 +1789,8 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) struct sk_buff *skb = NULL; int type; + read_lock_bh(&idev->lock); if (!pmc) { - read_lock_bh(&idev->lock); for (pmc=idev->mc_list; pmc; pmc=pmc->next) { if (pmc->mca_flags & MAF_NOREPORT) continue; @@ -1636,19 +1799,19 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); } else { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } + read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } @@ -1681,7 +1844,7 @@ static void mld_send_cr(struct inet6_dev *idev) int type, dtype; read_lock_bh(&idev->lock); - write_lock_bh(&idev->mc_lock); + spin_lock(&idev->mc_lock); /* deleted MCA's */ pmc_prev = NULL; @@ -1690,13 +1853,13 @@ static void mld_send_cr(struct inet6_dev *idev) if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; - skb = add_grec(skb, pmc, type, 1, 0); - skb = add_grec(skb, pmc, dtype, 1, 1); + skb = add_grec(skb, pmc, type, 1, 0, 0); + skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { @@ -1715,7 +1878,7 @@ static void mld_send_cr(struct inet6_dev *idev) } else pmc_prev = pmc; } - write_unlock_bh(&idev->mc_lock); + spin_unlock(&idev->mc_lock); /* change recs */ for (pmc=idev->mc_list; pmc; pmc=pmc->next) { @@ -1727,8 +1890,8 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } - skb = add_grec(skb, pmc, type, 0, 0); - skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + skb = add_grec(skb, pmc, type, 0, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { @@ -1736,7 +1899,7 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); @@ -1749,86 +1912,152 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct sock *sk = igmp6_socket->sk; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; struct inet6_dev *idev; struct sk_buff *skb; - struct icmp6hdr *hdr; - struct in6_addr *snd_addr; - struct in6_addr *addrp; + struct mld_msg *hdr; + const struct in6_addr *snd_addr, *saddr; struct in6_addr addr_buf; - struct in6_addr all_routers; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; + struct flowi6 fl6; + struct dst_entry *dst; - rcu_read_lock(); - IP6_INC_STATS(__in6_dev_get(dev), - IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); - snd_addr = addr; - if (type == ICMPV6_MGM_REDUCTION) { - snd_addr = &all_routers; - ipv6_addr_all_routers(&all_routers); - } + if (type == ICMPV6_MGM_REDUCTION) + snd_addr = &in6addr_linklocal_allrouters; + else + snd_addr = addr; len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - skb = sock_alloc_send_skb(sk, LL_RESERVED_SPACE(dev) + full_len, 1, &err); + rcu_read_lock(); + IP6_UPD_PO_STATS(net, __in6_dev_get(dev), + IPSTATS_MIB_OUT, full_len); + rcu_read_unlock(); + + skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); if (skb == NULL) { rcu_read_lock(); - IP6_INC_STATS(__in6_dev_get(dev), + IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } - - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. */ - memset(&addr_buf, 0, sizeof(addr_buf)); - } + saddr = &in6addr_any; + } else + saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len); + ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); - hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr)); - memset(hdr, 0, sizeof(struct icmp6hdr)); - hdr->icmp6_type = type; + hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg)); + memset(hdr, 0, sizeof(struct mld_msg)); + hdr->mld_type = type; + hdr->mld_mca = *addr; - addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); - ipv6_addr_copy(addrp, addr); + hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, len, 0)); - hdr->icmp6_cksum = csum_ipv6_magic(&addr_buf, snd_addr, len, - IPPROTO_ICMPV6, - csum_partial((__u8 *) hdr, len, 0)); + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); - idev = in6_dev_get(skb->dev); + icmpv6_flow_init(sk, &fl6, type, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err_out; + } - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, - mld_dev_queue_xmit); + skb_dst_set(skb, dst); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: if (!err) { - ICMP6MSGOUT_INC_STATS(idev, type); - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS); + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); } else - IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); return; + +err_out: + kfree_skb(skb); + goto out; +} + +static void mld_send_initial_cr(struct inet6_dev *idev) +{ + struct sk_buff *skb; + struct ifmcaddr6 *pmc; + int type; + + if (mld_in_v1_mode(idev)) + return; + + skb = NULL; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0, 1); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (skb) + mld_sendpack(skb); +} + +void ipv6_mc_dad_complete(struct inet6_dev *idev) +{ + idev->mc_dad_count = idev->mc_qrv; + if (idev->mc_dad_count) { + mld_send_initial_cr(idev); + idev->mc_dad_count--; + if (idev->mc_dad_count) + mld_dad_start_timer(idev, idev->mc_maxdelay); + } +} + +static void mld_dad_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_initial_cr(idev); + if (idev->mc_dad_count) { + idev->mc_dad_count--; + if (idev->mc_dad_count) + mld_dad_start_timer(idev, idev->mc_maxdelay); + } + in6_dev_put(idev); } static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, - struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; @@ -1853,7 +2082,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, else pmc->mca_sources = psf->sf_next; if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && - !MLD_V1_SEEN(idev)) { + !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; psf->sf_next = pmc->mca_tomb; pmc->mca_tomb = psf; @@ -1864,8 +2093,8 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, return rv; } -static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; @@ -1925,7 +2154,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, * Add multicast single-source filter to the interface list */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, - struct in6_addr *psfsrc, int delta) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; @@ -2010,8 +2239,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) &psf->sf_addr)) break; if (!dpsf) { - dpsf = (struct ip6_sf_list *) - kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; @@ -2029,8 +2257,8 @@ static int sf_setstate(struct ifmcaddr6 *pmc) /* * Add multicast source filter list to the interface list */ -static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; @@ -2057,7 +2285,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, pmc->mca_sfcount[sfmode]++; err = 0; for (i=0; i<sfcount; i++) { - err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } @@ -2067,9 +2295,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, if (!delta) pmc->mca_sfcount[sfmode]--; for (j=0; j<i; j++) - (void) ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); + ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { - struct inet6_dev *idev = pmc->idev; struct ip6_sf_list *psf; /* filter mode change */ @@ -2120,7 +2347,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % IGMP6_UNSOLICITED_IVAL; + delay = prandom_u32() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { @@ -2155,7 +2382,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, static void igmp6_leave_group(struct ifmcaddr6 *ma) { - if (MLD_V1_SEEN(ma->idev)) { + if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); @@ -2171,7 +2398,7 @@ static void mld_gq_timer_expire(unsigned long data) idev->mc_gq_running = 0; mld_send_report(idev, NULL); - __in6_dev_put(idev); + in6_dev_put(idev); } static void mld_ifc_timer_expire(unsigned long data) @@ -2184,12 +2411,12 @@ static void mld_ifc_timer_expire(unsigned long data) if (idev->mc_ifc_count) mld_ifc_start_timer(idev, idev->mc_maxdelay); } - __in6_dev_put(idev); + in6_dev_put(idev); } static void mld_ifc_event(struct inet6_dev *idev) { - if (MLD_V1_SEEN(idev)) + if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_timer(idev, 1); @@ -2200,7 +2427,7 @@ static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; - if (MLD_V1_SEEN(ma->idev)) + if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); @@ -2212,6 +2439,25 @@ static void igmp6_timer_handler(unsigned long data) ma_put(ma); } +/* Device changing type */ + +void ipv6_mc_unmap(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i = i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); +} + +void ipv6_mc_remap(struct inet6_dev *idev) +{ + ipv6_mc_up(idev); +} + /* Device going down */ void ipv6_mc_down(struct inet6_dev *idev) @@ -2221,12 +2467,9 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Withdraw multicast list */ read_lock_bh(&idev->lock); - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - idev->mc_gq_running = 0; - if (del_timer(&idev->mc_gq_timer)) - __in6_dev_put(idev); + mld_ifc_stop_timer(idev); + mld_gq_stop_timer(idev); + mld_dad_stop_timer(idev); for (i = idev->mc_list; i; i=i->next) igmp6_group_dropped(i); @@ -2255,7 +2498,7 @@ void ipv6_mc_up(struct inet6_dev *idev) void ipv6_mc_init_dev(struct inet6_dev *idev) { write_lock_bh(&idev->lock); - rwlock_init(&idev->mc_lock); + spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, (unsigned long)idev); @@ -2263,8 +2506,14 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) idev->mc_ifc_count = 0; setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, (unsigned long)idev); + setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, + (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); } @@ -2276,24 +2525,19 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; - struct in6_addr maddr; /* Deactivate timers */ ipv6_mc_down(idev); /* Delete all-nodes address. */ - ipv6_addr_all_nodes(&maddr); - /* We cannot call ipv6_dev_mc_dec() directly, our caller in * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will * fail. */ - __ipv6_dev_mc_dec(idev, &maddr); + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); - if (idev->cnf.forwarding) { - ipv6_addr_all_routers(&maddr); - __ipv6_dev_mc_dec(idev, &maddr); - } + if (idev->cnf.forwarding) + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); write_lock_bh(&idev->lock); while ((i = idev->mc_list) != NULL) { @@ -2310,6 +2554,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { + struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; }; @@ -2320,11 +2565,12 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) { struct ifmcaddr6 *im = NULL; struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + struct net *net = seq_file_net(seq); state->idev = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; - idev = in6_dev_get(state->dev); + idev = __in6_dev_get(state->dev); if (!idev) continue; read_lock_bh(&idev->lock); @@ -2334,7 +2580,6 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) break; } read_unlock_bh(&idev->lock); - in6_dev_put(idev); } return im; } @@ -2345,16 +2590,15 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr im = im->next; while (!im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; break; } - state->idev = in6_dev_get(state->dev); + state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; read_lock_bh(&state->idev->lock); @@ -2373,31 +2617,31 @@ static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { - read_lock(&dev_base_lock); + rcu_read_lock(); return igmp6_mc_get_idx(seq, *pos); } static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct ifmcaddr6 *im; - im = igmp6_mc_get_next(seq, v); + struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); + ++*pos; return im; } static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + if (likely(state->idev != NULL)) { read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); state->idev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp6_mc_seq_show(struct seq_file *seq, void *v) @@ -2406,9 +2650,9 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v) struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); seq_printf(seq, - "%-4d %-15s " NIP6_SEQFMT " %5d %08X %ld\n", + "%-4d %-15s %pi6 %5d %08X %ld\n", state->dev->ifindex, state->dev->name, - NIP6(im->mca_addr), + &im->mca_addr, im->mca_users, im->mca_flags, (im->mca_flags&MAF_TIMER_RUNNING) ? jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); @@ -2424,8 +2668,8 @@ static const struct seq_operations igmp6_mc_seq_ops = { static int igmp6_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &igmp6_mc_seq_ops, - sizeof(struct igmp6_mc_iter_state)); + return seq_open_net(inode, file, &igmp6_mc_seq_ops, + sizeof(struct igmp6_mc_iter_state)); } static const struct file_operations igmp6_mc_seq_fops = { @@ -2433,10 +2677,11 @@ static const struct file_operations igmp6_mc_seq_fops = { .open = igmp6_mc_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; struct igmp6_mcf_iter_state { + struct seq_net_private p; struct net_device *dev; struct inet6_dev *idev; struct ifmcaddr6 *im; @@ -2449,12 +2694,13 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) struct ip6_sf_list *psf = NULL; struct ifmcaddr6 *im = NULL; struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + struct net *net = seq_file_net(seq); state->idev = NULL; state->im = NULL; - for_each_netdev(&init_net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; - idev = in6_dev_get(state->dev); + idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; read_lock_bh(&idev->lock); @@ -2470,7 +2716,6 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) spin_unlock_bh(&im->mca_lock); } read_unlock_bh(&idev->lock); - in6_dev_put(idev); } return psf; } @@ -2484,16 +2729,15 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s spin_unlock_bh(&state->im->mca_lock); state->im = state->im->next; while (!state->im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } - state->idev = in6_dev_get(state->dev); + state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; read_lock_bh(&state->idev->lock); @@ -2518,9 +2762,9 @@ static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2536,7 +2780,7 @@ static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); if (likely(state->im != NULL)) { @@ -2545,11 +2789,10 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) } if (likely(state->idev != NULL)) { read_unlock_bh(&state->idev->lock); - in6_dev_put(state->idev); state->idev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) @@ -2565,10 +2808,10 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) "Source Address", "INC", "EXC"); } else { seq_printf(seq, - "%3d %6.6s " NIP6_SEQFMT " " NIP6_SEQFMT " %6lu %6lu\n", + "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", state->dev->ifindex, state->dev->name, - NIP6(state->im->mca_addr), - NIP6(psf->sf_addr), + &state->im->mca_addr, + &psf->sf_addr, psf->sf_count[MCAST_INCLUDE], psf->sf_count[MCAST_EXCLUDE]); } @@ -2584,8 +2827,8 @@ static const struct seq_operations igmp6_mcf_seq_ops = { static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &igmp6_mcf_seq_ops, - sizeof(struct igmp6_mcf_iter_state)); + return seq_open_net(inode, file, &igmp6_mcf_seq_ops, + sizeof(struct igmp6_mcf_iter_state)); } static const struct file_operations igmp6_mcf_seq_fops = { @@ -2593,47 +2836,86 @@ static const struct file_operations igmp6_mcf_seq_fops = { .open = igmp6_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; + +static int __net_init igmp6_proc_init(struct net *net) +{ + int err; + + err = -ENOMEM; + if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops)) + goto out; + if (!proc_create("mcfilter6", S_IRUGO, net->proc_net, + &igmp6_mcf_seq_fops)) + goto out_proc_net_igmp6; + + err = 0; +out: + return err; + +out_proc_net_igmp6: + remove_proc_entry("igmp6", net->proc_net); + goto out; +} + +static void __net_exit igmp6_proc_exit(struct net *net) +{ + remove_proc_entry("mcfilter6", net->proc_net); + remove_proc_entry("igmp6", net->proc_net); +} +#else +static inline int igmp6_proc_init(struct net *net) +{ + return 0; +} +static inline void igmp6_proc_exit(struct net *net) +{ +} #endif -int __init igmp6_init(struct net_proto_family *ops) +static int __net_init igmp6_net_init(struct net *net) { - struct ipv6_pinfo *np; - struct sock *sk; int err; - err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &igmp6_socket); + err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - printk(KERN_ERR - "Failed to initialize the IGMP6 control socket (err %d).\n", + pr_err("Failed to initialize the IGMP6 control socket (err %d)\n", err); - igmp6_socket = NULL; /* For safety. */ - return err; + goto out; } - sk = igmp6_socket->sk; - sk->sk_allocation = GFP_ATOMIC; - sk->sk_prot->unhash(sk); + inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; - np = inet6_sk(sk); - np->hop_limit = 1; + err = igmp6_proc_init(net); + if (err) + goto out_sock_create; +out: + return err; -#ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops); - proc_net_fops_create(&init_net, "mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); -#endif +out_sock_create: + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + goto out; +} - return 0; +static void __net_exit igmp6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + igmp6_proc_exit(net); } -void igmp6_cleanup(void) +static struct pernet_operations igmp6_net_ops = { + .init = igmp6_net_init, + .exit = igmp6_net_exit, +}; + +int __init igmp6_init(void) { - sock_release(igmp6_socket); - igmp6_socket = NULL; /* for safety */ + return register_pernet_subsys(&igmp6_net_ops); +} -#ifdef CONFIG_PROC_FS - proc_net_remove(&init_net, "mcfilter6"); - proc_net_remove(&init_net, "igmp6"); -#endif +void igmp6_cleanup(void) +{ + unregister_pernet_subsys(&igmp6_net_ops); } diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index cd8a5bda13c..db9b6cbc9db 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -13,8 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * Authors: @@ -22,6 +21,8 @@ * Masahide NAKAMURA @USAGI */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> #include <linux/time.h> @@ -44,9 +45,9 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen) if (!data) return NULL; if (padlen == 1) { - data[0] = MIP6_OPT_PAD_1; + data[0] = IPV6_TLV_PAD1; } else if (padlen > 1) { - data[0] = MIP6_OPT_PAD_N; + data[0] = IPV6_TLV_PADN; data[1] = padlen - 2; if (padlen > 2) memset(data+2, 0, data[1]); @@ -54,9 +55,9 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen) return data + padlen; } -static inline void mip6_param_prob(struct sk_buff *skb, int code, int pos) +static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); } static int mip6_mh_len(int type) @@ -84,28 +85,30 @@ static int mip6_mh_len(int type) static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { - struct ip6_mh *mh; + struct ip6_mh _hdr; + const struct ip6_mh *mh; - if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || - !pskb_may_pull(skb, (skb_transport_offset(skb) + - ((skb_transport_header(skb)[1] + 1) << 3)))) + mh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (!mh) return -1; - mh = (struct ip6_mh *)skb_transport_header(skb); + if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) + return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); - mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); - mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + + skb_network_header_len(skb)); return -1; } @@ -126,7 +129,7 @@ static struct mip6_report_rate_limiter mip6_report_rl = { static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; int err = destopt->nexthdr; @@ -164,8 +167,8 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) calc_padlen(sizeof(*dstopt), 6)); hao->type = IPV6_TLV_HAO; + BUILD_BUG_ON(sizeof(*hao) != 18); hao->length = sizeof(*hao) - 2; - BUG_TRAP(hao->length == 16); len = ((char *)hao - (char *)dstopt) + sizeof(*hao); @@ -174,15 +177,15 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); spin_unlock_bh(&x->lock); - BUG_TRAP(len == x->props.header_len); + WARN_ON(len != x->props.header_len); dstopt->hdrlen = (x->props.header_len >> 3) - 1; return 0; } static inline int mip6_report_rl_allow(struct timeval *stamp, - struct in6_addr *dst, - struct in6_addr *src, int iif) + const struct in6_addr *dst, + const struct in6_addr *src, int iif) { int allow = 0; @@ -195,25 +198,28 @@ static inline int mip6_report_rl_allow(struct timeval *stamp, mip6_report_rl.stamp.tv_sec = stamp->tv_sec; mip6_report_rl.stamp.tv_usec = stamp->tv_usec; mip6_report_rl.iif = iif; - ipv6_addr_copy(&mip6_report_rl.src, src); - ipv6_addr_copy(&mip6_report_rl.dst, dst); + mip6_report_rl.src = *src; + mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); return allow; } -static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct flowi *fl) +static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, + const struct flowi *fl) { + struct net *net = xs_net(x); struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + const struct flowi6 *fl6 = &fl->u.ip6; struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; struct timeval stamp; int err = 0; - if (unlikely(fl->proto == IPPROTO_MH && - fl->fl_mh_type <= IP6_MH_TYPE_MAX)) + if (unlikely(fl6->flowi6_proto == IPPROTO_MH && + fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) goto out; if (likely(opt->dsthao)) { @@ -238,16 +244,16 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; - sel.proto = fl->proto; - sel.dport = xfrm_flowi_dport(fl); + sel.proto = fl6->flowi6_proto; + sel.dport = xfrm_flowi_dport(fl, &fl6->uli); if (sel.dport) sel.dport_mask = htons(~0); - sel.sport = xfrm_flowi_sport(fl); + sel.sport = xfrm_flowi_sport(fl, &fl6->uli); if (sel.sport) sel.sport_mask = htons(~0); - sel.ifindex = fl->oif; + sel.ifindex = fl6->flowi6_oif; - err = km_report(IPPROTO_DSTOPTS, &sel, + err = km_report(net, IPPROTO_DSTOPTS, &sel, (hao ? (xfrm_address_t *)&hao->addr : NULL)); out: @@ -261,7 +267,8 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; @@ -304,20 +311,19 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, static int mip6_destopt_init_state(struct xfrm_state *x) { if (x->id.spi) { - printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__, - x->id.spi); + pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - printk(KERN_INFO "%s: state's mode is not %u: %u\n", - __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + pr_info("%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); return -EINVAL; } x->props.header_len = sizeof(struct ipv6_destopt_hdr) + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + sizeof(struct ipv6_destopt_hao); - BUG_TRAP(x->props.header_len == 24); + WARN_ON(x->props.header_len != 24); return 0; } @@ -346,11 +352,12 @@ static const struct xfrm_type mip6_destopt_type = static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { + const struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); - if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) && + if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); @@ -380,7 +387,7 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) rt2->rt_hdr.segments_left = 1; memset(&rt2->reserved, 0, sizeof(rt2->reserved)); - BUG_TRAP(rt2->rt_hdr.hdrlen == 2); + WARN_ON(rt2->rt_hdr.hdrlen != 2); memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); spin_lock_bh(&x->lock); @@ -397,7 +404,8 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; @@ -439,13 +447,12 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, static int mip6_rthdr_init_state(struct xfrm_state *x) { if (x->id.spi) { - printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__, - x->id.spi); + pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - printk(KERN_INFO "%s: state's mode is not %u: %u\n", - __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + pr_info("%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); return -EINVAL; } @@ -477,18 +484,18 @@ static const struct xfrm_type mip6_rthdr_type = static int __init mip6_init(void) { - printk(KERN_INFO "Mobile IPv6\n"); + pr_info("Mobile IPv6\n"); if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { - printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __FUNCTION__); + pr_info("%s: can't add xfrm type(destopt)\n", __func__); goto mip6_destopt_xfrm_fail; } if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { - printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __FUNCTION__); + pr_info("%s: can't add xfrm type(rthdr)\n", __func__); goto mip6_rthdr_xfrm_fail; } if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { - printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __FUNCTION__); + pr_info("%s: can't add rawv6 mh filter\n", __func__); goto mip6_rawv6_mh_fail; } @@ -506,11 +513,11 @@ static int __init mip6_init(void) static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) - printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __FUNCTION__); + pr_info("%s: can't remove rawv6 mh filter\n", __func__); if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __FUNCTION__); + pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __FUNCTION__); + pr_info("%s: can't remove xfrm type(destopt)\n", __func__); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 452a2ac4eec..ca8d4ea48a5 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -15,6 +15,7 @@ /* * Changes: * + * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt @@ -26,27 +27,7 @@ * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) -#define ND_NOPRINTK(x...) do { ; } while(0) -#define ND_PRINTK0 ND_PRINTK -#define ND_PRINTK1 ND_NOPRINTK -#define ND_PRINTK2 ND_NOPRINTK -#define ND_PRINTK3 ND_NOPRINTK -#if ND_DEBUG >= 1 -#undef ND_PRINTK1 -#define ND_PRINTK1 ND_PRINTK -#endif -#if ND_DEBUG >= 2 -#undef ND_PRINTK2 -#define ND_PRINTK2 ND_PRINTK -#endif -#if ND_DEBUG >= 3 -#undef ND_PRINTK3 -#define ND_PRINTK3 ND_PRINTK -#endif +#define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> @@ -59,6 +40,7 @@ #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -84,14 +66,24 @@ #include <net/flow.h> #include <net/ip6_checksum.h> +#include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> -static struct socket *ndisc_socket; +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 -static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +#define ND_PRINTK(val, level, fmt, ...) \ +do { \ + if (val <= ND_DEBUG) \ + net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ +} while (0) + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -99,38 +91,31 @@ static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); -static struct neigh_ops ndisc_generic_ops = { +static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops ndisc_hh_ops = { +static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; -static struct neigh_ops ndisc_direct_ops = { +static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, - .output = dev_queue_xmit, - .connected_output = dev_queue_xmit, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, - .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), .key_len = sizeof(struct in6_addr), .hash = ndisc_hash, .constructor = ndisc_constructor, @@ -139,18 +124,20 @@ struct neigh_table nd_tbl = { .proxy_redo = pndisc_redo, .id = "ndisc_cache", .parms = { - .tbl = &nd_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, + .tbl = &nd_tbl, + .reachable_time = ND_REACHABLE_TIME, + .data = { + [NEIGH_VAR_MCAST_PROBES] = 3, + [NEIGH_VAR_UCAST_PROBES] = 3, + [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, + [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, + [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_GC_STALETIME] = 60 * HZ, + [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_PROXY_QLEN] = 64, + [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, + [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, + }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, @@ -158,50 +145,12 @@ struct neigh_table nd_tbl = { .gc_thresh3 = 1024, }; -/* ND options */ -struct ndisc_options { - struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; -#ifdef CONFIG_IPV6_ROUTE_INFO - struct nd_opt_hdr *nd_opts_ri; - struct nd_opt_hdr *nd_opts_ri_end; -#endif - struct nd_opt_hdr *nd_useropts; - struct nd_opt_hdr *nd_useropts_end; -}; - -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] - -#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) - -/* - * Return the padding between the option length and the start of the - * link addr. Currently only IP-over-InfiniBand needs this, although - * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may - * also need a pad of 2. - */ -static int ndisc_addr_option_pad(unsigned short type) -{ - switch (type) { - case ARPHRD_INFINIBAND: return 2; - default: return 0; - } -} - -static inline int ndisc_opt_addr_space(struct net_device *dev) -{ - return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); -} - -static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, - unsigned short addr_type) +static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) { - int space = NDISC_OPT_SPACE(data_len); - int pad = ndisc_addr_option_pad(addr_type); + int pad = ndisc_addr_option_pad(skb->dev->type); + int data_len = skb->dev->addr_len; + int space = ndisc_opt_addr_space(skb->dev); + u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; @@ -215,7 +164,6 @@ static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, opt += data_len; if ((space -= data_len) > 0) memset(opt, 0, space); - return opt + space; } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, @@ -228,12 +176,13 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && cur->nd_opt_type != type); - return (cur <= end && cur->nd_opt_type == type ? cur : NULL); + return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) { - return (opt->nd_opt_type == ND_OPT_RDNSS); + return opt->nd_opt_type == ND_OPT_RDNSS || + opt->nd_opt_type == ND_OPT_DNSSL; } static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, @@ -244,11 +193,11 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && !ndisc_is_useropt(cur)); - return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL); + return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } -static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, - struct ndisc_options *ndopts) +struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -268,10 +217,9 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, case ND_OPT_MTU: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - ND_PRINTK2(KERN_WARNING - "%s(): duplicated ND6 option found: type=%d\n", - __FUNCTION__, - nd_opt->nd_opt_type); + ND_PRINTK(2, warn, + "%s: duplicated ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } @@ -299,10 +247,11 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, * to accommodate future extension to the * protocol. */ - ND_PRINTK2(KERN_NOTICE - "%s(): ignored unsupported option; type=%d, len=%d\n", - __FUNCTION__, - nd_opt->nd_opt_type, nd_opt->nd_opt_len); + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } } opt_len -= l; @@ -311,18 +260,7 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, return ndopts; } -static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, - struct net_device *dev) -{ - u8 *lladdr = (u8 *)(p + 1); - int lladdrlen = p->nd_opt_len << 3; - int prepad = ndisc_addr_option_pad(dev->type); - if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) - return NULL; - return (lladdr + prepad); -} - -int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: @@ -330,15 +268,14 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; - case ARPHRD_IEEE802_TR: - ipv6_tr_mc_map(addr,buf); - return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); @@ -350,16 +287,11 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d EXPORT_SYMBOL(ndisc_mc_map); -static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd) { - const u32 *p32 = pkey; - u32 addr_hash, i; - - addr_hash = 0; - for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) - addr_hash ^= *p32++; - - return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); + return ndisc_hashfn(pkey, dev, hash_rnd); } static int ndisc_constructor(struct neighbour *neigh) @@ -368,25 +300,22 @@ static int ndisc_constructor(struct neighbour *neigh) struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; - int is_multicast = ipv6_addr_is_multicast(addr); + bool is_multicast = ipv6_addr_is_multicast(addr); - rcu_read_lock(); in6_dev = in6_dev_get(dev); if (in6_dev == NULL) { - rcu_read_unlock(); return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; - neigh->output = neigh->ops->queue_xmit; + neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; @@ -438,153 +367,190 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } -/* - * Send a Neighbour Advertisement - */ - -static inline void ndisc_flow_init(struct flowi *fl, u8 type, - struct in6_addr *saddr, struct in6_addr *daddr, - int oif) +static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, + int len) { - memset(fl, 0, sizeof(*fl)); - ipv6_addr_copy(&fl->fl6_src, saddr); - ipv6_addr_copy(&fl->fl6_dst, daddr); - fl->proto = IPPROTO_ICMPV6; - fl->fl_icmp_type = type; - fl->fl_icmp_code = 0; - fl->oif = oif; - security_sk_classify_flow(ndisc_socket->sk, fl); -} - -static void __ndisc_send(struct net_device *dev, - struct neighbour *neigh, - struct in6_addr *daddr, struct in6_addr *saddr, - struct icmp6hdr *icmp6h, struct in6_addr *target, - int llinfo) -{ - struct flowi fl; - struct dst_entry *dst; - struct sock *sk = ndisc_socket->sk; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; - struct icmp6hdr *hdr; - struct inet6_dev *idev; - int len; - int err; - u8 *opt, type; - type = icmp6h->icmp6_type; + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); + if (!skb) { + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", + __func__); + return NULL; + } - ndisc_flow_init(&fl, type, saddr, daddr, - dev->ifindex); + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; - dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); - if (!dst) - return; + skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); - err = xfrm_lookup(&dst, &fl, NULL, 0); - if (err < 0) - return; + /* Manually assign socket ownership as we avoid calling + * sock_alloc_send_pskb() to bypass wmem buffer limits + */ + skb_set_owner_w(skb, sk); - if (!dev->addr_len) - llinfo = 0; + return skb; +} - len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); - if (llinfo) - len += ndisc_opt_addr_space(dev); +static void ip6_nd_hdr(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int hop_limit, int len) +{ + struct ipv6hdr *hdr; - skb = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_RESERVED_SPACE(dev)), - 1, &err); - if (!skb) { - ND_PRINTK0(KERN_ERR - "ICMPv6 ND: %s() failed to allocate an skb.\n", - __FUNCTION__); - dst_release(dst); - return; - } + skb_push(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + ip6_flow_hdr(hdr, 0, 0); - skb->transport_header = skb->tail; - skb_put(skb, len); + hdr->payload_len = htons(len); + hdr->nexthdr = IPPROTO_ICMPV6; + hdr->hop_limit = hop_limit; - hdr = (struct icmp6hdr *)skb_transport_header(skb); - memcpy(hdr, icmp6h, sizeof(*hdr)); + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} - opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); - if (target) { - ipv6_addr_copy((struct in6_addr *)opt, target); - opt += sizeof(*target); - } +static void ndisc_send_skb(struct sk_buff *skb, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct inet6_dev *idev; + int err; + struct icmp6hdr *icmp6h = icmp6_hdr(skb); + u8 type; + + type = icmp6h->icmp6_type; + + if (!dst) { + struct flowi6 fl6; + + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { + kfree_skb(skb); + return; + } - if (llinfo) - ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, - dev->addr_len, dev->type); + skb_dst_set(skb, dst); + } - hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, - IPPROTO_ICMPV6, - csum_partial((__u8 *) hdr, - len, 0)); + icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, + csum_partial(icmp6h, + skb->len, 0)); - skb->dst = dst; + ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); - idev = in6_dev_get(dst->dev); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, dst_output); if (!err) { - ICMP6MSGOUT_INC_STATS(idev, type); - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); } -static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - struct in6_addr *daddr, struct in6_addr *solicited_addr, - int router, int solicited, int override, int inc_opt) +void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt) { + struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; - struct in6_addr *src_addr; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, - }; + const struct in6_addr *src_addr; + struct nd_msg *msg; + int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ - ifp = ipv6_get_ifaddr(&init_net, solicited_addr, dev, 1); + ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) - override = 0; + override = false; + inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { - if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, + &tmpaddr)) return; src_addr = &tmpaddr; } - icmp6h.icmp6_router = router; - icmp6h.icmp6_solicited = solicited; - icmp6h.icmp6_override = override; + if (!dev->addr_len) + inc_opt = 0; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); - __ndisc_send(dev, neigh, daddr, src_addr, - &icmp6h, solicited_addr, - inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + .icmp6_router = router, + .icmp6_solicited = solicited, + .icmp6_override = override, + }, + .target = *solicited_addr, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + dev->dev_addr); + + + ndisc_send_skb(skb, daddr, src_addr); +} + +static void ndisc_send_unsol_na(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + idev = in6_dev_get(dev); + if (!idev) + return; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr, + /*router=*/ !!idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + read_unlock_bh(&idev->lock); + + in6_dev_put(idev); } void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, - struct in6_addr *solicit, - struct in6_addr *daddr, struct in6_addr *saddr) + const struct in6_addr *solicit, + const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct sk_buff *skb; struct in6_addr addr_buf; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, - }; + int inc_opt = dev->addr_len; + int optlen = 0; + struct nd_msg *msg; if (saddr == NULL) { if (ipv6_get_lladdr(dev, &addr_buf, @@ -593,18 +559,37 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, saddr = &addr_buf; } - __ndisc_send(dev, neigh, daddr, saddr, - &icmp6h, solicit, - !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0); + if (ipv6_addr_any(saddr)) + inc_opt = false; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }, + .target = *solicit, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } -void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr) +void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr) { - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_ROUTER_SOLICITATION, - }; + struct sk_buff *skb; + struct rs_msg *msg; int send_sllao = dev->addr_len; + int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* @@ -616,7 +601,7 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, * suppress the inclusion of the sllao. */ if (send_sllao) { - struct inet6_ifaddr *ifp = ipv6_get_ifaddr(&init_net, saddr, + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { @@ -628,9 +613,25 @@ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, } } #endif - __ndisc_send(dev, NULL, daddr, saddr, - &icmp6h, NULL, - send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0); + if (send_sllao) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct rs_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct rs_msg) { + .icmph = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }, + }; + + if (send_sllao) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } @@ -654,63 +655,63 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); - if (skb && ipv6_chk_addr(&init_net, &ipv6_hdr(skb)->saddr, dev, 1)) + if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1)) saddr = &ipv6_hdr(skb)->saddr; - if ((probes -= neigh->parms->ucast_probes) < 0) { + if ((probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES)) < 0) { if (!(neigh->nud_state & NUD_VALID)) { - ND_PRINTK1(KERN_DEBUG - "%s(): trying to ucast probe in NUD_INVALID: " - NIP6_FMT "\n", - __FUNCTION__, - NIP6(*target)); + ND_PRINTK(1, dbg, + "%s: trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); } ndisc_send_ns(dev, neigh, target, target, saddr); - } else if ((probes -= neigh->parms->app_probes) < 0) { -#ifdef CONFIG_ARPD + } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); -#endif } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); } } -static struct pneigh_entry *pndisc_check_router(struct net_device *dev, - struct in6_addr *addr, int *is_router) +static int pndisc_is_router(const void *pkey, + struct net_device *dev) { struct pneigh_entry *n; + int ret = -1; read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, &init_net, addr, dev); - if (n != NULL) - *is_router = (n->flags & NTF_ROUTER); + n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + if (n) + ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); - return n; + return ret; } static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); - struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; - struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; - struct pneigh_entry *pneigh = NULL; int dad = ipv6_addr_any(saddr); - int inc; - int is_router = 0; + bool inc; + int is_router = -1; + + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK(2, warn, "NS: packet too short\n"); + return; + } if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: multicast target address"); + ND_PRINTK(2, warn, "NS: multicast target address\n"); return; } @@ -718,27 +719,21 @@ static void ndisc_recv_ns(struct sk_buff *skb) * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ - if (dad && - !(daddr->s6_addr32[0] == htonl(0xff020000) && - daddr->s6_addr32[1] == htonl(0x00000000) && - daddr->s6_addr32[2] == htonl(0x00000001) && - daddr->s6_addr [12] == 0xff )) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + if (dad && !ipv6_addr_is_solict_mult(daddr)) { + ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return; } if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid ND options\n"); + ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "NS: invalid link-layer address length\n"); return; } @@ -748,32 +743,19 @@ static void ndisc_recv_ns(struct sk_buff *skb) * in the message. */ if (dad) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + ND_PRINTK(2, warn, + "NS: bad DAD packet (link-layer address option)\n"); return; } } inc = ipv6_addr_is_multicast(daddr); - if ((ifp = ipv6_get_ifaddr(&init_net, &msg->target, dev, 1)) != NULL) { + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { - if (dev->type == ARPHRD_IEEE802_TR) { - const unsigned char *sadr; - sadr = skb_mac_header(skb); - if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && - sadr[9] == dev->dev_addr[1] && - sadr[10] == dev->dev_addr[2] && - sadr[11] == dev->dev_addr[3] && - sadr[12] == dev->dev_addr[4] && - sadr[13] == dev->dev_addr[5]) { - /* looped-back to us */ - goto out; - } - } - /* * We are colliding with another node * who is doing DAD @@ -795,21 +777,22 @@ static void ndisc_recv_ns(struct sk_buff *skb) idev = ifp->idev; } else { + struct net *net = dev_net(dev); + idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return; } - if (ipv6_chk_acast_addr(dev, &msg->target) || + if (ipv6_chk_acast_addr(net, dev, &msg->target) || (idev->cnf.forwarding && - (ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) && - (pneigh = pndisc_check_router(dev, &msg->target, - &is_router)) != NULL)) { + (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && - inc != 0 && - idev->nd_parms->proxy_delay != 0) { + inc && + NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response @@ -826,14 +809,12 @@ static void ndisc_recv_ns(struct sk_buff *skb) goto out; } - is_router = !!(pneigh ? is_router : idev->cnf.forwarding); + if (is_router < 0) + is_router = idev->cnf.forwarding; if (dad) { - struct in6_addr maddr; - - ipv6_addr_all_nodes(&maddr); - ndisc_send_na(dev, NULL, &maddr, &msg->target, - is_router, 0, (ifp != NULL), 1); + ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + !!is_router, false, (ifp != NULL), true); goto out; } @@ -854,8 +835,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE); if (neigh || !dev->header_ops) { ndisc_send_na(dev, neigh, saddr, &msg->target, - is_router, - 1, (ifp != NULL && inc), inc); + !!is_router, + true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); } @@ -865,17 +846,15 @@ out: in6_ifa_put(ifp); else in6_dev_put(idev); - - return; } static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; - struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; @@ -883,50 +862,53 @@ static void ndisc_recv_na(struct sk_buff *skb) struct neighbour *neigh; if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: packet too short\n"); + ND_PRINTK(2, warn, "NA: packet too short\n"); return; } if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: target address is multicast.\n"); + ND_PRINTK(2, warn, "NA: target address is multicast\n"); return; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: solicited NA is multicasted.\n"); + ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return; } if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid ND option\n"); + ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "NA: invalid link-layer address length\n"); return; } } - if ((ifp = ipv6_get_ifaddr(&init_net, &msg->target, dev, 1))) { - if (ifp->flags & IFA_F_TENTATIVE) { - addrconf_dad_failure(ifp); - return; + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { + if (skb->pkt_type != PACKET_LOOPBACK + && (ifp->flags & IFA_F_TENTATIVE)) { + addrconf_dad_failure(ifp); + return; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) + + We should not print the error if NA has been + received from loopback - it is just our own + unsolicited advertisement. */ - ND_PRINTK1(KERN_WARNING - "ICMPv6 NA: someone advertises our address on %s!\n", - ifp->idev->dev->name); + if (skb->pkt_type != PACKET_LOOPBACK) + ND_PRINTK(1, warn, + "NA: someone advertises our address %pI6 on %s!\n", + &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return; } @@ -934,6 +916,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (neigh) { u8 old_flags = neigh->flags; + struct net *net = dev_net(dev); if (neigh->nud_state & NUD_FAILED) goto out; @@ -944,9 +927,9 @@ static void ndisc_recv_na(struct sk_buff *skb) * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && - ipv6_devconf.forwarding && ipv6_devconf.proxy_ndp && - pneigh_lookup(&nd_tbl, &init_net, &msg->target, dev, 0)) { - /* XXX: idev->cnf.prixy_ndp */ + net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + /* XXX: idev->cnf.proxy_ndp */ goto out; } @@ -961,10 +944,7 @@ static void ndisc_recv_na(struct sk_buff *skb) /* * Change: router to host */ - struct rt6_info *rt; - rt = rt6_get_dflt_router(saddr, dev); - if (rt) - ip6_del_rt(rt); + rt6_clean_tohost(dev_net(dev), saddr); } out: @@ -978,17 +958,16 @@ static void ndisc_recv_rs(struct sk_buff *skb) unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; - struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; if (skb->len < sizeof(*rs_msg)) return; - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); if (!idev) { - if (net_ratelimit()) - ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + ND_PRINTK(1, err, "RS: can't find in6 device\n"); return; } @@ -1005,8 +984,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) /* Parse ND options */ if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { - if (net_ratelimit()) - ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1026,7 +1004,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh_release(neigh); } out: - in6_dev_put(idev); + return; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) @@ -1035,6 +1013,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; + struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); @@ -1060,22 +1039,19 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); - NLA_PUT(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), - &ipv6_hdr(ra)->saddr); + if (nla_put(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), + &ipv6_hdr(ra)->saddr)) + goto nla_put_failure; nlmsg_end(skb, nlh); - err = rtnl_notify(skb, &init_net, 0, RTNLGRP_ND_USEROPT, NULL, - GFP_ATOMIC); - if (err < 0) - goto errout; - + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: - rtnl_set_sk_err(&init_net, RTNLGRP_ND_USEROPT, err); + rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static void ndisc_router_discovery(struct sk_buff *skb) @@ -1091,42 +1067,50 @@ static void ndisc_router_discovery(struct sk_buff *skb) __u8 * opt = (__u8 *)(ra_msg + 1); - optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); + optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - + sizeof(struct ra_msg); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: source address is not link-local.\n"); + ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; } if (optlen < 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: packet too short\n"); + ND_PRINTK(2, warn, "RA: packet too short\n"); return; } +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { + ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); + return; + } +#endif + /* * set the RA_RECV flag in the interface */ - in6_dev = in6_dev_get(skb->dev); + in6_dev = __in6_dev_get(skb->dev); if (in6_dev == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: can't find inet6 device for %s.\n", - skb->dev->name); - return; - } - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) { - in6_dev_put(in6_dev); + ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", + skb->dev->name); return; } if (!ndisc_parse_options(opt, optlen, &ndopts)) { - in6_dev_put(in6_dev); - ND_PRINTK2(KERN_WARNING - "ICMP6 RA: invalid ND options\n"); + ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } + if (!ipv6_accept_ra(in6_dev)) + goto skip_linkparms; + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific parameters from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto skip_linkparms; +#endif + if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent @@ -1149,6 +1133,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!in6_dev->cnf.accept_ra_defrtr) goto skip_defrtr; + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_defrtr; + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); #ifdef CONFIG_IPV6_ROUTER_PREF @@ -1161,49 +1148,52 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); - if (rt) - neigh = rt->rt6i_nexthop; - + if (rt) { + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + if (!neigh) { + ND_PRINTK(0, err, + "RA: %s got default router without neighbour\n", + __func__); + ip6_rt_put(rt); + return; + } + } if (rt && lifetime == 0) { - neigh_clone(neigh); ip6_del_rt(rt); rt = NULL; } if (rt == NULL && lifetime) { - ND_PRINTK3(KERN_DEBUG - "ICMPv6 RA: adding default router.\n"); + ND_PRINTK(3, dbg, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: %s() failed to add default route.\n", - __FUNCTION__); - in6_dev_put(in6_dev); + ND_PRINTK(0, err, + "RA: %s failed to add default route\n", + __func__); return; } - neigh = rt->rt6i_nexthop; + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); if (neigh == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: %s() got default router without neighbour.\n", - __FUNCTION__); - dst_release(&rt->u.dst); - in6_dev_put(in6_dev); + ND_PRINTK(0, err, + "RA: %s got default router without neighbour\n", + __func__); + ip6_rt_put(rt); return; } neigh->flags |= NTF_ROUTER; } else if (rt) { - rt->rt6i_flags |= (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) - rt->rt6i_expires = jiffies + (HZ * lifetime); - + rt6_set_expires(rt, jiffies + (HZ * lifetime)); if (ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; if (rt) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } skip_defrtr: @@ -1219,7 +1209,7 @@ skip_defrtr: rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; - in6_dev->nd_parms->retrans_time = rtime; + NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); } @@ -1231,9 +1221,11 @@ skip_defrtr: if (rtime < HZ/10) rtime = HZ/10; - if (rtime != in6_dev->nd_parms->base_reachable_time) { - in6_dev->nd_parms->base_reachable_time = rtime; - in6_dev->nd_parms->gc_staletime = 3 * rtime; + if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { + NEIGH_VAR_SET(in6_dev->nd_parms, + BASE_REACHABLE_TIME, rtime); + NEIGH_VAR_SET(in6_dev->nd_parms, + GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); @@ -1241,6 +1233,8 @@ skip_defrtr: } } +skip_linkparms: + /* * Process options. */ @@ -1254,8 +1248,8 @@ skip_defrtr: lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "RA: invalid link-layer address length\n"); goto out; } } @@ -1266,18 +1260,41 @@ skip_defrtr: NEIGH_UPDATE_F_ISROUTER); } + if (!ipv6_accept_ra(in6_dev)) + goto out; + #ifdef CONFIG_IPV6_ROUTE_INFO + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_routeinfo; + if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { - if (((struct route_info *)p)->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + struct route_info *ri = (struct route_info *)p; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && + ri->prefix_len == 0) + continue; +#endif + if (ri->prefix_len == 0 && + !in6_dev->cnf.accept_ra_defrtr) + continue; + if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } + +skip_routeinfo: +#endif + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific ndopts from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto out; #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { @@ -1285,7 +1302,9 @@ skip_defrtr: for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { - addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3); + addrconf_prefix_rcv(skb->dev, (u8 *)p, + (p->nd_opt_len) << 3, + ndopts.nd_opts_src_lladdr != NULL); } } @@ -1297,14 +1316,12 @@ skip_defrtr: mtu = ntohl(n); if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid mtu: %d\n", - mtu); + ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; if (rt) - rt->u.dst.metrics[RTAX_MTU-1] = mtu; + dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } @@ -1320,248 +1337,184 @@ skip_defrtr: } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid RA options"); + ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: - if (rt) - dst_release(&rt->u.dst); - else if (neigh) + ip6_rt_put(rt); + if (neigh) neigh_release(neigh); - in6_dev_put(in6_dev); } static void ndisc_redirect_rcv(struct sk_buff *skb) { - struct inet6_dev *in6_dev; - struct icmp6hdr *icmph; - struct in6_addr *dest; - struct in6_addr *target; /* new first hop to destination */ - struct neighbour *neigh; - int on_link = 0; + u8 *hdr; struct ndisc_options ndopts; - int optlen; - u8 *lladdr = NULL; - - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: source address is not link-local.\n"); + struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + + offsetof(struct rd_msg, opt)); + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + switch (skb->ndisc_nodetype) { + case NDISC_NODETYPE_HOST: + case NDISC_NODETYPE_NODEFAULT: + ND_PRINTK(2, warn, + "Redirect: from host or unauthorized router\n"); return; } +#endif - optlen = skb->tail - skb->transport_header; - optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); - - if (optlen < 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: packet too short\n"); + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK(2, warn, + "Redirect: source address is not link-local\n"); return; } - icmph = icmp6_hdr(skb); - target = (struct in6_addr *) (icmph + 1); - dest = target + 1; - - if (ipv6_addr_is_multicast(dest)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: destination address is multicast.\n"); + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) return; - } - if (ipv6_addr_equal(dest, target)) { - on_link = 1; - } else if (ipv6_addr_type(target) != - (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: target address is not link-local unicast.\n"); + if (!ndopts.nd_opts_rh) { + ip6_redirect_no_header(skb, dev_net(skb->dev), + skb->dev->ifindex, 0); return; } - in6_dev = in6_dev_get(skb->dev); - if (!in6_dev) - return; - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) { - in6_dev_put(in6_dev); + hdr = (u8 *)ndopts.nd_opts_rh; + hdr += 8; + if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return; - } - /* RFC2461 8.1: - * The IP source address of the Redirect MUST be the same as the current - * first-hop router for the specified ICMP Destination Address. - */ + icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); +} - if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: invalid ND options\n"); - in6_dev_put(in6_dev); - return; - } - if (ndopts.nd_opts_tgt_lladdr) { - lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, - skb->dev); - if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: invalid link-layer address length\n"); - in6_dev_put(in6_dev); - return; - } - } +static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, + struct sk_buff *orig_skb, + int rd_len) +{ + u8 *opt = skb_put(skb, rd_len); - neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); - if (neigh) { - rt6_redirect(dest, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, neigh, lladdr, - on_link); - neigh_release(neigh); - } - in6_dev_put(in6_dev); + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8); } -void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, - struct in6_addr *target) +void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { - struct sock *sk = ndisc_socket->sk; - int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + int optlen = 0; + struct inet_peer *peer; struct sk_buff *buff; - struct icmp6hdr *icmph; + struct rd_msg *msg; struct in6_addr saddr_buf; - struct in6_addr *addrp; - struct net_device *dev; struct rt6_info *rt; struct dst_entry *dst; - struct inet6_dev *idev; - struct flowi fl; - u8 *opt; + struct flowi6 fl6; int rd_len; - int err; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; - - dev = skb->dev; + bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: no link-local address on %s\n", - dev->name); + ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", + dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: target address is not link-local unicast.\n"); + ND_PRINTK(2, warn, + "Redirect: target address is not link-local unicast\n"); return; } - ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, - dev->ifindex); + icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, + &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); - dst = ip6_route_output(NULL, &fl); - if (dst == NULL) + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); return; - - err = xfrm_lookup(&dst, &fl, NULL, 0); - if (err) + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) return; rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: destination is not a neighbour.\n"); - dst_release(dst); - return; - } - if (!xrlim_allow(dst, 1*HZ)) { - dst_release(dst); - return; + ND_PRINTK(2, warn, + "Redirect: destination is not a neighbour\n"); + goto release; } + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + ret = inet_peer_xrlim_allow(peer, 1*HZ); + if (peer) + inet_putpeer(peer); + if (!ret) + goto release; if (dev->addr_len) { + struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); + if (!neigh) { + ND_PRINTK(2, warn, + "Redirect: no neigh for target address\n"); + goto release; + } + read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - len += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev); } else read_unlock_bh(&neigh->lock); + + neigh_release(neigh); } rd_len = min_t(unsigned int, - IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, + skb->len + 8); rd_len &= ~0x7; - len += rd_len; - - buff = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_RESERVED_SPACE(dev)), - 1, &err); - if (buff == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 Redirect: %s() failed to allocate an skb.\n", - __FUNCTION__); - dst_release(dst); - return; - } - - - skb_reserve(buff, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, - IPPROTO_ICMPV6, len); - - skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); - skb_put(buff, len); - icmph = icmp6_hdr(buff); - - memset(icmph, 0, sizeof(struct icmp6hdr)); - icmph->icmp6_type = NDISC_REDIRECT; - - /* - * copy target and destination addresses - */ - - addrp = (struct in6_addr *)(icmph + 1); - ipv6_addr_copy(addrp, target); - addrp++; - ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr); - - opt = (u8*) (addrp + 1); + optlen += rd_len; + + buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!buff) + goto release; + + msg = (struct rd_msg *)skb_put(buff, sizeof(*msg)); + *msg = (struct rd_msg) { + .icmph = { + .icmp6_type = NDISC_REDIRECT, + }, + .target = *target, + .dest = ipv6_hdr(skb)->daddr, + }; /* * include target_address option */ if (ha) - opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, - dev->addr_len, dev->type); + ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); /* * build redirect option and copy skb over to the new packet. */ - memset(opt, 0, 8); - *(opt++) = ND_OPT_REDIRECT_HDR; - *(opt++) = (rd_len >> 3); - opt += 6; - - memcpy(opt, ipv6_hdr(skb), rd_len - 8); + if (rd_len) + ndisc_fill_redirect_hdr_option(buff, skb, rd_len); - icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, - len, IPPROTO_ICMPV6, - csum_partial((u8 *) icmph, len, 0)); - - buff->dst = dst; - idev = in6_dev_get(dst->dev); - IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, - dst_output); - if (!err) { - ICMP6MSGOUT_INC_STATS(idev, NDISC_REDIRECT); - ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); - } + skb_dst_set(buff, dst); + ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); + return; - if (likely(idev != NULL)) - in6_dev_put(idev); +release: + dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) @@ -1570,11 +1523,28 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return true; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && + idev->cnf.suppress_frag_ndisc) { + net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); + return true; + } + return false; +} + int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; - if (!pskb_may_pull(skb, skb->len)) + if (ndisc_suppress_frag_ndisc(skb)) + return 0; + + if (skb_linearize(skb)) return 0; msg = (struct nd_msg *)skb_transport_header(skb); @@ -1582,16 +1552,14 @@ int ndisc_rcv(struct sk_buff *skb) __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NDISC: invalid hop-limit: %d\n", - ipv6_hdr(skb)->hop_limit); + ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", + ipv6_hdr(skb)->hop_limit); return 0; } if (msg->icmph.icmp6_code != 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", - msg->icmph.icmp6_code); + ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); return 0; } @@ -1624,19 +1592,27 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; - - if (dev->nd_net != &init_net) - return NOTIFY_DONE; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct inet6_dev *idev; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); - fib6_run_gc(~0UL); + fib6_run_gc(0, net, false); + idev = in6_dev_get(dev); + if (!idev) + break; + if (idev->cnf.ndisc_notify) + ndisc_send_unsol_na(dev); + in6_dev_put(idev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); - fib6_run_gc(~0UL); + fib6_run_gc(0, net, false); + break; + case NETDEV_NOTIFY_PEERS: + ndisc_send_unsol_na(dev); break; default: break; @@ -1657,11 +1633,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING - "process `%s' is using deprecated sysctl (%s) " - "net.ipv6.neigh.%s.%s; " - "Use net.ipv6.neigh.%s.%s_ms " - "instead.\n", + pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); @@ -1669,7 +1641,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; @@ -1680,22 +1652,23 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) - ret = proc_dointvec_jiffies(ctl, write, - filp, buffer, lenp, ppos); + ret = neigh_proc_dointvec_jiffies(ctl, write, + buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) - ret = proc_dointvec_ms_jiffies(ctl, write, - filp, buffer, lenp, ppos); + ret = neigh_proc_dointvec_ms_jiffies(ctl, write, + buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->data == &idev->nd_parms->base_reachable_time) - idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) + idev->nd_parms->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); @@ -1703,95 +1676,87 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f return ret; } -static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, - int nlen, void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct net_device *dev = ctl->extra1; - struct inet6_dev *idev; - int ret; - - if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME || - ctl->ctl_name == NET_NEIGH_REACHABLE_TIME) - ndisc_warn_deprecated_sysctl(ctl, "procfs", dev ? dev->name : "default"); - - switch (ctl->ctl_name) { - case NET_NEIGH_REACHABLE_TIME: - ret = sysctl_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); - break; - case NET_NEIGH_RETRANS_TIME_MS: - case NET_NEIGH_REACHABLE_TIME_MS: - ret = sysctl_ms_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); - break; - default: - ret = 0; - } - - if (newval && newlen && ret > 0 && - dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME || - ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS) - idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); - idev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, idev); - in6_dev_put(idev); - } - - return ret; -} #endif -int __init ndisc_init(struct net_proto_family *ops) +static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; - err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &ndisc_socket); + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - ND_PRINTK0(KERN_ERR - "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", - err); - ndisc_socket = NULL; /* For safety. */ + ND_PRINTK(0, err, + "NDISC: Failed to initialize the control socket (err %d)\n", + err); return err; } - sk = ndisc_socket->sk; + net->ipv6.ndisc_sk = sk; + np = inet6_sk(sk); - sk->sk_allocation = GFP_ATOMIC; np->hop_limit = 255; /* Do not loopback ndisc messages */ np->mc_loop = 0; - sk->sk_prot->unhash(sk); + return 0; +} + +static void __net_exit ndisc_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.ndisc_sk); +} + +static struct pernet_operations ndisc_net_ops = { + .init = ndisc_net_init, + .exit = ndisc_net_exit, +}; + +int __init ndisc_init(void) +{ + int err; + + err = register_pernet_subsys(&ndisc_net_ops); + if (err) + return err; /* * Initialize the neighbour table */ - neigh_table_init(&nd_tbl); #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, - "ipv6", - &ndisc_ifinfo_sysctl_change, - &ndisc_ifinfo_sysctl_strategy); + err = neigh_sysctl_register(NULL, &nd_tbl.parms, + &ndisc_ifinfo_sysctl_change); + if (err) + goto out_unregister_pernet; +out: #endif + return err; - register_netdevice_notifier(&ndisc_netdev_notifier); - return 0; +#ifdef CONFIG_SYSCTL +out_unregister_pernet: + unregister_pernet_subsys(&ndisc_net_ops); + goto out; +#endif } -void ndisc_cleanup(void) +int __init ndisc_late_init(void) +{ + return register_netdevice_notifier(&ndisc_netdev_notifier); +} + +void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); +} + +void ndisc_cleanup(void) +{ #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(&nd_tbl); - sock_release(ndisc_socket); - ndisc_socket = NULL; /* For safety. */ + unregister_pernet_subsys(&ndisc_net_ops); } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 2e06724dc34..d38e6a8d8b9 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -1,8 +1,16 @@ +/* + * IPv6 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/addrconf.h> #include <net/dst.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -12,37 +20,50 @@ int ip6_route_me_harder(struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int hh_len; struct dst_entry *dst; - struct flowi fl = { - .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, - .mark = skb->mark, - .nl_u = - { .ip6_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, } }, + struct flowi6 fl6 = { + .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .flowi6_mark = skb->mark, + .daddr = iph->daddr, + .saddr = iph->saddr, }; + int err; - dst = ip6_route_output(skb->sk, &fl); - -#ifdef CONFIG_XFRM - if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET6) == 0) - if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0)) - return -1; -#endif - - if (dst->error) { - IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + dst = ip6_route_output(net, skb->sk, &fl6); + err = dst->error; + if (err) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); - return -EINVAL; + return err; } /* Drop old route. */ - dst_release(skb->dst); + skb_dst_drop(skb); + + skb_dst_set(skb, dst); + +#ifdef CONFIG_XFRM + if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { + skb_dst_set(skb, NULL); + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + skb_dst_set(skb, dst); + } +#endif + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), + 0, GFP_ATOMIC)) + return -ENOMEM; - skb->dst = dst; return 0; } EXPORT_SYMBOL(ip6_route_me_harder); @@ -55,6 +76,7 @@ EXPORT_SYMBOL(ip6_route_me_harder); struct ip6_rt_info { struct in6_addr daddr; struct in6_addr saddr; + u_int32_t mark; }; static void nf_ip6_saveroute(const struct sk_buff *skb, @@ -63,10 +85,11 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->hook == NF_INET_LOCAL_OUT) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; } } @@ -76,24 +99,41 @@ static int nf_ip6_reroute(struct sk_buff *skb, struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->hook == NF_INET_LOCAL_OUT) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || - !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || + skb->mark != rt_info->mark) return ip6_route_me_harder(skb); } return 0; } -static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict) { - *dst = ip6_route_output(NULL, fl); - return (*dst)->error; + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + struct dst_entry *result; + int err; + + result = ip6_route_output(net, sk, &fl->u.ip6); + err = result->error; + if (err) + dst_release(result); + else + *dst = result; + return err; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { @@ -121,20 +161,51 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, } return csum; } - EXPORT_SYMBOL(nf_ip6_checksum); +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; + +static const struct nf_ipv6_ops ipv6ops = { + .chk_addr = ipv6_chk_addr, +}; + static const struct nf_afinfo nf_ip6_afinfo = { - .family = AF_INET6, - .checksum = nf_ip6_checksum, - .route = nf_ip6_route, - .saveroute = nf_ip6_saveroute, - .reroute = nf_ip6_reroute, - .route_key_size = sizeof(struct ip6_rt_info), + .family = AF_INET6, + .checksum = nf_ip6_checksum, + .checksum_partial = nf_ip6_checksum_partial, + .route = nf_ip6_route, + .saveroute = nf_ip6_saveroute, + .reroute = nf_ip6_reroute, + .route_key_size = sizeof(struct ip6_rt_info), }; int __init ipv6_netfilter_init(void) { + RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return nf_register_afinfo(&nf_ip6_afinfo); } @@ -143,5 +214,6 @@ int __init ipv6_netfilter_init(void) */ void ipv6_netfilter_fini(void) { + RCU_INIT_POINTER(nf_ipv6_ops, NULL); nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6cae5475737..4bff1f297e3 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +config NF_DEFRAG_IPV6 + tristate + default n + config NF_CONNTRACK_IPV6 tristate "IPv6 connection tracking support" depends on INET && IPV6 && NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -20,27 +25,35 @@ config NF_CONNTRACK_IPV6 To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_QUEUE - tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" - depends on INET && IPV6 && NETFILTER - depends on NETFILTER_ADVANCED - ---help--- - - This option adds a queue handler to the kernel for IPv6 - packets which enables users to receive the filtered packets - with QUEUE target using libipq. - - This option enables the old IPv6-only "ip6_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). +config NF_TABLES_IPV6 + depends on NF_TABLES + tristate "IPv6 nf_tables support" + help + This option enables the IPv6 support for nf_tables. - (C) Fernando Anton 2001 - IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. - Universidad Carlos III de Madrid - Universidad Politecnica de Alcala de Henares - email: <fanton@it.uc3m.es>. +config NFT_CHAIN_ROUTE_IPV6 + depends on NF_TABLES_IPV6 + tristate "IPv6 nf_tables route chain support" + help + This option enables the "route" chain for IPv6 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, flowlabel, hop-limit and + the packet mark. + +config NFT_CHAIN_NAT_IPV6 + depends on NF_TABLES_IPV6 + depends on NF_NAT_IPV6 && NFT_NAT + tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. - To compile it as a module, choose M here. If unsure, say N. +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + default NFT_REJECT + tristate config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" @@ -55,30 +68,29 @@ config IP6_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. +if IP6_NF_IPTABLES + # The simple matches. -config IP6_NF_MATCH_RT - tristate '"rt" Routing header match support' - depends on IP6_NF_IPTABLES +config IP6_NF_MATCH_AH + tristate '"ah" match support' depends on NETFILTER_ADVANCED help - rt matching allows you to match packets based on the routing - header of the packet. + This module allows one to match AH packets. To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_OPTS - tristate '"hopbyhop" and "dst" opts header match support' - depends on IP6_NF_IPTABLES +config IP6_NF_MATCH_EUI64 + tristate '"eui64" address check' depends on NETFILTER_ADVANCED help - This allows one to match packets based on the hop-by-hop - and destination options headers of a packet. + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address To compile it as a module, choose M here. If unsure, say N. config IP6_NF_MATCH_FRAG tristate '"frag" Fragmentation header match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help frag matching allows you to match packets based on the fragmentation @@ -86,19 +98,26 @@ config IP6_NF_MATCH_FRAG To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_HL - tristate '"hl" match support' - depends on IP6_NF_IPTABLES +config IP6_NF_MATCH_OPTS + tristate '"hbh" hop-by-hop and "dst" opts header match support' depends on NETFILTER_ADVANCED help - HL matching allows you to match packets based on the hop - limit of the packet. + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_HL + tristate '"hl" hoplimit match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. + config IP6_NF_MATCH_IPV6HEADER tristate '"ipv6header" IPv6 Extension Headers Match' - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This module allows one to match packets based upon @@ -106,39 +125,45 @@ config IP6_NF_MATCH_IPV6HEADER To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_AH - tristate '"ah" match support' - depends on IP6_NF_IPTABLES - depends on NETFILTER_ADVANCED - help - This module allows one to match AH packets. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_MATCH_MH tristate '"mh" match support' - depends on IP6_NF_IPTABLES depends on NETFILTER_ADVANCED help This module allows one to match MH packets. To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_MATCH_EUI64 - tristate '"eui64" address check' - depends on IP6_NF_IPTABLES +config IP6_NF_MATCH_RPFILTER + tristate '"rpfilter" reverse path filter match support' + depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + ---help--- + This option allows you to match packets whose replies would + go out via the interface the packet came in. + + To compile it as a module, choose M here. If unsure, say N. + The module will be called ip6t_rpfilter. + +config IP6_NF_MATCH_RT + tristate '"rt" Routing header match support' depends on NETFILTER_ADVANCED help - This module performs checking on the IPv6 source address - Compares the last 64 bits with the EUI64 (delivered - from the MAC address) address + rt matching allows you to match packets based on the routing + header of the packet. To compile it as a module, choose M here. If unsure, say N. # The targets +config IP6_NF_TARGET_HL + tristate '"HL" hoplimit target support' + depends on NETFILTER_ADVANCED && IP6_NF_MANGLE + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + config IP6_NF_FILTER tristate "Packet filtering" - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of @@ -147,16 +172,6 @@ config IP6_NF_FILTER To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_TARGET_LOG - tristate "LOG target support" - depends on IP6_NF_FILTER - default m if NETFILTER_ADVANCED=n - help - This option adds a `LOG' target, which allows you to create rules in - any iptables table which records the packet header to the syslog. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_TARGET_REJECT tristate "REJECT target support" depends on IP6_NF_FILTER @@ -168,9 +183,21 @@ config IP6_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_MANGLE tristate "Packet mangling" - depends on IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for @@ -179,27 +206,8 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_TARGET_HL - tristate 'HL (hoplimit) target support' - depends on IP6_NF_MANGLE - depends on NETFILTER_ADVANCED - help - This option adds a `HL' target, which enables the user to decrement - the hoplimit value of the IPv6 header or set it to a given (lower) - value. - - While it is safe to decrement the hoplimit value, this option also - enables functionality to increment and set the hoplimit value of the - IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since - you can easily create immortal packets that loop forever on the - network. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_RAW tristate 'raw table support (required for TRACE)' - depends on IP6_NF_IPTABLES - depends on NETFILTER_ADVANCED help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -208,5 +216,53 @@ config IP6_NF_RAW If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +# security table for MAC policy +config IP6_NF_SECURITY + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + +config NF_NAT_IPV6 + tristate "IPv6 NAT" + depends on NF_CONNTRACK_IPV6 + depends on NETFILTER_ADVANCED + select NF_NAT + help + The IPv6 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in ip6tables, see the man page for ip6tables(8). + + To compile it as a module, choose M here. If unsure, say N. + +if NF_NAT_IPV6 + +config IP6_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_NPT + tristate "NPT (Network Prefix translation) target support" + help + This option adds the `SNPT' and `DNPT' target, which perform + stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296. + + To compile it as a module, choose M here. If unsure, say N. + +endif # NF_NAT_IPV6 + +endif # IP6_NF_IPTABLES + endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index fbf2c14ed88..70d3dd66f2c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -6,26 +6,41 @@ obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o -obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o +obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o +obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o # objects for l3 independent conntrack -nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o +nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o # l3 independent conntrack obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o +nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o +obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o + +# defrag +nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + +# nf_tables +obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o +obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o -obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o +obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o # targets -obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o -obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o +obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o +obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c deleted file mode 100644 index 8d366f7f2a9..00000000000 --- a/net/ipv6/netfilter/ip6_queue.c +++ /dev/null @@ -1,652 +0,0 @@ -/* - * This is a module which is used for queueing IPv6 packets and - * communicating with userspace via netlink. - * - * (C) 2001 Fernando Anton, this code is GPL. - * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. - * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain - * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain - * email: fanton@it.uc3m.es - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/ipv6.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/ipv6.h> -#include <net/ip6_route.h> -#include <net/netfilter/nf_queue.h> -#include <linux/netfilter_ipv4/ip_queue.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> - -#define IPQ_QMAX_DEFAULT 1024 -#define IPQ_PROC_FS_NAME "ip6_queue" -#define NET_IPQ_QMAX 2088 -#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" - -typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); - -static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; -static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_RWLOCK(queue_lock); -static int peer_pid __read_mostly; -static unsigned int copy_range __read_mostly; -static unsigned int queue_total; -static unsigned int queue_dropped = 0; -static unsigned int queue_user_dropped = 0; -static struct sock *ipqnl __read_mostly; -static LIST_HEAD(queue_list); -static DEFINE_MUTEX(ipqnl_mutex); - -static inline void -__ipq_enqueue_entry(struct nf_queue_entry *entry) -{ - list_add_tail(&entry->list, &queue_list); - queue_total++; -} - -static inline int -__ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status = 0; - - switch(mode) { - case IPQ_COPY_NONE: - case IPQ_COPY_META: - copy_mode = mode; - copy_range = 0; - break; - - case IPQ_COPY_PACKET: - copy_mode = mode; - copy_range = range; - if (copy_range > 0xFFFF) - copy_range = 0xFFFF; - break; - - default: - status = -EINVAL; - - } - return status; -} - -static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); - -static inline void -__ipq_reset(void) -{ - peer_pid = 0; - net_disable_timestamp(); - __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NULL, 0); -} - -static struct nf_queue_entry * -ipq_find_dequeue_entry(unsigned long id) -{ - struct nf_queue_entry *entry = NULL, *i; - - write_lock_bh(&queue_lock); - - list_for_each_entry(i, &queue_list, list) { - if ((unsigned long)i == id) { - entry = i; - break; - } - } - - if (entry) { - list_del(&entry->list); - queue_total--; - } - - write_unlock_bh(&queue_lock); - return entry; -} - -static void -__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - struct nf_queue_entry *entry, *next; - - list_for_each_entry_safe(entry, next, &queue_list, list) { - if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue_total--; - nf_reinject(entry, NF_DROP); - } - } -} - -static void -ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - write_lock_bh(&queue_lock); - __ipq_flush(cmpfn, data); - write_unlock_bh(&queue_lock); -} - -static struct sk_buff * -ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) -{ - sk_buff_data_t old_tail; - size_t size = 0; - size_t data_len = 0; - struct sk_buff *skb; - struct ipq_packet_msg *pmsg; - struct nlmsghdr *nlh; - struct timeval tv; - - read_lock_bh(&queue_lock); - - switch (copy_mode) { - case IPQ_COPY_META: - case IPQ_COPY_NONE: - size = NLMSG_SPACE(sizeof(*pmsg)); - data_len = 0; - break; - - case IPQ_COPY_PACKET: - if ((entry->skb->ip_summed == CHECKSUM_PARTIAL || - entry->skb->ip_summed == CHECKSUM_COMPLETE) && - (*errp = skb_checksum_help(entry->skb))) { - read_unlock_bh(&queue_lock); - return NULL; - } - if (copy_range == 0 || copy_range > entry->skb->len) - data_len = entry->skb->len; - else - data_len = copy_range; - - size = NLMSG_SPACE(sizeof(*pmsg) + data_len); - break; - - default: - *errp = -EINVAL; - read_unlock_bh(&queue_lock); - return NULL; - } - - read_unlock_bh(&queue_lock); - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - goto nlmsg_failure; - - old_tail = skb->tail; - nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); - pmsg = NLMSG_DATA(nlh); - memset(pmsg, 0, sizeof(*pmsg)); - - pmsg->packet_id = (unsigned long )entry; - pmsg->data_len = data_len; - tv = ktime_to_timeval(entry->skb->tstamp); - pmsg->timestamp_sec = tv.tv_sec; - pmsg->timestamp_usec = tv.tv_usec; - pmsg->mark = entry->skb->mark; - pmsg->hook = entry->hook; - pmsg->hw_protocol = entry->skb->protocol; - - if (entry->indev) - strcpy(pmsg->indev_name, entry->indev->name); - else - pmsg->indev_name[0] = '\0'; - - if (entry->outdev) - strcpy(pmsg->outdev_name, entry->outdev->name); - else - pmsg->outdev_name[0] = '\0'; - - if (entry->indev && entry->skb->dev) { - pmsg->hw_type = entry->skb->dev->type; - pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); - } - - if (data_len) - if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) - BUG(); - - nlh->nlmsg_len = skb->tail - old_tail; - return skb; - -nlmsg_failure: - if (skb) - kfree_skb(skb); - *errp = -EINVAL; - printk(KERN_ERR "ip6_queue: error creating packet message\n"); - return NULL; -} - -static int -ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ - int status = -EINVAL; - struct sk_buff *nskb; - - if (copy_mode == IPQ_COPY_NONE) - return -EAGAIN; - - nskb = ipq_build_packet_message(entry, &status); - if (nskb == NULL) - return status; - - write_lock_bh(&queue_lock); - - if (!peer_pid) - goto err_out_free_nskb; - - if (queue_total >= queue_maxlen) { - queue_dropped++; - status = -ENOSPC; - if (net_ratelimit()) - printk (KERN_WARNING "ip6_queue: fill at %d entries, " - "dropping packet(s). Dropped: %d\n", queue_total, - queue_dropped); - goto err_out_free_nskb; - } - - /* netlink_unicast will either free the nskb or attach it to a socket */ - status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); - if (status < 0) { - queue_user_dropped++; - goto err_out_unlock; - } - - __ipq_enqueue_entry(entry); - - write_unlock_bh(&queue_lock); - return status; - -err_out_free_nskb: - kfree_skb(nskb); - -err_out_unlock: - write_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) -{ - int diff; - struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; - struct sk_buff *nskb; - - if (v->data_len < sizeof(*user_iph)) - return 0; - diff = v->data_len - e->skb->len; - if (diff < 0) { - if (pskb_trim(e->skb, v->data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (v->data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, 0, - diff - skb_tailroom(e->skb), - GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "ip6_queue: OOM " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - kfree_skb(e->skb); - e->skb = nskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(e->skb, v->data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, v->payload, v->data_len); - e->skb->ip_summed = CHECKSUM_NONE; - - return 0; -} - -static int -ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) -{ - struct nf_queue_entry *entry; - - if (vmsg->value > NF_MAX_VERDICT) - return -EINVAL; - - entry = ipq_find_dequeue_entry(vmsg->id); - if (entry == NULL) - return -ENOENT; - else { - int verdict = vmsg->value; - - if (vmsg->data_len && vmsg->data_len == len) - if (ipq_mangle_ipv6(vmsg, entry) < 0) - verdict = NF_DROP; - - nf_reinject(entry, verdict); - return 0; - } -} - -static int -ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status; - - write_lock_bh(&queue_lock); - status = __ipq_set_mode(mode, range); - write_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_receive_peer(struct ipq_peer_msg *pmsg, - unsigned char type, unsigned int len) -{ - int status = 0; - - if (len < sizeof(*pmsg)) - return -EINVAL; - - switch (type) { - case IPQM_MODE: - status = ipq_set_mode(pmsg->msg.mode.value, - pmsg->msg.mode.range); - break; - - case IPQM_VERDICT: - if (pmsg->msg.verdict.value > NF_MAX_VERDICT) - status = -EINVAL; - else - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; - default: - status = -EINVAL; - } - return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ - if (entry->indev) - if (entry->indev->ifindex == ifindex) - return 1; - - if (entry->outdev) - if (entry->outdev->ifindex == ifindex) - return 1; -#ifdef CONFIG_BRIDGE_NETFILTER - if (entry->skb->nf_bridge) { - if (entry->skb->nf_bridge->physindev && - entry->skb->nf_bridge->physindev->ifindex == ifindex) - return 1; - if (entry->skb->nf_bridge->physoutdev && - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) - return 1; - } -#endif - return 0; -} - -static void -ipq_dev_drop(int ifindex) -{ - ipq_flush(dev_cmp, ifindex); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static inline void -__ipq_rcv_skb(struct sk_buff *skb) -{ - int status, type, pid, flags, nlmsglen, skblen; - struct nlmsghdr *nlh; - - skblen = skb->len; - if (skblen < sizeof(*nlh)) - return; - - nlh = nlmsg_hdr(skb); - nlmsglen = nlh->nlmsg_len; - if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) - return; - - pid = nlh->nlmsg_pid; - flags = nlh->nlmsg_flags; - - if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) - RCV_SKB_FAIL(-EINVAL); - - if (flags & MSG_TRUNC) - RCV_SKB_FAIL(-ECOMM); - - type = nlh->nlmsg_type; - if (type < NLMSG_NOOP || type >= IPQM_MAX) - RCV_SKB_FAIL(-EINVAL); - - if (type <= IPQM_BASE) - return; - - if (security_netlink_recv(skb, CAP_NET_ADMIN)) - RCV_SKB_FAIL(-EPERM); - - write_lock_bh(&queue_lock); - - if (peer_pid) { - if (peer_pid != pid) { - write_unlock_bh(&queue_lock); - RCV_SKB_FAIL(-EBUSY); - } - } else { - net_enable_timestamp(); - peer_pid = pid; - } - - write_unlock_bh(&queue_lock); - - status = ipq_receive_peer(NLMSG_DATA(nlh), type, - nlmsglen - NLMSG_LENGTH(0)); - if (status < 0) - RCV_SKB_FAIL(status); - - if (flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - return; -} - -static void -ipq_rcv_skb(struct sk_buff *skb) -{ - mutex_lock(&ipqnl_mutex); - __ipq_rcv_skb(skb); - mutex_unlock(&ipqnl_mutex); -} - -static int -ipq_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = ptr; - - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - ipq_dev_drop(dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block ipq_dev_notifier = { - .notifier_call = ipq_rcv_dev_event, -}; - -static int -ipq_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_IP6_FW && n->pid) { - write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) - __ipq_reset(); - write_unlock_bh(&queue_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block ipq_nl_notifier = { - .notifier_call = ipq_rcv_nl_event, -}; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *ipq_sysctl_header; - -static ctl_table ipq_table[] = { - { - .ctl_name = NET_IPQ_QMAX, - .procname = NET_IPQ_QMAX_NAME, - .data = &queue_maxlen, - .maxlen = sizeof(queue_maxlen), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .ctl_name = 0 } -}; -#endif - -#ifdef CONFIG_PROC_FS -static int ip6_queue_show(struct seq_file *m, void *v) -{ - read_lock_bh(&queue_lock); - - seq_printf(m, - "Peer PID : %d\n" - "Copy mode : %hu\n" - "Copy range : %u\n" - "Queue length : %u\n" - "Queue max. length : %u\n" - "Queue dropped : %u\n" - "Netfilter dropped : %u\n", - peer_pid, - copy_mode, - copy_range, - queue_total, - queue_maxlen, - queue_dropped, - queue_user_dropped); - - read_unlock_bh(&queue_lock); - return 0; -} - -static int ip6_queue_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip6_queue_show, NULL); -} - -static const struct file_operations ip6_queue_proc_fops = { - .open = ip6_queue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; -#endif - -static const struct nf_queue_handler nfqh = { - .name = "ip6_queue", - .outfn = &ipq_enqueue_packet, -}; - -static int __init ip6_queue_init(void) -{ - int status = -ENOMEM; - struct proc_dir_entry *proc __maybe_unused; - - netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, - ipq_rcv_skb, NULL, THIS_MODULE); - if (ipqnl == NULL) { - printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - -#ifdef CONFIG_PROC_FS - proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, - &ip6_queue_proc_fops); - if (!proc) { - printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); - goto cleanup_ipqnl; - } -#endif - register_netdevice_notifier(&ipq_dev_notifier); -#ifdef CONFIG_SYSCTL - ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table); -#endif - status = nf_register_queue_handler(PF_INET6, &nfqh); - if (status < 0) { - printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); - goto cleanup_sysctl; - } - return status; - -cleanup_sysctl: -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - -cleanup_ipqnl: __maybe_unused - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - -cleanup_netlink_notifier: - netlink_unregister_notifier(&ipq_nl_notifier); - return status; -} - -static void __exit ip6_queue_fini(void) -{ - nf_unregister_queue_handlers(&nfqh); - synchronize_net(); - ipq_flush(NULL, 0); - -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - - netlink_unregister_notifier(&ipq_nl_notifier); -} - -MODULE_DESCRIPTION("IPv6 packet queue handler"); -MODULE_LICENSE("GPL"); - -module_init(ip6_queue_init); -module_exit(ip6_queue_fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index bf9bb6e55bb..e080fbbbc0e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -3,12 +3,13 @@ * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/in.h> #include <linux/skbuff.h> @@ -29,6 +30,7 @@ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -39,24 +41,19 @@ MODULE_DESCRIPTION("IPv6 packet filter"); /*#define DEBUG_IP_FIREWALL_USER*/ #ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) printk(format , ## args) +#define dprintf(format, args...) pr_info(format , ## args) #else #define dprintf(format, args...) #endif #ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) +#define duprintf(format, args...) pr_info(format , ## args) #else #define duprintf(format, args...) #endif #ifdef CONFIG_NETFILTER_DEBUG -#define IP_NF_ASSERT(x) \ -do { \ - if (!(x)) \ - printk("IP_NF_ASSERT: %s:%s:%u\n", \ - __FUNCTION__, __FILE__, __LINE__); \ -} while(0) +#define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif @@ -67,6 +64,12 @@ do { \ #define inline #endif +void *ip6t_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ip6t, IP6T); +} +EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); + /* We keep a set of rules for each CPU, so we can avoid write-locking them in the softirq when updating the counters and therefore @@ -76,19 +79,6 @@ do { \ Hence the start of any table is given by get_table() below. */ -/* Check for an extension */ -int -ip6t_ext_hdr(u8 nexthdr) -{ - return ( (nexthdr == IPPROTO_HOPOPTS) || - (nexthdr == IPPROTO_ROUTING) || - (nexthdr == IPPROTO_FRAGMENT) || - (nexthdr == IPPROTO_ESP) || - (nexthdr == IPPROTO_AH) || - (nexthdr == IPPROTO_NONE) || - (nexthdr == IPPROTO_DSTOPTS) ); -} - /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -99,16 +89,15 @@ ip6_packet_match(const struct sk_buff *skb, unsigned int *protoff, int *fragoff, bool *hotdrop) { - size_t i; unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); #define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, - &ip6info->src), IP6T_INV_SRCIP) - || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) { + &ip6info->src), IP6T_INV_SRCIP) || + FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst), IP6T_INV_DSTIP)) { dprintf("Source or dest mismatch.\n"); /* dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, @@ -120,12 +109,7 @@ ip6_packet_match(const struct sk_buff *skb, return false; } - /* Look for ifname matches; this should unroll nicely. */ - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)indev)[i] - ^ ((const unsigned long *)ip6info->iniface)[i]) - & ((const unsigned long *)ip6info->iniface_mask)[i]; - } + ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { dprintf("VIA in mismatch (%s vs %s).%s\n", @@ -134,11 +118,7 @@ ip6_packet_match(const struct sk_buff *skb, return false; } - for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { - ret |= (((const unsigned long *)outdev)[i] - ^ ((const unsigned long *)ip6info->outiface)[i]) - & ((const unsigned long *)ip6info->outiface_mask)[i]; - } + ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { dprintf("VIA out mismatch (%s vs %s).%s\n", @@ -154,7 +134,7 @@ ip6_packet_match(const struct sk_buff *skb, int protohdr; unsigned short _frag_off; - protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off); + protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL); if (protohdr < 0) { if (_frag_off == 0) *hotdrop = true; @@ -200,59 +180,35 @@ ip6_checkentry(const struct ip6t_ip6 *ipv6) } static unsigned int -ip6t_error(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) { - if (net_ratelimit()) - printk("ip6_tables: error: `%s'\n", (char *)targinfo); + net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } -/* Performance critical - called for every packet */ -static inline bool -do_match(struct ip6t_entry_match *m, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, - offset, protoff, hotdrop)) - return true; - else - return false; -} - static inline struct ip6t_entry * -get_entry(void *base, unsigned int offset) +get_entry(const void *base, unsigned int offset) { return (struct ip6t_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline int -unconditional(const struct ip6t_ip6 *ipv6) +static inline bool unconditional(const struct ip6t_ip6 *ipv6) { - unsigned int i; + static const struct ip6t_ip6 uncond; - for (i = 0; i < sizeof(*ipv6); i++) - if (((char *)ipv6)[i]) - break; + return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; +} - return (i == sizeof(*ipv6)); +static inline const struct xt_entry_target * +ip6t_get_target_c(const struct ip6t_entry *e) +{ + return ip6t_get_target((struct ip6t_entry *)e); } -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* This cries for unification! */ static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", @@ -286,28 +242,28 @@ static struct nf_loginfo trace_loginfo = { /* Mildly perf critical (only if packet tracing is on) */ static inline int -get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, - char *hookname, char **chainname, - char **comment, unsigned int *rulenum) +get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, + const char *hookname, const char **chainname, + const char **comment, unsigned int *rulenum) { - struct ip6t_standard_target *t = (void *)ip6t_get_target(s); + const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); - if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) { + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ip6t_entry) - && strcmp(t->target.u.kernel.target->name, - IP6T_STANDARD_TARGET) == 0 - && t->verdict < 0 - && unconditional(&s->ipv6)) { + if (s->target_offset == sizeof(struct ip6t_entry) && + strcmp(t->target.u.kernel.target->name, + XT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ipv6)) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname - ? (char *)comments[NF_IP6_TRACE_COMMENT_POLICY] - : (char *)comments[NF_IP6_TRACE_COMMENT_RETURN]; + ? comments[NF_IP6_TRACE_COMMENT_POLICY] + : comments[NF_IP6_TRACE_COMMENT_RETURN]; } return 1; } else @@ -316,36 +272,44 @@ get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, return 0; } -static void trace_packet(struct sk_buff *skb, +static void trace_packet(const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, - struct xt_table_info *private, - struct ip6t_entry *e) + const struct xt_table_info *private, + const struct ip6t_entry *e) { - void *table_base; - struct ip6t_entry *root; - char *hookname, *chainname, *comment; + const void *table_base; + const struct ip6t_entry *root; + const char *hookname, *chainname, *comment; + const struct ip6t_entry *iter; unsigned int rulenum = 0; + struct net *net = dev_net(in ? in : out); - table_base = (void *)private->entries[smp_processor_id()]; + table_base = private->entries[smp_processor_id()]; root = get_entry(table_base, private->hook_entry[hook]); - hookname = chainname = (char *)hooknames[hook]; - comment = (char *)comments[NF_IP6_TRACE_COMMENT_RULE]; + hookname = chainname = hooknames[hook]; + comment = comments[NF_IP6_TRACE_COMMENT_RULE]; - IP6T_ENTRY_ITERATE(root, - private->size - private->hook_entry[hook], - get_chainname_rulenum, - e, hookname, &chainname, &comment, &rulenum); + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; - nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo, + nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } #endif +static inline __pure struct ip6t_entry * +ip6t_next_entry(const struct ip6t_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(struct sk_buff *skb, @@ -355,15 +319,15 @@ ip6t_do_table(struct sk_buff *skb, struct xt_table *table) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); - int offset = 0; - unsigned int protoff = 0; - bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; - void *table_base; - struct ip6t_entry *e, *back; - struct xt_table_info *private; + const void *table_base; + struct ip6t_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; /* Initialization */ indev = in ? in->name : nulldevname; @@ -374,116 +338,112 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV6; + acpar.hooknum = hook; - read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; - e = get_entry(table_base, private->hook_entry[hook]); + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; - /* For return from builtin chain */ - back = get_entry(table_base, private->underflow[hook]); + e = get_entry(table_base, private->hook_entry[hook]); do { + const struct xt_entry_target *t; + const struct xt_entry_match *ematch; + IP_NF_ASSERT(e); - IP_NF_ASSERT(back); - if (ip6_packet_match(skb, indev, outdev, &e->ipv6, - &protoff, &offset, &hotdrop)) { - struct ip6t_entry_target *t; - - if (IP6T_MATCH_ITERATE(e, do_match, - skb, in, out, - offset, protoff, &hotdrop) != 0) - goto no_match; + acpar.thoff = 0; + if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, + &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { + no_match: + e = ip6t_next_entry(e); + continue; + } - ADD_COUNTER(e->counters, - ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr), 1); + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } - t = ip6t_get_target(e); - IP_NF_ASSERT(t->u.kernel.target); + ADD_COUNTER(e->counters, skb->len, 1); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - /* The packet is traced: log it */ - if (unlikely(skb->nf_trace)) - trace_packet(skb, hook, in, out, - table->name, private, e); -#endif - /* Standard target? */ - if (!t->u.kernel.target->target) { - int v; - - v = ((struct ip6t_standard_target *)t)->verdict; - if (v < 0) { - /* Pop from stack? */ - if (v != IP6T_RETURN) { - verdict = (unsigned)(-v) - 1; - break; - } - e = back; - back = get_entry(table_base, - back->comefrom); - continue; - } - if (table_base + v != (void *)e + e->next_offset - && !(e->ipv6.flags & IP6T_F_GOTO)) { - /* Save old back ptr in next entry */ - struct ip6t_entry *next - = (void *)e + e->next_offset; - next->comefrom - = (void *)back - table_base; - /* set back pointer to next entry */ - back = next; - } + t = ip6t_get_target_c(e); + IP_NF_ASSERT(t->u.kernel.target); - e = get_entry(table_base, v); - } else { - /* Targets which reenter must return - abs. verdicts */ -#ifdef CONFIG_NETFILTER_DEBUG - ((struct ip6t_entry *)table_base)->comefrom - = 0xeeeeeeec; +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) + /* The packet is traced: log it */ + if (unlikely(skb->nf_trace)) + trace_packet(skb, hook, in, out, + table->name, private, e); #endif - verdict = t->u.kernel.target->target(skb, - in, out, - hook, - t->u.kernel.target, - t->data); - -#ifdef CONFIG_NETFILTER_DEBUG - if (((struct ip6t_entry *)table_base)->comefrom - != 0xeeeeeeec - && verdict == IP6T_CONTINUE) { - printk("Target %s reentered!\n", - t->u.kernel.target->name); - verdict = NF_DROP; + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct xt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != XT_RETURN) { + verdict = (unsigned int)(-v) - 1; + break; } - ((struct ip6t_entry *)table_base)->comefrom - = 0x57acc001; -#endif - if (verdict == IP6T_CONTINUE) - e = (void *)e + e->next_offset; + if (*stackptr <= origptr) + e = get_entry(table_base, + private->underflow[hook]); else - /* Verdict */ + e = ip6t_next_entry(jumpstack[--*stackptr]); + continue; + } + if (table_base + v != ip6t_next_entry(e) && + !(e->ipv6.flags & IP6T_F_GOTO)) { + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; break; + } + jumpstack[(*stackptr)++] = e; } - } else { - no_match: - e = (void *)e + e->next_offset; + e = get_entry(table_base, v); + continue; } - } while (!hotdrop); -#ifdef CONFIG_NETFILTER_DEBUG - ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; -#endif - read_unlock_bh(&table->lock); + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + + verdict = t->u.kernel.target->target(skb, &acpar); + if (verdict == XT_CONTINUE) + e = ip6t_next_entry(e); + else + /* Verdict */ + break; + } while (!acpar.hotdrop); + + *stackptr = origptr; + + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else - if (hotdrop) + if (acpar.hotdrop) return NF_DROP; else return verdict; #endif @@ -492,7 +452,7 @@ ip6t_do_table(struct sk_buff *skb, /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -510,26 +470,28 @@ mark_source_chains(struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - struct ip6t_standard_target *t - = (void *)ip6t_get_target(e); + const struct xt_standard_target *t + = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", + pr_err("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ip6t_entry) - && (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0) - && t->verdict < 0 - && unconditional(&e->ipv6)) || visited) { + if ((e->target_offset == sizeof(struct ip6t_entry) && + (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < 0 && + unconditional(&e->ipv6)) || visited) { unsigned int oldpos, size; - if (t->verdict < -NF_MAX_VERDICT - 1) { + if ((strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", t->verdict); @@ -571,8 +533,8 @@ mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0 - && newpos >= 0) { + XT_STANDARD_TARGET) == 0 && + newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ip6t_entry)) { duprintf("mark_source_chains: " @@ -599,82 +561,73 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static int -cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +static void cleanup_match(struct xt_entry_match *m, struct net *net) { - if (i && (*i)-- == 0) - return 1; - - if (m->u.kernel.match->destroy) - m->u.kernel.match->destroy(m->u.kernel.match, m->data); - module_put(m->u.kernel.match->me); - return 0; + struct xt_mtdtor_param par; + + par.net = net; + par.match = m->u.kernel.match; + par.matchinfo = m->data; + par.family = NFPROTO_IPV6; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); } static int -check_entry(struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e, const char *name) { - struct ip6t_entry_target *t; + const struct xt_entry_target *t; if (!ip6_checkentry(&e->ipv6)) { duprintf("ip_tables: ip check failed %p %s.\n", e, name); return -EINVAL; } - if (e->target_offset + sizeof(struct ip6t_entry_target) > + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; return 0; } -static int check_match(struct ip6t_entry_match *m, const char *name, - const struct ip6t_ip6 *ipv6, - unsigned int hookmask, unsigned int *i) +static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { - struct xt_match *match; + const struct ip6t_ip6 *ipv6 = par->entryinfo; int ret; - match = m->u.kernel.match; - ret = xt_check_match(match, AF_INET6, m->u.match_size - sizeof(*m), - name, hookmask, ipv6->proto, - ipv6->invflags & IP6T_INV_PROTO); - if (!ret && m->u.kernel.match->checkentry - && !m->u.kernel.match->checkentry(name, ipv6, match, m->data, - hookmask)) { + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", - m->u.kernel.match->name); - ret = -EINVAL; + par.match->name); + return ret; } - if (!ret) - (*i)++; - return ret; + return 0; } static int -find_check_match(struct ip6t_entry_match *m, - const char *name, - const struct ip6t_ip6 *ipv6, - unsigned int hookmask, - unsigned int *i) +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; - match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, - m->u.user.revision), - "ip6t_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("find_check_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; - ret = check_match(m, name, ipv6, hookmask, i); + ret = check_match(m, par); if (ret) goto err; @@ -684,90 +637,118 @@ err: return ret; } -static int check_target(struct ip6t_entry *e, const char *name) +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { - struct ip6t_entry_target *t; - struct xt_target *target; + struct xt_entry_target *t = ip6t_get_target(e); + struct xt_tgchk_param par = { + .net = net, + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_IPV6, + }; int ret; t = ip6t_get_target(e); - target = t->u.kernel.target; - ret = xt_check_target(target, AF_INET6, t->u.target_size - sizeof(*t), - name, e->comefrom, e->ipv6.proto, - e->ipv6.invflags & IP6T_INV_PROTO); - if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); + if (ret < 0) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); - ret = -EINVAL; + return ret; } - return ret; + return 0; } static int -find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; ret = check_entry(e, name); if (ret) return ret; j = 0; - ret = IP6T_MATCH_ITERATE(e, find_check_match, name, &e->ipv6, - e->comefrom, &j); - if (ret != 0) - goto cleanup_matches; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } t = ip6t_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET6, - t->u.user.name, - t->u.user.revision), - "ip6t_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } +static bool check_underflow(const struct ip6t_entry *e) +{ + const struct xt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ipv6)) + return false; + t = ip6t_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct xt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; - if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 - || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -775,57 +756,59 @@ check_entry_size_and_hooks(struct ip6t_entry *e, /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; - if ((unsigned char *)e - base == underflows[h]) + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } newinfo->underflow[h] = underflows[h]; + } } - /* FIXME: underflows must be unconditional, standard verdicts - < 0 (not IP6T_RETURN). --RR */ - /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static int -cleanup_entry(struct ip6t_entry *e, unsigned int *i) +static void cleanup_entry(struct ip6t_entry *e, struct net *net) { - struct ip6t_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_tgdtor_param par; + struct xt_entry_target *t; + struct xt_entry_match *ematch; /* Cleanup all matches */ - IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); t = ip6t_get_target(e); - if (t->u.kernel.target->destroy) - t->u.kernel.target->destroy(t->u.kernel.target, t->data); - module_put(t->u.kernel.target->me); - return 0; + + par.net = net; + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_IPV6; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ip6t_replace *repl) { + struct ip6t_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -836,49 +819,61 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, &i); - if (ret != 0) - return ret; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - IP6T_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } return ret; } @@ -891,97 +886,65 @@ translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int -add_entry_to_counter(const struct ip6t_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int -set_entry_to_counter(const struct ip6t_entry *e, - struct ip6t_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct ip6t_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * We dont care about preemption here. - */ - curcpu = raw_smp_processor_id(); - - i = 0; - IP6T_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqcount_t *s = &per_cpu(xt_recseq, cpu); + i = 0; - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; + } } } -static struct xt_counters *alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); - /* First, sum counters... */ - write_lock_bh(&table->lock); get_counters(private, counters); - write_unlock_bh(&table->lock); return counters; } static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct ip6t_entry *e; + const struct ip6t_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1001,8 +964,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - struct ip6t_entry_match *m; - struct ip6t_entry_target *t; + const struct xt_entry_match *m; + const struct xt_entry_target *t; e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -1019,7 +982,7 @@ copy_entries_to_user(unsigned int total_size, m = (void *)e + i; if (copy_to_user(userptr + off + i - + offsetof(struct ip6t_entry_match, + + offsetof(struct xt_entry_match, u.user.name), m->u.kernel.match->name, strlen(m->u.kernel.match->name)+1) @@ -1029,9 +992,9 @@ copy_entries_to_user(unsigned int total_size, } } - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct ip6t_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -1046,7 +1009,7 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -1055,7 +1018,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -1064,25 +1027,20 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static inline int -compat_calc_match(struct ip6t_entry_match *m, int *size) -{ - *size += xt_compat_match_offset(m->u.kernel.match); - return 0; -} - -static int compat_calc_entry(struct ip6t_entry *e, +static int compat_calc_entry(const struct ip6t_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct ip6t_entry_target *t; + const struct xt_entry_match *ematch; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - base; - IP6T_MATCH_ITERATE(e, compat_calc_match, &off); - t = ip6t_get_target(e); + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ip6t_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); @@ -1103,7 +1061,9 @@ static int compat_calc_entry(struct ip6t_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct ip6t_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -1112,15 +1072,20 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return IP6T_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_compat_init_offsets(AF_INET6, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -1133,25 +1098,26 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET6); #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct ip6t_getinfo info; - struct xt_table_info *private = t->private; - + const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); @@ -1178,7 +1144,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int -get_entries(struct net *net, struct ip6t_get_entries __user *uptr, int *len) +get_entries(struct net *net, struct ip6t_get_entries __user *uptr, + const int *len) { int ret; struct ip6t_get_entries get; @@ -1197,7 +1164,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, int *len) } t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -1206,7 +1173,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, int *len) else { duprintf("get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } module_put(t->me); xt_table_unlock(t); @@ -1225,11 +1192,11 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_old_entry; + const void *loc_cpu_old_entry; + struct ip6t_entry *iter; ret = 0; - counters = vmalloc_node(num_counters * sizeof(struct xt_counters), - numa_node_id()); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1237,7 +1204,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1264,16 +1231,20 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; @@ -1288,12 +1259,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, void __user *user, unsigned int len) +do_replace(struct net *net, const void __user *user, unsigned int len) { int ret; struct ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1301,6 +1273,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1314,9 +1287,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1329,39 +1300,18 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static inline int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ -#if 0 - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", - *i, - (long unsigned int)e->counters.pcnt, - (long unsigned int)e->counters.bcnt, - (long unsigned int)addme[*i].pcnt, - (long unsigned int)addme[*i].bcnt); -#endif - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int -do_add_counters(struct net *net, void __user *user, unsigned int len, +do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1369,9 +1319,11 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int size; void *ptmp; struct xt_table *t; - struct xt_table_info *private; + const struct xt_table_info *private; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; + struct ip6t_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1402,7 +1354,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; @@ -1412,12 +1364,13 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, } t = xt_find_table_lock(net, AF_INET6, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } - write_lock_bh(&t->lock); + + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; @@ -1426,14 +1379,17 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); + curcpu = smp_processor_id(); + addend = xt_write_recseq_begin(); + loc_cpu_entry = private->entries[curcpu]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); + unlock_up_free: - write_unlock_bh(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: @@ -1444,130 +1400,109 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, #ifdef CONFIG_COMPAT struct compat_ip6t_replace { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; - compat_uptr_t counters; /* struct ip6t_counters * */ + compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[0]; }; static int compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; - int ret; + const struct xt_entry_match *ematch; + int ret = 0; - ret = -EFAULT; origsize = *size; ce = (struct compat_ip6t_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct ip6t_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_ip6t_entry); *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); - ret = IP6T_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } target_offset = e->target_offset - (origsize - *size); - if (ret) - goto out; t = ip6t_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int -compat_find_calc_match(struct ip6t_entry_match *m, +compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, unsigned int hookmask, - int *size, unsigned int *i) + int *size) { struct xt_match *match; - match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, - m->u.user.revision), - "ip6t_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("compat_check_calc_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; *size += xt_compat_match_offset(match); - - (*i)++; return 0; } -static int -compat_release_match(struct ip6t_entry_match *m, unsigned int *i) +static void compat_release_entry(struct compat_ip6t_entry *e) { - if (i && (*i)-- == 0) - return 1; - - module_put(m->u.kernel.match->me); - return 0; -} - -static int -compat_release_entry(struct compat_ip6t_entry *e, unsigned int *i) -{ - struct ip6t_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_target *t; + struct xt_entry_match *ematch; /* Cleanup all matches */ - COMPAT_IP6T_MATCH_ITERATE(e, compat_release_match, NULL); + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); t = compat_ip6t_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static int check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { - struct ip6t_entry_target *t; + struct xt_entry_match *ematch; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; int ret, off, h; duprintf("check_compat_entry_size_and_hooks %p\n", e); - if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 - || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1587,20 +1522,21 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = COMPAT_IP6T_MATCH_ITERATE(e, compat_find_calc_match, name, - &e->ipv6, e->comefrom, &off, &j); - if (ret != 0) - goto release_matches; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ipv6, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } t = compat_ip6t_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET6, - t->u.user.name, - t->u.user.revision), - "ip6t_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; @@ -1622,14 +1558,16 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; out: module_put(t->u.kernel.target->me); release_matches: - IP6T_MATCH_ITERATE(e, compat_release_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } return ret; } @@ -1638,11 +1576,11 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct ip6t_entry_target *t; - struct xt_target *target; + struct xt_entry_target *t; struct ip6t_entry *de; unsigned int origsize; int ret, h; + struct xt_entry_match *ematch; ret = 0; origsize = *size; @@ -1653,13 +1591,13 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, *dstptr += sizeof(struct ip6t_entry); *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); - ret = COMPAT_IP6T_MATCH_ITERATE(e, xt_compat_match_from_user, - dstptr, size); - if (ret) - return ret; + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } de->target_offset = e->target_offset - (origsize - *size); t = compat_ip6t_get_target(e); - target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); @@ -1672,32 +1610,44 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, return ret; } -static int compat_check_entry(struct ip6t_entry *e, const char *name, - unsigned int *i) +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name) { unsigned int j; - int ret; + int ret = 0; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; j = 0; - ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, - e->comefrom, &j); - if (ret) - goto cleanup_matches; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; - - (*i)++; return 0; cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1709,8 +1659,10 @@ translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_ip6t_entry *iter0; + struct ip6t_entry *iter1; unsigned int size; - int ret; + int ret = 0; info = *pinfo; entry0 = *pentry0; @@ -1726,14 +1678,19 @@ translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); /* Walk through entries, checking offsets. */ - ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1772,9 +1729,12 @@ translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); if (ret) @@ -1785,13 +1745,35 @@ translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = IP6T_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_IP6T_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } xt_free_table_info(newinfo); return ret; } @@ -1809,7 +1791,11 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(AF_INET6); @@ -1824,6 +1810,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) struct compat_ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1833,6 +1820,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1846,7 +1834,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1862,7 +1850,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1874,16 +1863,16 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: - ret = compat_do_replace(sk->sk_net, user, len); + ret = compat_do_replace(sock_net(sk), user, len); break; case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 1); + ret = do_add_counters(sock_net(sk), user, len, 1); break; default: @@ -1895,7 +1884,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, } struct compat_ip6t_get_entries { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[0]; }; @@ -1905,12 +1894,13 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; - void *loc_cpu_entry; + const void *loc_cpu_entry; unsigned int i = 0; + struct ip6t_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1923,9 +1913,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = IP6T_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; @@ -1955,8 +1948,8 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { - struct xt_table_info *private = t->private; + if (!IS_ERR_OR_NULL(t)) { + const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); @@ -1966,7 +1959,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, } else if (!ret) { duprintf("compat_get_entries: I've got %u not %u!\n", private->size, get.size); - ret = -EINVAL; + ret = -EAGAIN; } xt_compat_flush_offsets(AF_INET6); module_put(t->me); @@ -1985,15 +1978,15 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 1); + ret = get_info(sock_net(sk), user, len, 1); break; case IP6T_SO_GET_ENTRIES: - ret = compat_get_entries(sk->sk_net, user, len); + ret = compat_get_entries(sock_net(sk), user, len); break; default: ret = do_ip6t_get_ctl(sk, cmd, user, len); @@ -2007,16 +2000,16 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: - ret = do_replace(sk->sk_net, user, len); + ret = do_replace(sock_net(sk), user, len); break; case IP6T_SO_SET_ADD_COUNTERS: - ret = do_add_counters(sk->sk_net, user, len, 0); + ret = do_add_counters(sock_net(sk), user, len, 0); break; default: @@ -2032,21 +2025,21 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: - ret = get_info(sk->sk_net, user, len, 0); + ret = get_info(sock_net(sk), user, len, 0); break; case IP6T_SO_GET_ENTRIES: - ret = get_entries(sk->sk_net, user, len); + ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { - struct ip6t_get_revision rev; + struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { @@ -2057,6 +2050,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; @@ -2078,13 +2072,13 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ip6t_register_table(struct net *net, struct xt_table *table, +struct xt_table *ip6t_register_table(struct net *net, + const struct xt_table *table, const struct ip6t_replace *repl) { int ret; struct xt_table_info *newinfo; - struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; @@ -2098,11 +2092,7 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *table, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -2119,17 +2109,19 @@ out: return ERR_PTR(ret); } -void ip6t_unregister_table(struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct ip6t_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); @@ -2146,29 +2138,23 @@ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp6_match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { - struct icmp6hdr _icmph, *ic; - const struct ip6t_icmp *icmpinfo = matchinfo; + const struct icmp6hdr *ic; + struct icmp6hdr _icmph; + const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *hotdrop = true; + par->hotdrop = true; return false; } @@ -2180,36 +2166,32 @@ icmp6_match(const struct sk_buff *skb, } /* Called when user tries to insert an entry of this type. */ -static bool -icmp6_checkentry(const char *tablename, - const void *entry, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static int icmp6_checkentry(const struct xt_mtchk_param *par) { - const struct ip6t_icmp *icmpinfo = matchinfo; + const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(icmpinfo->invflags & ~IP6T_ICMP_INV); + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } /* The built-in targets: standard (NULL) and error. */ -static struct xt_target ip6t_standard_target __read_mostly = { - .name = IP6T_STANDARD_TARGET, - .targetsize = sizeof(int), - .family = AF_INET6, +static struct xt_target ip6t_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV6, #ifdef CONFIG_COMPAT - .compatsize = sizeof(compat_int_t), - .compat_from_user = compat_standard_from_user, - .compat_to_user = compat_standard_to_user, + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, #endif -}; - -static struct xt_target ip6t_error_target __read_mostly = { - .name = IP6T_ERROR_TARGET, - .target = ip6t_error, - .targetsize = IP6T_FUNCTION_MAXNAMELEN, - .family = AF_INET6, + }, + { + .name = XT_ERROR_TARGET, + .target = ip6t_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV6, + }, }; static struct nf_sockopt_ops ip6t_sockopts = { @@ -2229,23 +2211,25 @@ static struct nf_sockopt_ops ip6t_sockopts = { .owner = THIS_MODULE, }; -static struct xt_match icmp6_matchstruct __read_mostly = { - .name = "icmp6", - .match = icmp6_match, - .matchsize = sizeof(struct ip6t_icmp), - .checkentry = icmp6_checkentry, - .proto = IPPROTO_ICMPV6, - .family = AF_INET6, +static struct xt_match ip6t_builtin_mt[] __read_mostly = { + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + }, }; static int __net_init ip6_tables_net_init(struct net *net) { - return xt_proto_init(net, AF_INET6); + return xt_proto_init(net, NFPROTO_IPV6); } static void __net_exit ip6_tables_net_exit(struct net *net) { - xt_proto_fini(net, AF_INET6); + xt_proto_fini(net, NFPROTO_IPV6); } static struct pernet_operations ip6_tables_net_ops = { @@ -2261,14 +2245,11 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err1; - /* Noone else will be downing sem now, so we won't sleep */ - ret = xt_register_target(&ip6t_standard_target); + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; - ret = xt_register_target(&ip6t_error_target); - if (ret < 0) - goto err3; - ret = xt_register_match(&icmp6_matchstruct); + ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); if (ret < 0) goto err4; @@ -2277,15 +2258,13 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; - printk(KERN_INFO "ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); + pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: - xt_unregister_match(&icmp6_matchstruct); + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); err4: - xt_unregister_target(&ip6t_error_target); -err3: - xt_unregister_target(&ip6t_standard_target); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: unregister_pernet_subsys(&ip6_tables_net_ops); err1: @@ -2296,95 +2275,14 @@ static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); - xt_unregister_match(&icmp6_matchstruct); - xt_unregister_target(&ip6t_error_target); - xt_unregister_target(&ip6t_standard_target); - + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } -/* - * find the offset to specified header or the protocol number of last header - * if target < 0. "last header" is transport protocol header, ESP, or - * "No next header". - * - * If target header is found, its offset is set in *offset and return protocol - * number. Otherwise, return -1. - * - * If the first fragment doesn't contain the final protocol header or - * NEXTHDR_NONE it is considered invalid. - * - * Note that non-1st fragment is special case that "the protocol number - * of last header" is "next header" field in Fragment header. In this case, - * *offset is meaningless and fragment offset is stored in *fragoff if fragoff - * isn't NULL. - * - */ -int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, - int target, unsigned short *fragoff) -{ - unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len = skb->len - start; - - if (fragoff) - *fragoff = 0; - - while (nexthdr != target) { - struct ipv6_opt_hdr _hdr, *hp; - unsigned int hdrlen; - - if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { - if (target < 0) - break; - return -ENOENT; - } - - hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); - if (hp == NULL) - return -EBADMSG; - if (nexthdr == NEXTHDR_FRAGMENT) { - unsigned short _frag_off; - __be16 *fp; - fp = skb_header_pointer(skb, - start+offsetof(struct frag_hdr, - frag_off), - sizeof(_frag_off), - &_frag_off); - if (fp == NULL) - return -EBADMSG; - - _frag_off = ntohs(*fp) & ~0x7; - if (_frag_off) { - if (target < 0 && - ((!ipv6_ext_hdr(hp->nexthdr)) || - hp->nexthdr == NEXTHDR_NONE)) { - if (fragoff) - *fragoff = _frag_off; - return hp->nexthdr; - } - return -ENOENT; - } - hdrlen = 8; - } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen + 2) << 2; - else - hdrlen = ipv6_optlen(hp); - - nexthdr = hp->nexthdr; - len -= hdrlen; - start += hdrlen; - } - - *offset = start; - return nexthdr; -} - EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_do_table); -EXPORT_SYMBOL(ip6t_ext_hdr); -EXPORT_SYMBOL(ipv6_find_hdr); module_init(ip6_tables_init); module_exit(ip6_tables_fini); diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c deleted file mode 100644 index d5f8fd5f29d..00000000000 --- a/net/ipv6/netfilter/ip6t_HL.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Hop Limit modification target for ip6tables - * Maciej Soltysiak <solt@dns.toxicfilms.tv> - * Based on HW's TTL module - * - * This software is distributed under the terms of GNU GPL - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/ipv6.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv6/ip6t_HL.h> - -MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field modification target"); -MODULE_LICENSE("GPL"); - -static unsigned int -hl_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) -{ - struct ipv6hdr *ip6h; - const struct ip6t_HL_info *info = targinfo; - int new_hl; - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - ip6h = ipv6_hdr(skb); - - switch (info->mode) { - case IP6T_HL_SET: - new_hl = info->hop_limit; - break; - case IP6T_HL_INC: - new_hl = ip6h->hop_limit + info->hop_limit; - if (new_hl > 255) - new_hl = 255; - break; - case IP6T_HL_DEC: - new_hl = ip6h->hop_limit - info->hop_limit; - if (new_hl < 0) - new_hl = 0; - break; - default: - new_hl = ip6h->hop_limit; - break; - } - - ip6h->hop_limit = new_hl; - - return XT_CONTINUE; -} - -static bool -hl_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) -{ - const struct ip6t_HL_info *info = targinfo; - - if (info->mode > IP6T_HL_MAXMODE) { - printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", - info->mode); - return false; - } - if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { - printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't " - "make sense with value 0\n"); - return false; - } - return true; -} - -static struct xt_target hl_tg6_reg __read_mostly = { - .name = "HL", - .family = AF_INET6, - .target = hl_tg6, - .targetsize = sizeof(struct ip6t_HL_info), - .table = "mangle", - .checkentry = hl_tg6_check, - .me = THIS_MODULE -}; - -static int __init hl_tg6_init(void) -{ - return xt_register_target(&hl_tg6_reg); -} - -static void __exit hl_tg6_exit(void) -{ - xt_unregister_target(&hl_tg6_reg); -} - -module_init(hl_tg6_init); -module_exit(hl_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c deleted file mode 100644 index 86a613810b6..00000000000 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ /dev/null @@ -1,505 +0,0 @@ -/* - * This is a module which is used for logging packets. - */ - -/* (C) 2001 Jan Rekorajski <baggins@pld.org.pl> - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <linux/spinlock.h> -#include <linux/icmpv6.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/ipv6.h> -#include <linux/netfilter.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <net/netfilter/nf_log.h> - -MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); -MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); -MODULE_LICENSE("GPL"); - -struct in_device; -#include <net/route.h> -#include <linux/netfilter_ipv6/ip6t_LOG.h> - -/* Use lock to serialize, so printks don't overlap */ -static DEFINE_SPINLOCK(log_lock); - -/* One level of recursion won't kill us */ -static void dump_packet(const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int ip6hoff, - int recurse) -{ - u_int8_t currenthdr; - int fragment; - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - unsigned int ptr; - unsigned int hdrlen = 0; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); - if (ih == NULL) { - printk("TRUNCATED"); - return; - } - - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - printk("SRC=" NIP6_FMT " DST=" NIP6_FMT " ", NIP6(ih->saddr), NIP6(ih->daddr)); - - /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), - (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, - ih->hop_limit, - (ntohl(*(__be32 *)ih) & 0x000fffff)); - - fragment = 0; - ptr = ip6hoff + sizeof(struct ipv6hdr); - currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { - struct ipv6_opt_hdr _hdr; - const struct ipv6_opt_hdr *hp; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - if (hp == NULL) { - printk("TRUNCATED"); - return; - } - - /* Max length: 48 "OPT (...) " */ - if (logflags & IP6T_LOG_IPOPT) - printk("OPT ( "); - - switch (currenthdr) { - case IPPROTO_FRAGMENT: { - struct frag_hdr _fhdr; - const struct frag_hdr *fh; - - printk("FRAG:"); - fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), - &_fhdr); - if (fh == NULL) { - printk("TRUNCATED "); - return; - } - - /* Max length: 6 "65535 " */ - printk("%u ", ntohs(fh->frag_off) & 0xFFF8); - - /* Max length: 11 "INCOMPLETE " */ - if (fh->frag_off & htons(0x0001)) - printk("INCOMPLETE "); - - printk("ID:%08x ", ntohl(fh->identification)); - - if (ntohs(fh->frag_off) & 0xFFF8) - fragment = 1; - - hdrlen = 8; - - break; - } - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - if (fragment) { - if (logflags & IP6T_LOG_IPOPT) - printk(")"); - return; - } - hdrlen = ipv6_optlen(hp); - break; - /* Max Length */ - case IPPROTO_AH: - if (logflags & IP6T_LOG_IPOPT) { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - /* Max length: 3 "AH " */ - printk("AH "); - - if (fragment) { - printk(")"); - return; - } - - ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), - &_ahdr); - if (ah == NULL) { - /* - * Max length: 26 "INCOMPLETE [65535 - * bytes] )" - */ - printk("INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 15 "SPI=0xF1234567 */ - printk("SPI=0x%x ", ntohl(ah->spi)); - - } - - hdrlen = (hp->hdrlen+2)<<2; - break; - case IPPROTO_ESP: - if (logflags & IP6T_LOG_IPOPT) { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 4 "ESP " */ - printk("ESP "); - - if (fragment) { - printk(")"); - return; - } - - /* - * Max length: 26 "INCOMPLETE [65535 bytes] )" - */ - eh = skb_header_pointer(skb, ptr, sizeof(_esph), - &_esph); - if (eh == NULL) { - printk("INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 16 "SPI=0xF1234567 )" */ - printk("SPI=0x%x )", ntohl(eh->spi) ); - - } - return; - default: - /* Max length: 20 "Unknown Ext Hdr 255" */ - printk("Unknown Ext Hdr %u", currenthdr); - return; - } - if (logflags & IP6T_LOG_IPOPT) - printk(") "); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - } - - switch (currenthdr) { - case IPPROTO_TCP: { - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); - if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", - ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & IP6T_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", - ntohl(th->seq), ntohl(th->ack_seq)); - /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - printk("CWR "); - if (th->ece) - printk("ECE "); - if (th->urg) - printk("URG "); - if (th->ack) - printk("ACK "); - if (th->psh) - printk("PSH "); - if (th->rst) - printk("RST "); - if (th->syn) - printk("SYN "); - if (th->fin) - printk("FIN "); - /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & IP6T_LOG_TCPOPT) - && th->doff * 4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff * 4 - - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, - ptr + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - printk("OPT (TRUNCATED)"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); - for (i =0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); - } - break; - } - case IPPROTO_UDP: - case IPPROTO_UDPLITE: { - struct udphdr _udph; - const struct udphdr *uh; - - if (currenthdr == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); - else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); - if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", - ntohs(uh->source), ntohs(uh->dest), - ntohs(uh->len)); - break; - } - case IPPROTO_ICMPV6: { - struct icmp6hdr _icmp6h; - const struct icmp6hdr *ic; - - /* Max length: 13 "PROTO=ICMPv6 " */ - printk("PROTO=ICMPv6 "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); - if (ic == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); - - switch (ic->icmp6_type) { - case ICMPV6_ECHO_REQUEST: - case ICMPV6_ECHO_REPLY: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", - ntohs(ic->icmp6_identifier), - ntohs(ic->icmp6_sequence)); - break; - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - break; - - case ICMPV6_PARAMPROB: - /* Max length: 17 "POINTER=ffffffff " */ - printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); - /* Fall through */ - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - /* Max length: 3+maxlen */ - if (recurse) { - printk("["); - dump_packet(info, skb, ptr + sizeof(_icmp6h), - 0); - printk("] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - printk("MTU=%u ", ntohl(ic->icmp6_mtu)); - } - break; - } - /* Max length: 10 "PROTO=255 " */ - default: - printk("PROTO=%u ", currenthdr); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u", - skb->sk->sk_socket->file->f_uid, - skb->sk->sk_socket->file->f_gid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } -} - -static struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = 0, - .logflags = NF_LOG_MASK, - }, - }, -}; - -static void -ip6t_log_packet(unsigned int pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - if (!loginfo) - loginfo = &default_loginfo; - - spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, - prefix, - in ? in->name : "", - out ? out->name : ""); - if (in && !out) { - unsigned int len; - /* MAC logging for input chain only. */ - printk("MAC="); - if (skb->dev && (len = skb->dev->hard_header_len) && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - int i; - - if (skb->dev->type == ARPHRD_SIT && - (p -= ETH_HLEN) < skb->head) - p = NULL; - - if (p != NULL) { - for (i = 0; i < len; i++) - printk("%02x%s", p[i], - i == len - 1 ? "" : ":"); - } - printk(" "); - - if (skb->dev->type == ARPHRD_SIT) { - const struct iphdr *iph = - (struct iphdr *)skb_mac_header(skb); - printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ", - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - } else - printk(" "); - } - - dump_packet(loginfo, skb, skb_network_offset(skb), 1); - printk("\n"); - spin_unlock_bh(&log_lock); -} - -static unsigned int -log_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) -{ - const struct ip6t_log_info *loginfo = targinfo; - struct nf_loginfo li; - - li.type = NF_LOG_TYPE_LOG; - li.u.log.level = loginfo->level; - li.u.log.logflags = loginfo->logflags; - - ip6t_log_packet(PF_INET6, hooknum, skb, in, out, &li, loginfo->prefix); - return XT_CONTINUE; -} - - -static bool -log_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) -{ - const struct ip6t_log_info *loginfo = targinfo; - - if (loginfo->level >= 8) { - pr_debug("LOG: level %u >= 8\n", loginfo->level); - return false; - } - if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { - pr_debug("LOG: prefix term %i\n", - loginfo->prefix[sizeof(loginfo->prefix)-1]); - return false; - } - return true; -} - -static struct xt_target log_tg6_reg __read_mostly = { - .name = "LOG", - .family = AF_INET6, - .target = log_tg6, - .targetsize = sizeof(struct ip6t_log_info), - .checkentry = log_tg6_check, - .me = THIS_MODULE, -}; - -static const struct nf_logger ip6t_logger = { - .name = "ip6t_LOG", - .logfn = &ip6t_log_packet, - .me = THIS_MODULE, -}; - -static int __init log_tg6_init(void) -{ - int ret; - - ret = xt_register_target(&log_tg6_reg); - if (ret < 0) - return ret; - nf_log_register(PF_INET6, &ip6t_logger); - return 0; -} - -static void __exit log_tg6_exit(void) -{ - nf_log_unregister(&ip6t_logger); - xt_unregister_target(&log_tg6_reg); -} - -module_init(log_tg6_init); -module_exit(log_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c new file mode 100644 index 00000000000..3e4e92d5e15 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/addrconf.h> +#include <net/ipv6.h> + +static unsigned int +masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct in6_addr src; + struct nf_conn *ct; + struct nf_nat_range newrange; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + if (ipv6_dev_get_saddr(dev_net(par->out), par->out, + &ipv6_hdr(skb)->daddr, 0, &src) < 0) + return NF_DROP; + + nfct_nat(ct)->masq_index = par->out->ifindex; + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = src; + newrange.max_addr.in6 = src; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} + +static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + return 0; +} + +static int device_cmp(struct nf_conn *ct, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct xt_target masquerade_tg6_reg __read_mostly = { + .name = "MASQUERADE", + .family = NFPROTO_IPV6, + .checkentry = masquerade_tg6_checkentry, + .target = masquerade_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .me = THIS_MODULE, +}; + +static int __init masquerade_tg6_init(void) +{ + int err; + + err = xt_register_target(&masquerade_tg6_reg); + if (err == 0) { + register_netdevice_notifier(&masq_dev_notifier); + register_inet6addr_notifier(&masq_inet_notifier); + } + + return err; +} +static void __exit masquerade_tg6_exit(void) +{ + unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_netdevice_notifier(&masq_dev_notifier); + xt_unregister_target(&masquerade_tg6_reg); +} + +module_init(masquerade_tg6_init); +module_exit(masquerade_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: automatic address SNAT"); diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c new file mode 100644 index 00000000000..590f767db5d --- /dev/null +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6t_NPT.h> +#include <linux/netfilter/x_tables.h> + +static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) +{ + struct ip6t_npt_tginfo *npt = par->targinfo; + struct in6_addr pfx; + __wsum src_sum, dst_sum; + + if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) + return -EINVAL; + + /* Ensure that LSB of prefix is zero */ + ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len); + if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6)) + return -EINVAL; + ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len); + if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) + return -EINVAL; + + src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); + dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); + + npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); + return 0; +} + +static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, + struct in6_addr *addr) +{ + unsigned int pfx_len; + unsigned int i, idx; + __be32 mask; + __sum16 sum; + + pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len); + for (i = 0; i < pfx_len; i += 32) { + if (pfx_len - i >= 32) + mask = 0; + else + mask = htonl((1 << (i - pfx_len + 32)) - 1); + + idx = i / 32; + addr->s6_addr32[idx] &= mask; + addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx]; + } + + if (pfx_len <= 48) + idx = 3; + else { + for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) { + if ((__force __sum16)addr->s6_addr16[idx] != + CSUM_MANGLED_0) + break; + } + if (idx == ARRAY_SIZE(addr->s6_addr16)) + return false; + } + + sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]), + csum_unfold(npt->adjustment))); + if (sum == CSUM_MANGLED_0) + sum = 0; + *(__force __sum16 *)&addr->s6_addr16[idx] = sum; + + return true; +} + +static unsigned int +ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_npt_tginfo *npt = par->targinfo; + + if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { + icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, saddr)); + return NF_DROP; + } + return XT_CONTINUE; +} + +static unsigned int +ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_npt_tginfo *npt = par->targinfo; + + if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { + icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, daddr)); + return NF_DROP; + } + return XT_CONTINUE; +} + +static struct xt_target ip6t_npt_target_reg[] __read_mostly = { + { + .name = "SNPT", + .table = "mangle", + .target = ip6t_snpt_tg, + .targetsize = sizeof(struct ip6t_npt_tginfo), + .checkentry = ip6t_npt_checkentry, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "DNPT", + .table = "mangle", + .target = ip6t_dnpt_tg, + .targetsize = sizeof(struct ip6t_npt_tginfo), + .checkentry = ip6t_npt_checkentry, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init ip6t_npt_init(void) +{ + return xt_register_targets(ip6t_npt_target_reg, + ARRAY_SIZE(ip6t_npt_target_reg)); +} + +static void __exit ip6t_npt_exit(void) +{ + xt_unregister_targets(ip6t_npt_target_reg, + ARRAY_SIZE(ip6t_npt_target_reg)); +} + +module_init(ip6t_npt_init); +module_exit(ip6t_npt_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ip6t_SNPT"); +MODULE_ALIAS("ip6t_DNPT"); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index b23baa635fe..544b0a9da1b 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -7,6 +7,8 @@ * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * + * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> + * * Based on net/ipv4/netfilter/ipt_REJECT.c * * This program is free software; you can redistribute it and/or @@ -14,229 +16,85 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/gfp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> -#include <net/ipv6.h> -#include <net/tcp.h> #include <net/icmp.h> -#include <net/ip6_checksum.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> +#include <net/netfilter/ipv6/nf_reject.h> + MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb) -{ - struct sk_buff *nskb; - struct tcphdr otcph, *tcph; - unsigned int otcplen, hh_len; - int tcphoff, needs_ack; - struct ipv6hdr *oip6h = ipv6_hdr(oldskb), *ip6h; - struct dst_entry *dst = NULL; - u8 proto; - struct flowi fl; - - if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || - (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { - pr_debug("ip6t_REJECT: addr is not unicast.\n"); - return; - } - - proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); - - if ((tcphoff < 0) || (tcphoff > oldskb->len)) { - pr_debug("ip6t_REJECT: Can't get TCP header.\n"); - return; - } - - otcplen = oldskb->len - tcphoff; - - /* IP header checks: fragment, too short. */ - if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { - pr_debug("ip6t_REJECT: proto(%d) != IPPROTO_TCP, " - "or too short. otcplen = %d\n", - proto, otcplen); - return; - } - - if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) - BUG(); - - /* No RST for RST. */ - if (otcph.rst) { - pr_debug("ip6t_REJECT: RST is set\n"); - return; - } - - /* Check checksum. */ - if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, - skb_checksum(oldskb, tcphoff, otcplen, 0))) { - pr_debug("ip6t_REJECT: TCP checksum is invalid\n"); - return; - } - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr); - ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr); - fl.fl_ip_sport = otcph.dest; - fl.fl_ip_dport = otcph.source; - security_skb_classify_flow(oldskb, &fl); - dst = ip6_route_output(NULL, &fl); - if (dst == NULL) - return; - if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0)) - return; - - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, - GFP_ATOMIC); - - if (!nskb) { - if (net_ratelimit()) - printk("ip6t_REJECT: Can't alloc skb\n"); - dst_release(dst); - return; - } - - nskb->dst = dst; - - skb_reserve(nskb, hh_len + dst->header_len); - - skb_put(nskb, sizeof(struct ipv6hdr)); - skb_reset_network_header(nskb); - ip6h = ipv6_hdr(nskb); - ip6h->version = 6; - ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); - ip6h->nexthdr = IPPROTO_TCP; - ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); - ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); - - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - /* Truncate to length (no data) */ - tcph->doff = sizeof(struct tcphdr)/4; - tcph->source = otcph.dest; - tcph->dest = otcph.source; - - if (otcph.ack) { - needs_ack = 0; - tcph->seq = otcph.ack_seq; - tcph->ack_seq = 0; - } else { - needs_ack = 1; - tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin - + otcplen - (otcph.doff<<2)); - tcph->seq = 0; - } - - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; - tcph->rst = 1; - tcph->ack = needs_ack; - tcph->window = 0; - tcph->urg_ptr = 0; - tcph->check = 0; - - /* Adjust TCP checksum */ - tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, - &ipv6_hdr(nskb)->daddr, - sizeof(struct tcphdr), IPPROTO_TCP, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); - - nf_ct_attach(nskb, oldskb); - - ip6_local_out(nskb); -} - -static inline void -send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) -{ - if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) - skb_in->dev = init_net.loopback_dev; - - icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL); -} static unsigned int -reject_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct ip6t_reject_info *reject = targinfo; + const struct ip6t_reject_info *reject = par->targinfo; + struct net *net = dev_net((par->in != NULL) ? par->in : par->out); - pr_debug("%s: medium point\n", __FUNCTION__); - /* WARNING: This code causes reentry within ip6tables. - This means that the ip6tables jump stack is now crap. We - must return an absolute verdict. --RR */ + pr_debug("%s: medium point\n", __func__); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: - send_unreach(skb, ICMPV6_NOROUTE, hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); break; case IP6T_ICMP6_ADM_PROHIBITED: - send_unreach(skb, ICMPV6_ADM_PROHIBITED, hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); break; case IP6T_ICMP6_NOT_NEIGHBOUR: - send_unreach(skb, ICMPV6_NOT_NEIGHBOUR, hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); break; case IP6T_ICMP6_ADDR_UNREACH: - send_unreach(skb, ICMPV6_ADDR_UNREACH, hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); break; case IP6T_ICMP6_PORT_UNREACH: - send_unreach(skb, ICMPV6_PORT_UNREACH, hooknum); + nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: - send_reset(skb); + nf_send_reset6(net, skb, par->hooknum); break; default: - if (net_ratelimit()) - printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with); + net_info_ratelimited("case %u not handled yet\n", reject->with); break; } return NF_DROP; } -static bool -reject_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static int reject_tg6_check(const struct xt_tgchk_param *par) { - const struct ip6t_reject_info *rejinfo = targinfo; - const struct ip6t_entry *e = entry; + const struct ip6t_reject_info *rejinfo = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { - printk("ip6t_REJECT: ECHOREPLY is not supported.\n"); - return false; + pr_info("ECHOREPLY is not supported.\n"); + return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ - if (e->ipv6.proto != IPPROTO_TCP - || (e->ipv6.invflags & XT_INV_PROTO)) { - printk("ip6t_REJECT: TCP_RESET illegal for non-tcp\n"); - return false; + if (e->ipv6.proto != IPPROTO_TCP || + (e->ipv6.invflags & XT_INV_PROTO)) { + pr_info("TCP_RESET illegal for non-tcp\n"); + return -EINVAL; } } - return true; + return 0; } static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c new file mode 100644 index 00000000000..a0d17270117 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct ipv6hdr * +synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = 64; //XXX + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct dst_entry *dst; + struct flowi6 fl6; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + goto free_nskb; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip6_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= XT_SYNPROXY_OPT_MSS; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + if (!synproxy_parse_options(skb, par->thoff, th, &opts)) + return NF_DROP; + + if (th->syn && !(th->ack || th->fin || th->rst)) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + return NF_DROP; + + } else if (th->ack && !(th->fin || th->rst || th->syn)) { + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + return NF_DROP; + } + + return XT_CONTINUE; +} + +static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_entry *e = par->entryinfo; + + if (!(e->ipv6.flags & IP6T_F_PROTO) || + e->ipv6.proto != IPPROTO_TCP || + e->ipv6.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg6_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), + .target = synproxy_tg6, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg6_check, + .destroy = synproxy_tg6_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg6_init(void) +{ + int err; + + err = nf_register_hooks(ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg6_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg6_exit(void) +{ + xt_unregister_target(&synproxy_tg6_reg); + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +} + +module_init(synproxy_tg6_init); +module_exit(synproxy_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index 429629fd63b..04099ab7d2e 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> @@ -29,35 +29,32 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; - pr_debug("ah spi_match:%c 0x%x <= 0x%x <= 0x%x", + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static bool -ah_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_auth_hdr _ah; const struct ip_auth_hdr *ah; - const struct ip6t_ah *ahinfo = matchinfo; - unsigned int ptr; + const struct ip6t_ah *ahinfo = par->matchinfo; + unsigned int ptr = 0; unsigned int hdrlen = 0; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + par->hotdrop = true; return false; } ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); if (ah == NULL) { - *hotdrop = true; + par->hotdrop = true; return false; } @@ -80,37 +77,30 @@ ah_mt6(const struct sk_buff *skb, const struct net_device *in, ahinfo->hdrres, ah->reserved, !(ahinfo->hdrres && ah->reserved)); - return (ah != NULL) - && - spi_match(ahinfo->spis[0], ahinfo->spis[1], - ntohl(ah->spi), - !!(ahinfo->invflags & IP6T_AH_INV_SPI)) - && - (!ahinfo->hdrlen || - (ahinfo->hdrlen == hdrlen) ^ - !!(ahinfo->invflags & IP6T_AH_INV_LEN)) - && - !(ahinfo->hdrres && ah->reserved); + return (ah != NULL) && + spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN)) && + !(ahinfo->hdrres && ah->reserved); } -/* Called when user tries to insert an entry of this type. */ -static bool -ah_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int ah_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ah *ahinfo = matchinfo; + const struct ip6t_ah *ahinfo = par->matchinfo; if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { - pr_debug("ip6t_ah: unknown flags %X\n", ahinfo->invflags); - return false; + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; } - return true; + return 0; } static struct xt_match ah_mt6_reg __read_mostly = { .name = "ah", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = ah_mt6, .matchsize = sizeof(struct ip6t_ah), .checkentry = ah_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index 8f331f12b2e..aab0706908c 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -20,18 +20,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -eui64_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) { unsigned char eui64[8]; - int i = 0; if (!(skb_mac_header(skb) >= skb->head && skb_mac_header(skb) + ETH_HLEN <= skb->data) && - offset != 0) { - *hotdrop = true; + par->fragoff != 0) { + par->hotdrop = true; return false; } @@ -45,12 +41,8 @@ eui64_mt6(const struct sk_buff *skb, const struct net_device *in, eui64[4] = 0xfe; eui64[0] ^= 0x02; - i = 0; - while (ipv6_hdr(skb)->saddr.s6_addr[8 + i] == eui64[i] - && i < 8) - i++; - - if (i == 8) + if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64, + sizeof(eui64))) return true; } } @@ -60,7 +52,7 @@ eui64_mt6(const struct sk_buff *skb, const struct net_device *in, static struct xt_match eui64_mt6_reg __read_mostly = { .name = "eui64", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = eui64_mt6, .matchsize = sizeof(int), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index e2bbc63dba5..3b5735e56bf 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -27,7 +27,7 @@ static inline bool id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; - pr_debug("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ', + pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); @@ -35,27 +35,24 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) } static bool -frag_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct frag_hdr _frag; const struct frag_hdr *fh; - const struct ip6t_frag *fraginfo = matchinfo; - unsigned int ptr; + const struct ip6t_frag *fraginfo = par->matchinfo; + unsigned int ptr = 0; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + par->hotdrop = true; return false; } fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); if (fh == NULL) { - *hotdrop = true; + par->hotdrop = true; return false; } @@ -73,61 +70,52 @@ frag_mt6(const struct sk_buff *skb, const struct net_device *in, pr_debug("res %02X %02X%04X %02X ", fraginfo->flags & IP6T_FRAG_RES, fh->reserved, ntohs(fh->frag_off) & 0x6, - !((fraginfo->flags & IP6T_FRAG_RES) - && (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); pr_debug("first %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_FST, ntohs(fh->frag_off) & ~0x7, - !((fraginfo->flags & IP6T_FRAG_FST) - && (ntohs(fh->frag_off) & ~0x7))); + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7))); pr_debug("mf %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_MF, ntohs(fh->frag_off) & IP6_MF, - !((fraginfo->flags & IP6T_FRAG_MF) - && !((ntohs(fh->frag_off) & IP6_MF)))); + !((fraginfo->flags & IP6T_FRAG_MF) && + !((ntohs(fh->frag_off) & IP6_MF)))); pr_debug("last %02X %02X %02X\n", fraginfo->flags & IP6T_FRAG_NMF, ntohs(fh->frag_off) & IP6_MF, - !((fraginfo->flags & IP6T_FRAG_NMF) - && (ntohs(fh->frag_off) & IP6_MF))); - - return (fh != NULL) - && - id_match(fraginfo->ids[0], fraginfo->ids[1], - ntohl(fh->identification), - !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) - && - !((fraginfo->flags & IP6T_FRAG_RES) - && (fh->reserved || (ntohs(fh->frag_off) & 0x6))) - && - !((fraginfo->flags & IP6T_FRAG_FST) - && (ntohs(fh->frag_off) & ~0x7)) - && - !((fraginfo->flags & IP6T_FRAG_MF) - && !(ntohs(fh->frag_off) & IP6_MF)) - && - !((fraginfo->flags & IP6T_FRAG_NMF) - && (ntohs(fh->frag_off) & IP6_MF)); + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF))); + + return (fh != NULL) && + id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x6))) && + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7)) && + !((fraginfo->flags & IP6T_FRAG_MF) && + !(ntohs(fh->frag_off) & IP6_MF)) && + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF)); } -/* Called when user tries to insert an entry of this type. */ -static bool -frag_mt6_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int frag_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_frag *fraginfo = matchinfo; + const struct ip6t_frag *fraginfo = par->matchinfo; if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { - pr_debug("ip6t_frag: unknown flags %X\n", fraginfo->invflags); - return false; + pr_debug("unknown flags %X\n", fraginfo->invflags); + return -EINVAL; } - return true; + return 0; } static struct xt_match frag_mt6_reg __read_mostly = { .name = "frag", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = frag_mt6, .matchsize = sizeof(struct ip6t_frag), .checkentry = frag_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index 62e39ace058..01df142bb02 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -41,17 +41,16 @@ MODULE_ALIAS("ip6t_dst"); * 5 -> RTALERT 2 x x */ +static struct xt_match hbh_mt6_reg[] __read_mostly; + static bool -hbh_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; - const struct ip6t_opts *optinfo = matchinfo; + const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; - unsigned int ptr; + unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; u8 _opttype; @@ -61,16 +60,18 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in, unsigned int optlen; int err; - err = ipv6_find_hdr(skb, &ptr, match->data, NULL); + err = ipv6_find_hdr(skb, &ptr, + (par->match == &hbh_mt6_reg[0]) ? + NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { - *hotdrop = true; + par->hotdrop = true; return false; } @@ -97,8 +98,6 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in, hdrlen -= 2; if (!(optinfo->flags & IP6T_OPTS_OPTS)) { return ret; - } else if (optinfo->flags & IP6T_OPTS_NSTRICT) { - pr_debug("Not strict - not implemented"); } else { pr_debug("Strict "); pr_debug("#%d ", optinfo->optsnr); @@ -146,11 +145,11 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in, } /* Step to the next */ - pr_debug("len%04X \n", optlen); + pr_debug("len%04X\n", optlen); if ((ptr > skb->len - optlen || hdrlen < optlen) && temp < optinfo->optsnr - 1) { - pr_debug("new pointer is too large! \n"); + pr_debug("new pointer is too large!\n"); break; } ptr += optlen; @@ -165,39 +164,40 @@ hbh_mt6(const struct sk_buff *skb, const struct net_device *in, return false; } -/* Called when user tries to insert an entry of this type. */ -static bool -hbh_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int hbh_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_opts *optsinfo = matchinfo; + const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { - pr_debug("ip6t_opts: unknown flags %X\n", optsinfo->invflags); - return false; + pr_debug("unknown flags %X\n", optsinfo->invflags); + return -EINVAL; } - return true; + + if (optsinfo->flags & IP6T_OPTS_NSTRICT) { + pr_debug("Not strict - not implemented"); + return -EINVAL; + } + + return 0; } static struct xt_match hbh_mt6_reg[] __read_mostly = { { + /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ .name = "hbh", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, - .data = NEXTHDR_HOP, }, { .name = "dst", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, - .data = NEXTHDR_DEST, }, }; diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c deleted file mode 100644 index 34567167384..00000000000 --- a/net/ipv6/netfilter/ip6t_hl.c +++ /dev/null @@ -1,71 +0,0 @@ -/* Hop Limit matching module */ - -/* (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> - * Based on HW's ttl module - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/ipv6.h> -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv6/ip6t_hl.h> -#include <linux/netfilter/x_tables.h> - -MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); -MODULE_DESCRIPTION("Xtables: IPv6 Hop Limit field match"); -MODULE_LICENSE("GPL"); - -static bool -hl_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) -{ - const struct ip6t_hl_info *info = matchinfo; - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - - switch (info->mode) { - case IP6T_HL_EQ: - return ip6h->hop_limit == info->hop_limit; - break; - case IP6T_HL_NE: - return ip6h->hop_limit != info->hop_limit; - break; - case IP6T_HL_LT: - return ip6h->hop_limit < info->hop_limit; - break; - case IP6T_HL_GT: - return ip6h->hop_limit > info->hop_limit; - break; - default: - printk(KERN_WARNING "ip6t_hl: unknown mode %d\n", - info->mode); - return false; - } - - return false; -} - -static struct xt_match hl_mt6_reg __read_mostly = { - .name = "hl", - .family = AF_INET6, - .match = hl_mt6, - .matchsize = sizeof(struct ip6t_hl_info), - .me = THIS_MODULE, -}; - -static int __init hl_mt6_init(void) -{ - return xt_register_match(&hl_mt6_reg); -} - -static void __exit hl_mt6_exit(void) -{ - xt_unregister_match(&hl_mt6_reg); -} - -module_init(hl_mt6_init); -module_exit(hl_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 3a940171f82..54bd9790603 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -27,12 +27,9 @@ MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) { - const struct ip6t_ipv6header_info *info = matchinfo; + const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; int len; u8 nexthdr; @@ -49,17 +46,18 @@ ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in, temp = 0; while (ip6t_ext_hdr(nexthdr)) { - struct ipv6_opt_hdr _hdr, *hp; + const struct ipv6_opt_hdr *hp; + struct ipv6_opt_hdr _hdr; int hdrlen; - /* Is there enough space for the next ext header? */ - if (len < (int)sizeof(struct ipv6_opt_hdr)) - return false; /* No more exthdr -> evaluate */ if (nexthdr == NEXTHDR_NONE) { temp |= MASK_NONE; break; } + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return false; /* ESP -> evaluate */ if (nexthdr == NEXTHDR_ESP) { temp |= MASK_ESP; @@ -120,24 +118,21 @@ ipv6header_mt6(const struct sk_buff *skb, const struct net_device *in, } } -static bool -ipv6header_mt6_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int ipv6header_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ipv6header_info *info = matchinfo; + const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && info->invflags != 0xFF) - return false; + return -EINVAL; - return true; + return 0; } static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), .checkentry = ipv6header_mt6_check, diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index e06678d07ec..0c90c66b199 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -11,6 +11,7 @@ * Based on net/netfilter/xt_tcpudp.c * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> @@ -24,12 +25,6 @@ MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); MODULE_LICENSE("GPL"); -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - /* Returns 1 if the type is matched by the range, 0 otherwise */ static inline bool type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) @@ -37,32 +32,29 @@ type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) return (type >= min && type <= max) ^ invert; } -static bool -mh_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip6_mh _mh; const struct ip6_mh *mh; - const struct ip6t_mh *mhinfo = matchinfo; + const struct ip6t_mh *mhinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - mh = skb_header_pointer(skb, protoff, sizeof(_mh), &_mh); + mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); if (mh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ - duprintf("Dropping evil MH tinygram.\n"); - *hotdrop = true; + pr_debug("Dropping evil MH tinygram.\n"); + par->hotdrop = true; return false; } if (mh->ip6mh_proto != IPPROTO_NONE) { - duprintf("Dropping invalid MH Payload Proto: %u\n", + pr_debug("Dropping invalid MH Payload Proto: %u\n", mh->ip6mh_proto); - *hotdrop = true; + par->hotdrop = true; return false; } @@ -70,21 +62,17 @@ mh_mt6(const struct sk_buff *skb, const struct net_device *in, !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); } -/* Called when user tries to insert an entry of this type. */ -static bool -mh_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int mh_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_mh *mhinfo = matchinfo; + const struct ip6t_mh *mhinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(mhinfo->invflags & ~IP6T_MH_INV_MASK); + return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; } static struct xt_match mh_mt6_reg __read_mostly = { .name = "mh", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = mh_mt6_check, .match = mh_mt6, .matchsize = sizeof(struct ip6t_mh), diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c new file mode 100644 index 00000000000..790e0c6b19e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2011 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/route.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +#include <linux/netfilter/xt_rpfilter.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match"); + +static bool rpfilter_addr_unicast(const struct in6_addr *addr) +{ + int addr_type = ipv6_addr_type(addr); + return addr_type & IPV6_ADDR_UNICAST; +} + +static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, + const struct net_device *dev, u8 flags) +{ + struct rt6_info *rt; + struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = false; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowi6_proto = iph->nexthdr, + .daddr = iph->saddr, + }; + int lookup_flags; + + if (rpfilter_addr_unicast(&iph->daddr)) { + memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr)); + lookup_flags = RT6_LOOKUP_F_HAS_SADDR; + } else { + lookup_flags = 0; + } + + fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; + if ((flags & XT_RPFILTER_LOOSE) == 0) { + fl6.flowi6_oif = dev->ifindex; + lookup_flags |= RT6_LOOKUP_F_IFACE; + } + + rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags); + if (rt->dst.error) + goto out; + + if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST)) + goto out; + + if (rt->rt6i_flags & RTF_LOCAL) { + ret = flags & XT_RPFILTER_ACCEPT_LOCAL; + goto out; + } + + if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) + ret = true; + out: + ip6_rt_put(rt); + return ret; +} + +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rt6_info *rt = (const void *) skb_dst(skb); + return rt && (rt->rt6i_flags & RTF_LOCAL); +} + +static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + int saddrtype; + struct ipv6hdr *iph; + bool invert = info->flags & XT_RPFILTER_INVERT; + + if (rpfilter_is_local(skb)) + return true ^ invert; + + iph = ipv6_hdr(skb); + saddrtype = ipv6_addr_type(&iph->saddr); + if (unlikely(saddrtype == IPV6_ADDR_ANY)) + return true ^ invert; /* not routable: forward path will drop it */ + + return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert; +} + +static int rpfilter_check(const struct xt_mtchk_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + unsigned int options = ~XT_RPFILTER_OPTION_MASK; + + if (info->flags & options) { + pr_info("unknown options encountered"); + return -EINVAL; + } + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "raw") != 0) { + pr_info("match only valid in the \'raw\' " + "or \'mangle\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + return 0; +} + +static struct xt_match rpfilter_mt_reg __read_mostly = { + .name = "rpfilter", + .family = NFPROTO_IPV6, + .checkentry = rpfilter_check, + .match = rpfilter_mt, + .matchsize = sizeof(struct xt_rpfilter_info), + .hooks = (1 << NF_INET_PRE_ROUTING), + .me = THIS_MODULE +}; + +static int __init rpfilter_mt_init(void) +{ + return xt_register_match(&rpfilter_mt_reg); +} + +static void __exit rpfilter_mt_exit(void) +{ + xt_unregister_match(&rpfilter_mt_reg); +} + +module_init(rpfilter_mt_init); +module_exit(rpfilter_mt_exit); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 12a9efe9886..2c99b94eeca 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -29,39 +29,36 @@ static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; - pr_debug("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x", + pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static bool -rt_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; - const struct ip6t_rt *rtinfo = matchinfo; + const struct ip6t_rt *rtinfo = par->matchinfo; unsigned int temp; - unsigned int ptr; + unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; struct in6_addr _addr; const struct in6_addr *ap; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *hotdrop = true; + par->hotdrop = true; return false; } rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); if (rh == NULL) { - *hotdrop = true; + par->hotdrop = true; return false; } @@ -95,22 +92,20 @@ rt_mt6(const struct sk_buff *skb, const struct net_device *in, !((rtinfo->flags & IP6T_RT_RES) && (((const struct rt0_hdr *)rh)->reserved))); - ret = (rh != NULL) - && + ret = (rh != NULL) && (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, - !!(rtinfo->invflags & IP6T_RT_INV_SGS))) - && + !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && (!(rtinfo->flags & IP6T_RT_LEN) || ((rtinfo->hdrlen == hdrlen) ^ - !!(rtinfo->invflags & IP6T_RT_INV_LEN))) - && + !!(rtinfo->invflags & IP6T_RT_INV_LEN))) && (!(rtinfo->flags & IP6T_RT_TYP) || ((rtinfo->rt_type == rh->type) ^ !!(rtinfo->invflags & IP6T_RT_INV_TYP))); if (ret && (rtinfo->flags & IP6T_RT_RES)) { - u_int32_t *rp, _reserved; + const u_int32_t *rp; + u_int32_t _reserved; rp = skb_header_pointer(skb, ptr + offsetof(struct rt0_hdr, reserved), @@ -188,32 +183,28 @@ rt_mt6(const struct sk_buff *skb, const struct net_device *in, return false; } -/* Called when user tries to insert an entry of this type. */ -static bool -rt_mt6_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static int rt_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_rt *rtinfo = matchinfo; + const struct ip6t_rt *rtinfo = par->matchinfo; if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { - pr_debug("ip6t_rt: unknown flags %X\n", rtinfo->invflags); - return false; + pr_debug("unknown flags %X\n", rtinfo->invflags); + return -EINVAL; } if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && (!(rtinfo->flags & IP6T_RT_TYP) || (rtinfo->rt_type != 0) || (rtinfo->invflags & IP6T_RT_INV_TYP))) { pr_debug("`--rt-type 0' required before `--rt-0-*'"); - return false; + return -EINVAL; } - return true; + return 0; } static struct xt_match rt_mt6_reg __read_mostly = { .name = "rt", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = rt_mt6, .matchsize = sizeof(struct ip6t_rt), .checkentry = rt_mt6_check, diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 2d9cd095a72..ca7f6c12808 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -21,116 +22,52 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[3]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_filter = { +static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, - .af = AF_INET6, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_FILTER, }; /* The work comes in here from netfilter.c. */ static unsigned int -ip6t_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_filter); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int -ip6t_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#if 0 - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ip6t_hook: happy cracking.\n"); - return NF_ACCEPT; - } -#endif - - return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_filter); + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_filter); } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_FILTER, - }, - { - .hook = ip6t_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_FILTER, - }, - { - .hook = ip6t_local_out_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_FILTER, - }, -}; +static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static int forward = NF_ACCEPT; +static bool forward = true; module_param(forward, bool, 0000); static int __net_init ip6table_filter_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ip6t_standard *)repl->entries)[1].target.verdict = + forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + net->ipv6.ip6table_filter = - ip6t_register_table(net, &packet_filter, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_filter)) - return PTR_ERR(net->ipv6.ip6table_filter); - return 0; + ip6t_register_table(net, &packet_filter, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter); } static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_filter); + ip6t_unregister_table(net, net->ipv6.ip6table_filter); } static struct pernet_operations ip6table_filter_net_ops = { @@ -142,22 +79,16 @@ static int __init ip6table_filter_init(void) { int ret; - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; - ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); goto cleanup_table; + } return ret; @@ -168,7 +99,7 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&ip6table_filter_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 035343a90ff..307bbb782d1 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -10,6 +10,8 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> +#include <net/ipv6.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -21,80 +23,27 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[5]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "mangle", - .valid_hooks = MANGLE_VALID_HOOKS, - .num_entries = 6, - .size = sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_mangler = { +static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, - .af = AF_INET6, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_MANGLE, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6t_route_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_mangle); -} - static unsigned int -ip6t_local_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) { - unsigned int ret; struct in6_addr saddr, daddr; u_int8_t hop_limit; u_int32_t flowlabel, mark; - + int err; #if 0 /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ip6t_hook: happy cracking.\n"); + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) { + net_warn_ratelimited("ip6t_hook: happy cracking\n"); return NF_ACCEPT; } #endif @@ -108,69 +57,56 @@ ip6t_local_hook(unsigned int hook, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_mangle); - - if (ret != NF_DROP && ret != NF_STOLEN - && (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) - || memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) - || skb->mark != mark - || ipv6_hdr(skb)->hop_limit != hop_limit)) - return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, + dev_net(out)->ipv6.ip6table_mangle); + + if (ret != NF_DROP && ret != NF_STOLEN && + (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || + !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_route_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_local_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_route_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_local_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_route_hook, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_MANGLE, - }, -}; +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (ops->hooknum == NF_INET_LOCAL_OUT) + return ip6t_mangle_out(skb, out); + if (ops->hooknum == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, ops->hooknum, in, out, + dev_net(out)->ipv6.ip6table_mangle); + /* INPUT/FORWARD */ + return ip6t_do_table(skb, ops->hooknum, in, out, + dev_net(in)->ipv6.ip6table_mangle); +} +static struct nf_hook_ops *mangle_ops __read_mostly; static int __net_init ip6table_mangle_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; net->ipv6.ip6table_mangle = - ip6t_register_table(net, &packet_mangler, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_mangle)) - return PTR_ERR(net->ipv6.ip6table_mangle); - return 0; + ip6t_register_table(net, &packet_mangler, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_mangle); + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); } static struct pernet_operations ip6table_mangle_net_ops = { @@ -187,9 +123,11 @@ static int __init ip6table_mangle_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); goto cleanup_table; + } return ret; @@ -200,7 +138,7 @@ static int __init ip6table_mangle_init(void) static void __exit ip6table_mangle_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&ip6table_mangle_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c new file mode 100644 index 00000000000..387d8b8fc18 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT + * funded by Astaro. + */ + +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> + +static const struct xt_table nf_nat_ipv6_table = { + .name = "nat", + .valid_hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + .af = NFPROTO_IPV6, +}; + +static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + */ + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI6)\n", ct, + HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6); + + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); +} + +static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned int ret; + + ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat); + if (ret == NF_ACCEPT) { + if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) + ret = alloc_null_binding(ct, hooknum); + } + return ret; +} + +static unsigned int +nf_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} + +static unsigned int +nf_nat_ipv6_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int +nf_nat_ipv6_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} + +static unsigned int +nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} + +static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_out, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_local_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC, + }, +}; + +static int __net_init ip6table_nat_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat); +} + +static void __net_exit ip6table_nat_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_nat); +} + +static struct pernet_operations ip6table_nat_net_ops = { + .init = ip6table_nat_net_init, + .exit = ip6table_nat_net_exit, +}; + +static int __init ip6table_nat_init(void) +{ + int err; + + err = register_pernet_subsys(&ip6table_nat_net_ops); + if (err < 0) + goto err1; + + err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + if (err < 0) + goto err2; + return 0; + +err2: + unregister_pernet_subsys(&ip6table_nat_net_ops); +err1: + return err; +} + +static void __exit ip6table_nat_exit(void) +{ + nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + unregister_pernet_subsys(&ip6table_nat_net_ops); +} + +module_init(ip6table_nat_init); +module_exit(ip6table_nat_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 5cd84203abf..5274740acec 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -5,85 +5,48 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[2]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "raw", - .valid_hooks = RAW_VALID_HOOKS, - .num_entries = 3, - .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - -static struct xt_table packet_raw = { +static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, - .lock = RW_LOCK_UNLOCKED, .me = THIS_MODULE, - .af = AF_INET6, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW, }; /* The work comes in here from netfilter.c. */ static unsigned int -ip6t_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, init_net.ipv6.ip6table_raw); + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_raw); } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_hook, - .pf = PF_INET6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_FIRST, - .owner = THIS_MODULE, - }, - { - .hook = ip6t_hook, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_FIRST, - .owner = THIS_MODULE, - }, -}; +static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init ip6table_raw_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; net->ipv6.ip6table_raw = - ip6t_register_table(net, &packet_raw, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_raw)) - return PTR_ERR(net->ipv6.ip6table_raw); - return 0; + ip6t_register_table(net, &packet_raw, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw); } static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_raw); + ip6t_unregister_table(net, net->ipv6.ip6table_raw); } static struct pernet_operations ip6table_raw_net_ops = { @@ -100,9 +63,11 @@ static int __init ip6table_raw_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); goto cleanup_table; + } return ret; @@ -113,7 +78,7 @@ static int __init ip6table_raw_init(void) static void __exit ip6table_raw_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&ip6table_raw_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c new file mode 100644 index 00000000000..ab3b0219ecf --- /dev/null +++ b/net/ipv6/netfilter/ip6table_security.c @@ -0,0 +1,103 @@ +/* + * "security" table for IPv6 + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); +MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static const struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_SECURITY, +}; + +static unsigned int +ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_security); +} + +static struct nf_hook_ops *sectbl_ops __read_mostly; + +static int __net_init ip6table_security_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_security = + ip6t_register_table(net, &security_table, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security); +} + +static void __net_exit ip6table_security_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_security); +} + +static struct pernet_operations ip6table_security_net_ops = { + .init = ip6table_security_net_init, + .exit = ip6table_security_net_exit, +}; + +static int __init ip6table_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); + goto cleanup_table; + } + + return ret; + +cleanup_table: + unregister_pernet_subsys(&ip6table_security_net_ops); + return ret; +} + +static void __exit ip6table_security_fini(void) +{ + xt_hook_unlink(&security_table, sectbl_ops); + unregister_pernet_subsys(&ip6table_security_net_ops); +} + +module_init(ip6table_security_init); +module_exit(ip6table_security_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3717bdf34f6..4cbc6b290dd 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -16,19 +16,26 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> -#include <linux/sysctl.h> #include <net/ipv6.h> #include <net/inet_frag.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> - -static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netfilter/nf_log.h> + +static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) { const u_int32_t *ap; u_int32_t _addrs[8]; @@ -36,209 +43,142 @@ static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), sizeof(_addrs), _addrs); if (ap == NULL) - return 0; + return false; memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); - return 1; + return true; } -static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); - return 1; + return true; } static int ipv6_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ", - NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), - NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); -} - -/* - * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c - * - * This function parses (probably truncated) exthdr set "hdr" - * of length "len". "nexthdrp" initially points to some place, - * where type of the first header can be found. - * - * It skips all well-known exthdrs, and returns pointer to the start - * of unparsable area i.e. the first header with unknown type. - * if success, *nexthdr is updated by type/protocol of this header. - * - * NOTES: - it may return pointer pointing beyond end of packet, - * if the last recognized header is truncated in the middle. - * - if packet is truncated, so that all parsed headers are skipped, - * it returns -1. - * - if packet is fragmented, return pointer of the fragment header. - * - ESP is unparsable for now and considered like - * normal payload protocol. - * - Note also special handling of AUTH header. Thanks to IPsec wizards. - */ - -static int nf_ct_ipv6_skip_exthdr(const struct sk_buff *skb, int start, - u8 *nexthdrp, int len) -{ - u8 nexthdr = *nexthdrp; - - while (ipv6_ext_hdr(nexthdr)) { - struct ipv6_opt_hdr hdr; - int hdrlen; - - if (len < (int)sizeof(struct ipv6_opt_hdr)) - return -1; - if (nexthdr == NEXTHDR_NONE) - break; - if (nexthdr == NEXTHDR_FRAGMENT) - break; - if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) - BUG(); - if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hdr.hdrlen+2)<<2; - else - hdrlen = ipv6_optlen(&hdr); - - nexthdr = hdr.nexthdr; - len -= hdrlen; - start += hdrlen; - } - - *nexthdrp = nexthdr; - return start; + return seq_printf(s, "src=%pI6 dst=%pI6 ", + tuple->src.u3.ip6, tuple->dst.u3.ip6); } static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { unsigned int extoff = nhoff + sizeof(struct ipv6hdr); - unsigned char pnum; + __be16 frag_off; int protoff; + u8 nexthdr; if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), - &pnum, sizeof(pnum)) != 0) { + &nexthdr, sizeof(nexthdr)) != 0) { pr_debug("ip6_conntrack_core: can't get nexthdr\n"); return -NF_ACCEPT; } - protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, skb->len - extoff); + protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); /* - * (protoff == skb->len) mean that the packet doesn't have no data - * except of IPv6 & ext headers. but it's tracked anyway. - YK + * (protoff == skb->len) means the packet has not data, just + * IPv6 and possibly extensions headers, but it is tracked anyway */ - if ((protoff < 0) || (protoff > skb->len)) { + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); return -NF_ACCEPT; } *dataoff = protoff; - *protonum = pnum; + *protonum = nexthdr; return NF_ACCEPT; } -static unsigned int ipv6_confirm(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv6_helper(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; - unsigned int ret, protoff; - unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - + __be16 frag_off; + int protoff; + u8 nexthdr; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) - goto out; + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; help = nfct_help(ct); if (!help) - goto out; + return NF_ACCEPT; /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); if (!helper) - goto out; + return NF_ACCEPT; - protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, - skb->len - extoff); - if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) { + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } - ret = helper->help(skb, protoff, ct, ctinfo); - if (ret != NF_ACCEPT) - return ret; -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); + return helper->help(skb, protoff, ct, ctinfo); } -static unsigned int ipv6_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv6_confirm(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct sk_buff *reasm; - - /* Previously seen (loopback)? */ - if (skb->nfct) - return NF_ACCEPT; - - reasm = nf_ct_frag6_gather(skb); - - /* queued */ - if (reasm == NULL) - return NF_STOLEN; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + int protoff; + __be16 frag_off; - /* error occured or not fragmented */ - if (reasm == skb) - return NF_ACCEPT; + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; - nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, - (struct net_device *)out, okfn); + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + goto out; + } - return NF_STOLEN; + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); } -static unsigned int ipv6_conntrack_in(unsigned int hooknum, +static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct sk_buff *reasm = skb->nfct_reasm; - - /* This packet is fragmented and has reassembled packet. */ - if (reasm) { - /* Reassembled packet isn't parsed yet ? */ - if (!reasm->nfct) { - unsigned int ret; - - ret = nf_conntrack_in(PF_INET6, hooknum, reasm); - if (ret != NF_ACCEPT) - return ret; - } - nf_conntrack_get(reasm->nfct); - skb->nfct = reasm->nfct; - skb->nfctinfo = reasm->nfctinfo; - return NF_ACCEPT; - } - - return nf_conntrack_in(PF_INET6, hooknum, skb); + return nf_conntrack_in(dev_net(in), PF_INET6, ops->hooknum, skb); } -static unsigned int ipv6_conntrack_local(unsigned int hooknum, +static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -246,59 +186,103 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, { /* root is playing with raw sockets. */ if (skb->len < sizeof(struct ipv6hdr)) { - if (net_ratelimit()) - printk("ipv6_conntrack_local: packet too short\n"); + net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return ipv6_conntrack_in(hooknum, skb, in, out, okfn); + return nf_conntrack_in(dev_net(out), PF_INET6, ops->hooknum, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv6_conntrack_in, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK, }, { .hook = ipv6_conntrack_local, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_defrag, + .hook = ipv6_helper, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, }, { .hook = ipv6_confirm, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { + .hook = ipv6_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv6_confirm, .owner = THIS_MODULE, - .pf = PF_INET6, + .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST-1, }, }; -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +static int +ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct sockaddr_in6 sin6; + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; + struct nf_conn *ct; + + tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.in6 = sk->sk_v6_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.dst.protonum = sk->sk_protocol; + + if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) + return -ENOPROTOOPT; + + if (*len < 0 || (unsigned int) *len < sizeof(sin6)) + return -EINVAL; + + h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + if (!h) { + pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", + &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + sin6.sin6_family = AF_INET6; + sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + sin6.sin6_flowinfo = inet6->flow_label & IPV6_FLOWINFO_MASK; + memcpy(&sin6.sin6_addr, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, + sizeof(sin6.sin6_addr)); + + nf_ct_put(ct); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, + sk->sk_bound_dev_if); + return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -306,10 +290,11 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, - &tuple->src.u3.ip6); - NLA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, - &tuple->dst.u3.ip6); + if (nla_put(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6) || + nla_put(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6)) + goto nla_put_failure; return 0; nla_put_failure: @@ -334,6 +319,11 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[], return 0; } + +static int ipv6_nlattr_tuple_size(void) +{ + return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1); +} #endif struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { @@ -343,15 +333,12 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .invert_tuple = ipv6_invert_tuple, .print_tuple = ipv6_print_tuple, .get_l4proto = ipv6_get_l4proto, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv6_tuple_to_nlattr, + .nlattr_tuple_size = ipv6_nlattr_tuple_size, .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_path = nf_net_netfilter_sysctl_path, - .ctl_table = nf_ct_ipv6_sysctl_table, -#endif .me = THIS_MODULE, }; @@ -359,72 +346,137 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); +static struct nf_sockopt_ops so_getorigdst6 = { + .pf = NFPROTO_IPV6, + .get_optmin = IP6T_SO_ORIGINAL_DST, + .get_optmax = IP6T_SO_ORIGINAL_DST + 1, + .get = ipv6_getorigdst, + .owner = THIS_MODULE, +}; + +static int ipv6_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6); + if (ret < 0) { + pr_err("nf_conntrack_tcp6: pernet registration failed\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6); + if (ret < 0) { + pr_err("nf_conntrack_udp6: pernet registration failed\n"); + goto cleanup_tcp6; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + pr_err("nf_conntrack_icmp6: pernet registration failed\n"); + goto cleanup_udp6; + } + ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); + goto cleanup_icmpv6; + } + return 0; + cleanup_icmpv6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); + cleanup_udp6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); + cleanup_tcp6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); + out: + return ret; +} + +static void ipv6_net_exit(struct net *net) +{ + nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); +} + +static struct pernet_operations ipv6_net_ops = { + .init = ipv6_net_init, + .exit = ipv6_net_exit, +}; + static int __init nf_conntrack_l3proto_ipv6_init(void) { int ret = 0; need_conntrack(); + nf_defrag_ipv6_enable(); - ret = nf_ct_frag6_init(); + ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + pr_err("Unable to register netfilter socket option\n"); return ret; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); + + ret = register_pernet_subsys(&ipv6_net_ops); + if (ret < 0) + goto cleanup_sockopt; + + ret = nf_register_hooks(ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register tcp.\n"); - goto cleanup_frag6; + pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_pernet; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register udp.\n"); - goto cleanup_tcp; + pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n"); + goto cleanup_hooks; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register icmpv6.\n"); - goto cleanup_udp; + pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n"); + goto cleanup_tcp6; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register ipv6\n"); - goto cleanup_icmpv6; + pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n"); + goto cleanup_udp6; } - ret = nf_register_hooks(ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); + ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register pre-routing defrag " - "hook.\n"); - goto cleanup_ipv6; + pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); + goto cleanup_icmpv6; } return ret; - cleanup_ipv6: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); cleanup_icmpv6: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: - nf_ct_frag6_cleanup(); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + cleanup_udp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + cleanup_tcp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + cleanup_hooks: + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + cleanup_pernet: + unregister_pernet_subsys(&ipv6_net_ops); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst6); return ret; } static void __exit nf_conntrack_l3proto_ipv6_fini(void) { synchronize_net(); + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - nf_ct_frag6_cleanup(); + unregister_pernet_subsys(&ipv6_net_ops); + nf_unregister_sockopt(&so_getorigdst6); } module_init(nf_conntrack_l3proto_ipv6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 0897d0f4c4a..b3807c5cb88 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -23,47 +23,64 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> -static unsigned long nf_ct_icmpv6_timeout __read_mostly = 30*HZ; +static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; -static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6; +} + +static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { const struct icmp6hdr *hp; struct icmp6hdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) - return 0; + return false; tuple->dst.u.icmp.type = hp->icmp6_type; tuple->src.u.icmp.id = hp->icmp6_identifier; tuple->dst.u.icmp.code = hp->icmp6_code; - return 1; + return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 }; -static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT -130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + +static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) - return 0; + return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -76,35 +93,31 @@ static int icmpv6_print_tuple(struct seq_file *s, ntohs(tuple->src.u.icmp.id)); } +static unsigned int *icmpv6_get_timeouts(struct net *net) +{ + return &icmpv6_pernet(net)->timeout; +} + /* Returns verdict for packet, or -1 for invalid. */ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, - unsigned int hooknum) + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeout) { - /* Try to delete connection immediately after all replies: - won't actually vanish as we still have skb, and del_timer - means this will only run once even if count hits zero twice - (theoretically possible with SMP) */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - } else { - atomic_inc(&ct->proto.icmp.count); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); - } + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int icmpv6_new(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff) +static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) { static const u_int8_t valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, @@ -116,15 +129,20 @@ static int icmpv6_new(struct nf_conn *ct, /* Can't create a new ICMPv6 `conn' with this. */ pr_debug("icmpv6: can't create new conn with type %u\n", type + 128); - NF_CT_DUMP_TUPLE(&ct->tuplehash[0].tuple); - return 0; + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) + nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL, + NULL, NULL, + "nf_ct_icmpv6: invalid new with type %d ", + type + 128); + return false; } - atomic_set(&ct->proto.icmp.count, 0); - return 1; + return true; } static int -icmpv6_error_message(struct sk_buff *skb, +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int icmp6off, enum ip_conntrack_info *ctinfo, unsigned int hooknum) @@ -132,6 +150,7 @@ icmpv6_error_message(struct sk_buff *skb, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; NF_CT_ASSERT(skb->nfct == NULL); @@ -158,7 +177,7 @@ icmpv6_error_message(struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(&intuple); + h = nf_conntrack_find_get(net, zone, &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; @@ -170,49 +189,61 @@ icmpv6_error_message(struct sk_buff *skb, /* Update skb to refer to this connection */ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; + return NF_ACCEPT; } static int -icmpv6_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; + int type; icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - if (LOG_INVALID(IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: short packet "); return -NF_ACCEPT; } - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, - "nf_ct_icmpv6: ICMPv6 checksum failed\n"); + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed "); return -NF_ACCEPT; } + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + return NF_ACCEPT; + } + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); + return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id); - NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type); - NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code); - + if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; return 0; nla_put_failure: @@ -228,40 +259,111 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { - if (!tb[CTA_PROTO_ICMPV6_TYPE] - || !tb[CTA_PROTO_ICMPV6_CODE] - || !tb[CTA_PROTO_ICMPV6_ID]) + if (!tb[CTA_PROTO_ICMPV6_TYPE] || + !tb[CTA_PROTO_ICMPV6_CODE] || + !tb[CTA_PROTO_ICMPV6_ID]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - if (tuple->dst.u.icmp.type < 128 - || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) - || !invmap[tuple->dst.u.icmp.type - 128]) + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) return -EINVAL; return 0; } + +static int icmpv6_nlattr_tuple_size(void) +{ + return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); +} #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmpv6_pernet(net); + + if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; + } else { + /* Set default ICMPv6 timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { + [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static struct ctl_table_header *icmpv6_sysctl_header; static struct ctl_table icmpv6_sysctl_table[] = { { .procname = "nf_conntrack_icmpv6_timeout", - .data = &nf_ct_icmpv6_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_SYSCTL */ +static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmpv6_sysctl_table, + sizeof(icmpv6_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmpv6_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmpv6_timeout; + + return icmpv6_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = { .l3proto = PF_INET6, @@ -271,15 +373,24 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = .invert_tuple = icmpv6_invert_tuple, .print_tuple = icmpv6_print_tuple, .packet = icmpv6_packet, + .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, .error = icmpv6_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, + .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_header = &icmpv6_sysctl_header, - .ctl_table = icmpv6_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, + .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmpv6_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmpv6_init_net, + .get_net_proto = icmpv6_get_net_proto, }; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 24c0d03095b..0d5279fd852 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -14,6 +14,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6-nf: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -27,7 +29,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> -#include <linux/jhash.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> @@ -39,16 +41,15 @@ #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/inet_ecn.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ -#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ -#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT struct nf_ct_frag6_skb_cb { @@ -59,181 +60,172 @@ struct nf_ct_frag6_skb_cb #define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) -struct nf_ct_frag6_queue -{ - struct inet_frag_queue q; - - __be32 id; /* fragment id */ - struct in6_addr saddr; - struct in6_addr daddr; - - unsigned int csum; - __u16 nhoffset; -}; - static struct inet_frags nf_frags; -static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_ipv6_sysctl_table[] = { +static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &nf_init_frags.timeout, + .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, .procname = "nf_conntrack_frag6_low_thresh", - .data = &nf_init_frags.low_thresh, + .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, .procname = "nf_conntrack_frag6_high_thresh", - .data = &nf_init_frags.high_thresh, + .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; -#endif -static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr, - struct in6_addr *daddr) +static int nf_ct_frag6_sysctl_register(struct net *net) { - u32 a, b, c; - - a = (__force u32)saddr->s6_addr32[0]; - b = (__force u32)saddr->s6_addr32[1]; - c = (__force u32)saddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += nf_frags.rnd; - __jhash_mix(a, b, c); + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = nf_ct_frag6_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table), + GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->nf_frag.frags.timeout; + table[1].data = &net->nf_frag.frags.low_thresh; + table[2].data = &net->nf_frag.frags.high_thresh; + } - a += (__force u32)saddr->s6_addr32[3]; - b += (__force u32)daddr->s6_addr32[0]; - c += (__force u32)daddr->s6_addr32[1]; - __jhash_mix(a, b, c); + hdr = register_net_sysctl(net, "net/netfilter", table); + if (hdr == NULL) + goto err_reg; - a += (__force u32)daddr->s6_addr32[2]; - b += (__force u32)daddr->s6_addr32[3]; - c += (__force u32)id; - __jhash_mix(a, b, c); + net->nf_frag.sysctl.frags_hdr = hdr; + return 0; - return c & (INETFRAGS_HASHSZ - 1); +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; } -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { - struct nf_ct_frag6_queue *nq; + struct ctl_table *table; - nq = container_of(q, struct nf_ct_frag6_queue, q); - return ip6qhashfn(nq->id, &nq->saddr, &nq->daddr); + table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); + if (!net_eq(net, &init_net)) + kfree(table); } -static void nf_skb_free(struct sk_buff *skb) +#else +static int nf_ct_frag6_sysctl_register(struct net *net) { - if (NFCT_FRAG6_CB(skb)->orig) - kfree_skb(NFCT_FRAG6_CB(skb)->orig); + return 0; } - -/* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { - if (work) - *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_init_frags.mem); - nf_skb_free(skb); - kfree_skb(skb); } +#endif -/* Destruction primitives. */ +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) +{ + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); +} -static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr) { - inet_frag_put(&fq->q, &nf_frags); + u32 c; + + net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); + c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); + return c & (INETFRAGS_HASHSZ - 1); } -/* Kill fq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) + +static unsigned int nf_hashfn(struct inet_frag_queue *q) { - inet_frag_kill(&fq->q, &nf_frags); + const struct frag_queue *nq; + + nq = container_of(q, struct frag_queue, q); + return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); } -static void nf_ct_frag6_evictor(void) +static void nf_skb_free(struct sk_buff *skb) { - local_bh_disable(); - inet_frag_evictor(&nf_init_frags, &nf_frags); - local_bh_enable(); + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); } static void nf_ct_frag6_expire(unsigned long data) { - struct nf_ct_frag6_queue *fq; - - fq = container_of((struct inet_frag_queue *)data, - struct nf_ct_frag6_queue, q); - - spin_lock(&fq->q.lock); - - if (fq->q.last_in & COMPLETE) - goto out; + struct frag_queue *fq; + struct net *net; - fq_kill(fq); + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, nf_frag.frags); -out: - spin_unlock(&fq->q.lock); - fq_put(fq); + ip6_expire_frag_queue(net, fq, &nf_frags); } /* Creation primitives. */ - -static __inline__ struct nf_ct_frag6_queue * -fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) +static inline struct frag_queue *fq_find(struct net *net, __be32 id, + u32 user, struct in6_addr *src, + struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; unsigned int hash; arg.id = id; + arg.user = user; arg.src = src; arg.dst = dst; - hash = ip6qhashfn(id, src, dst); - - q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); - if (q == NULL) - goto oom; + arg.ecn = ecn; - return container_of(q, struct nf_ct_frag6_queue, q); + read_lock_bh(&nf_frags.lock); + hash = nf_hash_frag(id, src, dst); -oom: - pr_debug("Can't alloc new queue\n"); - return NULL; + q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); + local_bh_enable(); + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } + return container_of(q, struct frag_queue, q); } -static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr, int nhoff) +static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) { struct sk_buff *prev, *next; + unsigned int payload_len; int offset, end; + u8 ecn; - if (fq->q.last_in & COMPLETE) { - pr_debug("Allready completed\n"); + if (fq->q.last_in & INET_FRAG_COMPLETE) { + pr_debug("Already completed\n"); goto err; } + payload_len = ntohs(ipv6_hdr(skb)->payload_len); + offset = ntohs(fhdr->frag_off) & ~0x7; - end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + end = offset + (payload_len - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { @@ -241,6 +233,8 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -254,11 +248,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & LAST_IN) && end != fq->q.len)) { + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } - fq->q.last_in |= LAST_IN; + fq->q.last_in |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -273,7 +267,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & LAST_IN) { + if (fq->q.last_in & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } @@ -298,6 +292,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = fq->q.fragments_tail; + if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { if (NFCT_FRAG6_CB(next)->offset >= offset) @@ -305,93 +304,59 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, prev = next; } - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. +found: + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. */ - if (prev) { - int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; - if (i > 0) { - offset += i; - if (end <= offset) { - pr_debug("overlap\n"); - goto err; - } - if (!pskb_pull(skb, i)) { - pr_debug("Can't pull\n"); - goto err; - } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - while (next && NFCT_FRAG6_CB(next)->offset < end) { - /* overlap is 'i' bytes */ - int i = end - NFCT_FRAG6_CB(next)->offset; - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - pr_debug("Eat head of the overlapped parts.: %d", i); - if (!pskb_pull(next, i)) - goto err; - - /* next fragment */ - NFCT_FRAG6_CB(next)->offset += i; - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragmnet is completely overridden with - * new one drop it. - */ - next = next->next; + /* Check for overlap with preceding fragment. */ + if (prev && + (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; - if (prev) - prev->next = next; - else - fq->q.fragments = next; - - fq->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); - } - } + /* Look for overlap with succeeding segment. */ + if (next && NFCT_FRAG6_CB(next)->offset < end) + goto discard_fq; NFCT_FRAG6_CB(skb)->offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + fq->q.fragments_tail = skb; if (prev) prev->next = skb; else fq->q.fragments = skb; - skb->dev = NULL; + if (skb->dev) { + fq->iif = skb->dev->ifindex; + skb->dev = NULL; + } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_init_frags.mem); + fq->ecn |= ecn; + if (payload_len > fq->q.max_size) + fq->q.max_size = payload_len; + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= FIRST_IN; + fq->q.last_in |= INET_FRAG_FIRST_IN; } - write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); - write_unlock(&nf_frags.lock); + + inet_frag_lru_move(&fq->q); return 0; +discard_fq: + inet_frag_kill(&fq->q, &nf_frags); err: return -1; } @@ -406,15 +371,20 @@ err: * the last and the first frames arrived and all the bits are here. */ static struct sk_buff * -nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) { struct sk_buff *fp, *op, *head = fq->q.fragments; int payload_len; + u8 ecn; - fq_kill(fq); + inet_frag_kill(&fq->q, &nf_frags); + + WARN_ON(head == NULL); + WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); - BUG_TRAP(head != NULL); - BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - @@ -426,7 +396,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) } /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + if (skb_unclone(head, GFP_ATOMIC)) { pr_debug("skb is cloned but can't expand head"); goto out_oom; } @@ -434,20 +404,20 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_shinfo(head)->frag_list) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; - if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { - pr_debug("Can't alloc skb\n"); + clone = alloc_skb(0, GFP_ATOMIC); + if (clone == NULL) goto out_oom; - } + clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_shinfo(head)->frag_list = NULL; - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; @@ -455,7 +425,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_init_frags.mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -469,7 +439,6 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_init_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -479,13 +448,16 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_init_frags.mem); } + sub_frag_mem_limit(&fq->q, head->truesize); + head->ignore_df = 1; head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); + IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -494,10 +466,11 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) head->csum); fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ fp = skb_shinfo(head)->frag_list; - if (NFCT_FRAG6_CB(fp)->orig == NULL) + if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) /* at above code, head skb is divided into two skbs. */ fp = fp->next; @@ -513,12 +486,11 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) return head; out_oversize: - if (net_ratelimit()) - printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", + payload_len); goto out_fail; out_oom: - if (net_ratelimit()) - printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); + net_dbg_ratelimited("nf_ct_frag6_reasm: no memory for reassembly\n"); out_fail: return NULL; } @@ -553,14 +525,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) if (!ipv6_ext_hdr(nexthdr)) { return -1; } - if (len < (int)sizeof(struct ipv6_opt_hdr)) { - pr_debug("too short\n"); - return -1; - } if (nexthdr == NEXTHDR_NONE) { pr_debug("next header is none\n"); return -1; } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + pr_debug("too short\n"); + return -1; + } if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) BUG(); if (nexthdr == NEXTHDR_AUTH) @@ -586,12 +558,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) return 0; } -struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; + struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev) + : dev_net(skb->dev); struct frag_hdr *fhdr; - struct nf_ct_frag6_queue *fq; + struct frag_queue *fq; struct ipv6hdr *hdr; int fhoff, nhoff; u8 prevhdr; @@ -623,38 +597,35 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - if (!(fhdr->frag_off & htons(0xFFF9))) { - pr_debug("Invalid fragment offset\n"); - /* It is not a fragmented frame */ - goto ret_orig; - } - - if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) - nf_ct_frag6_evictor(); + local_bh_disable(); + inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); + local_bh_enable(); - fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; } - spin_lock(&fq->q.lock); + spin_lock_bh(&fq->q.lock); if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { - spin_unlock(&fq->q.lock); + spin_unlock_bh(&fq->q.lock); pr_debug("Can't insert skb to queue\n"); - fq_put(fq); + inet_frag_put(&fq->q, &nf_frags); goto ret_orig; } - if (fq->q.last_in == (FIRST_IN|LAST_IN) && fq->q.meat == fq->q.len) { + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { ret_skb = nf_ct_frag6_reasm(fq, dev); if (ret_skb == NULL) pr_debug("Can't reassemble fragmented packets\n"); } - spin_unlock(&fq->q.lock); + spin_unlock_bh(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, &nf_frags); return ret_skb; ret_orig: @@ -662,50 +633,62 @@ ret_orig: return skb; } -void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, struct net_device *out, - int (*okfn)(struct sk_buff *)) +void nf_ct_frag6_consume_orig(struct sk_buff *skb) { struct sk_buff *s, *s2; for (s = NFCT_FRAG6_CB(skb)->orig; s;) { - nf_conntrack_put_reasm(s->nfct_reasm); - nf_conntrack_get_reasm(skb); - s->nfct_reasm = skb; - s2 = s->next; s->next = NULL; - - NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, - NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + consume_skb(s); s = s2; } - nf_conntrack_put_reasm(skb); } +static int nf_ct_net_init(struct net *net) +{ + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; + inet_frags_init_net(&net->nf_frag.frags); + + return nf_ct_frag6_sysctl_register(net); +} + +static void nf_ct_net_exit(struct net *net) +{ + nf_ct_frags6_sysctl_unregister(net); + inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); +} + +static struct pernet_operations nf_ct_net_ops = { + .init = nf_ct_net_init, + .exit = nf_ct_net_exit, +}; + int nf_ct_frag6_init(void) { + int ret = 0; + nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.skb_free = nf_skb_free; - nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); + nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.secret_interval = 10 * 60 * HZ; - nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; - nf_init_frags.high_thresh = 256 * 1024; - nf_init_frags.low_thresh = 192 * 1024; - inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); - return 0; + ret = register_pernet_subsys(&nf_ct_net_ops); + if (ret) + inet_frags_fini(&nf_frags); + + return ret; } void nf_ct_frag6_cleanup(void) { + unregister_pernet_subsys(&nf_ct_net_ops); inet_frags_fini(&nf_frags); - - nf_init_frags.low_thresh = 0; - nf_ct_frag6_evictor(); } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 00000000000..7b9a748c6ba --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,140 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#endif +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; +#endif + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occurred or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_consume_orig(reasm); + + NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, reasm, + (struct net_device *) in, (struct net_device *) out, + okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c new file mode 100644 index 00000000000..abfe75a2e31 --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of IPv6 NAT funded by Astaro. + */ +#include <linux/types.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <net/secure_seq.h> +#include <net/checksum.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> + +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv6; + +#ifdef CONFIG_XFRM +static void nf_nat_ipv6_decode_session(struct sk_buff *skb, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + unsigned long statusbit, + struct flowi *fl) +{ + const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; + struct flowi6 *fl6 = &fl->u.ip6; + + if (ct->status & statusbit) { + fl6->daddr = t->dst.u3.in6; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl6->fl6_dport = t->dst.u.all; + } + + statusbit ^= IPS_NAT_MASK; + + if (ct->status & statusbit) { + fl6->saddr = t->src.u3.in6; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl6->fl6_sport = t->src.u.all; + } +} +#endif + +static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, + const struct nf_nat_range *range) +{ + return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && + ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; +} + +static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t, + __be16 dport) +{ + return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport); +} + +static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *target, + enum nf_nat_manip_type maniptype) +{ + struct ipv6hdr *ipv6h; + __be16 frag_off; + int hdroff; + u8 nexthdr; + + if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + return false; + + ipv6h = (void *)skb->data + iphdroff; + nexthdr = ipv6h->nexthdr; + hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), + &nexthdr, &frag_off); + if (hdroff < 0) + goto manip_addr; + + if ((frag_off & htons(~0x7)) == 0 && + !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff, + target, maniptype)) + return false; +manip_addr: + if (maniptype == NF_NAT_MANIP_SRC) + ipv6h->saddr = target->src.u3.in6; + else + ipv6h->daddr = target->dst.u3.in6; + + return true; +} + +static void nf_nat_ipv6_csum_update(struct sk_buff *skb, + unsigned int iphdroff, __sum16 *check, + const struct nf_conntrack_tuple *t, + enum nf_nat_manip_type maniptype) +{ + const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); + const struct in6_addr *oldip, *newip; + + if (maniptype == NF_NAT_MANIP_SRC) { + oldip = &ipv6h->saddr; + newip = &t->src.u3.in6; + } else { + oldip = &ipv6h->daddr; + newip = &t->dst.u3.in6; + } + inet_proto_csum_replace16(check, skb, oldip->s6_addr32, + newip->s6_addr32, 1); +} + +static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, + u8 proto, void *data, __sum16 *check, + int datalen, int oldlen) +{ + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt6i_flags & RTF_LOCAL) && + (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + (data - (void *)skb->data); + skb->csum_offset = (void *)check - data; + *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, 0); + } else { + *check = 0; + *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, + csum_partial(data, datalen, + 0)); + if (proto == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + +static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_NAT_V6_MINIP]) { + nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], + sizeof(struct in6_addr)); + range->flags |= NF_NAT_RANGE_MAP_IPS; + } + + if (tb[CTA_NAT_V6_MAXIP]) + nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], + sizeof(struct in6_addr)); + else + range->max_addr = range->min_addr; + + return 0; +} + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { + .l3proto = NFPROTO_IPV6, + .secure_port = nf_nat_ipv6_secure_port, + .in_range = nf_nat_ipv6_in_range, + .manip_pkt = nf_nat_ipv6_manip_pkt, + .csum_update = nf_nat_ipv6_csum_update, + .csum_recalc = nf_nat_ipv6_csum_recalc, + .nlattr_to_range = nf_nat_ipv6_nlattr_to_range, +#ifdef CONFIG_XFRM + .decode_session = nf_nat_ipv6_decode_session, +#endif +}; + +int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + unsigned int hdrlen) +{ + struct { + struct icmp6hdr icmp6; + struct ipv6hdr ip6; + } *inside; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); + const struct nf_nat_l4proto *l4proto; + struct nf_conntrack_tuple target; + unsigned long statusbit; + + NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); + + if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + return 0; + if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) + return 0; + + inside = (void *)skb->data + hdrlen; + if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + if (ct->status & IPS_NAT_MASK) + return 0; + } + + if (manip == NF_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply direction */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr); + if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), + l4proto, &ct->tuplehash[!dir].tuple, !manip)) + return 0; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + inside = (void *)skb->data + hdrlen; + inside->icmp6.icmp6_cksum = 0; + inside->icmp6.icmp6_cksum = + csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + skb->len - hdrlen, IPPROTO_ICMPV6, + csum_partial(&inside->icmp6, + skb->len - hdrlen, 0)); + } + + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6); + if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip)) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); + +static int __init nf_nat_l3proto_ipv6_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); + if (err < 0) + goto err1; + err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6); + if (err < 0) + goto err2; + return err; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); +err1: + return err; +} + +static void __exit nf_nat_l3proto_ipv6_exit(void) +{ + nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6); + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); +} + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-" __stringify(AF_INET6)); + +module_init(nf_nat_l3proto_ipv6_init); +module_exit(nf_nat_l3proto_ipv6_exit); diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c new file mode 100644 index 00000000000..2205e8eeeac --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/icmpv6.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static bool +icmpv6_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && + ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); +} + +static void +icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + static u16 id; + unsigned int range_size; + unsigned int i; + + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xffff; + + for (i = 0; ; ++id) { + tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + + (id % range_size)); + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; + } +} + +static bool +icmpv6_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct icmp6hdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct icmp6hdr *)(skb->data + hdroff); + l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum, + tuple, maniptype); + if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || + hdr->icmp6_type == ICMPV6_ECHO_REPLY) { + inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, + hdr->icmp6_identifier, + tuple->src.u.icmp.id, 0); + hdr->icmp6_identifier = tuple->src.u.icmp.id; + } + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = { + .l4proto = IPPROTO_ICMPV6, + .manip_pkt = icmpv6_manip_pkt, + .in_range = icmpv6_in_range, + .unique_tuple = icmpv6_unique_tuple, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c new file mode 100644 index 00000000000..0d812b31277 --- /dev/null +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ipv6.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> + +static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + /* malformed packet, drop it */ + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + return NF_DROP; + + return nft_do_chain(&pkt, ops); +} + +static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (unlikely(skb->len < sizeof(struct ipv6hdr))) { + if (net_ratelimit()) + pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " + "packet\n"); + return NF_ACCEPT; + } + + return nft_do_chain_ipv6(ops, skb, in, out, okfn); +} + +struct nft_af_info nft_af_ipv6 __read_mostly = { + .family = NFPROTO_IPV6, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_ipv6_output, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, +}; +EXPORT_SYMBOL_GPL(nft_af_ipv6); + +static int nf_tables_ipv6_init_net(struct net *net) +{ + net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.ipv6 == NULL) + return -ENOMEM; + + memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6)); + + if (nft_register_afinfo(net, net->nft.ipv6) < 0) + goto err; + + return 0; +err: + kfree(net->nft.ipv6); + return -ENOMEM; +} + +static void nf_tables_ipv6_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.ipv6); + kfree(net->nft.ipv6); +} + +static struct pernet_operations nf_tables_ipv6_net_ops = { + .init = nf_tables_ipv6_init_net, + .exit = nf_tables_ipv6_exit_net, +}; + +static const struct nf_chain_type filter_ipv6 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_ipv6_init(void) +{ + int ret; + + nft_register_chain_type(&filter_ipv6); + ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv6); + + return ret; +} + +static void __exit nf_tables_ipv6_exit(void) +{ + unregister_pernet_subsys(&nf_tables_ipv6_net_ops); + nft_unregister_chain_type(&filter_ipv6); +} + +module_init(nf_tables_ipv6_init); +module_exit(nf_tables_ipv6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET6); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c new file mode 100644 index 00000000000..d189fcb437f --- /dev/null +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ipv6.h> + +/* + * IPv6 NAT chains + */ + +static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + struct nft_pktinfo pkt; + unsigned int ret; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED + IP_CT_IS_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall through */ + case IP_CT_NEW: + if (nf_nat_initialized(ct, maniptype)) + break; + + nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_ACCEPT) + return ret; + if (!nf_nat_initialized(ct, maniptype)) { + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } + default: + break; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo __maybe_unused; + const struct nf_conn *ct __maybe_unused; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) + if (nf_xfrm_me_harder(skb, AF_INET6) < 0) + ret = NF_DROP; + } +#endif + return ret; +} + +static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + if (ip6_route_me_harder(skb)) + ret = NF_DROP; + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) + if (nf_xfrm_me_harder(skb, AF_INET6)) + ret = NF_DROP; +#endif + } + return ret; +} + +static const struct nf_chain_type nft_chain_nat_ipv6 = { + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .hooks = { + [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, + [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, + [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, + [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, + }, +}; + +static int __init nft_chain_nat_ipv6_init(void) +{ + int err; + + err = nft_register_chain_type(&nft_chain_nat_ipv6); + if (err < 0) + return err; + + return 0; +} + +static void __exit nft_chain_nat_ipv6_exit(void) +{ + nft_unregister_chain_type(&nft_chain_nat_ipv6); +} + +module_init(nft_chain_nat_ipv6_init); +module_exit(nft_chain_nat_ipv6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat"); diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c new file mode 100644 index 00000000000..42031299585 --- /dev/null +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/route.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct nft_pktinfo pkt; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u32 mark, flowlabel; + + /* malformed packet, drop it */ + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + return NF_DROP; + + /* save source/dest address, mark, hoplimit, flowlabel, priority */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u32 *)ipv6_hdr(skb)); + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_DROP && ret != NF_QUEUE && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) + return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + + return ret; +} + +static const struct nf_chain_type nft_chain_route_ipv6 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook, + }, +}; + +static int __init nft_chain_route_init(void) +{ + return nft_register_chain_type(&nft_chain_route_ipv6); +} + +static void __exit nft_chain_route_exit(void) +{ + nft_unregister_chain_type(&nft_chain_route_ipv6); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route"); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c new file mode 100644 index 00000000000..0bc19fa8782 --- /dev/null +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); + +static struct nft_expr_type nft_reject_ipv6_type; +static const struct nft_expr_ops nft_reject_ipv6_ops = { + .type = &nft_reject_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv6_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "reject", + .ops = &nft_reject_ipv6_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv6_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv6_type); +} + +static void __exit nft_reject_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv6_type); +} + +module_init(nft_reject_ipv6_module_init); +module_exit(nft_reject_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c new file mode 100644 index 00000000000..5ec867e4a8b --- /dev/null +++ b/net/ipv6/output_core.c @@ -0,0 +1,98 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. These functions are needed by GSO/GRO implementation. + */ +#include <linux/export.h> +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/addrconf.h> +#include <net/secure_seq.h> + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); + int found_rhdr = 0; + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: +#if IS_ENABLED(CONFIG_IPV6_MIP6) + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + break; +#endif + if (found_rhdr) + return offset; + break; + default : + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); + } + + return offset; +} +EXPORT_SYMBOL(ip6_find_1stfragopt); + +#if IS_ENABLED(CONFIG_IPV6) +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); +#endif + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} +EXPORT_SYMBOL_GPL(__ip6_local_out); + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c new file mode 100644 index 00000000000..5b7a1ed2aba --- /dev/null +++ b/net/ipv6/ping.c @@ -0,0 +1,275 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/ping.c code. + * + * Authors: Lorenzo Colitti (IPv6 support) + * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), + * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) + * + */ + +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/transp_v6.h> +#include <net/ping.h> + +struct proto pingv6_prot = { + .name = "PINGv6", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip6_datagram_connect_v6_only, + .disconnect = udp_disconnect, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .sendmsg = ping_v6_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, + .obj_size = sizeof(struct raw6_sock), +}; +EXPORT_SYMBOL_GPL(pingv6_prot); + +static struct inet_protosw pingv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMPV6, + .prot = &pingv6_prot, + .ops = &inet6_dgram_ops, + .flags = INET_PROTOSW_REUSE, +}; + + +/* Compatibility glue so we can support IPv6 when it's compiled as a module */ +static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, + int *addr_len) +{ + return -EAFNOSUPPORT; +} +static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ +} +static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) +{ + return -EAFNOSUPPORT; +} +static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) {} +static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict) +{ + return 0; +} + +int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr user_icmph; + int addr_type; + struct in6_addr *daddr; + int iif = 0; + struct flowi6 fl6; + int err; + int hlimit; + struct dst_entry *dst; + struct rt6_info *rt; + struct pingfakehdr pfh; + + pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); + if (msg->msg_namelen < sizeof(struct sockaddr_in6) || + u->sin6_family != AF_INET6) { + return -EINVAL; + } + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != u->sin6_scope_id) { + return -EINVAL; + } + daddr = &(u->sin6_addr); + iif = u->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &sk->sk_v6_daddr; + } + + if (!iif) + iif = sk->sk_bound_dev_if; + + addr_type = ipv6_addr_type(daddr); + if (__ipv6_addr_needs_scope_id(addr_type) && !iif) + return -EINVAL; + if (addr_type & IPV6_ADDR_MAPPED) + return -EINVAL; + + /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ + + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.saddr = np->saddr; + fl6.daddr = *daddr; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_icmp_type = user_icmph.icmp6_type; + fl6.fl6_icmp_code = user_icmph.icmp6_code; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); + if (IS_ERR(dst)) + return PTR_ERR(dst); + rt = (struct rt6_info *) dst; + + np = inet6_sk(sk); + if (!np) + return -EBADF; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + pfh.icmph.type = user_icmph.icmp6_type; + pfh.icmph.code = user_icmph.icmp6_code; + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + pfh.family = AF_INET6; + + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + + lock_sock(sk); + err = ip6_append_data(sk, ping_getfrag, &pfh, len, + 0, hlimit, + np->tclass, NULL, &fl6, rt, + MSG_DONTWAIT, np->dontfrag); + + if (err) { + ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, + ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *) &pfh.icmph, + len); + } + release_sock(sk); + + if (err) + return err; + + return len; +} + +#ifdef CONFIG_PROC_FS +static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) +{ + return ping_seq_start(seq, pos, AF_INET6); +} + +static int ping_v6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct ping_iter_state *) seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } + return 0; +} + +static struct ping_seq_afinfo ping_v6_seq_afinfo = { + .name = "icmp6", + .family = AF_INET6, + .seq_fops = &ping_seq_fops, + .seq_ops = { + .start = ping_v6_seq_start, + .show = ping_v6_seq_show, + .next = ping_seq_next, + .stop = ping_seq_stop, + }, +}; + +static int __net_init ping_v6_proc_init_net(struct net *net) +{ + return ping_proc_register(net, &ping_v6_seq_afinfo); +} + +static void __net_init ping_v6_proc_exit_net(struct net *net) +{ + return ping_proc_unregister(net, &ping_v6_seq_afinfo); +} + +static struct pernet_operations ping_v6_net_ops = { + .init = ping_v6_proc_init_net, + .exit = ping_v6_proc_exit_net, +}; +#endif + +int __init pingv6_init(void) +{ +#ifdef CONFIG_PROC_FS + int ret = register_pernet_subsys(&ping_v6_net_ops); + if (ret) + return ret; +#endif + pingv6_ops.ipv6_recv_error = ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = + ip6_datagram_recv_specific_ctl; + pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; + return inet6_register_protosw(&pingv6_protosw); +} + +/* This never gets called because it's not possible to unload the ipv6 module, + * but just in case. + */ +void pingv6_exit(void) +{ + pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; +#ifdef CONFIG_PROC_FS + unregister_pernet_subsys(&ping_v6_net_ops); +#endif + inet6_unregister_protosw(&pingv6_protosw); +} diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 199ef379e50..3317440ea34 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,8 +7,6 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.17 2002/02/01 22:01:04 davem Exp $ - * * Authors: David S. Miller (davem@caip.rutgers.edu) * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * @@ -23,6 +21,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> +#include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> @@ -31,26 +30,39 @@ #include <net/transp_v6.h> #include <net/ipv6.h> -static struct proc_dir_entry *proc_net_devsnmp6; - static int sockstat6_seq_show(struct seq_file *seq, void *v) { + struct net *net = seq->private; + seq_printf(seq, "TCP6: inuse %d\n", - sock_prot_inuse_get(&tcpv6_prot)); + sock_prot_inuse_get(net, &tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", - sock_prot_inuse_get(&udpv6_prot)); + sock_prot_inuse_get(net, &udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", - sock_prot_inuse_get(&udplitev6_prot)); + sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", - sock_prot_inuse_get(&rawv6_prot)); + sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %d memory %d\n", - ip6_frag_nqueues(&init_net), ip6_frag_mem(&init_net)); + ip6_frag_nqueues(net), ip6_frag_mem(net)); return 0; } -static struct snmp_mib snmp6_ipstats_list[] = { +static int sockstat6_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, sockstat6_seq_show); +} + +static const struct file_operations sockstat6_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static const struct snmp_mib snmp6_ipstats_list[] = { /* ipv6 mib according to RFC 2465 */ - SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES), + SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), @@ -60,7 +72,7 @@ static struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), - SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS), SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), @@ -72,19 +84,32 @@ static struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), + SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), + SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), + SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ + SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp6_icmp6_list[] = { +static const struct snmp_mib snmp6_icmp6_list[] = { /* icmpv6 mib according to RFC 2466 */ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), + SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; /* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ -static char *icmp6type2name[256] = { +static const char *const icmp6type2name[256] = { [ICMPV6_DEST_UNREACH] = "DestUnreachs", [ICMPV6_PKT_TOOBIG] = "PktTooBigs", [ICMPV6_TIME_EXCEED] = "TimeExcds", @@ -103,105 +128,139 @@ static char *icmp6type2name[256] = { }; -static struct snmp_mib snmp6_udp6_list[] = { +static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp6_udplite6_list[] = { +static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; -static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void **mib) +static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { - static char name[32]; + char name[32]; int i; /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { int icmptype; - char *p; + const char *p; icmptype = i & 0xff; p = icmp6type2name[icmptype]; if (!p) /* don't print un-named types here */ continue; - (void) snprintf(name, sizeof(name)-1, "Icmp6%s%s", + snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, - snmp_fold_field(mib, i)); + atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; - val = snmp_fold_field(mib, i); + val = atomic_long_read(smib + i); if (!val) continue; - (void) snprintf(name, sizeof(name)-1, "Icmp6%sType%u", + snprintf(name, sizeof(name), "Icmp6%sType%u", i & 0x100 ? "Out" : "In", i & 0xff); seq_printf(seq, "%-32s\t%lu\n", name, val); } - return; } -static inline void -snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist) +/* can be called either with percpu mib (pcpumib != NULL), + * or shared one (smib != NULL) + */ +static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, + atomic_long_t *smib, + const struct snmp_mib *itemlist) { int i; - for (i=0; itemlist[i].name; i++) - seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, - snmp_fold_field(mib, itemlist[i].entry)); + unsigned long val; + + for (i = 0; itemlist[i].name; i++) { + val = pcpumib ? + snmp_fold_field(pcpumib, itemlist[i].entry) : + atomic_long_read(smib + itemlist[i].entry); + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + } } -static int snmp6_seq_show(struct seq_file *seq, void *v) +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, + const struct snmp_mib *itemlist, size_t syncpoff) { - struct inet6_dev *idev = (struct inet6_dev *)seq->private; + int i; - if (idev) { - seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); - snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg); - } else { - snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics); - snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); - snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list); - } + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, + snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); +} + +static int snmp6_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + + snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, + NULL, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); + snmp6_seq_show_item(seq, net->mib.udp_stats_in6, + NULL, snmp6_udp6_list); + snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, + NULL, snmp6_udplite6_list); return 0; } -static int sockstat6_seq_open(struct inode *inode, struct file *file) +static int snmp6_seq_open(struct inode *inode, struct file *file) { - return single_open(file, sockstat6_seq_show, NULL); + return single_open_net(inode, file, snmp6_seq_show); } -static const struct file_operations sockstat6_seq_fops = { +static const struct file_operations snmp6_seq_fops = { .owner = THIS_MODULE, - .open = sockstat6_seq_open, + .open = snmp6_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; -static int snmp6_seq_open(struct inode *inode, struct file *file) +static int snmp6_dev_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_dev *idev = (struct inet6_dev *)seq->private; + + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item64(seq, idev->stats.ipv6, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, + snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); + return 0; +} + +static int snmp6_dev_seq_open(struct inode *inode, struct file *file) { - return single_open(file, snmp6_seq_show, PDE(inode)->data); + return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode)); } -static const struct file_operations snmp6_seq_fops = { +static const struct file_operations snmp6_dev_seq_fops = { .owner = THIS_MODULE, - .open = snmp6_seq_open, + .open = snmp6_dev_seq_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, @@ -210,65 +269,77 @@ static const struct file_operations snmp6_seq_fops = { int snmp6_register_dev(struct inet6_dev *idev) { struct proc_dir_entry *p; + struct net *net; if (!idev || !idev->dev) return -EINVAL; - if (!proc_net_devsnmp6) + net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) return -ENOENT; - p = proc_create(idev->dev->name, S_IRUGO, - proc_net_devsnmp6, &snmp6_seq_fops); + p = proc_create_data(idev->dev->name, S_IRUGO, + net->mib.proc_net_devsnmp6, + &snmp6_dev_seq_fops, idev); if (!p) return -ENOMEM; - p->data = idev; - idev->stats.proc_dir_entry = p; return 0; } int snmp6_unregister_dev(struct inet6_dev *idev) { - if (!proc_net_devsnmp6) + struct net *net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) return -ENOENT; - if (!idev || !idev->stats.proc_dir_entry) + if (!idev->stats.proc_dir_entry) return -EINVAL; - remove_proc_entry(idev->stats.proc_dir_entry->name, - proc_net_devsnmp6); + proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } -int __init ipv6_misc_proc_init(void) +static int __net_init ipv6_proc_init_net(struct net *net) { - int rc = 0; + if (!proc_create("sockstat6", S_IRUGO, net->proc_net, + &sockstat6_seq_fops)) + return -ENOMEM; - if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops)) goto proc_snmp6_fail; - proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net); - if (!proc_net_devsnmp6) + net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); + if (!net->mib.proc_net_devsnmp6) goto proc_dev_snmp6_fail; + return 0; - if (!proc_net_fops_create(&init_net, "sockstat6", S_IRUGO, &sockstat6_seq_fops)) - goto proc_sockstat6_fail; -out: - return rc; - -proc_sockstat6_fail: - proc_net_remove(&init_net, "dev_snmp6"); proc_dev_snmp6_fail: - proc_net_remove(&init_net, "snmp6"); + remove_proc_entry("snmp6", net->proc_net); proc_snmp6_fail: - rc = -ENOMEM; - goto out; + remove_proc_entry("sockstat6", net->proc_net); + return -ENOMEM; +} + +static void __net_exit ipv6_proc_exit_net(struct net *net) +{ + remove_proc_entry("sockstat6", net->proc_net); + remove_proc_entry("dev_snmp6", net->proc_net); + remove_proc_entry("snmp6", net->proc_net); +} + +static struct pernet_operations ipv6_proc_ops = { + .init = ipv6_proc_init_net, + .exit = ipv6_proc_exit_net, +}; + +int __init ipv6_misc_proc_init(void) +{ + return register_pernet_subsys(&ipv6_proc_ops); } void ipv6_misc_proc_exit(void) { - proc_net_remove(&init_net, "sockstat6"); - proc_net_remove(&init_net, "dev_snmp6"); - proc_net_remove(&init_net, "snmp6"); + unregister_pernet_subsys(&ipv6_proc_ops); } diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index f929f47b925..e048cf1bb6a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -5,8 +5,6 @@ * * PF_INET6 protocol dispatch tables. * - * Version: $Id: protocol.c,v 1.10 2001/05/18 02:25:49 davem Exp $ - * * Authors: Pedro Roque <roque@di.fc.ul.pt> * * This program is free software; you can redistribute it and/or @@ -22,68 +20,54 @@ * - Removed unused variable 'inet6_protocol_base' * - Modified inet6_del_protocol() to correctly maintain copy bit. */ - -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/in6.h> +#include <linux/module.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> - -#include <net/sock.h> -#include <net/snmp.h> - -#include <net/ipv6.h> +#include <linux/spinlock.h> #include <net/protocol.h> -struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -static DEFINE_SPINLOCK(inet6_proto_lock); - +#if IS_ENABLED(CONFIG_IPV6) +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet6_protos); -int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) +int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_protocol); - spin_lock_bh(&inet6_proto_lock); +int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int ret; - if (inet6_protos[hash]) { - ret = -1; - } else { - inet6_protos[hash] = prot; - ret = 0; - } + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], + prot, NULL) == prot) ? 0 : -1; - spin_unlock_bh(&inet6_proto_lock); + synchronize_net(); return ret; } +EXPORT_SYMBOL(inet6_del_protocol); +#endif -EXPORT_SYMBOL(inet6_add_protocol); - -/* - * Remove a protocol from the hash tables. - */ +const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly; -int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol) +int inet6_add_offload(const struct net_offload *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet6_proto_lock); + return !cmpxchg((const struct net_offload **)&inet6_offloads[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_offload); - if (inet6_protos[hash] != prot) { - ret = -1; - } else { - inet6_protos[hash] = NULL; - ret = 0; - } +int inet6_del_offload(const struct net_offload *prot, unsigned char protocol) +{ + int ret; - spin_unlock_bh(&inet6_proto_lock); + ret = (cmpxchg((const struct net_offload **)&inet6_offloads[protocol], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); return ret; } - -EXPORT_SYMBOL(inet6_del_protocol); +EXPORT_SYMBOL(inet6_del_offload); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8897ccf8086..b2dc60b0c76 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,8 +7,6 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.51 2002/02/01 22:01:04 davem Exp $ - * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) @@ -23,6 +21,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> +#include <linux/slab.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> @@ -32,6 +31,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -50,9 +50,10 @@ #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif +#include <linux/mroute6.h> #include <net/raw.h> #include <net/rawv6.h> @@ -60,45 +61,35 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/export.h> + +#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ static struct raw_hashinfo raw_v6_hashinfo = { - .lock = __RW_LOCK_UNLOCKED(), + .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), }; -static void raw_v6_hash(struct sock *sk) -{ - raw_hash_sk(sk, &raw_v6_hashinfo); -} - -static void raw_v6_unhash(struct sock *sk) -{ - raw_unhash_sk(sk, &raw_v6_hashinfo); -} - - static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, - unsigned short num, struct in6_addr *loc_addr, - struct in6_addr *rmt_addr, int dif) + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; - int is_multicast = ipv6_addr_is_multicast(loc_addr); + bool is_multicast = ipv6_addr_is_multicast(loc_addr); - sk_for_each_from(sk, node) - if (inet_sk(sk)->num == num) { - struct ipv6_pinfo *np = inet6_sk(sk); + sk_for_each_from(sk) + if (inet_sk(sk)->inet_num == num) { - if (sk->sk_net != net) + if (!net_eq(sock_net(sk), net)) continue; - if (!ipv6_addr_any(&np->daddr) && - !ipv6_addr_equal(&np->daddr, rmt_addr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) goto found; if (is_multicast && inet6_mc_check(sk, loc_addr, rmt_addr)) @@ -116,38 +107,40 @@ found: * 0 - deliver * 1 - block */ -static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { - struct icmp6hdr *icmph; - struct raw6_sock *rp = raw6_sk(sk); + struct icmp6hdr _hdr; + const struct icmp6hdr *hdr; - if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { - __u32 *data = &rp->filter.data[0]; - int bit_nr; - - icmph = (struct icmp6hdr *) skb->data; - bit_nr = icmph->icmp6_type; + /* We require only the four bytes of the ICMPv6 header, not any + * additional bytes of message body in "struct icmp6hdr". + */ + hdr = skb_header_pointer(skb, skb_transport_offset(skb), + ICMPV6_HDRLEN, &_hdr); + if (hdr) { + const __u32 *data = &raw6_sk(sk)->filter.data[0]; + unsigned int type = hdr->icmp6_type; - return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + return (data[type >> 5] & (1U << (type & 31))) != 0; } - return 0; + return 1; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -static int (*mh_filter)(struct sock *sock, struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6_MIP6) +typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); + +static mh_filter_t __rcu *mh_filter __read_mostly; -int rawv6_mh_filter_register(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); -int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +int rawv6_mh_filter_unregister(mh_filter_t filter) { - rcu_assign_pointer(mh_filter, NULL); + RCU_INIT_POINTER(mh_filter, NULL); synchronize_rcu(); return 0; } @@ -162,44 +155,39 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); * * Caller owns SKB so we must make clones. */ -static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { - struct in6_addr *saddr; - struct in6_addr *daddr; + const struct in6_addr *saddr; + const struct in6_addr *daddr; struct sock *sk; - int delivered = 0; + bool delivered = false; __u8 hash; struct net *net; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (MAX_INET_PROTOS - 1); + hash = nexthdr & (RAW_HTABLE_SIZE - 1); read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); - /* - * The first socket found will be delivered after - * delivery to transport protocols. - */ - if (sk == NULL) goto out; - net = skb->dev->nd_net; + net = dev_net(skb->dev); sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); while (sk) { int filtered; - delivered = 1; + delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); break; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: { /* XXX: To validate MH only once for each packet, @@ -208,10 +196,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) * policy is placed in rawv6_rcv() because it is * required for each socket. */ - int (*filter)(struct sock *sock, struct sk_buff *skb); + mh_filter_t *filter; filter = rcu_dereference(mh_filter); - filtered = filter ? filter(sk, skb) : 0; + filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif @@ -239,11 +227,11 @@ out: return delivered; } -int raw6_local_deliver(struct sk_buff *skb, int nexthdr) +bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { struct sock *raw_sk; - raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]); if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) raw_sk = NULL; @@ -262,11 +250,15 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) - return(-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; lock_sock(sk); @@ -274,11 +266,12 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE) goto out; + rcu_read_lock(); /* Check if the address belongs to the host. */ if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; - if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another @@ -289,13 +282,13 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) - goto out; + goto out_unlock; - dev = dev_get_by_index(sk->sk_net, sk->sk_bound_dev_if); - if (!dev) { - err = -ENODEV; - goto out; - } + err = -ENODEV; + dev = dev_get_by_index_rcu(sock_net(sk), + sk->sk_bound_dev_if); + if (!dev) + goto out_unlock; } /* ipv4 addr of the socket is invalid. Only the @@ -304,22 +297,20 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { err = -EADDRNOTAVAIL; - if (!ipv6_chk_addr(sk->sk_net, &addr->sin6_addr, + if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { - if (dev) - dev_put(dev); - goto out; + goto out_unlock; } } - if (dev) - dev_put(dev); } - inet->rcv_saddr = inet->saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + inet->inet_rcv_saddr = inet->inet_saddr = v4addr; + sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; err = 0; +out_unlock: + rcu_read_unlock(); out: release_sock(sk); return err; @@ -327,7 +318,7 @@ out: static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -343,9 +334,14 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, return; harderr = icmpv6_err_convert(type, code, &err); - if (type == ICMPV6_PKT_TOOBIG) + if (type == ICMPV6_PKT_TOOBIG) { + ip6_sk_update_pmtu(skb, sk, info); harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); - + } + if (type == NDISC_REDIRECT) { + ip6_sk_redirect(skb, sk); + return; + } if (np->recverr) { u8 *payload = skb->data; if (!inet->hdrincl) @@ -360,11 +356,11 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, } void raw6_icmp_error(struct sk_buff *skb, int nexthdr, - int type, int code, int inner_offset, __be32 info) + u8 type, u8 code, int inner_offset, __be32 info) { struct sock *sk; int hash; - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; struct net *net; hash = nexthdr & (RAW_HTABLE_SIZE - 1); @@ -372,9 +368,11 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); if (sk != NULL) { - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; - net = skb->dev->nd_net; + /* Note: ipv6_hdr(skb) != skb->data */ + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; + net = dev_net(skb->dev); while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, IP6CB(skb)->iif))) { @@ -386,20 +384,20 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, read_unlock(&raw_v6_hashinfo.lock); } -static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { - if ((raw6_sk(sk)->checksum || sk->sk_filter) && + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); - return 0; + return NET_RX_DROP; } /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk,skb)<0) { - atomic_inc(&sk->sk_drops); + skb_dst_drop(skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); - return 0; + return NET_RX_DROP; } return 0; @@ -431,20 +429,20 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) skb_network_header_len(skb)); if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, - skb->len, inet->num, skb->csum)) + skb->len, inet->inet_num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } if (!skb_csum_unnecessary(skb)) skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, - inet->num, 0)); + inet->inet_num, 0)); if (inet->hdrincl) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); - return 0; + return NET_RX_DROP; } } @@ -463,7 +461,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; @@ -471,11 +469,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (flags & MSG_OOB) return -EOPNOTSUPP; - if (addr_len) - *addr_len=sizeof(*sin6); - if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len); + return ipv6_recv_error(sk, msg, len, addr_len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -505,17 +503,17 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; - ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); + *addr_len = sizeof(*sin6); } - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_ctl(sk, msg, skb); err = copied; if (flags & MSG_TRUNC) @@ -533,11 +531,10 @@ csum_copy_err: as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; - atomic_inc(&sk->sk_drops); goto out; } -static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, +static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { struct sk_buff *skb; @@ -555,8 +552,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, goto out; offset = rp->offset; - total_len = inet_sk(sk)->cork.length - (skb_network_header(skb) - - skb->data); + total_len = inet_sk(sk)->cork.base.length; if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); @@ -599,11 +595,10 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, if (unlikely(csum)) tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); - csum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - total_len, fl->proto, tmp_csum); + csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + total_len, fl6->flowi6_proto, tmp_csum); - if (csum == 0 && fl->proto == IPPROTO_UDP) + if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; if (skb_store_bits(skb, offset, &csum, 2)) @@ -616,33 +611,36 @@ out: } static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, - struct flowi *fl, struct rt6_info *rt, + struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; - unsigned int hh_len; int err; + struct rt6_info *rt = (struct rt6_info *)*dstp; + int hlen = LL_RESERVED_SPACE(rt->dst.dev); + int tlen = rt->dst.dev->needed_tailroom; - if (length > rt->u.dst.dev->mtu) { - ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + if (length > rt->dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); - - skb = sock_alloc_send_skb(sk, length+hh_len+15, - flags&MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, + length + hlen + tlen + 15, + flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, hh_len); + skb_reserve(skb, hlen); + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb->dst = dst_clone(&rt->u.dst); + skb_dst_set(skb, &rt->dst); + *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -655,11 +653,11 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, if (err) goto error_fault; - IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); if (err > 0) - err = np->recverr ? net_xmit_errno(err) : 0; + err = net_xmit_errno(err); if (err) goto error; out: @@ -669,11 +667,13 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !np->recverr) + err = 0; return err; } -static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; @@ -690,7 +690,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) if (!iov) continue; - switch (fl->proto) { + switch (fl6->flowi6_proto) { case IPPROTO_ICMPV6: /* check if one-byte field is readable or not. */ if (iov->iov_base && iov->iov_len < 1) @@ -705,8 +705,8 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) code = iov->iov_base; if (type && code) { - if (get_user(fl->fl_icmp_type, type) || - get_user(fl->fl_icmp_code, code)) + if (get_user(fl6->fl6_icmp_type, type) || + get_user(fl6->fl6_icmp_code, code)) return -EFAULT; probed = 1; } @@ -717,7 +717,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) /* check if type field is readable or not. */ if (iov->iov_len > 2 - len) { u8 __user *p = iov->iov_base; - if (get_user(fl->fl_mh_type, &p[2 - len])) + if (get_user(fl6->fl6_mh_type, &p[2 - len])) return -EFAULT; probed = 1; } else @@ -738,18 +738,19 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; - struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; - struct flowi fl; + struct flowi6 fl6; int addr_len = msg->msg_namelen; int hlimit = -1; int tclass = -1; + int dontfrag = -1; u16 proto; int err; @@ -766,36 +767,35 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, /* * Get and verify the address. */ - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); - fl.mark = sk->sk_mark; + fl6.flowi6_mark = sk->sk_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); if (!proto) - proto = inet->num; - else if (proto != inet->num) - return(-EINVAL); + proto = inet->inet_num; + else if (proto != inet->inet_num) + return -EINVAL; if (proto > 255) - return(-EINVAL); + return -EINVAL; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -804,46 +804,38 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && - ipv6_addr_equal(daddr, &np->daddr)) - daddr = &np->daddr; + ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) + daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) + fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - proto = inet->num; - daddr = &np->daddr; - fl.fl6_flowlabel = np->flow_label; - } - - if (ipv6_addr_any(daddr)) { - /* - * unspecified destination address - * treated as error... is this correct ? - */ - fl6_sock_release(flowlabel); - return(-EINVAL); + proto = inet->inet_num; + daddr = &sk->sk_v6_daddr; + fl6.flowlabel = np->flow_label; } - if (fl.oif == 0) - fl.oif = sk->sk_bound_dev_if; + if (fl6.flowi6_oif == 0) + fl6.flowi6_oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); - err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -856,73 +848,56 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl.proto = proto; - err = rawv6_probe_proto_opt(&fl, msg); + fl6.flowi6_proto = proto; + err = rawv6_probe_proto_opt(&fl6, msg); if (err) goto out; - ipv6_addr_copy(&fl.fl6_dst, daddr); - if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl.fl6_src, &np->saddr); + if (!ipv6_addr_any(daddr)) + fl6.daddr = *daddr; + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + final_p = fl6_update_dst(&fl6, opt, &final); - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; - security_sk_classify_flow(sk, &fl); + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hlimit < 0) - hlimit = ipv6_get_hoplimit(dst->dev); - } - - if (tclass < 0) { + if (tclass < 0) tclass = np->tclass; - if (tclass < 0) - tclass = 0; - } + + if (dontfrag < 0) + dontfrag = np->dontfrag; if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: - if (inet->hdrincl) { - err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags); - } else { + if (inet->hdrincl) + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags); + else { lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, - len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst, - msg->msg_flags); + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + msg->msg_flags, dontfrag); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) - err = rawv6_push_pending_frames(sk, &fl, rp); + err = rawv6_push_pending_frames(sk, &fl6, rp); release_sock(sk); } done: @@ -982,7 +957,7 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; @@ -991,44 +966,54 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { - case IPV6_CHECKSUM: - /* You may get strange result with a positive odd offset; - RFC2292bis agrees with me. */ - if (val > 0 && (val&1)) - return(-EINVAL); - if (val < 0) { - rp->checksum = 0; - } else { - rp->checksum = 1; - rp->offset = val; - } + case IPV6_CHECKSUM: + if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && + level == IPPROTO_IPV6) { + /* + * RFC3542 tells that IPV6_CHECKSUM socket + * option in the IPPROTO_IPV6 level is not + * allowed on ICMPv6 sockets. + * If you want to set it, use IPPROTO_RAW + * level IPV6_CHECKSUM socket option + * (Linux extension). + */ + return -EINVAL; + } - return 0; - break; + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return -EINVAL; + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } - default: - return(-ENOPROTOOPT); + return 0; + + default: + return -ENOPROTOOPT; } } static int rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { - switch(level) { - case SOL_RAW: - break; + switch (level) { + case SOL_RAW: + break; - case SOL_ICMPV6: - if (inet_sk(sk)->num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, - optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM) - break; - default: - return ipv6_setsockopt(sk, level, optname, optval, - optlen); + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, optlen); } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); @@ -1036,13 +1021,13 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_COMPAT static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: - if (inet_sk(sk)->num != IPPROTO_ICMPV6) + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: @@ -1067,6 +1052,11 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case IPV6_CHECKSUM: + /* + * We allow getsockopt() for IPPROTO_IPV6-level + * IPV6_CHECKSUM socket option on ICMPv6 sockets + * since RFC3542 is silent about it. + */ if (rp->checksum == 0) val = -1; else @@ -1089,21 +1079,19 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, static int rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - switch(level) { - case SOL_RAW: - break; + switch (level) { + case SOL_RAW: + break; - case SOL_ICMPV6: - if (inet_sk(sk)->num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, - optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM) - break; - default: - return ipv6_getsockopt(sk, level, optname, optval, - optlen); + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, optlen); } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); @@ -1117,7 +1105,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, case SOL_RAW: break; case SOL_ICMPV6: - if (inet_sk(sk)->num != IPPROTO_ICMPV6) + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: @@ -1133,43 +1121,73 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) { - switch(cmd) { - case SIOCOUTQ: - { - int amount = atomic_read(&sk->sk_wmem_alloc); - return put_user(amount, (int __user *)arg); - } - case SIOCINQ: - { - struct sk_buff *skb; - int amount = 0; - - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) - amount = skb->tail - skb->transport_header; - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } + switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); - default: - return -ENOIOCTLCMD; + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb_tail_pointer(skb) - + skb_transport_header(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif } } -static void rawv6_close(struct sock *sk, long timeout) +#ifdef CONFIG_COMPAT +static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { - if (inet_sk(sk)->num == IPPROTO_RAW) - ip6_ra_control(sk, -1, NULL); + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif +static void rawv6_close(struct sock *sk, long timeout) +{ + if (inet_sk(sk)->inet_num == IPPROTO_RAW) + ip6_ra_control(sk, -1); + ip6mr_sk_done(sk); sk_common_release(sk); } +static void raw6_destroy(struct sock *sk) +{ + lock_sock(sk); + ip6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); +} + static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); - switch (inet_sk(sk)->num) { + switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; rp->offset = 2; @@ -1181,75 +1199,46 @@ static int rawv6_init_sk(struct sock *sk) default: break; } - return(0); + return 0; } -DEFINE_PROTO_INUSE(rawv6) - struct proto rawv6_prot = { .name = "RAWv6", .owner = THIS_MODULE, .close = rawv6_close, - .connect = ip6_datagram_connect, + .destroy = raw6_destroy, + .connect = ip6_datagram_connect_v6_only, .disconnect = udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, - .destroy = inet6_destroy_sock, .setsockopt = rawv6_setsockopt, .getsockopt = rawv6_getsockopt, .sendmsg = rawv6_sendmsg, .recvmsg = rawv6_recvmsg, .bind = rawv6_bind, .backlog_rcv = rawv6_rcv_skb, - .hash = raw_v6_hash, - .unhash = raw_v6_unhash, + .hash = raw_hash_sk, + .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_rawv6_setsockopt, .compat_getsockopt = compat_rawv6_getsockopt, + .compat_ioctl = compat_rawv6_ioctl, #endif - REF_PROTO_INUSE(rawv6) }; #ifdef CONFIG_PROC_FS -static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) -{ - struct ipv6_pinfo *np = inet6_sk(sp); - struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = 0; - srcp = inet_sk(sp)->num; - seq_printf(seq, - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", - i, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - atomic_read(&sp->sk_wmem_alloc), - atomic_read(&sp->sk_rmem_alloc), - 0, 0L, 0, - sock_i_uid(sp), 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); -} - static int raw6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode drops\n"); - else - raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + struct sock *sp = v; + __u16 srcp = inet_sk(sp)->inet_num; + ip6_dgram_sock_seq_show(seq, v, srcp, 0, + raw_seq_private(seq)->bucket); + } return 0; } @@ -1273,17 +1262,17 @@ static const struct file_operations raw6_seq_fops = { .release = seq_release_net, }; -static int raw6_init_net(struct net *net) +static int __net_init raw6_init_net(struct net *net) { - if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) + if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops)) return -ENOMEM; return 0; } -static void raw6_exit_net(struct net *net) +static void __net_exit raw6_exit_net(struct net *net) { - proc_net_remove(net, "raw6"); + remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { @@ -1333,8 +1322,6 @@ static struct inet_protosw rawv6_protosw = { .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, - .capability = CAP_NET_RAW, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f936d045a39..cc85a9ba501 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: reassembly.c,v 1.26 2001/03/07 22:00:57 davem Exp $ - * * Based on: net/ipv4/ip_fragment.c * * This program is free software; you can redistribute it and/or @@ -28,6 +26,9 @@ * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -43,6 +44,8 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/sock.h> #include <net/snmp.h> @@ -55,6 +58,7 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <net/inet_frag.h> +#include <net/inet_ecn.h> struct ip6frag_skb_cb { @@ -64,35 +68,12 @@ struct ip6frag_skb_cb #define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) - -/* - * Equivalent of ipv4 struct ipq - */ - -struct frag_queue -{ - struct inet_frag_queue q; - - __be32 id; /* fragment id */ - struct in6_addr saddr; - struct in6_addr daddr; - - int iif; - unsigned int csum; - __u16 nhoffset; -}; - -static struct inet_frags ip6_frags; - -int ip6_frag_nqueues(struct net *net) +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { - return net->ipv6.frags.nqueues; + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -int ip6_frag_mem(struct net *net) -{ - return atomic_read(&net->ipv6.frags.mem); -} +static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); @@ -101,29 +82,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, * callers should be careful not to use the hash value outside the ipfrag_lock * as doing so could race with ipfrag_hash_rnd being recalculated. */ -static unsigned int ip6qhashfn(__be32 id, struct in6_addr *saddr, - struct in6_addr *daddr) +static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr) { - u32 a, b, c; - - a = (__force u32)saddr->s6_addr32[0]; - b = (__force u32)saddr->s6_addr32[1]; - c = (__force u32)saddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += ip6_frags.rnd; - __jhash_mix(a, b, c); - - a += (__force u32)saddr->s6_addr32[3]; - b += (__force u32)daddr->s6_addr32[0]; - c += (__force u32)daddr->s6_addr32[1]; - __jhash_mix(a, b, c); + u32 c; - a += (__force u32)daddr->s6_addr32[2]; - b += (__force u32)daddr->s6_addr32[3]; - c += (__force u32)id; - __jhash_mix(a, b, c); + net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); + c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); return c & (INETFRAGS_HASHSZ - 1); } @@ -133,92 +99,58 @@ static unsigned int ip6_hashfn(struct inet_frag_queue *q) struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); - return ip6qhashfn(fq->id, &fq->saddr, &fq->daddr); + return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -int ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(struct inet_frag_queue *q, void *a) { struct frag_queue *fq; struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); - return (fq->id == arg->id && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst)); + return fq->id == arg->id && + fq->user == arg->user && + ipv6_addr_equal(&fq->saddr, arg->src) && + ipv6_addr_equal(&fq->daddr, arg->dst); } EXPORT_SYMBOL(ip6_frag_match); -/* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct netns_frags *nf, - struct sk_buff *skb, int *work) -{ - if (work) - *work -= skb->truesize; - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - void ip6_frag_init(struct inet_frag_queue *q, void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); struct ip6_create_arg *arg = a; fq->id = arg->id; - ipv6_addr_copy(&fq->saddr, arg->src); - ipv6_addr_copy(&fq->daddr, arg->dst); + fq->user = arg->user; + fq->saddr = *arg->src; + fq->daddr = *arg->dst; + fq->ecn = arg->ecn; } EXPORT_SYMBOL(ip6_frag_init); -/* Destruction primitives. */ - -static __inline__ void fq_put(struct frag_queue *fq) -{ - inet_frag_put(&fq->q, &ip6_frags); -} - -/* Kill fq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static __inline__ void fq_kill(struct frag_queue *fq) -{ - inet_frag_kill(&fq->q, &ip6_frags); -} - -static void ip6_evictor(struct net *net, struct inet6_dev *idev) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags); - if (evicted) - IP6_ADD_STATS_BH(idev, IPSTATS_MIB_REASMFAILS, evicted); -} - -static void ip6_frag_expire(unsigned long data) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, + struct inet_frags *frags) { - struct frag_queue *fq; struct net_device *dev = NULL; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); - spin_lock(&fq->q.lock); - if (fq->q.last_in & COMPLETE) + if (fq->q.last_in & INET_FRAG_COMPLETE) goto out; - fq_kill(fq); + inet_frag_kill(&fq->q, frags); - dev = dev_get_by_index(&init_net, fq->iif); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) - goto out; + goto out_rcu_unlock; - rcu_read_lock(); - IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - rcu_read_unlock(); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.last_in&FIRST_IN) || !fq->q.fragments) - goto out; + if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments) + goto out_rcu_unlock; /* But use as source device on which LAST ARRIVED @@ -226,36 +158,49 @@ static void ip6_frag_expire(unsigned long data) pointer directly, device might already disappeared. */ fq->q.fragments->dev = dev; - icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, dev); + icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); out: - if (dev) - dev_put(dev); spin_unlock(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, frags); +} +EXPORT_SYMBOL(ip6_expire_frag_queue); + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net *net; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, ipv6.frags); + + ip6_expire_frag_queue(net, fq, &ip6_frags); } static __inline__ struct frag_queue * -fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst, - struct inet6_dev *idev) +fq_find(struct net *net, __be32 id, const struct in6_addr *src, + const struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; unsigned int hash; arg.id = id; + arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; - hash = ip6qhashfn(id, src, dst); + arg.ecn = ecn; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (q == NULL) - goto oom; + read_lock(&ip6_frags.lock); + hash = inet6_hash_frag(id, src, dst); + q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct frag_queue, q); - -oom: - IP6_INC_STATS_BH(idev, IPSTATS_MIB_REASMFAILS); - return NULL; } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, @@ -264,8 +209,10 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, *next; struct net_device *dev; int offset, end; + struct net *net = dev_net(skb_dst(skb)->dev); + u8 ecn; - if (fq->q.last_in & COMPLETE) + if (fq->q.last_in & INET_FRAG_COMPLETE) goto err; offset = ntohs(fhdr->frag_off) & ~0x7; @@ -273,7 +220,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((u8 *)&fhdr->frag_off - @@ -281,6 +228,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -294,9 +243,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * or have different end, the segment is corrupted. */ if (end < fq->q.len || - ((fq->q.last_in & LAST_IN) && end != fq->q.len)) + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; - fq->q.last_in |= LAST_IN; + fq->q.last_in |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. @@ -306,7 +255,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, payload_len)); @@ -314,7 +263,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ - if (fq->q.last_in & LAST_IN) + if (fq->q.last_in & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } @@ -334,6 +283,11 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = fq->q.fragments_tail; + if (!prev || FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for(next = fq->q.fragments; next != NULL; next = next->next) { if (FRAG6_CB(next)->offset >= offset) @@ -341,63 +295,29 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, prev = next; } - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. +found: + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. */ - if (prev) { - int i = (FRAG6_CB(prev)->offset + prev->len) - offset; - - if (i > 0) { - offset += i; - if (end <= offset) - goto err; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - while (next && FRAG6_CB(next)->offset < end) { - int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - FRAG6_CB(next)->offset += i; /* next fragment */ - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - fq->q.fragments = next; + /* Check for overlap with preceding fragment. */ + if (prev && + (FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; - fq->q.meat -= free_it->len; - frag_kfree_skb(fq->q.net, free_it, NULL); - } - } + /* Look for overlap with succeeding segment. */ + if (next && FRAG6_CB(next)->offset < end) + goto discard_fq; FRAG6_CB(skb)->offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + fq->q.fragments_tail = skb; if (prev) prev->next = skb; else @@ -410,26 +330,37 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &fq->q.net->mem); + fq->ecn |= ecn; + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; - fq->q.last_in |= FIRST_IN; + fq->q.last_in |= INET_FRAG_FIRST_IN; } - if (fq->q.last_in == (FIRST_IN | LAST_IN) && fq->q.meat == fq->q.len) - return ip6_frag_reasm(fq, prev, dev); + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + int res; + unsigned long orefdst = skb->_skb_refdst; - write_lock(&ip6_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&ip6_frags.lock); + skb->_skb_refdst = 0UL; + res = ip6_frag_reasm(fq, prev, dev); + skb->_skb_refdst = orefdst; + return res; + } + + skb_dst_drop(skb); + inet_frag_lru_move(&fq->q); return -1; +discard_fq: + inet_frag_kill(&fq->q, &ip6_frags); err: - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; } @@ -446,11 +377,18 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); struct sk_buff *fp, *head = fq->q.fragments; int payload_len; unsigned int nhoff; + int sum_truesize; + u8 ecn; + + inet_frag_kill(&fq->q, &ip6_frags); - fq_kill(fq); + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; /* Make the one we just received the head. */ if (prev) { @@ -461,17 +399,19 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oom; fp->next = head->next; + if (!fp->next) + fq->q.fragments_tail = fp; prev->next = fp; skb_morph(head, fq->q.fragments); head->next = fq->q.fragments->next; - kfree_skb(fq->q.fragments); + consume_skb(fq->q.fragments); fq->q.fragments = head; } - BUG_TRAP(head != NULL); - BUG_TRAP(FRAG6_CB(head)->offset == 0); + WARN_ON(head == NULL); + WARN_ON(FRAG6_CB(head)->offset != 0); /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - @@ -481,13 +421,13 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_shinfo(head)->frag_list) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; @@ -496,15 +436,15 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_shinfo(head)->frag_list = NULL; - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -516,27 +456,41 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->mac_header += sizeof(struct frag_hdr); head->network_header += sizeof(struct frag_hdr); - skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &fq->q.net->mem); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - atomic_sub(fp->truesize, &fq->q.net->mem); + + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; } + sub_frag_mem_limit(&fq->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->nhoff = nhoff; + IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -545,21 +499,20 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->csum); rcu_read_lock(); - IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; return 1; out_oversize: - if (net_ratelimit()) - printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len); goto out_fail; out_oom: - if (net_ratelimit()) - printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); + net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); - IP6_INC_STATS_BH(__in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); return -1; } @@ -568,25 +521,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; struct frag_queue *fq; - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct net *net; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = dev_net(skb_dst(skb)->dev); + int evicted; + + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) + goto fail_hdr; - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMREQDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ - if (hdr->payload_len==0) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - skb_network_header_len(skb)); - return -1; - } + if (hdr->payload_len==0) + goto fail_hdr; + if (!pskb_may_pull(skb, (skb_transport_offset(skb) + - sizeof(struct frag_hdr)))) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - skb_network_header_len(skb)); - return -1; - } + sizeof(struct frag_hdr)))) + goto fail_hdr; hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); @@ -594,18 +544,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (!(fhdr->frag_off & htons(0xFFF9))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMOKS); + IP6_INC_STATS_BH(net, + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; return 1; } - net = skb->dev->nd_net; - if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh) - ip6_evictor(net, ip6_dst_idev(skb->dst)); + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); + if (evicted) + IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS, evicted); - if ((fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_dst_idev(skb->dst))) != NULL) { + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); + if (fq != NULL) { int ret; spin_lock(&fq->q.lock); @@ -613,78 +567,84 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, &ip6_frags); return ret; } - IP6_INC_STATS_BH(ip6_dst_idev(skb->dst), IPSTATS_MIB_REASMFAILS); + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; + +fail_hdr: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); + return -1; } -static struct inet6_protocol frag_protocol = +static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; #ifdef CONFIG_SYSCTL -static struct ctl_table ip6_frags_ctl_table[] = { +static struct ctl_table ip6_frags_ns_ctl_table[] = { { - .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH, .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV6_IP6FRAG_TIME, .procname = "ip6frag_time", .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, + { } +}; + +static struct ctl_table ip6_frags_ctl_table[] = { { - .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL, .procname = "ip6frag_secret_interval", .data = &ip6_frags.secret_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies + .proc_handler = proc_dointvec_jiffies, }, { } }; -static int ip6_frags_sysctl_register(struct net *net) +static int __net_init ip6_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; - table = ip6_frags_ctl_table; - if (net != &init_net) { - table = kmemdup(table, sizeof(ip6_frags_ctl_table), GFP_KERNEL); + table = ip6_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; table[0].data = &net->ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; table[2].data = &net->ipv6.frags.timeout; - table[3].mode &= ~0222; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } - hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table); + hdr = register_net_sysctl(net, "net/ipv6", table); if (hdr == NULL) goto err_reg; @@ -692,45 +652,69 @@ static int ip6_frags_sysctl_register(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } -static void ip6_frags_sysctl_unregister(struct net *net) +static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); - kfree(table); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct ctl_table_header *ip6_ctl_header; + +static int ip6_frags_sysctl_register(void) +{ + ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6", + ip6_frags_ctl_table); + return ip6_ctl_header == NULL ? -ENOMEM : 0; +} + +static void ip6_frags_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_ctl_header); } #else -static inline int ip6_frags_sysctl_register(struct net *net) +static inline int ip6_frags_ns_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void ip6_frags_ns_sysctl_unregister(struct net *net) +{ +} + +static inline int ip6_frags_sysctl_register(void) { return 0; } -static inline void ip6_frags_sysctl_unregister(struct net *net) +static inline void ip6_frags_sysctl_unregister(void) { } #endif -static int ipv6_frags_init_net(struct net *net) +static int __net_init ipv6_frags_init_net(struct net *net) { - net->ipv6.frags.high_thresh = 256 * 1024; - net->ipv6.frags.low_thresh = 192 * 1024; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; inet_frags_init_net(&net->ipv6.frags); - return ip6_frags_sysctl_register(net); + return ip6_frags_ns_sysctl_register(net); } -static void ipv6_frags_exit_net(struct net *net) +static void __net_exit ipv6_frags_exit_net(struct net *net) { - ip6_frags_sysctl_unregister(net); + ip6_frags_ns_sysctl_unregister(net); inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); } @@ -747,7 +731,13 @@ int __init ipv6_frag_init(void) if (ret) goto out; - register_pernet_subsys(&ip6_frags_ops); + ret = ip6_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&ip6_frags_ops); + if (ret) + goto err_pernet; ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; @@ -760,11 +750,18 @@ int __init ipv6_frag_init(void) inet_frags_init(&ip6_frags); out: return ret; + +err_pernet: + ip6_frags_sysctl_unregister(); +err_sysctl: + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + goto out; } void ipv6_frag_exit(void) { inet_frags_fini(&ip6_frags); + ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e8b241cb60b..f23fbd28a50 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -26,8 +24,11 @@ * Fixed routing subtrees. */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/capability.h> #include <linux/errno.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> @@ -36,10 +37,13 @@ #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> +#include <linux/mroute6.h> #include <linux/init.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> @@ -53,6 +57,7 @@ #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> +#include <net/nexthop.h> #include <asm/uaccess.h> @@ -60,21 +65,18 @@ #include <linux/sysctl.h> #endif -/* Set to 3 to get tracing. */ -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RDBG(x) printk x -#define RT6_TRACE(x...) printk(KERN_DEBUG x) -#else -#define RDBG(x) -#define RT6_TRACE(x...) do { ; } while (0) -#endif - -#define CLONE_OFFLINK_ROUTE 0 +enum rt6_nud_state { + RT6_NUD_FAIL_HARD = -3, + RT6_NUD_FAIL_PROBE = -2, + RT6_NUD_FAIL_DO_RR = -1, + RT6_NUD_SUCCEED = 1 +}; -static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ip6_default_advmss(const struct dst_entry *dst); +static unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -82,106 +84,216 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_prohibit(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex, - unsigned pref); -static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex); +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned int pref); +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex); #endif -static struct dst_ops ip6_dst_ops = { +static void rt6_bind_peer(struct rt6_info *rt, int create) +{ + struct inet_peer_base *base; + struct inet_peer *peer; + + base = inetpeer_base_ptr(rt->_rt6i_peer); + if (!base) + return; + + peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); + if (peer) { + if (!rt6_set_peer(rt, peer)) + inet_putpeer(peer); + } +} + +static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +{ + if (rt6_has_peer(rt)) + return rt6_peer_ptr(rt); + + rt6_bind_peer(rt, create); + return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); +} + +static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +{ + return __rt6_get_peer(rt, 1); +} + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!(rt->dst.flags & DST_HOST)) + return NULL; + + peer = rt6_get_peer_create(rt); + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer) || + (old & DST_METRICS_FORCE_OVERWRITE)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} + +static inline const void *choose_neigh_daddr(struct rt6_info *rt, + struct sk_buff *skb, + const void *daddr) +{ + struct in6_addr *p = &rt->rt6i_gateway; + + if (!ipv6_addr_any(p)) + return (const void *) p; + else if (skb) + return &ipv6_hdr(skb)->daddr; + return daddr; +} + +static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct neighbour *n; + + daddr = choose_neigh_daddr(rt, skb, daddr); + n = __ipv6_neigh_lookup(dst->dev, daddr); + if (n) + return n; + return neigh_create(&nd_tbl, daddr, dst->dev); +} + +static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, + .default_advmss = ip6_default_advmss, + .mtu = ip6_mtu, + .cow_metrics = ipv6_cow_metrics, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, - .local_out = ip6_local_out, - .entry_size = sizeof(struct rt6_info), - .entries = ATOMIC_INIT(0), + .redirect = rt6_do_redirect, + .local_out = __ip6_local_out, + .neigh_lookup = ip6_neigh_lookup, }; -static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; +} + +static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { } +static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, + .mtu = ip6_blackhole_mtu, + .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, - .entry_size = sizeof(struct rt6_info), - .entries = ATOMIC_INIT(0), + .redirect = ip6_rt_blackhole_redirect, + .cow_metrics = ip6_rt_blackhole_cow_metrics, + .neigh_lookup = ip6_neigh_lookup, }; -struct rt6_info ip6_null_entry = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -ENETUNREACH, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_discard, - .output = ip6_pkt_discard_out, - .ops = &ip6_dst_ops, - .path = (struct dst_entry*)&ip6_null_entry, - } +static const u32 ip6_template_metrics[RTAX_MAX] = { + [RTAX_HOPLIMIT - 1] = 0, +}; + +static const struct rt6_info ip6_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES -static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); - -struct rt6_info ip6_prohibit_entry = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EACCES, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_prohibit, - .output = ip6_pkt_prohibit_out, - .ops = &ip6_dst_ops, - .path = (struct dst_entry*)&ip6_prohibit_entry, - } +static const struct rt6_info ip6_prohibit_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EACCES, + .input = ip6_pkt_prohibit, + .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; -struct rt6_info ip6_blk_hole_entry = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EINVAL, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = dst_discard, - .output = dst_discard, - .ops = &ip6_dst_ops, - .path = (struct dst_entry*)&ip6_blk_hole_entry, - } +static const struct rt6_info ip6_blk_hole_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EINVAL, + .input = dst_discard, + .output = dst_discard_sk, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, .rt6i_metric = ~(u32) 0, .rt6i_ref = ATOMIC_INIT(1), }; @@ -189,20 +301,46 @@ struct rt6_info ip6_blk_hole_entry = { #endif /* allocate dst with ip6_dst_ops */ -static __inline__ struct rt6_info *ip6_dst_alloc(void) +static inline struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { - return (struct rt6_info *)dst_alloc(&ip6_dst_ops); + struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, + 0, DST_OBSOLETE_FORCE_CHK, flags); + + if (rt) { + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); + rt->rt6i_genid = rt_genid_ipv6(net); + INIT_LIST_HEAD(&rt->rt6i_siblings); + } + return rt; } static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct dst_entry *from = dst->from; - if (idev != NULL) { + if (!(rt->dst.flags & DST_HOST)) + dst_destroy_metrics_generic(dst); + + if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } + + dst->from = NULL; + dst_release(from); + + if (rt6_has_peer(rt)) { + struct inet_peer *peer = rt6_peer_ptr(rt); + inet_putpeer(peer); + } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -211,50 +349,116 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; struct net_device *loopback_dev = - dev->nd_net->loopback_dev; - - if (dev != loopback_dev && idev != NULL && idev->dev == dev) { - struct inet6_dev *loopback_idev = - in6_dev_get(loopback_dev); - if (loopback_idev != NULL) { - rt->rt6i_idev = loopback_idev; - in6_dev_put(idev); + dev_net(dev)->loopback_dev; + + if (dev != loopback_dev) { + if (idev && idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } } } } -static __inline__ int rt6_check_expired(const struct rt6_info *rt) +static bool rt6_check_expired(const struct rt6_info *rt) { - return (rt->rt6i_flags & RTF_EXPIRES && - time_after(jiffies, rt->rt6i_expires)); + if (rt->rt6i_flags & RTF_EXPIRES) { + if (time_after(jiffies, rt->dst.expires)) + return true; + } else if (rt->dst.from) { + return rt6_check_expired((struct rt6_info *) rt->dst.from); + } + return false; } -static inline int rt6_need_strict(struct in6_addr *daddr) +/* Multipath route selection: + * Hash based function using packet header and flowlabel. + * Adapted from fib_info_hashfn() + */ +static int rt6_info_hash_nhsfn(unsigned int candidate_count, + const struct flowi6 *fl6) { - return (ipv6_addr_type(daddr) & - (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); + unsigned int val = fl6->flowi6_proto; + + val ^= ipv6_addr_hash(&fl6->daddr); + val ^= ipv6_addr_hash(&fl6->saddr); + + /* Work only if this not encapsulated */ + switch (fl6->flowi6_proto) { + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + val ^= (__force u16)fl6->fl6_sport; + val ^= (__force u16)fl6->fl6_dport; + break; + + case IPPROTO_ICMPV6: + val ^= (__force u16)fl6->fl6_icmp_type; + val ^= (__force u16)fl6->fl6_icmp_code; + break; + } + /* RFC6438 recommands to use flowlabel */ + val ^= (__force u32)fl6->flowlabel; + + /* Perhaps, we need to tune, this function? */ + val = val ^ (val >> 7) ^ (val >> 12); + return val % candidate_count; +} + +static struct rt6_info *rt6_multipath_select(struct rt6_info *match, + struct flowi6 *fl6, int oif, + int strict) +{ + struct rt6_info *sibling, *next_sibling; + int route_choosen; + + route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); + /* Don't change the route, if route_choosen == 0 + * (siblings does not include ourself) + */ + if (route_choosen) + list_for_each_entry_safe(sibling, next_sibling, + &match->rt6i_siblings, rt6i_siblings) { + route_choosen--; + if (route_choosen == 0) { + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + } + return match; } /* * Route lookup. Any table->tb6_lock is implied. */ -static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, +static inline struct rt6_info *rt6_device_match(struct net *net, + struct rt6_info *rt, + const struct in6_addr *saddr, int oif, - int strict) + int flags) { struct rt6_info *local = NULL; struct rt6_info *sprt; - if (oif) { - for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) { - struct net_device *dev = sprt->rt6i_dev; + if (!oif && ipv6_addr_any(saddr)) + goto out; + + for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + struct net_device *dev = sprt->dst.dev; + + if (oif) { if (dev->ifindex == oif) return sprt; if (dev->flags & IFF_LOOPBACK) { - if (sprt->rt6i_idev == NULL || + if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { - if (strict && oif) + if (flags & RT6_LOOKUP_F_IFACE && oif) continue; if (local && (!oif || local->rt6i_idev->dev->ifindex == oif)) @@ -262,21 +466,46 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, } local = sprt; } + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return sprt; } + } + if (oif) { if (local) return local; - if (strict) - return &ip6_null_entry; + if (flags & RT6_LOOKUP_F_IFACE) + return net->ipv6.ip6_null_entry; } +out: return rt; } #ifdef CONFIG_IPV6_ROUTER_PREF +struct __rt6_probe_work { + struct work_struct work; + struct in6_addr target; + struct net_device *dev; +}; + +static void rt6_probe_deferred(struct work_struct *w) +{ + struct in6_addr mcaddr; + struct __rt6_probe_work *work = + container_of(w, struct __rt6_probe_work, work); + + addrconf_addr_solict_mult(&work->target, &mcaddr); + ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + dev_put(work->dev); + kfree(w); +} + static void rt6_probe(struct rt6_info *rt) { - struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL; + struct neighbour *neigh; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it @@ -285,27 +514,44 @@ static void rt6_probe(struct rt6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!neigh || (neigh->nud_state & NUD_VALID)) + if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) return; - read_lock_bh(&neigh->lock); - if (!(neigh->nud_state & NUD_VALID) && + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + write_lock(&neigh->lock); + if (neigh->nud_state & NUD_VALID) + goto out; + } + + if (!neigh || time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct in6_addr mcaddr; - struct in6_addr *target; + struct __rt6_probe_work *work; - neigh->updated = jiffies; - read_unlock_bh(&neigh->lock); + work = kmalloc(sizeof(*work), GFP_ATOMIC); - target = (struct in6_addr *)&neigh->primary_key; - addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL); - } else - read_unlock_bh(&neigh->lock); + if (neigh && work) + __neigh_set_probe_once(neigh); + + if (neigh) + write_unlock(&neigh->lock); + + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); + } + } else { +out: + write_unlock(&neigh->lock); + } + rcu_read_unlock_bh(); } #else static inline void rt6_probe(struct rt6_info *rt) { - return; } #endif @@ -314,7 +560,7 @@ static inline void rt6_probe(struct rt6_info *rt) */ static inline int rt6_check_dev(struct rt6_info *rt, int oif) { - struct net_device *dev = rt->rt6i_dev; + struct net_device *dev = rt->dst.dev; if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && @@ -323,85 +569,102 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) return 0; } -static inline int rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) { - struct neighbour *neigh = rt->rt6i_nexthop; - int m; + struct neighbour *neigh; + enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) - m = 1; - else if (neigh) { - read_lock_bh(&neigh->lock); + return RT6_NUD_SUCCEED; + + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) - m = 2; + ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF - else if (neigh->nud_state & NUD_FAILED) - m = 0; -#endif + else if (!(neigh->nud_state & NUD_FAILED)) + ret = RT6_NUD_SUCCEED; else - m = 1; - read_unlock_bh(&neigh->lock); - } else - m = 0; - return m; + ret = RT6_NUD_FAIL_PROBE; +#endif + read_unlock(&neigh->lock); + } else { + ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? + RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; + } + rcu_read_unlock_bh(); + + return ret; } static int rt6_score_route(struct rt6_info *rt, int oif, int strict) { - int m, n; + int m; m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) - return -1; + return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; #endif - n = rt6_check_neigh(rt); - if (!n && (strict & RT6_LOOKUP_F_REACHABLE)) - return -1; + if (strict & RT6_LOOKUP_F_REACHABLE) { + int n = rt6_check_neigh(rt); + if (n < 0) + return n; + } return m; } static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match) + int *mpri, struct rt6_info *match, + bool *do_rr) { int m; + bool match_do_rr = false; if (rt6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); - if (m < 0) + if (m == RT6_NUD_FAIL_DO_RR) { + match_do_rr = true; + m = 0; /* lowest valid score */ + } else if (m == RT6_NUD_FAIL_HARD) { goto out; + } + + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(rt); + /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { - if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(match); + *do_rr = match_do_rr; *mpri = m; match = rt; - } else if (strict & RT6_LOOKUP_F_REACHABLE) { - rt6_probe(rt); } - out: return match; } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, struct rt6_info *rr_head, - u32 metric, int oif, int strict) + u32 metric, int oif, int strict, + bool *do_rr) { struct rt6_info *rt, *match; int mpri = -1; match = NULL; for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match, do_rr); for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } @@ -409,19 +672,18 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) { struct rt6_info *match, *rt0; - - RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n", - __FUNCTION__, fn->leaf, oif); + struct net *net; + bool do_rr = false; rt0 = fn->rr_ptr; if (!rt0) fn->rr_ptr = rt0 = fn->leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + &do_rr); - if (!match && - (strict & RT6_LOOKUP_F_REACHABLE)) { - struct rt6_info *next = rt0->u.dst.rt6_next; + if (do_rr) { + struct rt6_info *next = rt0->dst.rt6_next; /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -431,20 +693,19 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) fn->rr_ptr = next; } - RT6_TRACE("%s() => %p\n", - __FUNCTION__, match); - - return (match ? match : &ip6_null_entry); + net = dev_net(rt0->dst.dev); + return match ? match : net->ipv6.ip6_null_entry; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, - struct in6_addr *gwaddr) + const struct in6_addr *gwaddr) { + struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; unsigned int pref; - u32 lifetime; + unsigned long lifetime; struct rt6_info *rt; if (len < sizeof(struct route_info)) { @@ -468,15 +729,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, pref = rinfo->route_pref; if (pref == ICMPV6_ROUTER_PREF_INVALID) - pref = ICMPV6_ROUTER_PREF_MEDIUM; + return -EINVAL; - lifetime = ntohl(rinfo->lifetime); - if (lifetime == 0xffffffff) { - /* infinity */ - } else if (lifetime > 0x7fffffff/HZ) { - /* Avoid arithmetic overflow */ - lifetime = 0x7fffffff/HZ - 1; - } + lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; @@ -488,7 +743,11 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, prefix = &prefix_buf; } - rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex); + if (rinfo->prefix_len == 0) + rt = rt6_get_dflt_router(gwaddr, dev); + else + rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, + gwaddr, dev->ifindex); if (rt && !lifetime) { ip6_del_rt(rt); @@ -496,28 +755,27 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } if (!rt && lifetime) - rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex, + rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, pref); else if (rt) rt->rt6i_flags = RTF_ROUTEINFO | (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (lifetime == 0xffffffff) { - rt->rt6i_flags &= ~RTF_EXPIRES; - } else { - rt->rt6i_expires = jiffies + HZ * lifetime; - rt->rt6i_flags |= RTF_EXPIRES; - } - dst_release(&rt->u.dst); + if (!addrconf_finite_timeout(lifetime)) + rt6_clean_expires(rt); + else + rt6_set_expires(rt, jiffies + HZ * lifetime); + + ip6_rt_put(rt); } return 0; } #endif -#define BACKTRACK(saddr) \ +#define BACKTRACK(__net, saddr) \ do { \ - if (rt == &ip6_null_entry) { \ + if (rt == __net->ipv6.ip6_null_entry) { \ struct fib6_node *pn; \ while (1) { \ if (fn->fn_flags & RTN_TL_ROOT) \ @@ -531,47 +789,53 @@ do { \ goto restart; \ } \ } \ -} while(0) +} while (0) -static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table, - struct flowi *fl, int flags) +static struct rt6_info *ip6_pol_route_lookup(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, int flags) { struct fib6_node *fn; struct rt6_info *rt; read_lock_bh(&table->tb6_lock); - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: rt = fn->leaf; - rt = rt6_device_match(rt, fl->oif, flags); - BACKTRACK(&fl->fl6_src); + rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + BACKTRACK(net, &fl6->saddr); out: - dst_use(&rt->u.dst, jiffies); + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); return rt; } -struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, - int oif, int strict) +struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, + int flags) { - struct flowi fl = { - .oif = oif, - .nl_u = { - .ip6_u = { - .daddr = *daddr, - }, - }, + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); +} +EXPORT_SYMBOL_GPL(ip6_route_lookup); + +struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, + const struct in6_addr *saddr, int oif, int strict) +{ + struct flowi6 fl6 = { + .flowi6_oif = oif, + .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { - memcpy(&fl.fl6_src, saddr, sizeof(*saddr)); + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -588,14 +852,15 @@ EXPORT_SYMBOL(rt6_lookup); be destroyed. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info); + err = fib6_add(&table->tb6_root, rt, info, mx, mx_len); write_unlock_bh(&table->tb6_lock); return err; @@ -604,13 +869,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) int ip6_ins_rt(struct rt6_info *rt) { struct nl_info info = { - .nl_net = &init_net, + .nl_net = dev_net(rt->dst.dev), }; - return __ip6_ins_rt(rt, &info); + return __ip6_ins_rt(rt, &info, NULL, 0); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr, - struct in6_addr *saddr) +static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -618,57 +884,45 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad * Clone the route. */ - rt = ip6_rt_copy(ort); + rt = ip6_rt_copy(ort, daddr); if (rt) { - if (!(rt->rt6i_flags&RTF_GATEWAY)) { - if (rt->rt6i_dst.plen != 128 && - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) - rt->rt6i_flags |= RTF_ANYCAST; - ipv6_addr_copy(&rt->rt6i_gateway, daddr); - } + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { - ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif - - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - } return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr) +static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, + const struct in6_addr *daddr) { - struct rt6_info *rt = ip6_rt_copy(ort); - if (rt) { - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - rt->rt6i_dst.plen = 128; + struct rt6_info *rt = ip6_rt_copy(ort, daddr); + + if (rt) rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; - rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop); - } return rt; } -static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif, - struct flowi *fl, int flags) +static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, int flags) { struct fib6_node *fn; struct rt6_info *rt, *nrt; int strict = 0; int attempts = 3; int err; - int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; + int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -676,32 +930,31 @@ relookup: read_lock_bh(&table->tb6_lock); restart_2: - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: rt = rt6_select(fn, oif, strict | reachable); - BACKTRACK(&fl->fl6_src); - if (rt == &ip6_null_entry || + if (rt->rt6i_nsiblings) + rt = rt6_multipath_select(rt, fl6, oif, strict | reachable); + BACKTRACK(net, &fl6->saddr); + if (rt == net->ipv6.ip6_null_entry || rt->rt6i_flags & RTF_CACHE) goto out; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) - nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src); - else { -#if CLONE_OFFLINK_ROUTE - nrt = rt6_alloc_clone(rt, &fl->fl6_dst); -#else + if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) + nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); + else if (!(rt->dst.flags & DST_HOST)) + nrt = rt6_alloc_clone(rt, &fl6->daddr); + else goto out2; -#endif - } - dst_release(&rt->u.dst); - rt = nrt ? : &ip6_null_entry; + ip6_rt_put(rt); + rt = nrt ? : net->ipv6.ip6_null_entry; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); if (nrt) { err = ip6_ins_rt(nrt); if (!err) @@ -715,7 +968,7 @@ restart: * Race condition! In the gap, when table->tb6_lock was * released someone could insert this route. Relookup. */ - dst_release(&rt->u.dst); + ip6_rt_put(rt); goto relookup; out: @@ -723,91 +976,100 @@ out: reachable = 0; goto restart_2; } - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); out2: - rt->u.dst.lastuse = jiffies; - rt->u.dst.__use++; + rt->dst.lastuse = jiffies; + rt->dst.__use++; return rt; } -static struct rt6_info *ip6_pol_route_input(struct fib6_table *table, - struct flowi *fl, int flags) +static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) { - return ip6_pol_route(table, fl->iif, fl, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); +} + +static struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) +{ + if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) + flags |= RT6_LOOKUP_F_IFACE; + + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } void ip6_route_input(struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; - struct flowi fl = { - .iif = skb->dev->ifindex, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK, - }, - }, - .mark = skb->mark, - .proto = iph->nexthdr, + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, }; - if (rt6_need_strict(&iph->daddr)) - flags |= RT6_LOOKUP_F_IFACE; - - skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input); + skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -static struct rt6_info *ip6_pol_route_output(struct fib6_table *table, - struct flowi *fl, int flags) +static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) { - return ip6_pol_route(table, fl->oif, fl, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, + struct flowi6 *fl6) { int flags = 0; - if (rt6_need_strict(&fl->fl6_dst)) + fl6->flowi6_iif = LOOPBACK_IFINDEX; + + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) flags |= RT6_LOOKUP_F_IFACE; - if (!ipv6_addr_any(&fl->fl6_src)) + if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; + else if (sk) + flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(fl, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } EXPORT_SYMBOL(ip6_route_output); -int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl) +struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *ort = (struct rt6_info *) *dstp; - struct rt6_info *rt = (struct rt6_info *) - dst_alloc(&ip6_dst_blackhole_ops); + struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; struct dst_entry *new = NULL; + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { - new = &rt->u.dst; + new = &rt->dst; + + memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); + rt6_init_peer(rt, net->ipv6.peers); - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; - if (new->dev) - dev_hold(new->dev); + if (dst_metrics_read_only(&ort->dst)) + new->_metrics = ort->dst._metrics; + else + dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->rt6i_expires = 0; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -818,11 +1080,9 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl dst_free(new); } - dst_release(*dstp); - *dstp = new; - return (new ? 0 : -ENOMEM); + dst_release(dst_orig); + return new ? new : ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(ip6_dst_blackhole); /* * Destination cache support functions @@ -834,10 +1094,20 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt = (struct rt6_info *) dst; - if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) - return dst; + /* All IPV6 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + */ + if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) + return NULL; - return NULL; + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return dst; } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -845,53 +1115,213 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) struct rt6_info *rt = (struct rt6_info *) dst; if (rt) { - if (rt->rt6i_flags & RTF_CACHE) - ip6_del_rt(rt); - else + if (rt->rt6i_flags & RTF_CACHE) { + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + dst = NULL; + } + } else { dst_release(dst); + dst = NULL; + } } - return NULL; + return dst; } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - rt = (struct rt6_info *) skb->dst; + rt = (struct rt6_info *) skb_dst(skb); if (rt) { - if (rt->rt6i_flags&RTF_CACHE) { - dst_set_expires(&rt->u.dst, 0); - rt->rt6i_flags |= RTF_EXPIRES; - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + if (rt->rt6i_flags & RTF_CACHE) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; + } } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info*)dst; + dst_confirm(dst); if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + struct net *net = dev_net(dst->dev); + rt6->rt6i_flags |= RTF_MODIFIED; if (mtu < IPV6_MIN_MTU) { + u32 features = dst_metric(dst, RTAX_FEATURES); mtu = IPV6_MIN_MTU; - dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(dst, RTAX_FEATURES, features); } - dst->metrics[RTAX_MTU-1] = mtu; - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + dst_metric_set(dst, RTAX_MTU, mtu); + rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + } +} + +void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, + int oif, u32 mark) +{ + const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst->error) + ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + dst_release(dst); +} +EXPORT_SYMBOL_GPL(ip6_update_pmtu); + +void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) +{ + ip6_update_pmtu(skb, sock_net(sk), mtu, + sk->sk_bound_dev_if, sk->sk_mark); +} +EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); + +/* Handle redirects */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 4861 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt6_check_expired(rt)) + continue; + if (rt->dst.error) + break; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->dst.dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + else if (rt->dst.error) { + rt = net->ipv6.ip6_null_entry; + goto out; + } + BACKTRACK(net, &fl6->saddr); +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct dst_entry *ip6_route_redirect(struct net *net, + const struct flowi6 *fl6, + const struct in6_addr *gateway) +{ + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip6rd_flowi rdfl; + + rdfl.fl6 = *fl6; + rdfl.gateway = *gateway; + + return fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); } -static int ipv6_get_mtu(struct net_device *dev); +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) +{ + const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + + dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} +EXPORT_SYMBOL_GPL(ip6_redirect); -static inline unsigned int ipv6_advmss(unsigned int mtu) +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.daddr = msg->dest; + fl6.saddr = iph->daddr; + + dst = ip6_route_redirect(net, &fl6, &iph->saddr); + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} + +void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); +} +EXPORT_SYMBOL_GPL(ip6_sk_redirect); + +static unsigned int ip6_default_advmss(const struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); + struct net *net = dev_net(dev); + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); - if (mtu < init_net.ipv6.sysctl.ip6_rt_min_advmss) - mtu = init_net.ipv6.sysctl.ip6_rt_min_advmss; + if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) + mtu = net->ipv6.sysctl.ip6_rt_min_advmss; /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and @@ -904,138 +1334,134 @@ static inline unsigned int ipv6_advmss(unsigned int mtu) return mtu; } -static struct dst_entry *ndisc_dst_gc_list; -static DEFINE_SPINLOCK(ndisc_lock); +static unsigned int ip6_mtu(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); -struct dst_entry *ndisc_dst_alloc(struct net_device *dev, - struct neighbour *neigh, - struct in6_addr *addr, - int (*output)(struct sk_buff *)) + if (mtu) + goto out; + + mtu = IPV6_MIN_MTU; + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + +out: + return min_t(unsigned int, mtu, IP6_MAX_MTU); +} + +static struct dst_entry *icmp6_dst_gc_list; +static DEFINE_SPINLOCK(icmp6_dst_lock); + +struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + struct flowi6 *fl6) { + struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); + struct net *net = dev_net(dev); - if (unlikely(idev == NULL)) - return NULL; + if (unlikely(!idev)) + return ERR_PTR(-ENODEV); - rt = ip6_dst_alloc(); - if (unlikely(rt == NULL)) { + rt = ip6_dst_alloc(net, dev, 0, NULL); + if (unlikely(!rt)) { in6_dev_put(idev); + dst = ERR_PTR(-ENOMEM); goto out; } - dev_hold(dev); - if (neigh) - neigh_hold(neigh); - else - neigh = ndisc_get_neigh(dev, addr); - - rt->rt6i_dev = dev; - rt->rt6i_idev = idev; - rt->rt6i_nexthop = neigh; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); - rt->u.dst.output = output; - -#if 0 /* there's no chance to use these for ndisc */ - rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST - ? DST_HOST - : 0; - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->dst.flags |= DST_HOST; + rt->dst.output = ip6_output; + atomic_set(&rt->dst.__refcnt, 1); + rt->rt6i_gateway = fl6->daddr; + rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; -#endif + rt->rt6i_idev = idev; + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - spin_lock_bh(&ndisc_lock); - rt->u.dst.next = ndisc_dst_gc_list; - ndisc_dst_gc_list = &rt->u.dst; - spin_unlock_bh(&ndisc_lock); + spin_lock_bh(&icmp6_dst_lock); + rt->dst.next = icmp6_dst_gc_list; + icmp6_dst_gc_list = &rt->dst; + spin_unlock_bh(&icmp6_dst_lock); - fib6_force_start_gc(); + fib6_force_start_gc(net); + + dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: - return &rt->u.dst; + return dst; } -int ndisc_dst_gc(int *more) +int icmp6_dst_gc(void) { - struct dst_entry *dst, *next, **pprev; - int freed; - - next = NULL; - freed = 0; + struct dst_entry *dst, **pprev; + int more = 0; - spin_lock_bh(&ndisc_lock); - pprev = &ndisc_dst_gc_list; + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; while ((dst = *pprev) != NULL) { if (!atomic_read(&dst->__refcnt)) { *pprev = dst->next; dst_free(dst); - freed++; } else { pprev = &dst->next; - (*more)++; + ++more; } } - spin_unlock_bh(&ndisc_lock); + spin_unlock_bh(&icmp6_dst_lock); - return freed; + return more; } -static int ip6_dst_gc(struct dst_ops *ops) +static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), + void *arg) { - static unsigned expire = 30*HZ; - static unsigned long last_gc; - unsigned long now = jiffies; - - if (time_after(last_gc + init_net.ipv6.sysctl.ip6_rt_gc_min_interval, now) && - atomic_read(&ip6_dst_ops.entries) <= init_net.ipv6.sysctl.ip6_rt_max_size) - goto out; + struct dst_entry *dst, **pprev; - expire++; - fib6_run_gc(expire); - last_gc = now; - if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) - expire = init_net.ipv6.sysctl.ip6_rt_gc_timeout>>1; - -out: - expire -= expire>>init_net.ipv6.sysctl.ip6_rt_gc_elasticity; - return (atomic_read(&ip6_dst_ops.entries) > init_net.ipv6.sysctl.ip6_rt_max_size); -} - -/* Clean host part of a prefix. Not necessary in radix tree, - but results in cleaner routing tables. - - Remove it only when all the things will work! - */ - -static int ipv6_get_mtu(struct net_device *dev) -{ - int mtu = IPV6_MIN_MTU; - struct inet6_dev *idev; - - idev = in6_dev_get(dev); - if (idev) { - mtu = idev->cnf.mtu6; - in6_dev_put(idev); + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; + while ((dst = *pprev) != NULL) { + struct rt6_info *rt = (struct rt6_info *) dst; + if (func(rt, arg)) { + *pprev = dst->next; + dst_free(dst); + } else { + pprev = &dst->next; + } } - return mtu; + spin_unlock_bh(&icmp6_dst_lock); } -int ipv6_get_hoplimit(struct net_device *dev) +static int ip6_dst_gc(struct dst_ops *ops) { - int hoplimit = ipv6_devconf.hop_limit; - struct inet6_dev *idev; + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; + int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int entries; + + entries = dst_entries_get_fast(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && + entries <= rt_max_size) + goto out; - idev = in6_dev_get(dev); - if (idev) { - hoplimit = idev->cnf.hop_limit; - in6_dev_put(idev); - } - return hoplimit; + net->ipv6.ip6_rt_gc_expire++; + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) + net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; +out: + net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + return entries > rt_max_size; } /* @@ -1045,6 +1471,7 @@ int ipv6_get_hoplimit(struct net_device *dev) int ip6_route_add(struct fib6_config *cfg) { int err; + struct net *net = cfg->fc_nlinfo.nl_net; struct rt6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; @@ -1059,7 +1486,7 @@ int ip6_route_add(struct fib6_config *cfg) #endif if (cfg->fc_ifindex) { err = -ENODEV; - dev = dev_get_by_index(&init_net, cfg->fc_ifindex); + dev = dev_get_by_index(net, cfg->fc_ifindex); if (!dev) goto out; idev = in6_dev_get(dev); @@ -1070,21 +1497,33 @@ int ip6_route_add(struct fib6_config *cfg) if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; - table = fib6_new_table(cfg->fc_table); - if (table == NULL) { - err = -ENOBUFS; - goto out; + err = -ENOBUFS; + if (cfg->fc_nlinfo.nlh && + !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { + table = fib6_get_table(net, cfg->fc_table); + if (!table) { + pr_warn("NLM_F_CREATE should be specified when creating new route\n"); + table = fib6_new_table(net, cfg->fc_table); + } + } else { + table = fib6_new_table(net, cfg->fc_table); } - rt = ip6_dst_alloc(); + if (!table) + goto out; + + rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); - if (rt == NULL) { + if (!rt) { err = -ENOMEM; goto out; } - rt->u.dst.obsolete = -1; - rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires); + if (cfg->fc_flags & RTF_EXPIRES) + rt6_set_expires(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); + else + rt6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -1093,16 +1532,20 @@ int ip6_route_add(struct fib6_config *cfg) addr_type = ipv6_addr_type(&cfg->fc_dst); if (addr_type & IPV6_ADDR_MULTICAST) - rt->u.dst.input = ip6_mc_input; + rt->dst.input = ip6_mc_input; + else if (cfg->fc_flags & RTF_LOCAL) + rt->dst.input = ip6_input; else - rt->u.dst.input = ip6_forward; + rt->dst.input = ip6_forward; - rt->u.dst.output = ip6_output; + rt->dst.output = ip6_output; ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) - rt->u.dst.flags = DST_HOST; + if (rt->rt6i_dst.plen == 128) { + rt->dst.flags |= DST_HOST; + dst_metrics_set_force_overwrite(&rt->dst); + } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1115,14 +1558,16 @@ int ip6_route_add(struct fib6_config *cfg) they would result in kernel looping; promote them to reject routes */ if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(cfg->fc_flags & RTF_LOCAL))) { /* hold loopback dev/idev if we haven't done so. */ - if (dev != init_net.loopback_dev) { + if (dev != net->loopback_dev) { if (dev) { dev_put(dev); in6_dev_put(idev); } - dev = init_net.loopback_dev; + dev = net->loopback_dev; dev_hold(dev); idev = in6_dev_get(dev); if (!idev) { @@ -1130,19 +1575,35 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } } - rt->u.dst.output = ip6_pkt_discard_out; - rt->u.dst.input = ip6_pkt_discard; - rt->u.dst.error = -ENETUNREACH; rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + switch (cfg->fc_type) { + case RTN_BLACKHOLE: + rt->dst.error = -EINVAL; + rt->dst.output = dst_discard_sk; + rt->dst.input = dst_discard; + break; + case RTN_PROHIBIT: + rt->dst.error = -EACCES; + rt->dst.output = ip6_pkt_prohibit_out; + rt->dst.input = ip6_pkt_prohibit; + break; + case RTN_THROW: + default: + rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN + : -ENETUNREACH; + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + break; + } goto install_route; } if (cfg->fc_flags & RTF_GATEWAY) { - struct in6_addr *gw_addr; + const struct in6_addr *gw_addr; int gwa_type; gw_addr = &cfg->fc_gateway; - ipv6_addr_copy(&rt->rt6i_gateway, gw_addr); + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { @@ -1156,81 +1617,61 @@ int ip6_route_add(struct fib6_config *cfg) some exceptions. --ANK */ err = -EINVAL; - if (!(gwa_type&IPV6_ADDR_UNICAST)) + if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1); + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); err = -EHOSTUNREACH; - if (grt == NULL) + if (!grt) goto out; if (dev) { - if (dev != grt->rt6i_dev) { - dst_release(&grt->u.dst); + if (dev != grt->dst.dev) { + ip6_rt_put(grt); goto out; } } else { - dev = grt->rt6i_dev; + dev = grt->dst.dev; idev = grt->rt6i_idev; dev_hold(dev); in6_dev_hold(grt->rt6i_idev); } - if (!(grt->rt6i_flags&RTF_GATEWAY)) + if (!(grt->rt6i_flags & RTF_GATEWAY)) err = 0; - dst_release(&grt->u.dst); + ip6_rt_put(grt); if (err) goto out; } err = -EINVAL; - if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + if (!dev || (dev->flags & IFF_LOOPBACK)) goto out; } err = -ENODEV; - if (dev == NULL) + if (!dev) goto out; - if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { - rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); - if (IS_ERR(rt->rt6i_nexthop)) { - err = PTR_ERR(rt->rt6i_nexthop); - rt->rt6i_nexthop = NULL; + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + err = -EINVAL; goto out; } - } + rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; + rt->rt6i_prefsrc.plen = 128; + } else + rt->rt6i_prefsrc.plen = 0; rt->rt6i_flags = cfg->fc_flags; install_route: - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - if (type > RTAX_MAX) { - err = -EINVAL; - goto out; - } - - rt->u.dst.metrics[type - 1] = nla_get_u32(nla); - } - } - } - - if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - if (!rt->u.dst.metrics[RTAX_MTU-1]) - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); - if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); - rt->u.dst.dev = dev; + rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; - return __ip6_ins_rt(rt, &cfg->fc_nlinfo); + + cfg->fc_nlinfo.nl_net = dev_net(dev); + + return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len); out: if (dev) @@ -1238,7 +1679,7 @@ out: if (idev) in6_dev_put(idev); if (rt) - dst_free(&rt->u.dst); + dst_free(&rt->dst); return err; } @@ -1246,25 +1687,27 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) { int err; struct fib6_table *table; + struct net *net = dev_net(rt->dst.dev); - if (rt == &ip6_null_entry) - return -ENOENT; + if (rt == net->ipv6.ip6_null_entry) { + err = -ENOENT; + goto out; + } table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_del(rt, info); - dst_release(&rt->u.dst); - write_unlock_bh(&table->tb6_lock); +out: + ip6_rt_put(rt); return err; } int ip6_del_rt(struct rt6_info *rt) { struct nl_info info = { - .nl_net = &init_net, + .nl_net = dev_net(rt->dst.dev), }; return __ip6_del_rt(rt, &info); } @@ -1276,8 +1719,8 @@ static int ip6_route_del(struct fib6_config *cfg) struct rt6_info *rt; int err = -ESRCH; - table = fib6_get_table(cfg->fc_table); - if (table == NULL) + table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); + if (!table) return err; read_lock_bh(&table->tb6_lock); @@ -1287,17 +1730,17 @@ static int ip6_route_del(struct fib6_config *cfg) &cfg->fc_src, cfg->fc_src_len); if (fn) { - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (cfg->fc_ifindex && - (rt->rt6i_dev == NULL || - rt->rt6i_dev->ifindex != cfg->fc_ifindex)) + (!rt->dst.dev || + rt->dst.dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); return __ip6_del_rt(rt, &cfg->fc_nlinfo); @@ -1308,107 +1751,83 @@ static int ip6_route_del(struct fib6_config *cfg) return err; } -/* - * Handle redirects - */ -struct ip6rd_flowi { - struct flowi fl; - struct in6_addr gateway; -}; - -static struct rt6_info *__ip6_route_redirect(struct fib6_table *table, - struct flowi *fl, - int flags) +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl; - struct rt6_info *rt; - struct fib6_node *fn; + struct net *net = dev_net(skb->dev); + struct netevent_redirect netevent; + struct rt6_info *rt, *nrt = NULL; + struct ndisc_options ndopts; + struct inet6_dev *in6_dev; + struct neighbour *neigh; + struct rd_msg *msg; + int optlen, on_link; + u8 *lladdr; + + optlen = skb_tail_pointer(skb) - skb_transport_header(skb); + optlen -= sizeof(*msg); + + if (optlen < 0) { + net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); + return; + } - /* - * Get the "current" route for this destination and - * check if the redirect has come from approriate router. - * - * RFC 2461 specifies that redirects should only be - * accepted if they come from the nexthop to the target. - * Due to the way the routes are chosen, this notion - * is a bit fuzzy and one might need to check all possible - * routes. - */ + msg = (struct rd_msg *)icmp6_hdr(skb); - read_lock_bh(&table->tb6_lock); - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); -restart: - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { - /* - * Current route is on-link; redirect is always invalid. - * - * Seems, previous statement is not true. It could - * be node, which looks for us as on-link (f.e. proxy ndisc) - * But then router serving it might decide, that we should - * know truth 8)8) --ANK (980726). - */ - if (rt6_check_expired(rt)) - continue; - if (!(rt->rt6i_flags & RTF_GATEWAY)) - continue; - if (fl->oif != rt->rt6i_dev->ifindex) - continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) - continue; - break; + if (ipv6_addr_is_multicast(&msg->dest)) { + net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); + return; } - if (!rt) - rt = &ip6_null_entry; - BACKTRACK(&fl->fl6_src); -out: - dst_hold(&rt->u.dst); - - read_unlock_bh(&table->tb6_lock); + on_link = 0; + if (ipv6_addr_equal(&msg->dest, &msg->target)) { + on_link = 1; + } else if (ipv6_addr_type(&msg->target) != + (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); + return; + } - return rt; -}; + in6_dev = __in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + return; -static struct rt6_info *ip6_route_redirect(struct in6_addr *dest, - struct in6_addr *src, - struct in6_addr *gateway, - struct net_device *dev) -{ - int flags = RT6_LOOKUP_F_HAS_SADDR; - struct ip6rd_flowi rdfl = { - .fl = { - .oif = dev->ifindex, - .nl_u = { - .ip6_u = { - .daddr = *dest, - .saddr = *src, - }, - }, - }, - .gateway = *gateway, - }; + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ - if (rt6_need_strict(dest)) - flags |= RT6_LOOKUP_F_IFACE; + if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); + return; + } - return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect); -} + lladdr = NULL; + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); + return; + } + } -void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, - struct in6_addr *saddr, - struct neighbour *neigh, u8 *lladdr, int on_link) -{ - struct rt6_info *rt, *nrt = NULL; - struct netevent_redirect netevent; + rt = (struct rt6_info *) dst; + if (rt == net->ipv6.ip6_null_entry) { + net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); + return; + } - rt = ip6_route_redirect(dest, src, saddr, neigh->dev); + /* Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); - if (rt == &ip6_null_entry) { - if (net_ratelimit()) - printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " - "for redirect target\n"); - goto out; - } + neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); + if (!neigh) + return; /* * We have finally decided to accept it. @@ -1421,202 +1840,113 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, NEIGH_UPDATE_F_ISROUTER)) ); - /* - * Redirect received -> path was valid. - * Look, redirects are sent only in response to data packets, - * so that this nexthop apparently is reachable. --ANK - */ - dst_confirm(&rt->u.dst); - - /* Duplicate redirect: silently ignore. */ - if (neigh == rt->u.dst.neighbour) - goto out; - - nrt = ip6_rt_copy(rt); - if (nrt == NULL) + nrt = ip6_rt_copy(rt, &msg->dest); + if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); - nrt->rt6i_dst.plen = 128; - nrt->u.dst.flags |= DST_HOST; - - ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); - nrt->rt6i_nexthop = neigh_clone(neigh); - /* Reset pmtu, it may be better */ - nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); - nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); + nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) goto out; - netevent.old = &rt->u.dst; - netevent.new = &nrt->u.dst; + netevent.old = &rt->dst; + netevent.new = &nrt->dst; + netevent.daddr = &msg->dest; + netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags&RTF_CACHE) { - ip6_del_rt(rt); - return; - } - -out: - dst_release(&rt->u.dst); - return; -} - -/* - * Handle ICMP "packet too big" messages - * i.e. Path MTU discovery - */ - -void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, - struct net_device *dev, u32 pmtu) -{ - struct rt6_info *rt, *nrt; - int allfrag = 0; - - rt = rt6_lookup(daddr, saddr, dev->ifindex, 0); - if (rt == NULL) - return; - - if (pmtu >= dst_mtu(&rt->u.dst)) - goto out; - - if (pmtu < IPV6_MIN_MTU) { - /* - * According to RFC2460, PMTU is set to the IPv6 Minimum Link - * MTU (1280) and a fragment header should always be included - * after a node receiving Too Big message reporting PMTU is - * less than the IPv6 Minimum Link MTU. - */ - pmtu = IPV6_MIN_MTU; - allfrag = 1; - } - - /* New mtu received -> path was valid. - They are sent only in response to data packets, - so that this nexthop apparently is reachable. --ANK - */ - dst_confirm(&rt->u.dst); - - /* Host route. If it is static, it would be better - not to override it, but add new one, so that - when cache entry will expire old pmtu - would return automatically. - */ if (rt->rt6i_flags & RTF_CACHE) { - rt->u.dst.metrics[RTAX_MTU-1] = pmtu; - if (allfrag) - rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - dst_set_expires(&rt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires); - rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; - goto out; + rt = (struct rt6_info *) dst_clone(&rt->dst); + ip6_del_rt(rt); } - /* Network route. - Two cases are possible: - 1. It is connected route. Action: COW - 2. It is gatewayed route or NONEXTHOP route. Action: clone it. - */ - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) - nrt = rt6_alloc_cow(rt, daddr, saddr); - else - nrt = rt6_alloc_clone(rt, daddr); - - if (nrt) { - nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; - if (allfrag) - nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - - /* According to RFC 1981, detecting PMTU increase shouldn't be - * happened within 5 mins, the recommended timer is 10 mins. - * Here this route expiration time is set to ip6_rt_mtu_expires - * which is 10 mins. After 10 mins the decreased pmtu is expired - * and detecting PMTU increase will be automatically happened. - */ - dst_set_expires(&nrt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires); - nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; - - ip6_ins_rt(nrt); - } out: - dst_release(&rt->u.dst); + neigh_release(neigh); } /* * Misc support functions */ -static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest) { - struct rt6_info *rt = ip6_dst_alloc(); + struct net *net = dev_net(ort->dst.dev); + struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, + ort->rt6i_table); if (rt) { - rt->u.dst.input = ort->u.dst.input; - rt->u.dst.output = ort->u.dst.output; - - memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - rt->u.dst.error = ort->u.dst.error; - rt->u.dst.dev = ort->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->dst.flags |= DST_HOST; + + rt->rt6i_dst.addr = *dest; + rt->rt6i_dst.plen = 128; + dst_copy_metrics(&rt->dst, &ort->dst); + rt->dst.error = ort->dst.error; rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->u.dst.lastuse = jiffies; - rt->rt6i_expires = 0; + rt->dst.lastuse = jiffies; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + if (ort->rt6i_flags & RTF_GATEWAY) + rt->rt6i_gateway = ort->rt6i_gateway; + else + rt->rt6i_gateway = *dest; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); rt->rt6i_metric = 0; - memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif + memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); rt->rt6i_table = ort->rt6i_table; } return rt; } #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex) +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex) { struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; - table = fib6_get_table(RT6_TABLE_INFO); - if (table == NULL) + table = fib6_get_table(net, RT6_TABLE_INFO); + if (!table) return NULL; - write_lock_bh(&table->tb6_lock); + read_lock_bh(&table->tb6_lock); fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { - if (rt->rt6i_dev->ifindex != ifindex) + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); break; } out: - write_unlock_bh(&table->tb6_lock); + read_unlock_bh(&table->tb6_lock); return rt; } -static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex, - unsigned pref) +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned int pref) { struct fib6_config cfg = { .fc_table = RT6_TABLE_INFO, @@ -1625,10 +1955,13 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_nlinfo.portid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = net, }; - ipv6_addr_copy(&cfg.fc_dst, prefix); - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_dst = *prefix; + cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) @@ -1636,35 +1969,33 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle ip6_route_add(&cfg); - return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex); + return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); } #endif -struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev) +struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) { struct rt6_info *rt; struct fib6_table *table; - table = fib6_get_table(RT6_TABLE_DFLT); - if (table == NULL) + table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); + if (!table) return NULL; - write_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) { - if (dev == rt->rt6i_dev && + read_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) break; } if (rt) - dst_hold(&rt->u.dst); - write_unlock_bh(&table->tb6_lock); + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); return rt; } -EXPORT_SYMBOL(rt6_get_dflt_router); - -struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { @@ -1674,30 +2005,34 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_nlinfo.portid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = dev_net(dev), }; - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_gateway = *gwaddr; ip6_route_add(&cfg); return rt6_get_dflt_router(gwaddr, dev); } -void rt6_purge_dflt_routers(void) +void rt6_purge_dflt_routers(struct net *net) { struct rt6_info *rt; struct fib6_table *table; /* NOTE: Keep consistent with rt6_get_dflt_router */ - table = fib6_get_table(RT6_TABLE_DFLT); - if (table == NULL) + table = fib6_get_table(net, RT6_TABLE_DFLT); + if (!table) return; restart: read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { - dst_hold(&rt->u.dst); + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); ip6_del_rt(rt); goto restart; @@ -1706,7 +2041,8 @@ restart: read_unlock_bh(&table->tb6_lock); } -static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg, +static void rtmsg_to_fib6_config(struct net *net, + struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { memset(cfg, 0, sizeof(*cfg)); @@ -1719,14 +2055,14 @@ static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg, cfg->fc_src_len = rtmsg->rtmsg_src_len; cfg->fc_flags = rtmsg->rtmsg_flags; - cfg->fc_nlinfo.nl_net = &init_net; + cfg->fc_nlinfo.nl_net = net; - ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst); - ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src); - ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway); + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; } -int ipv6_route_ioctl(unsigned int cmd, void __user *arg) +int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct fib6_config cfg; struct in6_rtmsg rtmsg; @@ -1735,14 +2071,14 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) switch(cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = copy_from_user(&rtmsg, arg, sizeof(struct in6_rtmsg)); if (err) return -EFAULT; - rtmsg_to_fib6_config(&rtmsg, &cfg); + rtmsg_to_fib6_config(net, &rtmsg, &cfg); rtnl_lock(); switch (cmd) { @@ -1767,22 +2103,25 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) * Drop the packet on the floor */ -static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes) +static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { int type; + struct dst_entry *dst = skb_dst(skb); switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); - if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) { - IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS); + if (type == IPV6_ADDR_ANY) { + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INADDRERRORS); break; } /* FALLTHROUGH */ case IPSTATS_MIB_OUTNOROUTES: - IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes); + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + ipstats_mib_noroutes); break; } - icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb(skb); return 0; } @@ -1792,92 +2131,161 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) { - skb->dev = skb->dst->dev; + skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) { - skb->dev = skb->dst->dev; + skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } -#endif - /* * Allocate a dst for local (unicast / anycast) address. */ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, - int anycast) + bool anycast) { - struct rt6_info *rt = ip6_dst_alloc(); - - if (rt == NULL) + struct net *net = dev_net(idev->dev); + struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, + DST_NOCOUNT, NULL); + if (!rt) return ERR_PTR(-ENOMEM); - dev_hold(init_net.loopback_dev); in6_dev_hold(idev); - rt->u.dst.flags = DST_HOST; - rt->u.dst.input = ip6_input; - rt->u.dst.output = ip6_output; - rt->rt6i_dev = init_net.loopback_dev; + rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; + rt->dst.output = ip6_output; rt->rt6i_idev = idev; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (rt->rt6i_nexthop == NULL) { - dst_free(&rt->u.dst); - return ERR_PTR(-ENOMEM); - } - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_gateway = *addr; + rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; - rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL); + rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); - atomic_set(&rt->u.dst.__refcnt, 1); + atomic_set(&rt->dst.__refcnt, 1); return rt; } -static int fib6_ifdown(struct rt6_info *rt, void *arg) +int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; + if (rt->rt6i_prefsrc.plen) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + return err; +} + +/* remove deleted ip from prefsrc entries */ +struct arg_dev_net_ip { + struct net_device *dev; + struct net *net; + struct in6_addr *addr; +}; + +static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +{ + struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; + struct net *net = ((struct arg_dev_net_ip *)arg)->net; + struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; + + if (((void *)rt->dst.dev == dev || !dev) && + rt != net->ipv6.ip6_null_entry && + ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + /* remove prefsrc entry */ + rt->rt6i_prefsrc.plen = 0; + } + return 0; +} + +void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) +{ + struct net *net = dev_net(ifp->idev->dev); + struct arg_dev_net_ip adni = { + .dev = ifp->idev->dev, + .net = net, + .addr = &ifp->addr, + }; + fib6_clean_all(net, fib6_remove_prefsrc, &adni); +} + +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +/* Remove routers and update dst entries when gateway turn into host. */ +static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { - if (((void*)rt->rt6i_dev == arg || arg == NULL) && - rt != &ip6_null_entry) { - RT6_TRACE("deleted by ifdown %p\n", rt); + struct in6_addr *gateway = (struct in6_addr *)arg; + + if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || + ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } return 0; } -void rt6_ifdown(struct net_device *dev) +void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) { - fib6_clean_all(fib6_ifdown, 0, dev); + fib6_clean_all(net, fib6_clean_tohost, gateway); } -struct rt6_mtu_change_arg +struct arg_dev_net { + struct net_device *dev; + struct net *net; +}; + +static int fib6_ifdown(struct rt6_info *rt, void *arg) { + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; + + if ((rt->dst.dev == dev || !dev) && + rt != adn->net->ipv6.ip6_null_entry) + return -1; + + return 0; +} + +void rt6_ifdown(struct net *net, struct net_device *dev) +{ + struct arg_dev_net adn = { + .dev = dev, + .net = net, + }; + + fib6_clean_all(net, fib6_ifdown, &adn); + icmp6_clean_all(fib6_ifdown, &adn); +} + +struct rt6_mtu_change_arg { struct net_device *dev; - unsigned mtu; + unsigned int mtu; }; static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) @@ -1892,7 +2300,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) */ idev = __in6_dev_get(arg->dev); - if (idev == NULL) + if (!idev) return 0; /* For administrative MTU increase, there is no way to discover @@ -1909,25 +2317,24 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) also have the lowest MTU, TOO BIG MESSAGE will be lead to PMTU discouvery. */ - if (rt->rt6i_dev == arg->dev && - !dst_metric_locked(&rt->u.dst, RTAX_MTU) && - (dst_mtu(&rt->u.dst) >= arg->mtu || - (dst_mtu(&rt->u.dst) < arg->mtu && - dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) { - rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu); + if (rt->dst.dev == arg->dev && + !dst_metric_locked(&rt->dst, RTAX_MTU) && + (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6))) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } return 0; } -void rt6_mtu_change(struct net_device *dev, unsigned mtu) +void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; - fib6_clean_all(rt6_mtu_change_route, 0, &arg); + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { @@ -1936,6 +2343,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1958,13 +2366,20 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_src_len = rtm->rtm_src_len; cfg->fc_flags = RTF_UP; cfg->fc_protocol = rtm->rtm_protocol; + cfg->fc_type = rtm->rtm_type; - if (rtm->rtm_type == RTN_UNREACHABLE) + if (rtm->rtm_type == RTN_UNREACHABLE || + rtm->rtm_type == RTN_BLACKHOLE || + rtm->rtm_type == RTN_PROHIBIT || + rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; - cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + if (rtm->rtm_type == RTN_LOCAL) + cfg->fc_flags |= RTF_LOCAL; + + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; - cfg->fc_nlinfo.nl_net = skb->sk->sk_net; + cfg->fc_nlinfo.nl_net = sock_net(skb->sk); if (tb[RTA_GATEWAY]) { nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); @@ -1989,6 +2404,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } + if (tb[RTA_PREFSRC]) + nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); @@ -2003,41 +2421,99 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); + if (tb[RTA_MULTIPATH]) { + cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); + cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); + } + err = 0; errout: return err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int ip6_route_multipath(struct fib6_config *cfg, int add) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 0, last_err = 0; + +beginning: + rtnh = (struct rtnexthop *)cfg->fc_mp; + remaining = cfg->fc_mp_len; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + if (err) { + last_err = err; + /* If we are trying to remove a route, do not stop the + * loop when ip6_route_del() fails (because next hop is + * already gone), we should try to remove all next hops. + */ + if (add) { + /* If add fails, we should try to delete all + * next hops that have been already added. + */ + add = 0; + goto beginning; + } + } + /* Because each route is added like a single route we remove + * this flag after the first nexthop (if there is a collision, + * we have already fail to add the first nexthop: + * fib6_add_rt2node() has reject it). + */ + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + rtnh = rtnh_next(rtnh, &remaining); + } + + return last_err; +} + +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) { - struct net *net = skb->sk->sk_net; struct fib6_config cfg; int err; - if (net != &init_net) - return -EINVAL; - err = rtm_to_fib6_config(skb, nlh, &cfg); if (err < 0) return err; - return ip6_route_del(&cfg); + if (cfg.fc_mp) + return ip6_route_multipath(&cfg, 0); + else + return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) { - struct net *net = skb->sk->sk_net; struct fib6_config cfg; int err; - if (net != &init_net) - return -EINVAL; - err = rtm_to_fib6_config(skb, nlh, &cfg); if (err < 0) return err; - return ip6_route_add(&cfg); + if (cfg.fc_mp) + return ip6_route_multipath(&cfg, 1); + else + return ip6_route_add(&cfg); } static inline size_t rt6_nlmsg_size(void) @@ -2055,10 +2531,11 @@ static inline size_t rt6_nlmsg_size(void) + nla_total_size(sizeof(struct rta_cacheinfo)); } -static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, +static int rt6_fill_node(struct net *net, + struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, - int iif, int type, u32 pid, u32 seq, - int prefix, unsigned int flags) + int iif, int type, u32 portid, u32 seq, + int prefix, int nowait, unsigned int flags) { struct rtmsg *rtm; struct nlmsghdr *nlh; @@ -2072,8 +2549,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, } } - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); - if (nlh == NULL) + nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -2086,60 +2563,110 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table; - NLA_PUT_U32(skb, RTA_TABLE, table); - if (rt->rt6i_flags&RTF_REJECT) - rtm->rtm_type = RTN_UNREACHABLE; - else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + if (nla_put_u32(skb, RTA_TABLE, table)) + goto nla_put_failure; + if (rt->rt6i_flags & RTF_REJECT) { + switch (rt->dst.error) { + case -EINVAL: + rtm->rtm_type = RTN_BLACKHOLE; + break; + case -EACCES: + rtm->rtm_type = RTN_PROHIBIT; + break; + case -EAGAIN: + rtm->rtm_type = RTN_THROW; + break; + default: + rtm->rtm_type = RTN_UNREACHABLE; + break; + } + } + else if (rt->rt6i_flags & RTF_LOCAL) + rtm->rtm_type = RTN_LOCAL; + else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags&RTF_DYNAMIC) + if (rt->rt6i_flags & RTF_DYNAMIC) rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) - rtm->rtm_protocol = RTPROT_KERNEL; - else if (rt->rt6i_flags&RTF_DEFAULT) - rtm->rtm_protocol = RTPROT_RA; + else if (rt->rt6i_flags & RTF_ADDRCONF) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) + rtm->rtm_protocol = RTPROT_RA; + else + rtm->rtm_protocol = RTPROT_KERNEL; + } - if (rt->rt6i_flags&RTF_CACHE) + if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dst) { - NLA_PUT(skb, RTA_DST, 16, dst); + if (nla_put(skb, RTA_DST, 16, dst)) + goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); + if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr)) + goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { - NLA_PUT(skb, RTA_SRC, 16, src); + if (nla_put(skb, RTA_SRC, 16, src)) + goto nla_put_failure; rtm->rtm_src_len = 128; - } else if (rtm->rtm_src_len) - NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); + } else if (rtm->rtm_src_len && + nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr)) + goto nla_put_failure; #endif - if (iif) - NLA_PUT_U32(skb, RTA_IIF, iif); - else if (dst) { + if (iif) { +#ifdef CONFIG_IPV6_MROUTE + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + int err = ip6mr_get_route(net, skb, rtm, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nla_put_failure; + } else { + if (err == -EMSGSIZE) + goto nla_put_failure; + } + } + } else +#endif + if (nla_put_u32(skb, RTA_IIF, iif)) + goto nla_put_failure; + } else if (dst) { + struct in6_addr saddr_buf; + if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && + nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + goto nla_put_failure; + } + + if (rt->rt6i_prefsrc.plen) { struct in6_addr saddr_buf; - if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) - NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + saddr_buf = rt->rt6i_prefsrc.addr; + if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; - if (rt->u.dst.neighbour) - NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rt->rt6i_flags & RTF_GATEWAY) { + if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0) + goto nla_put_failure; + } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; + if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) + goto nla_put_failure; - NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; - expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, - expires, rt->u.dst.error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2160,62 +2687,78 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) } else prefix = 0; - return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, - prefix, NLM_F_MULTI); + return rt6_fill_node(arg->net, + arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, + prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi fl; - int err, iif = 0; - - if (net != &init_net) - return -EINVAL; + struct flowi6 fl6; + int err, iif = 0, oif = 0; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); if (err < 0) goto errout; err = -EINVAL; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC])); + fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST])); + fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) - fl.oif = nla_get_u32(tb[RTA_OIF]); + oif = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_MARK]) + fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + int flags = 0; + + dev = __dev_get_by_index(net, iif); if (!dev) { err = -ENODEV; goto errout; } + + fl6.flowi6_iif = iif; + + if (!ipv6_addr_any(&fl6.saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6, + flags); + } else { + fl6.flowi6_oif = oif; + + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (skb == NULL) { + if (!skb) { + ip6_rt_put(rt); err = -ENOBUFS; goto errout; } @@ -2226,18 +2769,17 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb_reset_mac_header(skb); skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); - rt = (struct rt6_info*) ip6_route_output(NULL, &fl); - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->dst); - err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0, 0); + err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, 0, 0); if (err < 0) { kfree_skb(skb); goto errout; } - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } @@ -2245,109 +2787,85 @@ errout: void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) { struct sk_buff *skb; + struct net *net = info->nl_net; u32 seq; int err; err = -ENOBUFS; - seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; + seq = info->nlh ? info->nlh->nlmsg_seq : 0; skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); - if (skb == NULL) + if (!skb) goto errout; - err = rt6_fill_node(skb, rt, NULL, NULL, 0, - event, info->pid, seq, 0, 0); + err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, + event, info->portid, seq, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, &init_net, info->pid, - RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; errout: if (err < 0) - rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } -/* - * /proc - */ - -#ifdef CONFIG_PROC_FS - -#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) - -struct rt6_proc_arg -{ - char *buffer; - int offset; - int length; - int skip; - int len; -}; - -static int rt6_info_route(struct rt6_info *rt, void *p_arg) +static int ip6_route_dev_notify(struct notifier_block *this, + unsigned long event, void *ptr) { - struct seq_file *m = p_arg; - - seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr), - rt->rt6i_dst.plen); + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); -#ifdef CONFIG_IPV6_SUBTREES - seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr), - rt->rt6i_src.plen); -#else - seq_puts(m, "00000000000000000000000000000000 00 "); + if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { + net->ipv6.ip6_null_entry->dst.dev = dev; + net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry->dst.dev = dev; + net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.ip6_blk_hole_entry->dst.dev = dev; + net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif - - if (rt->rt6i_nexthop) { - seq_printf(m, NIP6_SEQFMT, - NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key))); - } else { - seq_puts(m, "00000000000000000000000000000000"); } - seq_printf(m, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt), - rt->u.dst.__use, rt->rt6i_flags, - rt->rt6i_dev ? rt->rt6i_dev->name : ""); - return 0; -} -static int ipv6_route_show(struct seq_file *m, void *v) -{ - fib6_clean_all(rt6_info_route, 0, m); - return 0; + return NOTIFY_OK; } -static int ipv6_route_open(struct inode *inode, struct file *file) -{ - return single_open(file, ipv6_route_show, NULL); -} +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS static const struct file_operations ipv6_route_proc_fops = { .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_net, }; static int rt6_stats_seq_show(struct seq_file *seq, void *v) { + struct net *net = (struct net *)seq->private; seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", - rt6_stats.fib_nodes, rt6_stats.fib_route_nodes, - rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries, - rt6_stats.fib_rt_cache, - atomic_read(&ip6_dst_ops.entries), - rt6_stats.fib_discarded_routes); + net->ipv6.rt6_stats->fib_nodes, + net->ipv6.rt6_stats->fib_route_nodes, + net->ipv6.rt6_stats->fib_rt_alloc, + net->ipv6.rt6_stats->fib_rt_entries, + net->ipv6.rt6_stats->fib_rt_cache, + dst_entries_get_slow(&net->ipv6.ip6_dst_ops), + net->ipv6.rt6_stats->fib_discarded_routes); return 0; } static int rt6_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, rt6_stats_seq_show, NULL); + return single_open_net(inode, file, rt6_stats_seq_show); } static const struct file_operations rt6_stats_seq_fops = { @@ -2355,218 +2873,364 @@ static const struct file_operations rt6_stats_seq_fops = { .open = rt6_stats_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; - -static int ipv6_route_proc_init(struct net *net) -{ - int ret = -ENOMEM; - if (!proc_net_fops_create(net, "ipv6_route", - 0, &ipv6_route_proc_fops)) - goto out; - - if (!proc_net_fops_create(net, "rt6_stats", - S_IRUGO, &rt6_stats_seq_fops)) - goto out_ipv6_route; - - ret = 0; -out: - return ret; -out_ipv6_route: - proc_net_remove(net, "ipv6_route"); - goto out; -} - -static void ipv6_route_proc_fini(struct net *net) -{ - proc_net_remove(net, "ipv6_route"); - proc_net_remove(net, "rt6_stats"); -} -#else -static inline int ipv6_route_proc_init(struct net *net) -{ - return 0; -} -static inline void ipv6_route_proc_fini(struct net *net) -{ - return ; -} #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, +int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int delay = init_net.ipv6.sysctl.flush_delay; - if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay); - return 0; - } else + struct net *net; + int delay; + if (!write) return -EINVAL; + + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; + proc_dointvec(ctl, write, buffer, lenp, ppos); + fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); + return 0; } -ctl_table ipv6_route_table_template[] = { +struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, - .proc_handler = &ipv6_sysctl_rtcache_flush + .proc_handler = ipv6_sysctl_rtcache_flush }, { - .ctl_name = NET_IPV6_ROUTE_GC_THRESH, .procname = "gc_thresh", - .data = &ip6_dst_ops.gc_thresh, + .data = &ip6_dst_ops_template.gc_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ROUTE_MAX_SIZE, .procname = "max_size", .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL, .procname = "gc_min_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL, .procname = "gc_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, + .proc_handler = proc_dointvec_ms_jiffies, }, - { .ctl_name = 0 } + { } }; -struct ctl_table *ipv6_route_sysctl_init(struct net *net) +struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_route_table_template, sizeof(ipv6_route_table_template), GFP_KERNEL); + + if (table) { + table[0].data = &net->ipv6.sysctl.flush_delay; + table[0].extra1 = net; + table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; + table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; + table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; + table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; + table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; + table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; + table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; + } + return table; } #endif +static int __net_init ip6_route_net_init(struct net *net) +{ + int ret = -ENOMEM; + + memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, + sizeof(net->ipv6.ip6_dst_ops)); + + if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) + goto out_ip6_dst_ops; + + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, + sizeof(*net->ipv6.ip6_null_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_null_entry; + net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_null_entry->dst, + ip6_template_metrics, true); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, + sizeof(*net->ipv6.ip6_prohibit_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_prohibit_entry) + goto out_ip6_null_entry; + net->ipv6.ip6_prohibit_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_prohibit_entry; + net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, + ip6_template_metrics, true); + + net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, + sizeof(*net->ipv6.ip6_blk_hole_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_blk_hole_entry) + goto out_ip6_prohibit_entry; + net->ipv6.ip6_blk_hole_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; + net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, + ip6_template_metrics, true); +#endif + + net->ipv6.sysctl.flush_delay = 0; + net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; + net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; + net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + + net->ipv6.ip6_rt_gc_expire = 30*HZ; + + ret = 0; +out: + return ret; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_ip6_prohibit_entry: + kfree(net->ipv6.ip6_prohibit_entry); +out_ip6_null_entry: + kfree(net->ipv6.ip6_null_entry); +#endif +out_ip6_dst_entries: + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +out_ip6_dst_ops: + goto out; +} + +static void __net_exit ip6_route_net_exit(struct net *net) +{ + kfree(net->ipv6.ip6_null_entry); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.ip6_prohibit_entry); + kfree(net->ipv6.ip6_blk_hole_entry); +#endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +} + +static int __net_init ip6_route_net_init_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); + proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); +#endif + return 0; +} + +static void __net_exit ip6_route_net_exit_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipv6_route", net->proc_net); + remove_proc_entry("rt6_stats", net->proc_net); +#endif +} + +static struct pernet_operations ip6_route_net_ops = { + .init = ip6_route_net_init, + .exit = ip6_route_net_exit, +}; + +static int __net_init ipv6_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv6.peers = bp; + return 0; +} + +static void __net_exit ipv6_inetpeer_exit(struct net *net) +{ + struct inet_peer_base *bp = net->ipv6.peers; + + net->ipv6.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); +} + +static struct pernet_operations ipv6_inetpeer_ops = { + .init = ipv6_inetpeer_init, + .exit = ipv6_inetpeer_exit, +}; + +static struct pernet_operations ip6_route_net_late_ops = { + .init = ip6_route_net_init_late, + .exit = ip6_route_net_exit_late, +}; + +static struct notifier_block ip6_route_dev_notifier = { + .notifier_call = ip6_route_dev_notify, + .priority = 0, +}; + int __init ip6_route_init(void) { int ret; - ip6_dst_ops.kmem_cachep = + ret = -ENOMEM; + ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ip6_dst_ops.kmem_cachep) - return -ENOMEM; - - ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep; + if (!ip6_dst_ops_template.kmem_cachep) + goto out; - ret = fib6_init(); + ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; - ret = ipv6_route_proc_init(&init_net); + ret = register_pernet_subsys(&ipv6_inetpeer_ops); if (ret) - goto out_fib6_init; + goto out_dst_entries; + + ret = register_pernet_subsys(&ip6_route_net_ops); + if (ret) + goto out_register_inetpeer; + + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; + + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif + ret = fib6_init(); + if (ret) + goto out_register_subsys; ret = xfrm6_init(); if (ret) - goto out_proc_init; + goto out_fib6_init; ret = fib6_rules_init(); if (ret) goto xfrm6_init; - ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL)) + ret = register_pernet_subsys(&ip6_route_net_late_ops); + if (ret) goto fib6_rules_init; - ret = 0; + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) + goto out_register_late_subsys; + + ret = register_netdevice_notifier(&ip6_route_dev_notifier); + if (ret) + goto out_register_late_subsys; + out: return ret; +out_register_late_subsys: + unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: xfrm6_fini(); -out_proc_init: - ipv6_route_proc_fini(&init_net); out_fib6_init: - rt6_ifdown(NULL); fib6_gc_cleanup(); +out_register_subsys: + unregister_pernet_subsys(&ip6_route_net_ops); +out_register_inetpeer: + unregister_pernet_subsys(&ipv6_inetpeer_ops); +out_dst_entries: + dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: - kmem_cache_destroy(ip6_dst_ops.kmem_cachep); + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; } void ip6_route_cleanup(void) { + unregister_netdevice_notifier(&ip6_route_dev_notifier); + unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); - ipv6_route_proc_fini(&init_net); xfrm6_fini(); - rt6_ifdown(NULL); fib6_gc_cleanup(); - kmem_cache_destroy(ip6_dst_ops.kmem_cachep); + unregister_pernet_subsys(&ipv6_inetpeer_ops); + unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1656c003b98..4f408176dc6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,8 +6,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.53 2001/09/25 05:09:53 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -16,9 +14,11 @@ * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support - * Fred L. Templin <fltemplin@acm.org>: isatap support + * Fred Templin <fred.l.templin@boeing.com>: isatap support */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> @@ -30,6 +30,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> @@ -48,10 +49,12 @@ #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -62,49 +65,70 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static int ipip6_fb_tunnel_init(struct net_device *dev); +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); +static void ipip6_dev_free(struct net_device *dev); +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst); +static struct rtnl_link_ops sit_link_ops __read_mostly; + +static int sit_net_id __read_mostly; +struct sit_net { + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; -static struct net_device *ipip6_fb_tunnel_dev; - -static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; -static struct ip_tunnel *tunnels_r[HASH_SIZE]; -static struct ip_tunnel *tunnels_l[HASH_SIZE]; -static struct ip_tunnel *tunnels_wc[1]; -static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; - -static DEFINE_RWLOCK(ipip6_lock); - -static struct ip_tunnel * ipip6_tunnel_lookup(__be32 remote, __be32 local) +/* + * Must be invoked with rcu_read_lock + */ +static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, + struct net_device *dev, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; + struct sit_net *sitn = net_generic(net, sit_net_id); - for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && - remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) return t; } - for (t = tunnels_r[h0]; t; t = t->next) { - if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { + if (remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) return t; } - for (t = tunnels_l[h1]; t; t = t->next) { - if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { + if (local == t->parms.iph.saddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) return t; } - if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + t = rcu_dereference(sitn->tunnels_wc[0]); + if ((t != NULL) && (t->dev->flags & IFF_UP)) return t; return NULL; } -static struct ip_tunnel **__ipip6_bucket(struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -115,49 +139,108 @@ static struct ip_tunnel **__ipip6_bucket(struct ip_tunnel_parm *parms) prio |= 1; h ^= HASH(local); } - return &tunnels[prio][h]; + return &sitn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip6_bucket(struct ip_tunnel *t) +static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel *t) { - return __ipip6_bucket(&t->parms); + return __ipip6_bucket(sitn, &t->parms); } -static void ipip6_tunnel_unlink(struct ip_tunnel *t) +static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - write_lock_bh(&ipip6_lock); - *tp = t->next; - write_unlock_bh(&ipip6_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip6_bucket(sitn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } } -static void ipip6_tunnel_link(struct ip_tunnel *t) +static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip6_bucket(t); + struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} - t->next = *tp; - write_lock_bh(&ipip6_lock); - *tp = t; - write_unlock_bh(&ipip6_lock); +static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) +{ +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel *t = netdev_priv(dev); + + if (t->dev == sitn->fb_tunnel_dev) { + ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); + t->ip6rd.relay_prefix = 0; + t->ip6rd.prefixlen = 16; + t->ip6rd.relay_prefixlen = 0; + } else { + struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); + memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); + } +#endif } -static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) +static int ipip6_tunnel_create(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + int err; + + err = ipip6_tunnel_init(dev); + if (err < 0) + goto out; + ipip6_tunnel_clone_6rd(dev, sitn); + + if ((__force u16)t->parms.i_flags & SIT_ISATAP) + dev->priv_flags |= IFF_ISATAP; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &sit_link_ops; + + dev_hold(dev); + + ipip6_tunnel_link(sitn, t); + return 0; + +out: + return err; +} + +static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; + struct sit_net *sitn = net_generic(net, sit_net_id); - for (tp = __ipip6_bucket(parms); (t = *tp) != NULL; tp = &t->next) { - if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) - return t; + for (tp = __ipip6_bucket(sitn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + parms->link == t->parms.link) { + if (create) + return NULL; + else + return t; + } } if (!create) goto failed; @@ -165,61 +248,276 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "sit%%d"); + strcpy(name, "sit%d"); dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); if (dev == NULL) return NULL; - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } + dev_net_set(dev, net); nt = netdev_priv(dev); - dev->init = ipip6_tunnel_init; - nt->parms = *parms; - - if (parms->i_flags & SIT_ISATAP) - dev->priv_flags |= IFF_ISATAP; - if (register_netdevice(dev) < 0) + nt->parms = *parms; + if (ipip6_tunnel_create(dev) < 0) goto failed_free; - dev_hold(dev); - - ipip6_tunnel_link(nt); return nt; failed_free: - free_netdev(dev); + ipip6_dev_free(dev); failed: return NULL; } -static void ipip6_tunnel_uninit(struct net_device *dev) +#define for_each_prl_rcu(start) \ + for (prl = rcu_dereference(start); \ + prl; \ + prl = rcu_dereference(prl->next)) + +static struct ip_tunnel_prl_entry * +__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) +{ + struct ip_tunnel_prl_entry *prl; + + for_each_prl_rcu(t->prl) + if (prl->addr == addr) + break; + return prl; + +} + +static int ipip6_tunnel_get_prl(struct ip_tunnel *t, + struct ip_tunnel_prl __user *a) +{ + struct ip_tunnel_prl kprl, *kp; + struct ip_tunnel_prl_entry *prl; + unsigned int cmax, c = 0, ca, len; + int ret = 0; + + if (copy_from_user(&kprl, a, sizeof(kprl))) + return -EFAULT; + cmax = kprl.datalen / sizeof(kprl); + if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) + cmax = 1; + + /* For simple GET or for root users, + * we try harder to allocate. + */ + kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? + kcalloc(cmax, sizeof(*kp), GFP_KERNEL) : + NULL; + + rcu_read_lock(); + + ca = t->prl_count < cmax ? t->prl_count : cmax; + + if (!kp) { + /* We don't try hard to allocate much memory for + * non-root users. + * For root users, retry allocating enough memory for + * the answer. + */ + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + if (!kp) { + ret = -ENOMEM; + goto out; + } + } + + c = 0; + for_each_prl_rcu(t->prl) { + if (c >= cmax) + break; + if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) + continue; + kp[c].addr = prl->addr; + kp[c].flags = prl->flags; + c++; + if (kprl.addr != htonl(INADDR_ANY)) + break; + } +out: + rcu_read_unlock(); + + len = sizeof(*kp) * c; + ret = 0; + if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) + ret = -EFAULT; + + kfree(kp); + + return ret; +} + +static int +ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) { - if (dev == ipip6_fb_tunnel_dev) { - write_lock_bh(&ipip6_lock); - tunnels_wc[0] = NULL; - write_unlock_bh(&ipip6_lock); - dev_put(dev); + struct ip_tunnel_prl_entry *p; + int err = 0; + + if (a->addr == htonl(INADDR_ANY)) + return -EINVAL; + + ASSERT_RTNL(); + + for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { + if (p->addr == a->addr) { + if (chg) { + p->flags = a->flags; + goto out; + } + err = -EEXIST; + goto out; + } + } + + if (chg) { + err = -ENXIO; + goto out; + } + + p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); + if (!p) { + err = -ENOBUFS; + goto out; + } + + p->next = t->prl; + p->addr = a->addr; + p->flags = a->flags; + t->prl_count++; + rcu_assign_pointer(t->prl, p); +out: + return err; +} + +static void prl_list_destroy_rcu(struct rcu_head *head) +{ + struct ip_tunnel_prl_entry *p, *n; + + p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); + do { + n = rcu_dereference_protected(p->next, 1); + kfree(p); + p = n; + } while (p); +} + +static int +ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) +{ + struct ip_tunnel_prl_entry *x; + struct ip_tunnel_prl_entry __rcu **p; + int err = 0; + + ASSERT_RTNL(); + + if (a && a->addr != htonl(INADDR_ANY)) { + for (p = &t->prl; + (x = rtnl_dereference(*p)) != NULL; + p = &x->next) { + if (x->addr == a->addr) { + *p = x->next; + kfree_rcu(x, rcu_head); + t->prl_count--; + goto out; + } + } + err = -ENXIO; } else { - ipip6_tunnel_unlink(netdev_priv(dev)); - dev_put(dev); + x = rtnl_dereference(t->prl); + if (x) { + t->prl_count = 0; + call_rcu(&x->rcu_head, prl_list_destroy_rcu); + t->prl = NULL; + } } +out: + return err; } +static int +isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) +{ + struct ip_tunnel_prl_entry *p; + int ok = 1; + + rcu_read_lock(); + p = __ipip6_tunnel_locate_prl(t, iph->saddr); + if (p) { + if (p->flags & PRL_DEFAULT) + skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; + else + skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; + } else { + const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; -static int ipip6_err(struct sk_buff *skb, u32 info) + if (ipv6_addr_is_isatap(addr6) && + (addr6->s6_addr32[3] == iph->saddr) && + ipv6_chk_prefix(addr6, t->dev)) + skb->ndisc_nodetype = NDISC_NODETYPE_HOST; + else + ok = 0; + } + rcu_read_unlock(); + return ok; +} + +static void ipip6_tunnel_uninit(struct net_device *dev) { -#ifndef I_WISH_WORLD_WERE_PERFECT + struct ip_tunnel *tunnel = netdev_priv(dev); + struct sit_net *sitn = net_generic(tunnel->net, sit_net_id); + + if (dev == sitn->fb_tunnel_dev) { + RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); + } else { + ipip6_tunnel_unlink(sitn, tunnel); + ipip6_tunnel_del_prl(tunnel, NULL); + } + ip_tunnel_dst_reset_all(tunnel); + dev_put(dev); +} -/* It is not :-( All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. +/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH + * if sufficient data bytes are available */ - struct iphdr *iph = (struct iphdr*)skb->data; +static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct rt6_info *rt; + struct sk_buff *skb2; + + if (!pskb_may_pull(skb, iph->ihl * 4 + sizeof(struct ipv6hdr) + 8)) + return 1; + + skb2 = skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 1; + + skb_dst_drop(skb2); + skb_pull(skb2, iph->ihl * 4); + skb_reset_network_header(skb2); + + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; + + icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + + if (rt) + ip6_rt_put(rt); + + kfree_skb(skb2); + + return 0; +} + +static int ipip6_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -233,12 +531,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -251,218 +545,256 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; - read_lock(&ipip6_lock); - t = ipip6_tunnel_lookup(iph->daddr, iph->saddr); - if (t == NULL || t->parms.iph.daddr == 0) + t = ipip6_tunnel_lookup(dev_net(skb->dev), + skb->dev, + iph->daddr, + iph->saddr); + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_IPV6, 0); + err = 0; + goto out; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_IPV6, 0); + err = 0; + goto out; + } + + if (t->parms.iph.daddr == 0) goto out; err = 0; + if (!ipip6_err_gen_icmpv6_unreach(skb)) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; - if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: - read_unlock(&ipip6_lock); return err; -#else - struct iphdr *iph = (struct iphdr*)dp; - int hlen = iph->ihl<<2; - struct ipv6hdr *iph6; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - int rel_type = 0; - int rel_code = 0; - int rel_info = 0; - struct sk_buff *skb2; - struct rt6_info *rt6i; - - if (len < hlen + sizeof(struct ipv6hdr)) - return; - iph6 = (struct ipv6hdr*)(dp + hlen); - - switch (type) { - default: - return; - case ICMP_PARAMETERPROB: - if (icmp_hdr(skb)->un.gateway < hlen) - return; - - /* So... This guy found something strange INSIDE encapsulated - packet. Well, he is fool, but what can we do ? - */ - rel_type = ICMPV6_PARAMPROB; - rel_info = icmp_hdr(skb)->un.gateway - hlen; - break; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return; - case ICMP_FRAG_NEEDED: - /* Too complicated case ... */ - return; - default: - /* All others are translated to HOST_UNREACH. - rfc2003 contains "deep thoughts" about NET_UNREACH, - I believe, it is just ether pollution. --ANK - */ - rel_type = ICMPV6_DEST_UNREACH; - rel_code = ICMPV6_ADDR_UNREACH; - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return; - rel_type = ICMPV6_TIME_EXCEED; - rel_code = ICMPV6_EXC_HOPLIMIT; - break; - } - - /* Prepare fake skb to feed it to icmpv6_send */ - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) - return 0; - dst_release(skb2->dst); - skb2->dst = NULL; - skb_pull(skb2, skb->data - (u8*)iph6); - skb_reset_network_header(skb2); +} - /* Try to guess incoming interface */ - rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); - if (rt6i && rt6i->rt6i_dev) { - skb2->dev = rt6i->rt6i_dev; +static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, + const struct in6_addr *v6addr) +{ + __be32 v4embed = 0; + if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) + return true; + return false; +} - rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); +/* Checks if an address matches an address on the tunnel interface. + * Used to detect the NAT of proto 41 packets and let them pass spoofing test. + * Long story: + * This function is called after we considered the packet as spoofed + * in is_spoofed_6rd. + * We may have a router that is doing NAT for proto 41 packets + * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb + * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd + * function will return true, dropping the packet. + * But, we can still check if is spoofed against the IP + * addresses associated with the interface. + */ +static bool only_dnatted(const struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + int prefix_len; - if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { - struct ip_tunnel *t = netdev_priv(rt6i->rt6i_dev); - if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { - rel_type = ICMPV6_DEST_UNREACH; - rel_code = ICMPV6_ADDR_UNREACH; - } - icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); - } - } - kfree_skb(skb2); - return 0; +#ifdef CONFIG_IPV6_SIT_6RD + prefix_len = tunnel->ip6rd.prefixlen + 32 + - tunnel->ip6rd.relay_prefixlen; +#else + prefix_len = 48; #endif + return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); } -static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +/* Returns true if a packet is spoofed */ +static bool packet_is_spoofed(struct sk_buff *skb, + const struct iphdr *iph, + struct ip_tunnel *tunnel) { - if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(ipv6_hdr(skb)); -} + const struct ipv6hdr *ipv6h; -/* ISATAP (RFC4214) - check source address */ -static int -isatap_srcok(struct sk_buff *skb, struct iphdr *iph, struct net_device *dev) -{ - struct neighbour *neigh; - struct dst_entry *dst; - struct rt6_info *rt; - struct flowi fl; - struct in6_addr *addr6; - struct in6_addr rtr; - struct ipv6hdr *iph6; - int ok = 0; - - /* from onlink default router */ - ipv6_addr_set(&rtr, htonl(0xFE800000), 0, 0, 0); - ipv6_isatap_eui64(rtr.s6_addr + 8, iph->saddr); - if ((rt = rt6_get_dflt_router(&rtr, dev))) { - dst_release(&rt->u.dst); - return 1; - } + if (tunnel->dev->priv_flags & IFF_ISATAP) { + if (!isatap_chksrc(skb, iph, tunnel)) + return true; - iph6 = ipv6_hdr(skb); - memset(&fl, 0, sizeof(fl)); - fl.proto = iph6->nexthdr; - ipv6_addr_copy(&fl.fl6_dst, &iph6->saddr); - fl.oif = dev->ifindex; - security_skb_classify_flow(skb, &fl); + return false; + } - dst = ip6_route_output(NULL, &fl); - if (!dst->error && (dst->dev == dev) && (neigh = dst->neighbour)) { + if (tunnel->dev->flags & IFF_POINTOPOINT) + return false; - addr6 = (struct in6_addr*)&neigh->primary_key; + ipv6h = ipv6_hdr(skb); - /* from correct previous hop */ - if (ipv6_addr_is_isatap(addr6) && - (addr6->s6_addr32[3] == iph->saddr)) - ok = 1; + if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { + net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; } - dst_release(dst); - return ok; + + if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) + return false; + + if (only_dnatted(tunnel, &ipv6h->daddr)) + return false; + + net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; } static int ipip6_rcv(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; + int err; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_sw_netstats *tstats; - iph = ip_hdr(skb); + if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && + tunnel->parms.iph.protocol != 0) + goto out; - read_lock(&ipip6_lock); - if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; - if ((tunnel->dev->priv_flags & IFF_ISATAP) && - !isatap_srcok(skb, iph, tunnel->dev)) { - tunnel->stat.rx_errors++; - read_unlock(&ipip6_lock); - kfree_skb(skb); - return 0; + if (packet_is_spoofed(skb, iph, tunnel)) { + tunnel->dev->stats.rx_errors++; + goto out; + } + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + err = IP_ECN_decapsulate(iph, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &iph->saddr, iph->tos); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto out; + } } - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; - skb->dev = tunnel->dev; - dst_release(skb->dst); - skb->dst = NULL; - nf_reset(skb); - ipip6_ecn_decapsulate(iph, skb); + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + netif_rx(skb); - read_unlock(&ipip6_lock); + return 0; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - kfree_skb(skb); - read_unlock(&ipip6_lock); + /* no tunnel matched, let upstream know, ipsec may handle it */ + return 1; out: + kfree_skb(skb); return 0; } -/* Returns the embedded IPv4 address if the IPv6 address - comes from 6to4 (RFC 3056) addr space */ +static const struct tnl_ptk_info tpi = { + /* no tunnel info required for ipip. */ + .proto = htons(ETH_P_IP), +}; -static inline __be32 try_6to4(struct in6_addr *v6dst) +static int ipip_rcv(struct sk_buff *skb) { - __be32 dst = 0; + const struct iphdr *iph; + struct ip_tunnel *tunnel; + + iph = ip_hdr(skb); + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + tunnel->parms.iph.protocol != 0) + goto drop; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + } + + return 1; + +drop: + kfree_skb(skb); + return 0; +} +/* + * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function + * stores the embedded IPv4 address in v4dst and returns true. + */ +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst) +{ +#ifdef CONFIG_IPV6_SIT_6RD + if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, + tunnel->ip6rd.prefixlen)) { + unsigned int pbw0, pbi0; + int pbi1; + u32 d; + + pbw0 = tunnel->ip6rd.prefixlen >> 5; + pbi0 = tunnel->ip6rd.prefixlen & 0x1f; + + d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> + tunnel->ip6rd.relay_prefixlen; + + pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; + if (pbi1 > 0) + d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> + (32 - pbi1); + + *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); + return true; + } +#else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ - memcpy(&dst, &v6dst->s6_addr16[1], 4); + memcpy(v4dst, &v6dst->s6_addr16[1], 4); + return true; } +#endif + return false; +} + +static inline __be32 try_6rd(struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + __be32 dst = 0; + check_6rd(tunnel, v6dst, &dst); return dst; } @@ -471,69 +803,74 @@ static inline __be32 try_6to4(struct in6_addr *v6dst) * and that skb is filled properly by that function. */ -static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; - struct iphdr *tiph = &tunnel->parms.iph; - struct ipv6hdr *iph6 = ipv6_hdr(skb); + const struct iphdr *tiph = &tunnel->parms.iph; + const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; + __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *iph; /* Our new IP header */ + struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; + struct flowi4 fl4; int mtu; - struct in6_addr *addr6; + const struct in6_addr *addr6; int addr_type; - - if (tunnel->recursion++) { - tunnel->stat.collisions++; - goto tx_error; - } + u8 ttl; + int err; if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; + if (tos == 1) + tos = ipv6_get_dsfield(iph6); + /* ISATAP (RFC4214) - must come before 6to4 */ if (dev->priv_flags & IFF_ISATAP) { struct neighbour *neigh = NULL; + bool do_tx_error = false; - if (skb->dst) - neigh = skb->dst->neighbour; + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } - addr6 = (struct in6_addr*)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if ((addr_type & IPV6_ADDR_UNICAST) && ipv6_addr_is_isatap(addr6)) dst = addr6->s6_addr32[3]; else + do_tx_error = true; + + neigh_release(neigh); + if (do_tx_error) goto tx_error; } if (!dst) - dst = try_6to4(&iph6->daddr); + dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; + bool do_tx_error = false; - if (skb->dst) - neigh = skb->dst->neighbour; + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } - addr6 = (struct in6_addr*)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -541,60 +878,65 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) addr_type = ipv6_addr_type(addr6); } - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) - goto tx_error_icmp; + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) + dst = addr6->s6_addr32[3]; + else + do_tx_error = true; - dst = addr6->s6_addr32[3]; + neigh_release(neigh); + if (do_tx_error) + goto tx_error; } - { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .oif = tunnel->parms.link, - .proto = IPPROTO_IPV6 }; - if (ip_route_output_key(&init_net, &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; - goto tx_error_icmp; - } + rt = ip_route_output_ports(tunnel->net, &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPV6, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; } if (rt->rt_type != RTN_UNICAST) { ip_rt_put(rt); - tunnel->stat.tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + dev->stats.collisions++; goto tx_error; } - if (tiph->frag_off) - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - else - mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + if (df) { + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); - if (mtu < 68) { - tunnel->stat.collisions++; - ip_rt_put(rt); - goto tx_error; - } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - if (tunnel->parms.iph.daddr && skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); + if (mtu < 68) { + dev->stats.collisions++; + ip_rt_put(rt); + goto tx_error; + } - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); - ip_rt_put(rt); - goto tx_error; + if (mtu < IPV6_MIN_MTU) { + mtu = IPV6_MIN_MTU; + df = 0; + } + + if (tunnel->parms.iph.daddr && skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + + if (skb->len > mtu && !skb_is_gso(skb)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ip_rt_put(rt); + goto tx_error; + } } if (tunnel->err_count > 0) { - if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else @@ -611,10 +953,9 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - stats->tx_dropped++; - dev_kfree_skb(skb); - tunnel->recursion--; - return 0; + dev->stats.tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); @@ -622,76 +963,99 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb = new_skb; iph6 = ipv6_hdr(skb); } + ttl = tiph->ttl; + if (ttl == 0) + ttl = iph6->hop_limit; + tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - skb->transport_header = skb->network_header; - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags = 0; - dst_release(skb->dst); - skb->dst = &rt->u.dst; + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + if (IS_ERR(skb)) { + ip_rt_put(rt); + goto out; + } - /* - * Push down and install the IPIP header. - */ + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df, + !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return NETDEV_TX_OK; - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - if (mtu > IPV6_MIN_MTU) - iph->frag_off = htons(IP_DF); - else - iph->frag_off = 0; +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); +out: + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} - iph->protocol = IPPROTO_IPV6; - iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tiph = &tunnel->parms.iph; - if ((iph->ttl = tiph->ttl) == 0) - iph->ttl = iph6->hop_limit; + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; - nf_reset(skb); + ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); + return NETDEV_TX_OK; +out: + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} - IPTUNNEL_XMIT(); - tunnel->recursion--; - return 0; +static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipip_tunnel_xmit(skb, dev); + break; + case htons(ETH_P_IPV6): + ipip6_tunnel_xmit(skb, dev); + break; + default: + goto tx_err; + } + + return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; -tx_error_icmp: - dst_link_failure(skb); -tx_error: - stats->tx_errors++; - dev_kfree_skb(skb); - tunnel->recursion--; - return 0; } static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; + struct flowi4 fl4; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .oif = tunnel->parms.link, - .proto = IPPROTO_IPV6 }; - struct rtable *rt; - if (!ip_route_output_key(&init_net, &rt, &fl)) { - tdev = rt->u.dst.dev; + struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, + NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPV6, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); @@ -702,34 +1066,113 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->iflink = tunnel->parms.link; } +static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) +{ + struct net *net = t->net; + struct sit_net *sitn = net_generic(net, sit_net_id); + + ipip6_tunnel_unlink(sitn, t); + synchronize_net(); + t->parms.iph.saddr = p->iph.saddr; + t->parms.iph.daddr = p->iph.daddr; + memcpy(t->dev->dev_addr, &p->iph.saddr, 4); + memcpy(t->dev->broadcast, &p->iph.daddr, 4); + ipip6_tunnel_link(sitn, t); + t->parms.iph.ttl = p->iph.ttl; + t->parms.iph.tos = p->iph.tos; + if (t->parms.link != p->link) { + t->parms.link = p->link; + ipip6_tunnel_bind_dev(t->dev); + } + ip_tunnel_dst_reset_all(t); + netdev_state_change(t->dev); +} + +#ifdef CONFIG_IPV6_SIT_6RD +static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, + struct ip_tunnel_6rd *ip6rd) +{ + struct in6_addr prefix; + __be32 relay_prefix; + + if (ip6rd->relay_prefixlen > 32 || + ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64) + return -EINVAL; + + ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen); + if (!ipv6_addr_equal(&prefix, &ip6rd->prefix)) + return -EINVAL; + if (ip6rd->relay_prefixlen) + relay_prefix = ip6rd->relay_prefix & + htonl(0xffffffffUL << + (32 - ip6rd->relay_prefixlen)); + else + relay_prefix = 0; + if (relay_prefix != ip6rd->relay_prefix) + return -EINVAL; + + t->ip6rd.prefix = prefix; + t->ip6rd.relay_prefix = relay_prefix; + t->ip6rd.prefixlen = ip6rd->prefixlen; + t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; + ip_tunnel_dst_reset_all(t); + netdev_state_change(t->dev); + return 0; +} +#endif + static int ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; - struct ip_tunnel *t; + struct ip_tunnel_prl prl; + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; + struct sit_net *sitn = net_generic(net, sit_net_id); +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif switch (cmd) { case SIOCGETTUNNEL: - t = NULL; - if (dev == ipip6_fb_tunnel_dev) { +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: +#endif + if (dev == sitn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } - t = ipip6_tunnel_locate(&p, 0); + t = ipip6_tunnel_locate(net, &p, 0); + if (t == NULL) + t = netdev_priv(dev); } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; + + err = -EFAULT; + if (cmd == SIOCGETTUNNEL) { + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, + sizeof(p))) + goto done; +#ifdef CONFIG_IPV6_SIT_6RD + } else { + ip6rd.prefix = t->ip6rd.prefix; + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, + sizeof(ip6rd))) + goto done; +#endif + } + err = 0; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -737,15 +1180,19 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -EINVAL; - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + if (p.iph.protocol != IPPROTO_IPV6 && + p.iph.protocol != IPPROTO_IPIP && + p.iph.protocol != 0) + goto done; + if (p.iph.version != 4 || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) goto done; if (p.iph.ttl) p.iph.frag_off |= htons(IP_DF); - t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - if (dev != ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { err = -EEXIST; @@ -758,27 +1205,13 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } t = netdev_priv(dev); - ipip6_tunnel_unlink(t); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - ipip6_tunnel_link(t); - netdev_state_change(dev); } + + ipip6_tunnel_update(t, &p); } if (t) { err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - if (t->parms.link != p.link) { - t->parms.link = p.link; - ipip6_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; } else @@ -787,18 +1220,18 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; - if (dev == ipip6_fb_tunnel_dev) { + if (dev == sitn->fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; err = -ENOENT; - if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) + if ((t = ipip6_tunnel_locate(net, &p, 0)) == NULL) goto done; err = -EPERM; - if (t == netdev_priv(ipip6_fb_tunnel_dev)) + if (t == netdev_priv(sitn->fb_tunnel_dev)) goto done; dev = t->dev; } @@ -806,6 +1239,63 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = 0; break; + case SIOCGETPRL: + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); + break; + + case SIOCADDPRL: + case SIOCDELPRL: + case SIOCCHGPRL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = -EFAULT; + if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + goto done; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + ip_tunnel_dst_reset_all(t); + netdev_state_change(dev); + break; + +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCADD6RD: + case SIOCCHG6RD: + case SIOCDEL6RD: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, + sizeof(ip6rd))) + goto done; + + if (cmd != SIOCDEL6RD) { + err = ipip6_tunnel_update_6rd(t, &ip6rd); + if (err < 0) + goto done; + } else + ipip6_tunnel_clone_6rd(dev, sitn); + + err = 0; + break; +#endif + default: err = -EINVAL; } @@ -814,11 +1304,6 @@ done: return err; } -static struct net_device_stats *ipip6_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) @@ -827,46 +1312,79 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } +static const struct net_device_ops ipip6_netdev_ops = { + .ndo_uninit = ipip6_tunnel_uninit, + .ndo_start_xmit = sit_tunnel_xmit, + .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void ipip6_dev_free(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + free_percpu(tunnel->dst_cache); + free_percpu(dev->tstats); + free_netdev(dev); +} + +#define SIT_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + static void ipip6_tunnel_setup(struct net_device *dev) { - dev->uninit = ipip6_tunnel_uninit; - dev->destructor = free_netdev; - dev->hard_start_xmit = ipip6_tunnel_xmit; - dev->get_stats = ipip6_tunnel_get_stats; - dev->do_ioctl = ipip6_tunnel_ioctl; - dev->change_mtu = ipip6_tunnel_change_mtu; + dev->netdev_ops = &ipip6_netdev_ops; + dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); dev->flags = IFF_NOARP; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->iflink = 0; dev->addr_len = 4; + dev->features |= NETIF_F_LLTX; + dev->features |= SIT_FEATURES; + dev->hw_features |= SIT_FEATURES; } static int ipip6_tunnel_init(struct net_device *dev) { - struct ip_tunnel *tunnel; - - tunnel = netdev_priv(dev); + struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + tunnel->net = dev_net(dev); memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip6_tunnel_bind_dev(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } return 0; } -static int __init ipip6_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); iph->version = 4; @@ -874,74 +1392,441 @@ static int __init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } + dev_hold(dev); - tunnels_wc[0] = tunnel; + rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); + return 0; +} + +static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPV6 && + proto != IPPROTO_IPIP && + proto != 0) + return -EINVAL; + + return 0; +} + +static void ipip6_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.version = 4; + parms->iph.protocol = IPPROTO_IPV6; + parms->iph.ihl = 5; + parms->iph.ttl = 64; + + if (!data) + return; + + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); + + if (data[IFLA_IPTUN_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); + + if (data[IFLA_IPTUN_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); + + if (data[IFLA_IPTUN_TTL]) { + parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); + if (parms->iph.ttl) + parms->iph.frag_off = htons(IP_DF); + } + + if (data[IFLA_IPTUN_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_IPTUN_FLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + +} + +#ifdef CONFIG_IPV6_SIT_6RD +/* This function returns true when 6RD attributes are present in the nl msg */ +static bool ipip6_netlink_6rd_parms(struct nlattr *data[], + struct ip_tunnel_6rd *ip6rd) +{ + bool ret = false; + memset(ip6rd, 0, sizeof(*ip6rd)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_6RD_PREFIX]) { + ret = true; + nla_memcpy(&ip6rd->prefix, data[IFLA_IPTUN_6RD_PREFIX], + sizeof(struct in6_addr)); + } + + if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { + ret = true; + ip6rd->relay_prefix = + nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]); + } + + if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { + ret = true; + ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]); + } + + if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) { + ret = true; + ip6rd->relay_prefixlen = + nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); + } + + return ret; +} +#endif + +static int ipip6_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip_tunnel *nt; +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + int err; + + nt = netdev_priv(dev); + ipip6_netlink_parms(data, &nt->parms); + + if (ipip6_tunnel_locate(net, &nt->parms, 0)) + return -EEXIST; + + err = ipip6_tunnel_create(dev); + if (err < 0) + return err; + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipip6_netlink_6rd_parms(data, &ip6rd)) + err = ipip6_tunnel_update_6rd(nt, &ip6rd); +#endif + + return err; +} + +static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm p; + struct net *net = t->net; + struct sit_net *sitn = net_generic(net, sit_net_id); +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + + if (dev == sitn->fb_tunnel_dev) + return -EINVAL; + + ipip6_netlink_parms(data, &p); + + if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) + return -EINVAL; + + t = ipip6_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + ipip6_tunnel_update(t, &p); + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipip6_netlink_6rd_parms(data, &ip6rd)) + return ipip6_tunnel_update_6rd(t, &ip6rd); +#endif + return 0; } -static struct xfrm_tunnel sit_handler = { +static size_t ipip6_get_size(const struct net_device *dev) +{ + return + /* IFLA_IPTUN_LINK */ + nla_total_size(4) + + /* IFLA_IPTUN_LOCAL */ + nla_total_size(4) + + /* IFLA_IPTUN_REMOTE */ + nla_total_size(4) + + /* IFLA_IPTUN_TTL */ + nla_total_size(1) + + /* IFLA_IPTUN_TOS */ + nla_total_size(1) + + /* IFLA_IPTUN_PMTUDISC */ + nla_total_size(1) + + /* IFLA_IPTUN_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + +#ifdef CONFIG_IPV6_SIT_6RD + /* IFLA_IPTUN_6RD_PREFIX */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_6RD_RELAY_PREFIX */ + nla_total_size(4) + + /* IFLA_IPTUN_6RD_PREFIXLEN */ + nla_total_size(2) + + /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ + nla_total_size(2) + +#endif + 0; +} + +static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || + nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || + nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || + nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, + !!(parm->iph.frag_off & htons(IP_DF))) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags)) + goto nla_put_failure; + +#ifdef CONFIG_IPV6_SIT_6RD + if (nla_put(skb, IFLA_IPTUN_6RD_PREFIX, sizeof(struct in6_addr), + &tunnel->ip6rd.prefix) || + nla_put_be32(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, + tunnel->ip6rd.relay_prefix) || + nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, + tunnel->ip6rd.prefixlen) || + nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + tunnel->ip6rd.relay_prefixlen)) + goto nla_put_failure; +#endif + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { + [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, + [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, + [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, + [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, + [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, + [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, +#ifdef CONFIG_IPV6_SIT_6RD + [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, + [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, + [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, +#endif +}; + +static void ipip6_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + if (dev != sitn->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + +static struct rtnl_link_ops sit_link_ops __read_mostly = { + .kind = "sit", + .maxtype = IFLA_IPTUN_MAX, + .policy = ipip6_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipip6_tunnel_setup, + .validate = ipip6_validate, + .newlink = ipip6_newlink, + .changelink = ipip6_changelink, + .get_size = ipip6_get_size, + .fill_info = ipip6_fill_info, + .dellink = ipip6_dellink, +}; + +static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, }; -static void __exit sit_destroy_tunnels(void) +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = ipip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; + +static void __net_exit sit_destroy_tunnels(struct net *net, + struct list_head *head) { + struct sit_net *sitn = net_generic(net, sit_net_id); + struct net_device *dev, *aux; int prio; + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &sit_link_ops) + unregister_netdevice_queue(dev, head); + for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { struct ip_tunnel *t; - while ((t = tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); + + t = rtnl_dereference(sitn->tunnels[prio][h]); + while (t != NULL) { + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, + head); + t = rtnl_dereference(t->next); + } } } } -static void __exit sit_cleanup(void) +static int __net_init sit_init_net(struct net *net) { - xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *t; + int err; + + sitn->tunnels[0] = sitn->tunnels_wc; + sitn->tunnels[1] = sitn->tunnels_l; + sitn->tunnels[2] = sitn->tunnels_r; + sitn->tunnels[3] = sitn->tunnels_r_l; + + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + ipip6_tunnel_setup); + if (!sitn->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(sitn->fb_tunnel_dev, net); + sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + + err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + if (err) + goto err_dev_free; + + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + + if ((err = register_netdev(sitn->fb_tunnel_dev))) + goto err_reg_dev; + + t = netdev_priv(sitn->fb_tunnel_dev); + + strcpy(t->parms.name, sitn->fb_tunnel_dev->name); + return 0; + +err_reg_dev: + dev_put(sitn->fb_tunnel_dev); +err_dev_free: + ipip6_dev_free(sitn->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit sit_exit_net(struct net *net) +{ + LIST_HEAD(list); rtnl_lock(); - sit_destroy_tunnels(); - unregister_netdevice(ipip6_fb_tunnel_dev); + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } +static struct pernet_operations sit_net_ops = { + .init = sit_init_net, + .exit = sit_exit_net, + .id = &sit_net_id, + .size = sizeof(struct sit_net), +}; + +static void __exit sit_cleanup(void) +{ + rtnl_link_unregister(&sit_link_ops); + xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); + + unregister_pernet_device(&sit_net_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + static int __init sit_init(void) { int err; - printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + pr_info("IPv6 over IPv4 tunneling driver\n"); - if (xfrm4_tunnel_register(&sit_handler, AF_INET6) < 0) { - printk(KERN_INFO "sit init: Can't add protocol\n"); - return -EAGAIN; + err = register_pernet_device(&sit_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&sit_handler, AF_INET6); + if (err < 0) { + pr_info("%s: can't register ip6ip4\n", __func__); + goto xfrm_tunnel_failed; } - - ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", - ipip6_tunnel_setup); - if (!ipip6_fb_tunnel_dev) { - err = -ENOMEM; - goto err1; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: can't register ip4ip4\n", __func__); + goto xfrm_tunnel4_failed; } + err = rtnl_link_register(&sit_link_ops); + if (err < 0) + goto rtnl_link_failed; - ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init; - - if ((err = register_netdev(ipip6_fb_tunnel_dev))) - goto err2; - - out: +out: return err; - err2: - free_netdev(ipip6_fb_tunnel_dev); - err1: + +rtnl_link_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); +xfrm_tunnel_failed: + unregister_pernet_device(&sit_net_ops); goto out; } module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); -MODULE_ALIAS("sit0"); +MODULE_ALIAS_RTNL_LINK("sit"); +MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c new file mode 100644 index 00000000000..a822b880689 --- /dev/null +++ b/net/ipv6/syncookies.c @@ -0,0 +1,272 @@ +/* + * IPv6 Syncookies implementation for the Linux kernel + * + * Authors: + * Glenn Griffin <ggriffin.kernel@gmail.com> + * + * Based on IPv4 implementation by Andi Kleen + * linux/net/ipv4/syncookies.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/tcp.h> +#include <linux/random.h> +#include <linux/cryptohash.h> +#include <linux/kernel.h> +#include <net/ipv6.h> +#include <net/tcp.h> + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS]; + +/* RFC 2460, Section 8.3: + * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] + * + * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows + * using higher values than ipv4 tcp syncookies. + * The other values are chosen based on ethernet (1500 and 9k MTU), plus + * one that accounts for common encap (PPPoe) overhead. Table must be sorted. + */ +static __u16 const msstab[] = { + 1280 - 60, /* IPV6_MIN_MTU - 60 */ + 1480 - 60, + 1500 - 60, + 9000 - 60, +}; + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *child; + + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + if (child) + inet_csk_reqsk_queue_add(sk, req, child); + else + reqsk_free(req); + + return child; +} + +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); + +static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, + __be16 sport, __be16 dport, u32 count, int c) +{ + __u32 *tmp; + + net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); + + tmp = __get_cpu_var(ipv6_cookie_scratch); + + /* + * we have 320 bits of information to hash, copy in the remaining + * 192 bits required for sha_transform, from the syncookie6_secret + * and overwrite the digest with the secret + */ + memcpy(tmp + 10, syncookie6_secret[c], 44); + memcpy(tmp, saddr, 16); + memcpy(tmp + 4, daddr, 16); + tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; + tmp[9] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + +static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 data) +{ + u32 count = tcp_cookie_time(); + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, + __be16 dport, __u32 sseq) +{ + __u32 diff, count = tcp_cookie_time(); + + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); + if (diff >= MAX_SYNCOOKIE_AGE) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; +} + +u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, __u16 *mssp) +{ + int mssind; + const __u16 mss = *mssp; + + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + + *mssp = msstab[mssind]; + + return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, + th->dest, ntohl(th->seq), mssind); +} +EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); + +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v6_init_sequence(iph, th, mssp); +} + +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + __u32 cookie) +{ + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq); + + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; +} +EXPORT_SYMBOL_GPL(__cookie_v6_check); + +struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tcp_opt; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; + struct sock *ret = sk; + struct request_sock *req; + int mss; + struct dst_entry *dst; + __u8 rcv_wscale; + bool ecn_ok = false; + + if (!sysctl_tcp_syncookies || !th->ack || th->rst) + goto out; + + if (tcp_synq_no_recent_overflow(sk) || + (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, 0, NULL); + + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + goto out; + + ret = NULL; + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (!req) + goto out; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + treq->listener = NULL; + + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + + req->mss = mss; + ireq->ir_rmt_port = th->source; + ireq->ir_num = ntohs(th->dest); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq->pktopts = skb; + } + + ireq->ir_iif = sk->sk_bound_dev_if; + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + ireq->ir_mark = inet_request_mark(sk, skb); + + req->expires = 0UL; + req->num_retrans = 0; + ireq->ecn_ok = ecn_ok; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = cookie; + + /* + * We need to lookup the dst_entry to get the correct window size. + * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten + * me if there is a preferred way. + */ + { + struct in6_addr *final_p, final; + struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = ireq->ir_v6_rmt_addr; + final_p = fl6_update_dst(&fl6, np->opt, &final); + fl6.saddr = ireq->ir_v6_loc_addr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = ireq->ir_mark; + fl6.fl6_dport = ireq->ir_rmt_port; + fl6.fl6_sport = inet_sk(sk)->inet_sport; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) + goto out_free; + } + + req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, dst); +out: + return ret; +out_free: + reqsk_free(req); + return NULL; +} + diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index d6d3e68086f..058f3eca2e5 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -9,53 +9,57 @@ #include <linux/sysctl.h> #include <linux/in6.h> #include <linux/ipv6.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/ndisc.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> -static ctl_table ipv6_table_template[] = { +static struct ctl_table ipv6_table_template[] = { { - .ctl_name = NET_IPV6_ROUTE, - .procname = "route", - .maxlen = 0, - .mode = 0555, - .child = ipv6_route_table_template + .procname = "bindv6only", + .data = &init_net.ipv6.sysctl.bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV6_ICMP, - .procname = "icmp", - .maxlen = 0, - .mode = 0555, - .child = ipv6_icmp_table_template + .procname = "anycast_src_echo_reply", + .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV6_BINDV6ONLY, - .procname = "bindv6only", - .data = &init_net.ipv6.sysctl.bindv6only, + .procname = "flowlabel_consistency", + .data = &init_net.ipv6.sysctl.flowlabel_consistency, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV6_MLD_MAX_MSF, - .procname = "mld_max_msf", - .data = &sysctl_mld_max_msf, + .procname = "fwmark_reflect", + .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; -struct ctl_path net_ipv6_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv6", .ctl_name = NET_IPV6, }, - { }, +static struct ctl_table ipv6_rotable[] = { + { + .procname = "mld_max_msf", + .data = &sysctl_mld_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } }; -EXPORT_SYMBOL_GPL(net_ipv6_ctl_path); -static int ipv6_sysctl_net_init(struct net *net) +static int __net_init ipv6_sysctl_net_init(struct net *net) { struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; @@ -67,6 +71,9 @@ static int ipv6_sysctl_net_init(struct net *net) GFP_KERNEL); if (!ipv6_table) goto out; + ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; + ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; + ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) @@ -76,38 +83,27 @@ static int ipv6_sysctl_net_init(struct net *net) if (!ipv6_icmp_table) goto out_ipv6_route_table; - ipv6_route_table[0].data = &net->ipv6.sysctl.flush_delay; - /* ipv6_route_table[1].data will be handled when we have - routes per namespace */ - ipv6_route_table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; - ipv6_route_table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; - ipv6_route_table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; - ipv6_route_table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; - ipv6_route_table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; - ipv6_route_table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; - ipv6_route_table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; - ipv6_table[0].child = ipv6_route_table; - - ipv6_icmp_table[0].data = &net->ipv6.sysctl.icmpv6_time; - ipv6_table[1].child = ipv6_icmp_table; - - ipv6_table[2].data = &net->ipv6.sysctl.bindv6only; - - /* We don't want this value to be per namespace, it should be global - to all namespaces, so make it read-only when we are not in the - init network namespace */ - if (net != &init_net) - ipv6_table[3].mode = 0444; - - net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path, - ipv6_table); - if (!net->ipv6.sysctl.table) + net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table); + if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; + net->ipv6.sysctl.route_hdr = + register_net_sysctl(net, "net/ipv6/route", ipv6_route_table); + if (!net->ipv6.sysctl.route_hdr) + goto out_unregister_ipv6_table; + + net->ipv6.sysctl.icmp_hdr = + register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table); + if (!net->ipv6.sysctl.icmp_hdr) + goto out_unregister_route_table; + err = 0; out: return err; - +out_unregister_route_table: + unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); +out_unregister_ipv6_table: + unregister_net_sysctl_table(net->ipv6.sysctl.hdr); out_ipv6_icmp_table: kfree(ipv6_icmp_table); out_ipv6_route_table: @@ -117,17 +113,19 @@ out_ipv6_table: goto out; } -static void ipv6_sysctl_net_exit(struct net *net) +static void __net_exit ipv6_sysctl_net_exit(struct net *net) { struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; - ipv6_table = net->ipv6.sysctl.table->ctl_table_arg; - ipv6_route_table = ipv6_table[0].child; - ipv6_icmp_table = ipv6_table[1].child; + ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; + ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; + ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->ipv6.sysctl.table); + unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr); + unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); + unregister_net_sysctl_table(net->ipv6.sysctl.hdr); kfree(ipv6_table); kfree(ipv6_route_table); @@ -139,12 +137,29 @@ static struct pernet_operations ipv6_sysctl_net_ops = { .exit = ipv6_sysctl_net_exit, }; +static struct ctl_table_header *ip6_header; + int ipv6_sysctl_register(void) { - return register_pernet_subsys(&ipv6_sysctl_net_ops); + int err = -ENOMEM; + + ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); + if (ip6_header == NULL) + goto out; + + err = register_pernet_subsys(&ipv6_sysctl_net_ops); + if (err) + goto err_pernet; +out: + return err; + +err_pernet: + unregister_net_sysctl_table(ip6_header); + goto out; } void ipv6_sysctl_unregister(void) { + unregister_net_sysctl_table(ip6_header); unregister_pernet_subsys(&ipv6_sysctl_net_ops); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 12750f2b05a..229239ad96b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,8 +5,6 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.144 2002/02/01 22:01:04 davem Exp $ - * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c @@ -25,6 +23,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -39,7 +38,8 @@ #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> - +#include <linux/slab.h> +#include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> @@ -60,8 +60,10 @@ #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/netdma.h> - -#include <asm/uaccess.h> +#include <net/inet_common.h> +#include <net/secure_seq.h> +#include <net/tcp_memcontrol.h> +#include <net/busy_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -69,23 +71,37 @@ #include <linux/crypto.h> #include <linux/scatterlist.h> -/* Socket used for sending RSTs and ACKs */ -static struct socket *tcp6_socket; - static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); -static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, int len, - struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static struct inet_connection_sock_af_ops ipv6_mapped; -static struct inet_connection_sock_af_ops ipv6_specific; +static const struct inet_connection_sock_af_ops ipv6_mapped; +static const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_sock_af_ops tcp_sock_ipv6_specific; -static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; +#else +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, + const struct in6_addr *addr) +{ + return NULL; +} #endif +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -94,20 +110,12 @@ static void tcp_v6_hash(struct sock *sk) return; } local_bh_disable(); - __inet6_hash(sk); + __inet6_hash(sk, NULL); local_bh_enable(); } } -static __inline__ __sum16 tcp_v6_check(struct tcphdr *th, int len, - struct in6_addr *saddr, - struct in6_addr *daddr, - __wsum base) -{ - return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); -} - -static __u32 tcp_v6_init_sequence(struct sk_buff *skb) +static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) { return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, @@ -123,8 +131,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct in6_addr *saddr = NULL, *final_p = NULL, final; - struct flowi fl; + struct in6_addr *saddr = NULL, *final_p, final; + struct rt6_info *rt; + struct flowi6 fl6; struct dst_entry *dst; int addr_type; int err; @@ -133,19 +142,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; if (usin->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { - fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl.fl6_flowlabel); - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6.flowlabel); + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); fl6_sock_release(flowlabel); } } @@ -154,12 +162,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if(ipv6_addr_any(&usin->sin6_addr)) + if (ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 0x1; addr_type = ipv6_addr_type(&usin->sin6_addr); - if(addr_type & IPV6_ADDR_MULTICAST) + if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { @@ -181,14 +189,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } if (tp->rx_opt.ts_recent_stamp && - !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } - ipv6_addr_copy(&np->daddr, &usin->sin6_addr); - np->flow_label = fl.fl6_flowlabel; + sk->sk_v6_daddr = usin->sin6_addr; + np->flow_label = fl6.flowlabel; /* * TCP over IPv4 @@ -224,60 +232,53 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, #endif goto failure; } else { - ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), - inet->saddr); - ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), - inet->rcv_saddr); + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &sk->sk_v6_rcv_saddr); } return err; } - if (!ipv6_addr_any(&np->rcv_saddr)) - saddr = &np->rcv_saddr; - - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, - (saddr ? saddr : &np->saddr)); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = usin->sin6_port; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + saddr = &sk->sk_v6_rcv_saddr; - security_sk_classify_flow(sk, &fl); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = usin->sin6_port; + fl6.fl6_sport = inet->inet_sport; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) - goto failure; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); + final_p = fl6_update_dst(&fl6, np->opt, &final); - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto failure; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; } if (saddr == NULL) { - saddr = &fl.fl6_src; - ipv6_addr_copy(&np->rcv_saddr, saddr); + saddr = &fl6.saddr; + sk->sk_v6_rcv_saddr = *saddr; } /* set the source address */ - ipv6_addr_copy(&np->saddr, saddr); - inet->rcv_saddr = LOOPBACK4_IPV6; + np->saddr = *saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(sk, dst, NULL, NULL); + rt = (struct rt6_info *) dst; + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && + ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr)) + tcp_fetch_timewait_stamp(sk, dst); + icsk->icsk_ext_hdr_len = 0; if (np->opt) icsk->icsk_ext_hdr_len = (np->opt->opt_flen + @@ -285,18 +286,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); - inet->dport = usin->sin6_port; + inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; - if (!tp->write_seq) + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, - np->daddr.s6_addr32, - inet->sport, - inet->dport); + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); err = tcp_connect(sk); if (err) @@ -308,27 +309,47 @@ late_failure: tcp_set_state(sk, TCP_CLOSE); __sk_dst_reset(sk); failure: - inet->dport = 0; + inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } +static void tcp_v6_mtu_reduced(struct sock *sk) +{ + struct dst_entry *dst; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + if (!dst) + return; + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } +} + static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { - struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; + struct net *net = dev_net(skb->dev); - sk = inet6_lookup(skb->dev->nd_net, &tcp_hashinfo, &hdr->daddr, + sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); if (sk == NULL) { - ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -338,68 +359,55 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; + if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + tp = tcp_sk(sk); seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + !between(seq, snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = inet6_sk(sk); - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) - goto out; - - /* icmp should have updated the destination cache entry */ - dst = __sk_dst_check(sk, np->dst_cookie); + if (type == NDISC_REDIRECT) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct flowi fl; - - /* BUGGG_FUTURE: Again, it is not clear how - to handle rthdr case. Ignore this complexity - for now. - */ - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - security_skb_classify_flow(skb, &fl); - - if ((err = ip6_dst_lookup(sk, &dst, &fl))) { - sk->sk_err_soft = -err; - goto out; - } + if (dst) + dst->ops->redirect(dst, sk, skb); + goto out; + } - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - goto out; - } + if (type == ICMPV6_PKT_TOOBIG) { + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; - } else - dst_hold(dst); + if (!ip6_sk_accept_pmtu(sk)) + goto out; - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); - tcp_simple_retransmit(sk); - } /* else let the usual retransmit timer handle it */ - dst_release(dst); + tp->mtu_info = ntohl(info); + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + &tp->tsq_flags)) + sock_hold(sk); goto out; } @@ -420,19 +428,25 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* ICMPs are not backlogged, hence we cannot get * an established socket here. */ - BUG_TRAP(req->sk == NULL); + WARN_ON(req->sk != NULL); if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -455,231 +469,82 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi6 *fl6, + struct request_sock *req, + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { - struct inet6_request_sock *treq = inet6_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff * skb; - struct ipv6_txoptions *opt = NULL; - struct in6_addr * final_p = NULL, final; - struct flowi fl; - int err = -1; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); - fl.fl6_flowlabel = 0; - fl.oif = treq->iif; - fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; - security_req_classify_flow(req, &fl); - - if (dst == NULL) { - opt = np->opt; - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + struct sk_buff *skb; + int err = -ENOMEM; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) - goto done; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) - goto done; - } + /* First, grab a route. */ + if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) + goto done; + + skb = tcp_make_synack(sk, dst, req, foc); - skb = tcp_make_synack(sk, dst, req); if (skb) { - struct tcphdr *th = tcp_hdr(skb); + __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, + &ireq->ir_v6_rmt_addr); - th->check = tcp_v6_check(th, skb->len, - &treq->loc_addr, &treq->rmt_addr, - csum_partial((char *)th, skb->len, skb->csum)); + fl6->daddr = ireq->ir_v6_rmt_addr; + if (np->repflow && (ireq->pktopts != NULL)) + fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - err = ip6_xmit(sk, skb, &fl, opt, 0); + skb_set_queue_mapping(skb, queue_mapping); + err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); } done: - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return err; } +static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) +{ + struct flowi6 fl6; + int res; + + res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } + return res; +} + static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (inet6_rsk(req)->pktopts) - kfree_skb(inet6_rsk(req)->pktopts); + kfree_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, - struct in6_addr *addr) + const struct in6_addr *addr) { - struct tcp_sock *tp = tcp_sk(sk); - int i; - - BUG_ON(tp == NULL); - - if (!tp->md5sig_info || !tp->md5sig_info->entries6) - return NULL; - - for (i = 0; i < tp->md5sig_info->entries6; i++) { - if (ipv6_addr_cmp(&tp->md5sig_info->keys6[i].addr, addr) == 0) - return &tp->md5sig_info->keys6[i].base; - } - return NULL; + return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, struct sock *addr_sk) { - return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr); + return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); } static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, struct request_sock *req) { - return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr); -} - -static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer, - char *newkey, u8 newkeylen) -{ - /* Add key to the list */ - struct tcp_md5sig_key *key; - struct tcp_sock *tp = tcp_sk(sk); - struct tcp6_md5sig_key *keys; - - key = tcp_v6_md5_do_lookup(sk, peer); - if (key) { - /* modify existing entry - just update that one */ - kfree(key->key); - key->key = newkey; - key->keylen = newkeylen; - } else { - /* reallocate new list if current one is full. */ - if (!tp->md5sig_info) { - tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC); - if (!tp->md5sig_info) { - kfree(newkey); - return -ENOMEM; - } - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - } - if (tcp_alloc_md5sig_pool() == NULL) { - kfree(newkey); - return -ENOMEM; - } - if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) { - keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) * - (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); - - if (!keys) { - tcp_free_md5sig_pool(); - kfree(newkey); - return -ENOMEM; - } - - if (tp->md5sig_info->entries6) - memmove(keys, tp->md5sig_info->keys6, - (sizeof (tp->md5sig_info->keys6[0]) * - tp->md5sig_info->entries6)); - - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = keys; - tp->md5sig_info->alloced6++; - } - - ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr, - peer); - tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.key = newkey; - tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.keylen = newkeylen; - - tp->md5sig_info->entries6++; - } - return 0; -} - -static int tcp_v6_md5_add_func(struct sock *sk, struct sock *addr_sk, - u8 *newkey, __u8 newkeylen) -{ - return tcp_v6_md5_do_add(sk, &inet6_sk(addr_sk)->daddr, - newkey, newkeylen); -} - -static int tcp_v6_md5_do_del(struct sock *sk, struct in6_addr *peer) -{ - struct tcp_sock *tp = tcp_sk(sk); - int i; - - for (i = 0; i < tp->md5sig_info->entries6; i++) { - if (ipv6_addr_cmp(&tp->md5sig_info->keys6[i].addr, peer) == 0) { - /* Free the key */ - kfree(tp->md5sig_info->keys6[i].base.key); - tp->md5sig_info->entries6--; - - if (tp->md5sig_info->entries6 == 0) { - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = NULL; - tp->md5sig_info->alloced6 = 0; - } else { - /* shrink the database */ - if (tp->md5sig_info->entries6 != i) - memmove(&tp->md5sig_info->keys6[i], - &tp->md5sig_info->keys6[i+1], - (tp->md5sig_info->entries6 - i) - * sizeof (tp->md5sig_info->keys6[0])); - } - tcp_free_md5sig_pool(); - return 0; - } - } - return -ENOENT; -} - -static void tcp_v6_clear_md5_list (struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - int i; - - if (tp->md5sig_info->entries6) { - for (i = 0; i < tp->md5sig_info->entries6; i++) - kfree(tp->md5sig_info->keys6[i].base.key); - tp->md5sig_info->entries6 = 0; - tcp_free_md5sig_pool(); - } - - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = NULL; - tp->md5sig_info->alloced6 = 0; - - if (tp->md5sig_info->entries4) { - for (i = 0; i < tp->md5sig_info->entries4; i++) - kfree(tp->md5sig_info->keys4[i].base.key); - tp->md5sig_info->entries4 = 0; - tcp_free_md5sig_pool(); - } - - kfree(tp->md5sig_info->keys4); - tp->md5sig_info->keys4 = NULL; - tp->md5sig_info->alloced4 = 0; + return tcp_v6_md5_do_lookup(sk, &inet_rsk(req)->ir_v6_rmt_addr); } -static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, - int optlen) +static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, + int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; - u8 *newkey; if (optlen < sizeof(cmd)) return -EINVAL; @@ -691,320 +556,207 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, return -EINVAL; if (!cmd.tcpm_keylen) { - if (!tcp_sk(sk)->md5sig_info) - return -ENOENT; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - return tcp_v4_md5_do_del(sk, sin6->sin6_addr.s6_addr32[3]); - return tcp_v6_md5_do_del(sk, &sin6->sin6_addr); + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET); + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - if (!tcp_sk(sk)->md5sig_info) { - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_md5sig_info *p; + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); - p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL); - if (!p) - return -ENOMEM; + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); +} - tp->md5sig_info = p; - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - } +static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes) +{ + struct tcp6_pseudohdr *bp; + struct scatterlist sg; - newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); - if (!newkey) - return -ENOMEM; - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { - return tcp_v4_md5_do_add(sk, sin6->sin6_addr.s6_addr32[3], - newkey, cmd.tcpm_keylen); - } - return tcp_v6_md5_do_add(sk, &sin6->sin6_addr, newkey, cmd.tcpm_keylen); + bp = &hp->md5_blk.ip6; + /* 1. TCP pseudo-header (RFC2460) */ + bp->saddr = *saddr; + bp->daddr = *daddr; + bp->protocol = cpu_to_be32(IPPROTO_TCP); + bp->len = cpu_to_be32(nbytes); + + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); } -static int tcp_v6_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - struct in6_addr *saddr, - struct in6_addr *daddr, - struct tcphdr *th, int protocol, - unsigned int tcplen) +static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) { - struct scatterlist sg[4]; - __u16 data_len; - int block = 0; - __sum16 cksum; struct tcp_md5sig_pool *hp; - struct tcp6_pseudohdr *bp; struct hash_desc *desc; - int err; - unsigned int nbytes = 0; hp = tcp_get_md5sig_pool(); - if (!hp) { - printk(KERN_WARNING "%s(): hash pool not found...\n", __FUNCTION__); + if (!hp) goto clear_hash_noput; - } - bp = &hp->md5_blk.ip6; desc = &hp->md5_desc; - /* 1. TCP pseudo-header (RFC2460) */ - ipv6_addr_copy(&bp->saddr, saddr); - ipv6_addr_copy(&bp->daddr, daddr); - bp->len = htonl(tcplen); - bp->protocol = htonl(protocol); - - sg_init_table(sg, 4); - - sg_set_buf(&sg[block++], bp, sizeof(*bp)); - nbytes += sizeof(*bp); - - /* 2. TCP header, excluding options */ - cksum = th->check; - th->check = 0; - sg_set_buf(&sg[block++], th, sizeof(*th)); - nbytes += sizeof(*th); - - /* 3. TCP segment data (if any) */ - data_len = tcplen - (th->doff << 2); - if (data_len > 0) { - u8 *data = (u8 *)th + (th->doff << 2); - sg_set_buf(&sg[block++], data, data_len); - nbytes += data_len; - } - - /* 4. shared key */ - sg_set_buf(&sg[block++], key->key, key->keylen); - nbytes += key->keylen; - - sg_mark_end(&sg[block - 1]); - - /* Now store the hash into the packet */ - err = crypto_hash_init(desc); - if (err) { - printk(KERN_WARNING "%s(): hash_init failed\n", __FUNCTION__); + if (crypto_hash_init(desc)) goto clear_hash; - } - err = crypto_hash_update(desc, sg, nbytes); - if (err) { - printk(KERN_WARNING "%s(): hash_update failed\n", __FUNCTION__); + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; - } - err = crypto_hash_final(desc, md5_hash); - if (err) { - printk(KERN_WARNING "%s(): hash_final failed\n", __FUNCTION__); + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) goto clear_hash; - } - /* Reset header, and free up the crypto */ tcp_put_md5sig_pool(); - th->check = cksum; -out: return 0; + clear_hash: tcp_put_md5sig_pool(); clear_hash_noput: memset(md5_hash, 0, 16); - goto out; + return 1; } -static int tcp_v6_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, - struct sock *sk, - struct dst_entry *dst, - struct request_sock *req, - struct tcphdr *th, int protocol, - unsigned int tcplen) +static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + const struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + const struct tcphdr *th = tcp_hdr(skb); if (sk) { saddr = &inet6_sk(sk)->saddr; - daddr = &inet6_sk(sk)->daddr; + daddr = &sk->sk_v6_daddr; + } else if (req) { + saddr = &inet_rsk(req)->ir_v6_loc_addr; + daddr = &inet_rsk(req)->ir_v6_rmt_addr; } else { - saddr = &inet6_rsk(req)->loc_addr; - daddr = &inet6_rsk(req)->rmt_addr; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; } - return tcp_v6_do_calc_md5_hash(md5_hash, key, - saddr, daddr, - th, protocol, tcplen); + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; } -static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { - __u8 *hash_location = NULL; + const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); - int length = (th->doff << 2) - sizeof (*th); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); int genhash; - u8 *ptr; u8 newhash[16]; hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + hash_location = tcp_parse_md5sig_option(th); - /* If the TCP option is too short, we can short cut */ - if (length < TCPOLEN_MD5SIG) - return hash_expected ? 1 : 0; - - /* parse options */ - ptr = (u8*)(th + 1); - while (length > 0) { - int opcode = *ptr++; - int opsize; - - switch(opcode) { - case TCPOPT_EOL: - goto done_opts; - case TCPOPT_NOP: - length--; - continue; - default: - opsize = *ptr++; - if (opsize < 2 || opsize > length) - goto done_opts; - if (opcode == TCPOPT_MD5SIG) { - hash_location = ptr; - goto done_opts; - } - } - ptr += opsize - 2; - length -= opsize; - } + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return 0; -done_opts: - /* do we have a hash as expected? */ - if (!hash_expected) { - if (!hash_location) - return 0; - if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash NOT expected but found " - "(" NIP6_FMT ", %u)->" - "(" NIP6_FMT ", %u)\n", - NIP6(ip6h->saddr), ntohs(th->source), - NIP6(ip6h->daddr), ntohs(th->dest)); - } + if (hash_expected && !hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return 1; } - if (!hash_location) { - if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash expected but NOT found " - "(" NIP6_FMT ", %u)->" - "(" NIP6_FMT ", %u)\n", - NIP6(ip6h->saddr), ntohs(th->source), - NIP6(ip6h->daddr), ntohs(th->dest)); - } + if (!hash_expected && hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return 1; } /* check the signature */ - genhash = tcp_v6_do_calc_md5_hash(newhash, - hash_expected, - &ip6h->saddr, &ip6h->daddr, - th, sk->sk_protocol, - skb->len); + genhash = tcp_v6_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); + if (genhash || memcmp(hash_location, newhash, 16) != 0) { - if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash %s for " - "(" NIP6_FMT ", %u)->" - "(" NIP6_FMT ", %u)\n", - genhash ? "failed" : "mismatch", - NIP6(ip6h->saddr), ntohs(th->source), - NIP6(ip6h->daddr), ntohs(th->dest)); - } + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + genhash ? "failed" : "mismatch", + &ip6h->saddr, ntohs(th->source), + &ip6h->daddr, ntohs(th->dest)); return 1; } return 0; } #endif -static struct request_sock_ops tcp6_request_sock_ops __read_mostly = { +struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_v6_send_synack, + .rtx_syn_ack = tcp_v6_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, - .send_reset = tcp_v6_send_reset + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .md5_lookup = tcp_v6_reqsk_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, }; #endif -static struct timewait_sock_ops tcp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, -}; - -static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcphdr *th = tcp_hdr(skb); - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, - csum_partial((char *)th, th->doff<<2, - skb->csum)); - } -} - -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - struct ipv6hdr *ipv6h; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - ipv6h = ipv6_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_TCP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - return 0; -} - -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, + u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, int rst, u8 tclass, + u32 label) { - struct tcphdr *th = tcp_hdr(skb), *t1; + const struct tcphdr *th = tcp_hdr(skb); + struct tcphdr *t1; struct sk_buff *buff; - struct flowi fl; - unsigned int tot_len = sizeof(*th); -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; -#endif - - if (th->rst) - return; - - if (!ipv6_unicast_destination(skb)) - return; + struct flowi6 fl6; + struct net *net = dev_net(skb_dst(skb)->dev); + struct sock *ctl_sk = net->ipv6.tcp_sk; + unsigned int tot_len = sizeof(struct tcphdr); + struct dst_entry *dst; + __be32 *topt; + if (tsecr) + tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG - if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); - else - key = NULL; - if (key) tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (buff == NULL) @@ -1013,162 +765,150 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); t1 = (struct tcphdr *) skb_push(buff, tot_len); + skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; - t1->rst = 1; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = !rst || !th->ack; + t1->rst = rst; + t1->window = htons(win); - if(th->ack) { - t1->seq = th->ack_seq; - } else { - t1->ack = 1; - t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin - + skb->len - (th->doff<<2)); + topt = (__be32 *)(t1 + 1); + + if (tsecr) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tsval); + *topt++ = htonl(tsecr); } #ifdef CONFIG_TCP_MD5SIG if (key) { - __be32 *opt = (__be32*)(t1 + 1); - opt[0] = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - tcp_v6_do_calc_md5_hash((__u8 *)&opt[1], key, - &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, - t1, IPPROTO_TCP, tot_len); + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); + tcp_v6_md5_hash_hdr((__u8 *)topt, key, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, t1); } #endif - buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - - memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); - - t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, - sizeof(*t1), IPPROTO_TCP, - buff->csum); + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = ipv6_hdr(skb)->saddr; + fl6.saddr = ipv6_hdr(skb)->daddr; + fl6.flowlabel = label; - fl.proto = IPPROTO_TCP; - fl.oif = inet6_iif(skb); - fl.fl_ip_dport = t1->dest; - fl.fl_ip_sport = t1->source; - security_skb_classify_flow(skb, &fl); + buff->ip_summed = CHECKSUM_PARTIAL; + buff->csum = 0; - /* sk = NULL, but it is safe for now. RST socket required. */ - if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { + __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); - if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { - ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); - return; - } + fl6.flowi6_proto = IPPROTO_TCP; + if (rt6_need_strict(&fl6.daddr) && !oif) + fl6.flowi6_oif = inet6_iif(skb); + else + fl6.flowi6_oif = oif; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); + fl6.fl6_dport = t1->dest; + fl6.fl6_sport = t1->source; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + /* Pass a socket to ip6_dst_lookup either it is for RST + * Underlying function will use this to retrieve the network + * namespace + */ + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + if (rst) + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + return; } kfree_skb(buff); } -static void tcp_v6_send_ack(struct tcp_timewait_sock *tw, - struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) +static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb), *t1; - struct sk_buff *buff; - struct flowi fl; - unsigned int tot_len = sizeof(struct tcphdr); - __be32 *topt; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; - struct tcp_md5sig_key tw_key; -#endif - -#ifdef CONFIG_TCP_MD5SIG - if (!tw && skb->sk) { - key = tcp_v6_md5_do_lookup(skb->sk, &ipv6_hdr(skb)->daddr); - } else if (tw && tw->tw_md5_keylen) { - tw_key.key = tw->tw_md5_key; - tw_key.keylen = tw->tw_md5_keylen; - key = &tw_key; - } else { - key = NULL; - } -#endif - - if (ts) - tot_len += TCPOLEN_TSTAMP_ALIGNED; + const struct tcphdr *th = tcp_hdr(skb); + u32 seq = 0, ack_seq = 0; + struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG - if (key) - tot_len += TCPOLEN_MD5SIG_ALIGNED; + const __u8 *hash_location = NULL; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + unsigned char newhash[16]; + int genhash; + struct sock *sk1 = NULL; #endif + int oif; - buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, - GFP_ATOMIC); - if (buff == NULL) + if (th->rst) return; - skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); - - t1 = (struct tcphdr *) skb_push(buff,tot_len); - - /* Swap the send and the receive. */ - memset(t1, 0, sizeof(*t1)); - t1->dest = th->source; - t1->source = th->dest; - t1->doff = tot_len/4; - t1->seq = htonl(seq); - t1->ack_seq = htonl(ack); - t1->ack = 1; - t1->window = htons(win); + if (!ipv6_unicast_destination(skb)) + return; - topt = (__be32 *)(t1 + 1); +#ifdef CONFIG_TCP_MD5SIG + hash_location = tcp_parse_md5sig_option(th); + if (!sk && hash_location) { + /* + * active side is lost. Try to find listening socket through + * source port, and then find md5 key through listening socket. + * we are not loose security here: + * Incoming packet is checked with md5 hash with finding key, + * no RST generated if md5 hash doesn't match. + */ + sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + &tcp_hashinfo, &ipv6h->saddr, + th->source, &ipv6h->daddr, + ntohs(th->source), inet6_iif(skb)); + if (!sk1) + return; - if (ts) { - *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *topt++ = htonl(tcp_time_stamp); - *topt = htonl(ts); - } + rcu_read_lock(); + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + if (!key) + goto release_sk1; -#ifdef CONFIG_TCP_MD5SIG - if (key) { - *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - tcp_v6_do_calc_md5_hash((__u8 *)topt, key, - &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, - t1, IPPROTO_TCP, tot_len); + genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, NULL, skb); + if (genhash || memcmp(hash_location, newhash, 16) != 0) + goto release_sk1; + } else { + key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL; } #endif - buff->csum = csum_partial((char *)t1, tot_len, 0); - - memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); - - t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, - tot_len, IPPROTO_TCP, - buff->csum); + if (th->ack) + seq = ntohl(th->ack_seq); + else + ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - + (th->doff << 2); - fl.proto = IPPROTO_TCP; - fl.oif = inet6_iif(skb); - fl.fl_ip_dport = t1->dest; - fl.fl_ip_sport = t1->source; - security_skb_classify_flow(skb, &fl); + oif = sk ? sk->sk_bound_dev_if : 0; + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); - if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { - if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) { - ip6_xmit(tcp6_socket->sk, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - return; - } +#ifdef CONFIG_TCP_MD5SIG +release_sk1: + if (sk1) { + rcu_read_unlock(); + sock_put(sk1); } +#endif +} - kfree_skb(buff); +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, u8 tclass, + u32 label) +{ + tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, + label); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1176,20 +916,31 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v6_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcptw->tw_ts_recent); + tcp_time_stamp + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), + tw->tw_tclass, (tw->tw_flowlabel << 12)); inet_twsk_put(tw); } -static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { - tcp_v6_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent); + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, + req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { struct request_sock *req, **prev; const struct tcphdr *th = tcp_hdr(skb); @@ -1200,9 +951,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, inet6_iif(skb)); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); - nsk = __inet6_lookup_established(sk->sk_net, &tcp_hashinfo, + nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); @@ -1215,9 +966,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return NULL; } -#if 0 /*def CONFIG_SYN_COOKIES*/ - if (!th->rst && !th->syn && th->ack) - sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt)); +#ifdef CONFIG_SYN_COOKIES + if (!th->syn) + sk = cookie_v6_check(sk, skb); #endif return sk; } @@ -1227,12 +978,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct inet6_request_sock *treq; - struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_options_received tmp_opt; + struct request_sock *req; + struct inet_request_sock *ireq; + struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct request_sock *req = NULL; __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; + struct tcp_fastopen_cookie foc = { .len = -1 }; + bool want_cookie = false, fastopen; + struct flowi6 fl6; + int err; if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1240,17 +996,17 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - /* - * There are no SYN attacks on IPv6, yet... - */ - if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); - goto drop; + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); + if (!want_cookie) + goto drop; } - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; + } req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) @@ -1263,64 +1019,126 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); - tcp_parse_options(skb, &tmp_opt, 0); + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = inet6_rsk(req); - ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); - TCP_ECN_create_request(req, tcp_hdr(skb)); - treq->pktopts = NULL; - if (ipv6_opt_accepted(sk, skb) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - atomic_inc(&skb->users); - treq->pktopts = skb; - } - treq->iif = sk->sk_bound_dev_if; + ireq = inet_rsk(req); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(sk)); + + ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_mark = inet_request_mark(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && - ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) - treq->iif = inet6_iif(skb); + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + if (!isn) { + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || + np->repflow) { + atomic_inc(&skb->users); + ireq->pktopts = skb; + } - if (isn == 0) - isn = tcp_v6_init_sequence(skb); + if (want_cookie) { + isn = cookie_v6_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + goto have_isn; + } - tcp_rsk(req)->snt_isn = isn; + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", + &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } - security_inet_conn_request(sk, skb, req); + isn = tcp_v6_init_sequence(skb); + } +have_isn: - if (tcp_v6_send_synack(sk, req, NULL)) - goto drop; + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_release; + + if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) + goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v6_send_synack(sk, dst, &fl6, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } return 0; +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); drop: - if (req) - reqsk_free(req); - + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; /* don't send reset */ } -static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { - struct inet6_request_sock *treq = inet6_rsk(req); + struct inet_request_sock *ireq; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; - struct ipv6_txoptions *opt; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* @@ -1341,13 +1159,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), - newinet->daddr); + ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); - ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), - newinet->saddr); + ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + newsk->sk_v6_rcv_saddr = newnp->saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -1355,10 +1171,15 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -1375,43 +1196,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } - opt = np->opt; + ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; - if (dst == NULL) { - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; - security_req_classify_flow(req, &fl); - - if (ip6_dst_lookup(sk, &dst, &fl)) - goto out; - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if (!dst) { + dst = inet6_csk_route_req(sk, &fl6, req); + if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1421,6 +1219,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; @@ -1431,16 +1230,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr); - ipv6_addr_copy(&newnp->saddr, &treq->loc_addr); - ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr); - newsk->sk_bound_dev_if = treq->iif; + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + newnp->saddr = ireq->ir_v6_loc_addr; + newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; + newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ - newinet->opt = NULL; + newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ @@ -1448,16 +1248,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; - if (treq->pktopts != NULL) { - newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); - kfree_skb(treq->pktopts); - treq->pktopts = NULL; + if (ireq->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; if (newnp->pktoptions) skb_set_owner_r(newnp->pktoptions, newsk); } newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Clone native IPv6 options from listening socket (if any) @@ -1465,74 +1269,58 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, but we make one more one thing there: reattach optmem to newsk. */ - if (opt) { - newnp->opt = ipv6_dup_options(newsk, opt); - if (opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - } + if (np->opt) + newnp->opt = ipv6_dup_options(newsk, np->opt); inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); - newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; + newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) { + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + if (key != NULL) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key * across. Shucks. */ - char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); - if (newkey != NULL) - tcp_v6_md5_do_add(newsk, &inet6_sk(sk)->daddr, - newkey, key->keylen); + tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, + AF_INET6, key->key, key->keylen, + sk_gfp_atomic(sk, GFP_ATOMIC)); } #endif - __inet6_hash(newsk); - inet_inherit_port(sk, newsk); + if (__inet_inherit_port(sk, newsk) < 0) { + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto out; + } + __inet6_hash(newsk, NULL); return newsk; out_overflow: - NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); -out: - NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } -static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v6_check(tcp_hdr(skb), skb->len, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = ~csum_unfold(tcp_v6_check(tcp_hdr(skb), skb->len, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, 0)); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - /* The socket must have it's spinlock held when we get * here. * @@ -1559,7 +1347,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return tcp_v4_do_rcv(sk, skb); #ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash (sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard; #endif @@ -1585,13 +1373,21 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) - goto reset; - TCP_CHECK_TIMER(sk); + struct dst_entry *dst = sk->sk_rx_dst; + + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1610,19 +1406,19 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * otherwise we just shortcircuit this and continue with * the new socket.. */ - if(nsk != sk) { + if (nsk != sk) { + sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) __kfree_skb(opt_skb); return 0; } - } + } else + sock_rps_save_rxhash(sk, skb); - TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) goto reset; - TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1635,7 +1431,8 @@ discard: kfree_skb(skb); return 0; csum_err: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1654,6 +1451,10 @@ ipv6_pktoptions: np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) + np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); + if (np->repflow) + np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1663,16 +1464,17 @@ ipv6_pktoptions: } } - if (opt_skb) - kfree_skb(opt_skb); + kfree_skb(opt_skb); return 0; } static int tcp_v6_rcv(struct sk_buff *skb) { - struct tcphdr *th; + const struct tcphdr *th; + const struct ipv6hdr *hdr; struct sock *sk; int ret; + struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1680,7 +1482,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) /* * Count it even if it's bad. */ - TCP_INC_STATS_BH(TCP_MIB_INSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1692,23 +1494,20 @@ static int tcp_v6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; - if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) - goto bad_packet; + if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) + goto csum_error; th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb)); + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; - sk = __inet6_lookup(skb->dev->nd_net, &tcp_hashinfo, - &ipv6_hdr(skb)->saddr, th->source, - &ipv6_hdr(skb)->daddr, ntohs(th->dest), - inet6_iif(skb)); - + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; @@ -1716,12 +1515,18 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); @@ -1730,7 +1535,7 @@ process: #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) ret = tcp_v6_do_rcv(sk, skb); else @@ -1739,8 +1544,12 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); } - } else - sk_add_backlog(sk, skb); + } else if (unlikely(sk_add_backlog(sk, skb, + sk->sk_rcvbuf + sk->sk_sndbuf))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } bh_unlock_sock(sk); sock_put(sk); @@ -1751,18 +1560,15 @@ no_tcp_socket: goto discard_it; if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb); } discard_it: - - /* - * Discard frame - */ - kfree_skb(skb); return 0; @@ -1776,10 +1582,13 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); + if (skb->len < (th->doff<<2)) { inet_twsk_put(inet_twsk(sk)); - goto discard_it; + goto bad_packet; + } + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { @@ -1787,7 +1596,8 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(skb->dev->nd_net, &tcp_hashinfo, + sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { @@ -1804,25 +1614,64 @@ do_time_wait: break; case TCP_TW_RST: goto no_tcp_socket; - case TCP_TW_SUCCESS:; + case TCP_TW_SUCCESS: + ; } goto discard_it; } -static int tcp_v6_remember_stamp(struct sock *sk) +static void tcp_v6_early_demux(struct sk_buff *skb) { - /* Alas, not yet... */ - return 0; + const struct ipv6hdr *hdr; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + hdr = ipv6_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + inet6_iif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + + if (dst) + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + if (dst && + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } } -static struct inet_connection_sock_af_ops ipv6_specific = { +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor = tcp_twsk_destructor, +}; + +static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v6_remember_stamp, .net_header_len = sizeof(struct ipv6hdr), + .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, @@ -1835,10 +1684,9 @@ static struct inet_connection_sock_af_ops ipv6_specific = { }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_sock_af_ops tcp_sock_ipv6_specific = { +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { .md5_lookup = tcp_v6_md5_lookup, - .calc_md5_hash = tcp_v6_calc_md5_hash, - .md5_add = tcp_v6_md5_add_func, + .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, }; #endif @@ -1846,14 +1694,13 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific = { /* * TCP over IPv4 via INET6 API */ - -static struct inet_connection_sock_af_ops ipv6_mapped = { +static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v4_remember_stamp, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, @@ -1867,10 +1714,9 @@ static struct inet_connection_sock_af_ops ipv6_mapped = { }; #ifdef CONFIG_TCP_MD5SIG -static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .md5_lookup = tcp_v4_md5_lookup, - .calc_md5_hash = tcp_v4_calc_md5_hash, - .md5_add = tcp_v6_md5_add_func, + .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, }; #endif @@ -1881,90 +1727,52 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); - tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); - - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - tp->snd_cwnd = 2; - - /* See draft-stevens-tcpca-spec-01 for discussion of the - * initialization of these values. - */ - tp->snd_ssthresh = 0x7fffffff; - tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; - - tp->reordering = sysctl_tcp_reordering; - - sk->sk_state = TCP_CLOSE; + tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; - icsk->icsk_ca_ops = &tcp_init_congestion_ops; - icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_write_space = sk_stream_write_space; - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv6_specific; + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - - atomic_inc(&tcp_sockets_allocated); - return 0; } -static int tcp_v6_destroy_sock(struct sock *sk) +static void tcp_v6_destroy_sock(struct sock *sk) { -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list */ - if (tcp_sk(sk)->md5sig_info) - tcp_v6_clear_md5_list(sk); -#endif tcp_v4_destroy_sock(sk); - return inet6_destroy_sock(sk); + inet6_destroy_sock(sk); } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct sock *sk, struct request_sock *req, int i, int uid) + const struct sock *sk, struct request_sock *req, int i, kuid_t uid) { int ttd = req->expires - jiffies; - struct in6_addr *src = &inet6_rsk(req)->loc_addr; - struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; + const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; + const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], - ntohs(inet_sk(sk)->sport), + inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], - ntohs(inet_rsk(req)->rmt_port), + ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, - 0,0, /* could print option size, but that is af dependent. */ + 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), - req->retrans, - uid, + req->num_timeout, + from_kuid_munged(seq_user_ns(seq), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1972,19 +1780,19 @@ static void get_openreq6(struct seq_file *seq, static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { - struct in6_addr *dest, *src; + const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; - struct inet_sock *inet = inet_sk(sp); - struct tcp_sock *tp = tcp_sk(sp); + const struct inet_sock *inet = inet_sk(sp); + const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct ipv6_pinfo *np = inet6_sk(sp); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; - dest = &np->daddr; - src = &np->rcv_saddr; - destp = ntohs(inet->dport); - srcp = ntohs(inet->sport); + dest = &sp->sk_v6_daddr; + src = &sp->sk_v6_rcv_saddr; + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; @@ -2002,7 +1810,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -2012,51 +1820,51 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) tp->write_seq-tp->snd_una, (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), timer_active, - jiffies_to_clock_t(timer_expires - jiffies), + jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sp), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - icsk->icsk_rto, - icsk->icsk_ack.ato, - (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, - tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, + tp->snd_cwnd, + sp->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { - struct in6_addr *dest, *src; + const struct in6_addr *dest, *src; __u16 destp, srcp; - struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); - int ttd = tw->tw_ttd - jiffies; - - if (ttd < 0) - ttd = 0; + s32 delta = tw->tw_ttd - inet_tw_time_stamp(); - dest = &tw6->tw_v6_daddr; - src = &tw6->tw_v6_rcv_saddr; + dest = &tw->tw_v6_daddr; + src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, tw->tw_substate, 0, 0, - 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; + struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2072,40 +1880,57 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: - get_tcp6_sock(seq, v, st->num); + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else + get_tcp6_sock(seq, v, st->num); break; case TCP_SEQ_STATE_OPENREQ: get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); break; - case TCP_SEQ_STATE_TIME_WAIT: - get_timewait6_sock(seq, v, st->num); - break; } out: return 0; } -static struct file_operations tcp6_seq_fops; +static const struct file_operations tcp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = tcp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct tcp_seq_afinfo tcp6_seq_afinfo = { - .owner = THIS_MODULE, .name = "tcp6", .family = AF_INET6, - .seq_show = tcp6_seq_show, - .seq_fops = &tcp6_seq_fops, + .seq_fops = &tcp6_afinfo_seq_fops, + .seq_ops = { + .show = tcp6_seq_show, + }, }; -int __init tcp6_proc_init(void) +int __net_init tcp6_proc_init(struct net *net) { - return tcp_proc_register(&tcp6_seq_afinfo); + return tcp_proc_register(net, &tcp6_seq_afinfo); } -void tcp6_proc_exit(void) +void tcp6_proc_exit(struct net *net) { - tcp_proc_unregister(&tcp6_seq_afinfo); + tcp_proc_unregister(net, &tcp6_seq_afinfo); } #endif -DEFINE_PROTO_INUSE(tcpv6) +static void tcp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} struct proto tcpv6_prot = { .name = "TCPv6", @@ -2121,11 +1946,16 @@ struct proto tcpv6_prot = { .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, @@ -2135,21 +1965,25 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, - .hashinfo = &tcp_hashinfo, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif - REF_PROTO_INUSE(tcpv6) +#ifdef CONFIG_MEMCG_KMEM + .proto_cgroup = tcp_proto_cgroup, +#endif + .clear_sk = tcp_v6_clear_sk, }; -static struct inet6_protocol tcpv6_protocol = { +static const struct inet6_protocol tcpv6_protocol = { + .early_demux = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, - .gso_send_check = tcp_v6_gso_send_check, - .gso_segment = tcp_tso_segment, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; @@ -2158,12 +1992,32 @@ static struct inet_protosw tcpv6_protosw = { .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, - .capability = -1, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; +static int __net_init tcpv6_net_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcpv6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.tcp_sk); +} + +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); +} + +static struct pernet_operations tcpv6_net_ops = { + .init = tcpv6_net_init, + .exit = tcpv6_net_exit, + .exit_batch = tcpv6_net_exit_batch, +}; + int __init tcpv6_init(void) { int ret; @@ -2177,23 +2031,22 @@ int __init tcpv6_init(void) if (ret) goto out_tcpv6_protocol; - ret = inet_csk_ctl_sock_create(&tcp6_socket, PF_INET6, - SOCK_RAW, IPPROTO_TCP); + ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; out: return ret; -out_tcpv6_protocol: - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); +out_tcpv6_protocol: + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { - sock_release(tcp6_socket); + unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); } diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c new file mode 100644 index 00000000000..01b0ff9a0c2 --- /dev/null +++ b/net/ipv6/tcpv6_offload.c @@ -0,0 +1,93 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TCPv6 GSO/GRO support + */ +#include <linux/skbuff.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip6_checksum.h> +#include "ip6_offload.h" + +static int tcp_v6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + return 0; +} + +static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + __wsum wsum; + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (NAPI_GRO_CB(skb)->flush) + goto skip_csum; + + wsum = NAPI_GRO_CB(skb)->csum; + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), + wsum); + + /* fall through */ + + case CHECKSUM_COMPLETE: + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, + wsum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + +skip_csum: + return tcp_gro_receive(head, skb); +} + +static int tcp6_gro_complete(struct sk_buff *skb, int thoff) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, + &iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + + return tcp_gro_complete(skb); +} + +static const struct net_offload tcpv6_offload = { + .callbacks = { + .gso_send_check = tcp_v6_gso_send_check, + .gso_segment = tcp_gso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + }, +}; + +int __init tcpv6_offload_init(void) +{ + return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP); +} diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 6323921b40b..2c4e4c5c761 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -12,45 +12,50 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers; -static struct xfrm6_tunnel *tunnel46_handlers; +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -64,14 +69,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; @@ -87,6 +95,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) EXPORT_SYMBOL(xfrm6_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -94,11 +107,11 @@ static int tunnel6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -109,14 +122,14 @@ static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel46_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -124,22 +137,22 @@ drop: } static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) break; } -static struct inet6_protocol tunnel6_protocol = { +static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -static struct inet6_protocol tunnel46_protocol = { +static const struct inet6_protocol tunnel46_protocol = { .handler = tunnel46_rcv, .err_handler = tunnel6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, @@ -148,11 +161,11 @@ static struct inet6_protocol tunnel46_protocol = { static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { - printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { - printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } @@ -162,9 +175,9 @@ static int __init tunnel6_init(void) static void __exit tunnel6_fini(void) { if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) - printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) - printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 53739de829d..7092ff78fd8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,8 +7,6 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.65 2002/02/01 22:01:04 davem Exp $ - * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which @@ -36,6 +34,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <net/ndisc.h> @@ -46,68 +45,333 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> +#include <net/inet6_hashtables.h> +#include <net/busy_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <trace/events/skb.h> #include "udp_impl.h" -static inline int udp_v6_get_port(struct sock *sk, unsigned short snum) +static unsigned int udp6_ehashfn(struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { - return udp_get_port(sk, snum, ipv6_rcv_saddr_equal); + static u32 udp6_ehash_secret __read_mostly; + static u32 udp_ipv6_hash_secret __read_mostly; + + u32 lhash, fhash; + + net_get_random_once(&udp6_ehash_secret, + sizeof(udp6_ehash_secret)); + net_get_random_once(&udp_ipv6_hash_secret, + sizeof(udp_ipv6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); + + return __inet6_ehashfn(lhash, lport, fhash, fport, + udp_ipv6_hash_secret + net_hash_mix(net)); } -static struct sock *__udp6_lib_lookup(struct net *net, - struct in6_addr *saddr, __be16 sport, - struct in6_addr *daddr, __be16 dport, - int dif, struct hlist_head udptable[]) +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { - struct sock *sk, *result = NULL; - struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) + return (!sk2_ipv6only && + (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || + sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} + +static unsigned int udp6_portaddr_hash(struct net *net, + const struct in6_addr *addr6, + unsigned int port) +{ + unsigned int hash, mix = net_hash_mix(net); + + if (ipv6_addr_any(addr6)) + hash = jhash_1word(0, mix); + else if (ipv6_addr_v4mapped(addr6)) + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); + else + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); + + return hash ^ port; +} + + +int udp_v6_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr = + udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + unsigned int hash2_partial = + udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); +} + +static void udp_v6_rehash(struct sock *sk) +{ + u16 new_hash = udp6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); - read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + udp_lib_rehash(sk, new_hash); +} + +static inline int compute_score(struct sock *sk, struct net *net, + unsigned short hnum, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_net == net && sk->sk_hash == hnum && - sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - int score = 0; - if (inet->dport) { - if (inet->dport != sport) - continue; - score++; - } - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) - continue; - score++; - } - if (!ipv6_addr_any(&np->daddr)) { - if (!ipv6_addr_equal(&np->daddr, saddr)) - continue; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score++; - } - if (score == 4) { - result = sk; - break; - } else if (score > badness) { + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score++; + } + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + +#define SCORE2_MAX (1 + 1 + 1) +static inline int compute_score2(struct sock *sk, struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned short hnum, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { + struct inet_sock *inet = inet_sk(sk); + + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return -1; + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp6_lib_lookup2(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } else if (score == SCORE2_MAX) + goto exact_match; + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) result = sk; - badness = score; + hash = next_pseudo_random32(hash); + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + +struct sock *__udp6_lib_lookup(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, struct udp_table *udptable) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; + + rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp6_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } +begin: + result = NULL; + badness = -1; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } - if (result) - sock_hold(result); - read_unlock(&udp_hash_lock); + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, saddr, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(__udp6_lib_lookup); + +static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + struct sock *sk; + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + udptable); +} + +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + /* * This should be easy, if there is something there we @@ -122,19 +386,21 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; - int peeked; + int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); - - if (addr_len) - *addr_len=sizeof(struct sockaddr_in6); + int is_udp4; + bool slow; if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len); + return ipv6_recv_error(sk, msg, len, addr_len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &err); + &peeked, &off, &err); if (!skb) goto out; @@ -145,6 +411,8 @@ try_again: else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; + is_udp4 = (skb->protocol == htons(ETH_P_IP)); + /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial @@ -158,47 +426,67 @@ try_again: if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udpv6_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + } goto out_free; + } + if (!peeked) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + } - if (!peeked) - UDP6_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); - - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); /* Copy the address. */ if (msg->msg_name) { - struct sockaddr_in6 *sin6; - - sin6 = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); sin6->sin6_family = AF_INET6; sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - - if (skb->protocol == htons(ETH_P_IP)) - ipv6_addr_set(&sin6->sin6_addr, 0, 0, - htonl(0xffff), ip_hdr(skb)->saddr); - else { - ipv6_addr_copy(&sin6->sin6_addr, - &ipv6_hdr(skb)->saddr); - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; - } + if (is_udp4) { + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin6->sin6_addr); + sin6->sin6_scope_id = 0; + } else { + sin6->sin6_addr = ipv6_hdr(skb)->saddr; + sin6->sin6_scope_id = + ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); + } + *addr_len = sizeof(*sin6); } - if (skb->protocol == htons(ETH_P_IP)) { + + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); + + if (is_udp4) { if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } else { if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); } err = copied; @@ -206,40 +494,62 @@ try_again: err = ulen; out_free: - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + skb_free_datagram_locked(sk, skb); out: return err; csum_copy_err: - lock_sock(sk); - if (!skb_kill_datagram(sk, skb, flags)) - UDP6_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); - release_sock(sk); + slow = lock_sock_fast(sk); + if (!skb_kill_datagram(sk, skb, flags)) { + if (is_udp4) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } else { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } + } + unlock_sock_fast(sk, slow); - if (flags & MSG_DONTWAIT) + if (noblock) return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; goto try_again; } void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info, - struct hlist_head udptable[] ) + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) { struct ipv6_pinfo *np; - struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; - struct in6_addr *saddr = &hdr->saddr; - struct in6_addr *daddr = &hdr->daddr; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct in6_addr *saddr = &hdr->saddr; + const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr*)(skb->data+offset); struct sock *sk; int err; - sk = __udp6_lib_lookup(skb->dev->nd_net, daddr, uh->dest, + sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable); if (sk == NULL) return; + if (type == ICMPV6_PKT_TOOBIG) { + if (!ip6_sk_accept_pmtu(sk)) + goto out; + ip6_sk_update_pmtu(skb, sk, info); + } + if (type == NDISC_REDIRECT) { + ip6_sk_redirect(skb, sk); + goto out; + } + np = inet6_sk(sk); if (!icmpv6_err_convert(type, code, &err) && !np->recverr) @@ -257,14 +567,46 @@ out: sock_put(sk); } +static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); + } + + rc = sock_queue_rcv_skb(sk, skb); + if (rc < 0) { + int is_udplite = IS_UDPLITE(sk); + + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; + } + return 0; +} + static __inline__ void udpv6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, int type, - int code, int offset, __be32 info ) + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info ) +{ + __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); +} + +static struct static_key udpv6_encap_needed __read_mostly; +void udpv6_encap_enable(void) { - __udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); + if (!static_key_enabled(&udpv6_encap_needed)) + static_key_slow_inc(&udpv6_encap_needed); } +EXPORT_SYMBOL(udpv6_encap_enable); -int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; @@ -273,6 +615,41 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + + /* + * This is an encapsulation socket so pass the skb to + * the socket's udp_encap_rcv() hook. Otherwise, just + * fall through and pass this up the UDP socket. + * up->encap_rcv() returns the following value: + * =0 if skb was successfully passed to the encap + * handler or was discarded by it. + * >0 if skb should be passed on to UDP. + * <0 if skb should be resubmitted as proto -N + */ + + /* if we're overly short, let UDP handle it */ + encap_rcv = ACCESS_ONCE(up->encap_rcv); + if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { + int ret; + + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + + ret = encap_rcv(sk, skb); + if (ret <= 0) { + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); + return -ret; + } + } + + /* FALLTHROUGH -- it's a UDP Packet */ + } + /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ @@ -292,151 +669,179 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_access_pointer(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) - goto drop; + goto csum_error; } - if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { - /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) - UDP6_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } - return 0; + skb_dst_drop(skb); + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = __udpv6_queue_rcv_skb(sk, skb); + else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + bh_unlock_sock(sk); + goto drop; + } + bh_unlock_sock(sk); + + return rc; + +csum_error: + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - UDP6_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; } -static struct sock *udp_v6_mcast_next(struct sock *sk, - __be16 loc_port, struct in6_addr *loc_addr, - __be16 rmt_port, struct in6_addr *rmt_addr, +static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; - struct sock *s = sk; + struct hlist_nulls_node *node; unsigned short num = ntohs(loc_port); - sk_for_each_from(s, node) { - struct inet_sock *inet = inet_sk(s); + sk_nulls_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; - if (s->sk_hash == num && s->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(s); - if (inet->dport) { - if (inet->dport != rmt_port) + if (udp_sk(sk)->udp_port_hash == num && + sk->sk_family == PF_INET6) { + if (inet->inet_dport) { + if (inet->inet_dport != rmt_port) continue; } - if (!ipv6_addr_any(&np->daddr) && - !ipv6_addr_equal(&np->daddr, rmt_addr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) continue; } - if (!inet6_mc_check(s, loc_addr, rmt_addr)) + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) continue; - return s; + return sk; } } return NULL; } +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + struct sk_buff *skb1 = NULL; + struct sock *sk; + unsigned int i; + + for (i = 0; i < count; i++) { + sk = stack[i]; + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + if (!skb1) { + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + } + + if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; + } + if (unlikely(skb1)) + kfree_skb(skb1); +} + +static void udp6_csum_zero_error(struct sk_buff *skb) +{ + /* RFC 2460 section 8.1 says that we SHOULD log + * this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", + &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), + &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); +} + /* * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ -static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr, - struct in6_addr *daddr, struct hlist_head udptable[]) +static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, const struct in6_addr *daddr, + struct udp_table *udptable) { - struct sock *sk, *sk2; + struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); int dif; + unsigned int i, count = 0; - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + spin_lock(&hslot->lock); + sk = sk_nulls_head(&hslot->head); dif = inet6_iif(skb); - sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (!sk) { - kfree_skb(skb); - goto out; - } + sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); + while (sk) { + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (uh->check || udp_sk(sk)->no_check6_rx) + stack[count++] = sk; - sk2 = sk; - while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr, - uh->source, saddr, dif))) { - struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); - if (buff) { - bh_lock_sock_nested(sk2); - if (!sock_owned_by_user(sk2)) - udpv6_queue_rcv_skb(sk2, buff); - else - sk_add_backlog(sk2, buff); - bh_unlock_sock(sk2); + sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, + uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; } } - bh_lock_sock_nested(sk); - if (!sock_owned_by_user(sk)) - udpv6_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); -out: - read_unlock(&udp_hash_lock); - return 0; -} - -static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, - int proto) -{ - int err; + /* + * before releasing the lock, we must take reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; + spin_unlock(&hslot->lock); - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - } + if (count) { + flush_stack(stack, count, skb, count - 1); - if (uh->check == 0) { - /* RFC 2460 section 8.1 says that we SHOULD log - this error. Well, it is reasonable. - */ - LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); - return 1; + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); } - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, - skb->len, proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (!skb_csum_unnecessary(skb)) - skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len, proto, 0)); - return 0; } -int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + struct net *net = dev_net(skb->dev); struct sock *sk; struct udphdr *uh; - struct net_device *dev = skb->dev; - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto short_packet; + goto discard; saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -466,13 +871,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], } if (udp6_csum_init(skb, uh, proto)) - goto discard; + goto csum_error; /* * Multicast receive code */ if (ipv6_addr_is_multicast(daddr)) - return __udp6_lib_mcast_deliver(skb, saddr, daddr, udptable); + return __udp6_lib_mcast_deliver(net, skb, + saddr, daddr, udptable); /* Unicast */ @@ -480,48 +886,66 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], * check socket cache ... must talk to Alan about his plans * for sock caches... i'll skip this for now. */ - sk = __udp6_lib_lookup(skb->dev->nd_net, saddr, uh->source, - daddr, uh->dest, inet6_iif(skb), udptable); - - if (sk == NULL) { - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard; + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (sk != NULL) { + int ret; + + if (!uh->check && !udp_sk(sk)->no_check6_rx) { + sock_put(sk); + udp6_csum_zero_error(skb); + goto csum_error; + } - if (udp_lib_checksum_complete(skb)) - goto discard; - UDP6_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; - kfree_skb(skb); return 0; } - /* deliver */ + if (!uh->check) { + udp6_csum_zero_error(skb); + goto csum_error; + } - bh_lock_sock_nested(sk); - if (!sock_owned_by_user(sk)) - udpv6_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (udp_lib_checksum_complete(skb)) + goto csum_error; + + UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: %d/%u\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", - ulen, skb->len); - + saddr, + ntohs(uh->source), + ulen, + skb->len, + daddr, + ntohs(uh->dest)); + goto discard; +csum_error: + UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: - UDP6_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } static __inline__ int udpv6_rcv(struct sk_buff *skb) { - return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP); + return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); } /* @@ -531,13 +955,56 @@ static void udp_v6_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); - if (up->pending) { + if (up->pending == AF_INET) + udp_flush_pending_frames(sk); + else if (up->pending) { up->len = 0; up->pending = 0; ip6_flush_pending_frames(sk); } } +/** + * udp6_hwcsum_outgoing - handle outgoing HW checksumming + * @sk: socket we are sending on + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) + */ +static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + unsigned int offset; + struct udphdr *uh = udp_hdr(skb); + __wsum csum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* Only one fragment on the socket. */ + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0); + } else { + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together + */ + offset = skb_transport_offset(skb); + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + + skb_queue_walk(&sk->sk_write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + + uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, + csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + /* * Sending */ @@ -548,11 +1015,16 @@ static int udp_v6_push_pending_frames(struct sock *sk) struct udphdr *uh; struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); - struct flowi *fl = &inet->cork.fl; + struct flowi6 *fl6; int err = 0; int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; + if (up->pending == AF_INET) + return udp_push_pending_frames(sk); + + fl6 = &inet->cork.fl.u.ip6; + /* Grab the skbuff where UDP header space exists. */ if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) goto out; @@ -561,28 +1033,43 @@ static int udp_v6_push_pending_frames(struct sock *sk) * Create a UDP header */ uh = udp_hdr(skb); - uh->source = fl->fl_ip_sport; - uh->dest = fl->fl_ip_dport; + uh->source = fl6->fl6_sport; + uh->dest = fl6->fl6_dport; uh->len = htons(up->len); uh->check = 0; if (is_udplite) csum = udplite_csum_outgoing(sk, skb); - else + else if (up->no_check6_tx) { /* UDP csum disabled */ + skb->ip_summed = CHECKSUM_NONE; + goto send; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, + up->len); + goto send; + } else csum = udp_csum_outgoing(sk, skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_ipv6_magic(&fl->fl6_src, &fl->fl6_dst, - up->len, fl->proto, csum ); + uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + up->len, fl6->flowi6_proto, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; +send: err = ip6_push_pending_frames(sk); + if (err) { + if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); out: up->len = 0; up->pending = 0; - if (!err) - UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -593,16 +1080,17 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi fl; + struct flowi6 fl6; struct dst_entry *dst; int addr_len = msg->msg_namelen; int ulen = len; int hlimit = -1; int tclass = -1; + int dontfrag = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; @@ -633,7 +1121,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, } else if (!up->pending) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - daddr = &np->daddr; + daddr = &sk->sk_v6_daddr; } else daddr = NULL; @@ -641,7 +1129,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (ipv6_addr_v4mapped(daddr)) { struct sockaddr_in sin; sin.sin_family = AF_INET; - sin.sin_port = sin6 ? sin6->sin6_port : inet->dport; + sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport; sin.sin_addr.s_addr = daddr->s6_addr32[3]; msg->msg_name = &sin; msg->msg_namelen = sizeof(sin); @@ -679,22 +1167,21 @@ do_udp_sendmsg: } ulen += sizeof(struct udphdr); - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (sin6) { if (sin6->sin6_port == 0) return -EINVAL; - fl.fl_ip_dport = sin6->sin6_port; + fl6.fl6_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -703,38 +1190,44 @@ do_udp_sendmsg: * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && - ipv6_addr_equal(daddr, &np->daddr)) - daddr = &np->daddr; + ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) + daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) + fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - fl.fl_ip_dport = inet->dport; - daddr = &np->daddr; - fl.fl6_flowlabel = np->flow_label; + fl6.fl6_dport = inet->inet_dport; + daddr = &sk->sk_v6_daddr; + fl6.flowlabel = np->flow_label; connected = 1; } - if (!fl.oif) - fl.oif = sk->sk_bound_dev_if; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + + fl6.flowi6_mark = sk->sk_mark; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -748,57 +1241,39 @@ do_udp_sendmsg: opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, daddr); - if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl_ip_sport = inet->sport; - - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; + fl6.flowi6_proto = sk->sk_protocol; + if (!ipv6_addr_any(daddr)) + fl6.daddr = *daddr; + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, opt, &final); + if (final_p) connected = 0; - } - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) { - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) { + fl6.flowi6_oif = np->mcast_oif; connected = 0; - } + } else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, &fl); + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - err = ip6_sk_dst_lookup(sk, &dst, &fl); - if (err) + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT)) < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; } - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hlimit < 0) - hlimit = ipv6_get_hoplimit(dst->dev); - } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) { + if (tclass < 0) tclass = np->tclass; - if (tclass < 0) - tclass = 0; - } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -818,12 +1293,14 @@ back_from_confirm: up->pending = AF_INET6; do_append_data: + if (dontfrag < 0) + dontfrag = np->dontfrag; up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, &fl, + sizeof(struct udphdr), hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) @@ -834,22 +1311,24 @@ do_append_data: if (dst) { if (connected) { ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? - &np->daddr : NULL, + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? &np->saddr : #endif NULL); } else { dst_release(dst); } + dst = NULL; } if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; release_sock(sk); out: + dst_release(dst); fl6_sock_release(flowlabel); if (!err) return len; @@ -861,7 +1340,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP6_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -873,22 +1353,28 @@ do_confirm: goto out; } -int udpv6_destroy_sock(struct sock *sk) +void udpv6_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); lock_sock(sk); udp_v6_flush_pending_frames(sk); release_sock(sk); - inet6_destroy_sock(sk); + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } - return 0; + inet6_destroy_sock(sk); } /* * Socket option code for UDP */ int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, optval, optlen, @@ -898,7 +1384,7 @@ int udpv6_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_COMPAT int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_setsockopt(sk, level, optname, optval, optlen, @@ -925,7 +1411,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static struct inet6_protocol udpv6_protocol = { +static const struct inet6_protocol udpv6_protocol = { .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, @@ -933,72 +1419,60 @@ static struct inet6_protocol udpv6_protocol = { /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS - -static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) -{ - struct inet_sock *inet = inet_sk(sp); - struct ipv6_pinfo *np = inet6_sk(sp); - struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = ntohs(inet->dport); - srcp = ntohs(inet->sport); - seq_printf(seq, - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", - bucket, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - atomic_read(&sp->sk_wmem_alloc), - atomic_read(&sp->sk_rmem_alloc), - 0, 0L, 0, - sock_i_uid(sp), 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp); -} - int udp6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode\n"); - else - udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct udp_iter_state *)seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } return 0; } -static struct file_operations udp6_seq_fops; +static const struct file_operations udp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct udp_seq_afinfo udp6_seq_afinfo = { - .owner = THIS_MODULE, .name = "udp6", .family = AF_INET6, - .hashtable = udp_hash, - .seq_show = udp6_seq_show, - .seq_fops = &udp6_seq_fops, + .udp_table = &udp_table, + .seq_fops = &udp6_afinfo_seq_fops, + .seq_ops = { + .show = udp6_seq_show, + }, }; -int __init udp6_proc_init(void) +int __net_init udp6_proc_init(struct net *net) { - return udp_proc_register(&udp6_seq_afinfo); + return udp_proc_register(net, &udp6_seq_afinfo); } -void udp6_proc_exit(void) { - udp_proc_unregister(&udp6_seq_afinfo); +void udp6_proc_exit(struct net *net) { + udp_proc_unregister(net, &udp6_seq_afinfo); } #endif /* CONFIG_PROC_FS */ -/* ------------------------------------------------------------------------ */ +void udp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); -DEFINE_PROTO_INUSE(udpv6) + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + +/* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { .name = "UDPv6", @@ -1012,20 +1486,23 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, - .backlog_rcv = udpv6_queue_rcv_skb, + .backlog_rcv = __udpv6_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - REF_PROTO_INUSE(udpv6) + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udpv6_protosw = { @@ -1033,8 +1510,6 @@ static struct inet_protosw udpv6_protosw = { .protocol = IPPROTO_UDP, .prot = &udpv6_prot, .ops = &inet6_dgram_ops, - .capability =-1, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 21be3a83e7b..c779c3c90b9 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -7,29 +7,32 @@ #include <net/inet_common.h> #include <net/transp_v6.h> -extern int __udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int ); -extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, - int , int , int , __be32 , struct hlist_head []); +int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, + __be32, struct udp_table *); -extern int udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -extern int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen); +int udp_v6_get_port(struct sock *sk, unsigned short snum); + +int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); #ifdef CONFIG_COMPAT -extern int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen); -extern int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #endif -extern int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len); -extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, - int noblock, int flags, int *addr_len); -extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern int udpv6_destroy_sock(struct sock *sk); +int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len); +int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udpv6_destroy_sock(struct sock *sk); + +void udp_v6_clear_sk(struct sock *sk, int size); #ifdef CONFIG_PROC_FS -extern int udp6_seq_show(struct seq_file *seq, void *v); +int udp6_seq_show(struct seq_file *seq, void *v); #endif #endif /* _UDP6_IMPL_H */ diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c new file mode 100644 index 00000000000..0ae3d98f83e --- /dev/null +++ b/net/ipv6/udp_offload.c @@ -0,0 +1,140 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * UDPv6 GSO support + */ +#include <linux/skbuff.h> +#include <net/protocol.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> +#include "ip6_offload.h" + +static int udp6_ufo_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + if (likely(!skb->encapsulation)) { + ipv6h = ipv6_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + + return 0; +} + +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *packet_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + int offset; + __wsum csum; + int tnl_hlen; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_MPLS) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + if (skb->encapsulation && skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) + segs = skb_udp_tunnel_segment(skb, features); + else { + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + fptr->identification = skb_shinfo(skb)->ip6_frag_id; + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + } + +out: + return segs; +} +static const struct net_offload udpv6_offload = { + .callbacks = { + .gso_send_check = udp6_ufo_send_check, + .gso_segment = udp6_ufo_fragment, + }, +}; + +int __init udp_offload_init(void) +{ + return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); +} diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 87d4202522e..9cf097e206e 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -2,8 +2,6 @@ * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6. * See also net/ipv4/udplite.c * - * Version: $Id: udplite.c,v 1.9 2006/10/19 08:28:10 gerrit Exp $ - * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: @@ -13,35 +11,27 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <linux/export.h> #include "udp_impl.h" -DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6) __read_mostly; - static int udplitev6_rcv(struct sk_buff *skb) { - return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); + return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static void udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash); + __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); } -static struct inet6_protocol udplitev6_protocol = { +static const struct inet6_protocol udplitev6_protocol = { .handler = udplitev6_rcv, .err_handler = udplitev6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -static int udplite_v6_get_port(struct sock *sk, unsigned short snum) -{ - return udplite_get_port(sk, snum, ipv6_rcv_saddr_equal); -} - -DEFINE_PROTO_INUSE(udplitev6) - struct proto udplitev6_prot = { .name = "UDPLITEv6", .owner = THIS_MODULE, @@ -58,13 +48,15 @@ struct proto udplitev6_prot = { .backlog_rcv = udpv6_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, - .get_port = udplite_v6_get_port, + .get_port = udp_v6_get_port, .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udplite_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - REF_PROTO_INUSE(udplitev6) + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { @@ -72,8 +64,6 @@ static struct inet_protosw udplite6_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplitev6_prot, .ops = &inet6_dgram_ops, - .capability = -1, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT, }; @@ -103,23 +93,47 @@ void udplitev6_exit(void) } #ifdef CONFIG_PROC_FS -static struct file_operations udplite6_seq_fops; + +static const struct file_operations udplite6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct udp_seq_afinfo udplite6_seq_afinfo = { - .owner = THIS_MODULE, .name = "udplite6", .family = AF_INET6, - .hashtable = udplite_hash, - .seq_show = udp6_seq_show, - .seq_fops = &udplite6_seq_fops, + .udp_table = &udplite_table, + .seq_fops = &udplite6_afinfo_seq_fops, + .seq_ops = { + .show = udp6_seq_show, + }, +}; + +static int __net_init udplite6_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udplite6_seq_afinfo); +} + +static void __net_exit udplite6_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udplite6_seq_afinfo); +} + +static struct pernet_operations udplite6_net_ops = { + .init = udplite6_proc_init_net, + .exit = udplite6_proc_exit_net, }; int __init udplite6_proc_init(void) { - return udp_proc_register(&udplite6_seq_afinfo); + return register_pernet_subsys(&udplite6_net_ops); } void udplite6_proc_exit(void) { - udp_proc_unregister(&udplite6_seq_afinfo); + unregister_pernet_subsys(&udplite6_net_ops); } #endif diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index a4714d76ae6..f8c3cf842f5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -42,7 +42,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); - NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, ip6_rcv_finish); return -1; } @@ -58,10 +58,8 @@ EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { + struct net *net = dev_net(skb->dev); struct xfrm_state *x = NULL; - int wildcard = 0; - xfrm_address_t *xany; - int nh = 0; int i = 0; /* Allocate new secpath or COW existing one. */ @@ -70,7 +68,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, sp = secpath_dup(skb->sp); if (!sp) { - XFRM_INC_STATS(LINUX_MIB_XFRMINERROR); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } if (skb->sp) @@ -79,14 +77,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, } if (1 + skb->sp->len == XFRM_MAX_DEPTH) { - XFRM_INC_STATS(LINUX_MIB_XFRMINBUFFERERROR); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } - xany = (xfrm_address_t *)&in6addr_any; - for (i = 0; i < 3; i++) { xfrm_address_t *dst, *src; + switch (i) { case 0: dst = daddr; @@ -94,62 +91,39 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, break; case 1: /* lookup state with wild-card source address */ - wildcard = 1; dst = daddr; - src = xany; + src = (xfrm_address_t *)&in6addr_any; break; - case 2: default: /* lookup state with wild-card addresses */ - wildcard = 1; /* XXX */ - dst = xany; - src = xany; + dst = (xfrm_address_t *)&in6addr_any; + src = (xfrm_address_t *)&in6addr_any; break; } - x = xfrm_state_lookup_byaddr(dst, src, proto, AF_INET6); + x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); if (!x) continue; spin_lock(&x->lock); - if (wildcard) { - if ((x->props.flags & XFRM_STATE_WILDRECV) == 0) { - spin_unlock(&x->lock); - xfrm_state_put(x); - x = NULL; - continue; - } - } - - if (unlikely(x->km.state != XFRM_STATE_VALID)) { + if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && + likely(x->km.state == XFRM_STATE_VALID) && + !xfrm_state_check_expire(x)) { spin_unlock(&x->lock); - xfrm_state_put(x); - x = NULL; - continue; - } - if (xfrm_state_check_expire(x)) { + if (x->type->input(x, skb) > 0) { + /* found a valid state */ + break; + } + } else spin_unlock(&x->lock); - xfrm_state_put(x); - x = NULL; - continue; - } - - spin_unlock(&x->lock); - - nh = x->type->input(x, skb); - if (nh <= 0) { - xfrm_state_put(x); - x = NULL; - continue; - } - /* Found a state */ - break; + xfrm_state_put(x); + x = NULL; } if (!x) { - XFRM_INC_STATS(LINUX_MIB_XFRMINNOSTATES); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound_simple(skb, AF_INET6); goto drop; } diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index d6ce400f585..9949a356d62 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -40,26 +40,46 @@ static void xfrm6_beet_make_header(struct sk_buff *skb) static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *top_iph; + struct ip_beet_phdr *ph; + int optlen, hdr_len; - skb_set_network_header(skb, -x->props.header_len); + hdr_len = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - hdr_len); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); skb->transport_header = skb->network_header + sizeof(*top_iph); - __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl); + ph = (struct ip_beet_phdr *)__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl-hdr_len); xfrm6_beet_make_header(skb); top_iph = ipv6_hdr(skb); + if (unlikely(optlen)) { + + BUG_ON(optlen < 0); + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->nexthdr; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + top_iph->nexthdr = IPPROTO_BEETPH; + } + + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; - const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); int err; @@ -69,17 +89,14 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) __skb_push(skb, size); skb_reset_network_header(skb); - - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); - ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; + ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 63d5d493098..0e015906f9c 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -15,8 +15,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * Authors: diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index e20529b4c82..901ef6f8add 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -5,6 +5,7 @@ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -13,12 +14,13 @@ #include <net/dsfield.h> #include <net/dst.h> #include <net/inet_ecn.h> +#include <net/ip6_route.h> #include <net/ipv6.h> #include <net/xfrm.h> static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { - struct ipv6hdr *outer_iph = ipv6_hdr(skb); + const struct ipv6hdr *outer_iph = ipv6_hdr(skb); struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) @@ -31,7 +33,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) */ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *top_iph; int dsfield; @@ -45,31 +47,39 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(top_iph->flow_lbl)); - top_iph->nexthdr = xfrm_af2proto(skb->dst->ops->family); + top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); - dsfield = XFRM_MODE_SKB_CB(skb)->tos; - dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + dsfield = 0; + else + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } +#define for_each_input_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) + + static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; - const unsigned char *old_mac; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) goto out; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) @@ -78,10 +88,9 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 0af823cf7f1..433672d07d0 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -17,6 +17,7 @@ #include <linux/netfilter_ipv6.h> #include <net/dst.h> #include <net/ipv6.h> +#include <net/ip6_route.h> #include <net/xfrm.h> int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, @@ -27,18 +28,65 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, EXPORT_SYMBOL(xfrm6_find_1stfragopt); +static int xfrm6_local_dontfrag(struct sk_buff *skb) +{ + int proto; + struct sock *sk = skb->sk; + + if (sk) { + if (sk->sk_family != AF_INET6) + return 0; + + proto = sk->sk_protocol; + if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) + return inet6_sk(sk)->dontfrag; + } + + return 0; +} + +static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + struct sock *sk = skb->sk; + + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.daddr = ipv6_hdr(skb)->daddr; + + ipv6_local_rxpmtu(sk, &fl6, mtu); +} + +void xfrm6_local_error(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + const struct ipv6hdr *hdr; + struct sock *sk = skb->sk; + + hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb); + fl6.fl6_dport = inet_sk(sk)->inet_dport; + fl6.daddr = hdr->daddr; + + ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); +} + static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->local_df && skb->len > mtu) { + if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + + if (xfrm6_local_dontfrag(skb)) + xfrm6_local_rxpmtu(skb, mtu); + else if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; } @@ -66,29 +114,61 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif - - skb->protocol = htons(ETH_P_IPV6); + skb->ignore_df = 1; return x->outer_mode->output2(x, skb); } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + skb->protocol = htons(ETH_P_IPV6); + #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IPV6); return xfrm_output(skb); } -int xfrm6_output(struct sk_buff *skb) +static int __xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + int mtu; + +#ifdef CONFIG_NETFILTER + if (!x) { + IP6CB(skb)->flags |= IP6SKB_REROUTED; + return dst_output(skb); + } +#endif + + if (skb->protocol == htons(ETH_P_IPV6)) + mtu = ip6_skb_dst_mtu(skb); + else + mtu = dst_mtu(skb_dst(skb)); + + if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + xfrm6_local_rxpmtu(skb, mtu); + return -EMSGSIZE; + } else if (!skb->ignore_df && skb->len > mtu && skb->sk) { + xfrm_local_error(skb, mtu); + return -EMSGSIZE; + } + + if (x->props.mode == XFRM_MODE_TUNNEL && + ((skb->len > mtu && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)))) { + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); + } + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm6_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dst->dev, - xfrm6_output_finish); + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, + NULL, skb_dst(skb)->dev, __xfrm6_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 7d20199ee1f..2a0bbda2c76 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -20,25 +20,26 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif -static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr, - xfrm_address_t *daddr) +static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi fl = {}; + struct flowi6 fl6; struct dst_entry *dst; int err; - memcpy(&fl.fl6_dst, daddr, sizeof(fl.fl6_dst)); + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) - memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src)); + memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); - dst = ip6_route_output(NULL, &fl); + dst = ip6_route_output(net, NULL, &fl6); err = dst->error; if (dst->error) { @@ -49,53 +50,34 @@ static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr, return dst; } -static int xfrm6_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) +static int xfrm6_get_saddr(struct net *net, + xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; + struct net_device *dev; - dst = xfrm6_dst_lookup(0, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; - ipv6_get_saddr(dst, (struct in6_addr *)&daddr->a6, - (struct in6_addr *)&saddr->a6); + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, + (struct in6_addr *)&saddr->a6); dst_release(dst); return 0; } -static struct dst_entry * -__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +static int xfrm6_get_tos(const struct flowi *fl) { - struct dst_entry *dst; - - /* Still not clear if we should set fl->fl6_{src,dst}... */ - read_lock_bh(&policy->lock); - for (dst = policy->bundles; dst; dst = dst->next) { - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct in6_addr fl_dst_prefix, fl_src_prefix; - - ipv6_addr_prefix(&fl_dst_prefix, - &fl->fl6_dst, - xdst->u.rt6.rt6i_dst.plen); - ipv6_addr_prefix(&fl_src_prefix, - &fl->fl6_src, - xdst->u.rt6.rt6i_src.plen); - if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && - ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && - xfrm_bundle_ok(policy, xdst, fl, AF_INET6, - (xdst->u.rt6.rt6i_dst.plen != 128 || - xdst->u.rt6.rt6i_src.plen != 128))) { - dst_clone(dst); - break; - } - } - read_unlock_bh(&policy->lock); - return dst; + return 0; } -static int xfrm6_get_tos(struct flowi *fl) +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) { - return 0; + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); } static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -112,16 +94,21 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, return 0; } -static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) { struct rt6_info *rt = (struct rt6_info*)xdst->route; xdst->u.dst.dev = dev; dev_hold(dev); - xdst->u.rt6.rt6i_idev = in6_dev_get(rt->u.dst.dev); - if (!xdst->u.rt6.rt6i_idev) + xdst->u.rt6.rt6i_idev = in6_dev_get(dev); + if (!xdst->u.rt6.rt6i_idev) { + dev_put(dev); return -ENODEV; + } + + rt6_transfer_peer(&xdst->u.rt6, rt); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ @@ -141,21 +128,33 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) static inline void _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) { + struct flowi6 *fl6 = &fl->u.ip6; + int onlyproto = 0; u16 offset = skb_network_header_len(skb); - struct ipv6hdr *hdr = ipv6_hdr(skb); + const struct ipv6hdr *hdr = ipv6_hdr(skb); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u8 nexthdr = nh[IP6CB(skb)->nhoff]; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; + + memset(fl6, 0, sizeof(struct flowi6)); + fl6->flowi6_mark = skb->mark; + fl6->flowi6_oif = reverse ? skb->skb_iif : oif; - memset(fl, 0, sizeof(struct flowi)); - ipv6_addr_copy(&fl->fl6_dst, reverse ? &hdr->saddr : &hdr->daddr); - ipv6_addr_copy(&fl->fl6_src, reverse ? &hdr->daddr : &hdr->saddr); + fl6->daddr = reverse ? hdr->saddr : hdr->daddr; + fl6->saddr = reverse ? hdr->daddr : hdr->saddr; - while (pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + while (nh + offset + 1 < skb->data || + pskb_may_pull(skb, nh + offset + 1 - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); switch (nexthdr) { + case NEXTHDR_FRAGMENT: + onlyproto = 1; case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: @@ -169,34 +168,35 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: - if (pskb_may_pull(skb, nh + offset + 4 - skb->data)) { + if (!onlyproto && (nh + offset + 4 < skb->data || + pskb_may_pull(skb, nh + offset + 4 - skb->data))) { __be16 *ports = (__be16 *)exthdr; - fl->fl_ip_sport = ports[!!reverse]; - fl->fl_ip_dport = ports[!reverse]; + fl6->fl6_sport = ports[!!reverse]; + fl6->fl6_dport = ports[!reverse]; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; case IPPROTO_ICMPV6: - if (pskb_may_pull(skb, nh + offset + 2 - skb->data)) { + if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { u8 *icmp = (u8 *)exthdr; - fl->fl_icmp_type = icmp[0]; - fl->fl_icmp_code = icmp[1]; + fl6->fl6_icmp_type = icmp[0]; + fl6->fl6_icmp_code = icmp[1]; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: - if (pskb_may_pull(skb, nh + offset + 3 - skb->data)) { + if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; mh = (struct ip6_mh *)exthdr; - fl->fl_mh_type = mh->ip6mh_type; + fl6->fl6_mh_type = mh->ip6mh_type; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; #endif @@ -205,8 +205,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_ESP: case IPPROTO_COMP: default: - fl->fl_ipsec_spi = 0; - fl->proto = nexthdr; + fl6->fl6_ipsec_spi = 0; + fl6->flowi6_proto = nexthdr; return; } } @@ -214,16 +214,28 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm6_garbage_collect(struct dst_ops *ops) { - xfrm6_policy_afinfo.garbage_collect(); - return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); + struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); + + xfrm6_policy_afinfo.garbage_collect(net); + return dst_entries_get_fast(ops) > ops->gc_thresh * 2; +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, sk, skb, mtu); } -static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) @@ -232,6 +244,11 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); + dst_destroy_metrics_generic(dst); + if (rt6_has_peer(&xdst->u.rt6)) { + struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6); + inet_putpeer(peer); + } xfrm_dst_destroy(xdst); } @@ -246,7 +263,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = - in6_dev_get(dev->nd_net->loopback_dev); + in6_dev_get(dev_net(dev)->loopback_dev); BUG_ON(!loopback_idev); do { @@ -264,15 +281,15 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm6_dst_ops = { .family = AF_INET6, - .protocol = __constant_htons(ETH_P_IPV6), + .protocol = cpu_to_be16(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, + .redirect = xfrm6_redirect, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = 1024, - .entry_size = sizeof(struct xfrm_dst), - .entries = ATOMIC_INIT(0), + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { @@ -280,11 +297,12 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, - .find_bundle = __xfrm6_find_bundle, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, + .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) @@ -297,19 +315,91 @@ static void xfrm6_policy_fini(void) xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm6_policy_table[] = { + { + .procname = "xfrm6_gc_thresh", + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int __net_init xfrm6_net_init(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = xfrm6_policy_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; + } + + hdr = register_net_sysctl(net, "net/ipv6", table); + if (!hdr) + goto err_reg; + + net->ipv6.sysctl.xfrm6_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + struct ctl_table *table; + + if (net->ipv6.sysctl.xfrm6_hdr == NULL) + return; + + table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations xfrm6_net_ops = { + .init = xfrm6_net_init, + .exit = xfrm6_net_exit, +}; +#endif + int __init xfrm6_init(void) { int ret; + dst_entries_init(&xfrm6_dst_ops); + ret = xfrm6_policy_init(); - if (ret) + if (ret) { + dst_entries_destroy(&xfrm6_dst_ops); goto out; - + } ret = xfrm6_state_init(); if (ret) goto out_policy; + + ret = xfrm6_protocol_init(); + if (ret) + goto out_state; + +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&xfrm6_net_ops); +#endif out: return ret; +out_state: + xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; @@ -317,7 +407,11 @@ out_policy: void xfrm6_fini(void) { - //xfrm6_input_fini(); +#ifdef CONFIG_SYSCTL + unregister_pernet_subsys(&xfrm6_net_ops); +#endif + xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); + dst_entries_destroy(&xfrm6_dst_ops); } diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c new file mode 100644 index 00000000000..54d13f8dbba --- /dev/null +++ b/net/ipv6/xfrm6_protocol.c @@ -0,0 +1,279 @@ +/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/xfrm4_protocol.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; +static DEFINE_MUTEX(xfrm6_protocol_mutex); + +static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_handlers; + case IPPROTO_AH: + return &ah6_handlers; + case IPPROTO_COMP: + return &ipcomp6_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*proto_handlers(protocol), handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_cb); + +static int xfrm6_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(esp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(esp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ah6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ah6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static const struct inet6_protocol esp6_protocol = { + .handler = xfrm6_esp_rcv, + .err_handler = xfrm6_esp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ah6_protocol = { + .handler = xfrm6_ah_rcv, + .err_handler = xfrm6_ah_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ipcomp6_protocol = { + .handler = xfrm6_ipcomp_rcv, + .err_handler = xfrm6_ipcomp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static struct xfrm_input_afinfo xfrm6_input_afinfo = { + .family = AF_INET6, + .owner = THIS_MODULE, + .callback = xfrm6_rcv_cb, +}; + +static inline const struct inet6_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_protocol; + case IPPROTO_AH: + return &ah6_protocol; + case IPPROTO_COMP: + return &ipcomp6_protocol; + } + + return NULL; +} + +int xfrm6_protocol_register(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm6_protocol_mutex); + + if (add_netproto) { + if (inet6_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_register); + +int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) { + if (inet6_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm6_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_deregister); + +int __init xfrm6_protocol_init(void) +{ + return xfrm_input_register_afinfo(&xfrm6_input_afinfo); +} + +void xfrm6_protocol_fini(void) +{ + xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index ff1e1db8e23..3fc970135fc 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -15,29 +15,35 @@ #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/netfilter_ipv6.h> +#include <linux/export.h> #include <net/dsfield.h> #include <net/ipv6.h> #include <net/addrconf.h> -static struct xfrm_state_afinfo xfrm6_state_afinfo; - static void -__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { + const struct flowi6 *fl6 = &fl->u.ip6; + /* Initialize temporary selector matching only * to current session. */ - ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); - ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); - x->sel.dport = xfrm_flowi_dport(fl); - x->sel.dport_mask = htons(0xffff); - x->sel.sport = xfrm_flowi_sport(fl); - x->sel.sport_mask = htons(0xffff); - x->sel.prefixlen_d = 128; - x->sel.prefixlen_s = 128; - x->sel.proto = fl->proto; - x->sel.ifindex = fl->oif; + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void +xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) +{ x->id = tmpl->id; if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); @@ -49,125 +55,102 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->props.family = AF_INET6; } +/* distribution counting sort function for xfrm_state and xfrm_tmpl */ static int -__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) +__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) { int i; - int j = 0; + int class[XFRM_MAX_DEPTH]; + int count[maxclass]; - /* Rule 1: select IPsec transport except AH */ - for (i = 0; i < n; i++) { - if (src[i]->props.mode == XFRM_MODE_TRANSPORT && - src[i]->id.proto != IPPROTO_AH) { - dst[j++] = src[i]; - src[i] = NULL; - } - } - if (j == n) - goto end; + memset(count, 0, sizeof(count)); - /* Rule 2: select MIPv6 RO or inbound trigger */ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) for (i = 0; i < n; i++) { - if (src[i] && - (src[i]->props.mode == XFRM_MODE_ROUTEOPTIMIZATION || - src[i]->props.mode == XFRM_MODE_IN_TRIGGER)) { - dst[j++] = src[i]; - src[i] = NULL; - } + int c; + class[i] = c = cmp(src[i]); + count[c]++; } - if (j == n) - goto end; -#endif - /* Rule 3: select IPsec transport AH */ - for (i = 0; i < n; i++) { - if (src[i] && - src[i]->props.mode == XFRM_MODE_TRANSPORT && - src[i]->id.proto == IPPROTO_AH) { - dst[j++] = src[i]; - src[i] = NULL; - } - } - if (j == n) - goto end; + for (i = 2; i < maxclass; i++) + count[i] += count[i - 1]; - /* Rule 4: select IPsec tunnel */ for (i = 0; i < n; i++) { - if (src[i] && - (src[i]->props.mode == XFRM_MODE_TUNNEL || - src[i]->props.mode == XFRM_MODE_BEET)) { - dst[j++] = src[i]; - src[i] = NULL; - } + dst[count[class[i] - 1]++] = src[i]; + src[i] = NULL; } - if (likely(j == n)) - goto end; - /* Final rule */ - for (i = 0; i < n; i++) { - if (src[i]) { - dst[j++] = src[i]; - src[i] = NULL; - } - } - - end: return 0; } -static int -__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) +/* + * Rule for xfrm_state: + * + * rule 1: select IPsec transport except AH + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec transport AH + * rule 4: select IPsec tunnel + * rule 5: others + */ +static int __xfrm6_state_sort_cmp(void *p) { - int i; - int j = 0; - - /* Rule 1: select IPsec transport */ - for (i = 0; i < n; i++) { - if (src[i]->mode == XFRM_MODE_TRANSPORT) { - dst[j++] = src[i]; - src[i] = NULL; - } - } - if (j == n) - goto end; - - /* Rule 2: select MIPv6 RO or inbound trigger */ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - for (i = 0; i < n; i++) { - if (src[i] && - (src[i]->mode == XFRM_MODE_ROUTEOPTIMIZATION || - src[i]->mode == XFRM_MODE_IN_TRIGGER)) { - dst[j++] = src[i]; - src[i] = NULL; - } - } - if (j == n) - goto end; + struct xfrm_state *v = p; + + switch (v->props.mode) { + case XFRM_MODE_TRANSPORT: + if (v->id.proto != IPPROTO_AH) + return 1; + else + return 3; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; #endif - - /* Rule 3: select IPsec tunnel */ - for (i = 0; i < n; i++) { - if (src[i] && - (src[i]->mode == XFRM_MODE_TUNNEL || - src[i]->mode == XFRM_MODE_BEET)) { - dst[j++] = src[i]; - src[i] = NULL; - } + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 4; } - if (likely(j == n)) - goto end; + return 5; +} - /* Final rule */ - for (i = 0; i < n; i++) { - if (src[i]) { - dst[j++] = src[i]; - src[i] = NULL; - } +static int +__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_state_sort_cmp, 6); +} + +/* + * Rule for xfrm_tmpl: + * + * rule 1: select IPsec transport + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec tunnel + * rule 4: others + */ +static int __xfrm6_tmpl_sort_cmp(void *p) +{ + struct xfrm_tmpl *v = p; + switch (v->mode) { + case XFRM_MODE_TRANSPORT: + return 1; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 3; } + return 4; +} - end: - return 0; +static int +__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_tmpl_sort_cmp, 5); } int xfrm6_extract_header(struct sk_buff *skb) @@ -192,12 +175,15 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .eth_proto = htons(ETH_P_IPV6), .owner = THIS_MODULE, .init_tempsel = __xfrm6_init_tempsel, + .init_temprop = xfrm6_init_temprop, .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, + .local_error = xfrm6_local_error, }; int __init xfrm6_state_init(void) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 639fe8a6ff1..1c66465a42d 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> @@ -23,46 +22,56 @@ */ #include <linux/module.h> #include <linux/xfrm.h> -#include <linux/list.h> +#include <linux/slab.h> +#include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> +#include <net/netns/generic.h> + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +struct xfrm6_tunnel_net { + struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; + struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + u32 spi; +}; + +static int xfrm6_tunnel_net_id __read_mostly; +static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) +{ + return net_generic(net, xfrm6_tunnel_net_id); +} /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. */ struct xfrm6_tunnel_spi { - struct hlist_node list_byaddr; - struct hlist_node list_byspi; - xfrm_address_t addr; - u32 spi; - atomic_t refcnt; + struct hlist_node list_byaddr; + struct hlist_node list_byspi; + xfrm_address_t addr; + u32 spi; + atomic_t refcnt; + struct rcu_head rcu_head; }; -static DEFINE_RWLOCK(xfrm6_tunnel_spi_lock); - -static u32 xfrm6_tunnel_spi; - -#define XFRM6_TUNNEL_SPI_MIN 1 -#define XFRM6_TUNNEL_SPI_MAX 0xffffffff +static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; -#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 -#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 - -static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; -static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; - -static inline unsigned xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) +static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { - unsigned h; + unsigned int h; - h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]); + h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; @@ -70,116 +79,82 @@ static inline unsigned xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) return h; } -static inline unsigned xfrm6_tunnel_spi_hash_byspi(u32 spi) +static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } - -static int xfrm6_tunnel_spi_init(void) -{ - int i; - - xfrm6_tunnel_spi = 0; - xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", - sizeof(struct xfrm6_tunnel_spi), - 0, SLAB_HWCACHE_ALIGN, - NULL); - if (!xfrm6_tunnel_spi_kmem) - return -ENOMEM; - - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) - INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); - for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) - INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byspi[i]); - return 0; -} - -static void xfrm6_tunnel_spi_fini(void) -{ - int i; - - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { - if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) - return; - } - for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { - if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) - return; - } - kmem_cache_destroy(xfrm6_tunnel_spi_kmem); - xfrm6_tunnel_spi_kmem = NULL; -} - -static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos; - hlist_for_each_entry(x6spi, pos, - &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + hlist_for_each_entry_rcu(x6spi, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) + if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } -__be32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; - read_lock_bh(&xfrm6_tunnel_spi_lock); - x6spi = __xfrm6_tunnel_spi_lookup(saddr); + rcu_read_lock_bh(); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; - read_unlock_bh(&xfrm6_tunnel_spi_lock); + rcu_read_unlock_bh(); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); -static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + int index = xfrm6_tunnel_spi_hash_byspi(spi); + + hlist_for_each_entry(x6spi, + &xfrm6_tn->spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + return -1; + } + return index; +} + +static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos; - unsigned index; + int index; - if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || - xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) - xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; + if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else - xfrm6_tunnel_spi++; - - for (spi = xfrm6_tunnel_spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { - index = xfrm6_tunnel_spi_hash_byspi(spi); - hlist_for_each_entry(x6spi, pos, - &xfrm6_tunnel_spi_byspi[index], - list_byspi) { - if (x6spi->spi == spi) - goto try_next_1; - } - xfrm6_tunnel_spi = spi; - goto alloc_spi; -try_next_1:; + xfrm6_tn->spi++; + + for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; } - for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tunnel_spi; spi++) { - index = xfrm6_tunnel_spi_hash_byspi(spi); - hlist_for_each_entry(x6spi, pos, - &xfrm6_tunnel_spi_byspi[index], - list_byspi) { - if (x6spi->spi == spi) - goto try_next_2; - } - xfrm6_tunnel_spi = spi; - goto alloc_spi; -try_next_2:; + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; } spi = 0; goto out; alloc_spi: + xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; @@ -188,58 +163,63 @@ alloc_spi: x6spi->spi = spi; atomic_set(&x6spi->refcnt, 1); - hlist_add_head(&x6spi->list_byspi, &xfrm6_tunnel_spi_byspi[index]); + hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); - hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); + hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } -__be32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; - write_lock_bh(&xfrm6_tunnel_spi_lock); - x6spi = __xfrm6_tunnel_spi_lookup(saddr); + spin_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { atomic_inc(&x6spi->refcnt); spi = x6spi->spi; } else - spi = __xfrm6_tunnel_alloc_spi(saddr); - write_unlock_bh(&xfrm6_tunnel_spi_lock); + spi = __xfrm6_tunnel_alloc_spi(net, saddr); + spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); -void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) +static void x6spi_destroy_rcu(struct rcu_head *head) { + kmem_cache_free(xfrm6_tunnel_spi_kmem, + container_of(head, struct xfrm6_tunnel_spi, rcu_head)); +} + +static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos, *n; + struct hlist_node *n; - write_lock_bh(&xfrm6_tunnel_spi_lock); + spin_lock_bh(&xfrm6_tunnel_spi_lock); - hlist_for_each_entry_safe(x6spi, pos, n, - &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + hlist_for_each_entry_safe(x6spi, n, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (atomic_dec_and_test(&x6spi->refcnt)) { - hlist_del(&x6spi->list_byaddr); - hlist_del(&x6spi->list_byspi); - kmem_cache_free(xfrm6_tunnel_spi_kmem, x6spi); + hlist_del_rcu(&x6spi->list_byaddr); + hlist_del_rcu(&x6spi->list_byspi); + call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); break; } } } - write_unlock_bh(&xfrm6_tunnel_spi_lock); + spin_unlock_bh(&xfrm6_tunnel_spi_lock); } -EXPORT_SYMBOL(xfrm6_tunnel_free_spi); - static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); @@ -253,15 +233,16 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_tunnel_rcv(struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; - spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi) > 0 ? : 0; + spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - int type, int code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { /* xfrm6_tunnel native err handling */ switch (type) { @@ -316,7 +297,9 @@ static int xfrm6_tunnel_init_state(struct xfrm_state *x) static void xfrm6_tunnel_destroy(struct xfrm_state *x) { - xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); + struct net *net = xs_net(x); + + xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { @@ -329,47 +312,85 @@ static const struct xfrm_type xfrm6_tunnel_type = { .output = xfrm6_tunnel_output, }; -static struct xfrm6_tunnel xfrm6_tunnel_handler = { +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; -static struct xfrm6_tunnel xfrm46_tunnel_handler = { +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; +static int __net_init xfrm6_tunnel_net_init(struct net *net) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); + xfrm6_tn->spi = 0; + + return 0; +} + +static void __net_exit xfrm6_tunnel_net_exit(struct net *net) +{ +} + +static struct pernet_operations xfrm6_tunnel_net_ops = { + .init = xfrm6_tunnel_net_init, + .exit = xfrm6_tunnel_net_exit, + .id = &xfrm6_tunnel_net_id, + .size = sizeof(struct xfrm6_tunnel_net), +}; + static int __init xfrm6_tunnel_init(void) { - if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) - return -EAGAIN; + int rv; - if (xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6)) { - xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); - return -EAGAIN; - } - if (xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET)) { - xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); - xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); - return -EAGAIN; - } - if (xfrm6_tunnel_spi_init() < 0) { - xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); - xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); - xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); - return -EAGAIN; - } + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!xfrm6_tunnel_spi_kmem) + return -ENOMEM; + rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); + if (rv < 0) + goto out_pernet; + rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); + if (rv < 0) + goto out_type; + rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); + if (rv < 0) + goto out_xfrm6; + rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); + if (rv < 0) + goto out_xfrm46; return 0; + +out_xfrm46: + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); +out_xfrm6: + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); +out_type: + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); +out_pernet: + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + return rv; } static void __exit xfrm6_tunnel_fini(void) { - xfrm6_tunnel_spi_fini(); xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); |
