diff options
Diffstat (limited to 'net/ipv6')
93 files changed, 20912 insertions, 11459 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index a578096152a..438a73aa777 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -11,7 +11,7 @@ menuconfig IPV6 You will still be able to do traditional IPv4 networking as well. For general information about IPv6, see - <http://playground.sun.com/pub/ipng/html/ipng-main.html>. + <https://en.wikipedia.org/wiki/IPv6>. For Linux IPv6 development information, see <http://www.linux-ipv6.org>. For specific information about IPv6 under Linux, read the HOWTO at <http://www.bieringer.de/linux/IPv6/>. @@ -21,24 +21,6 @@ menuconfig IPV6 if IPV6 -config IPV6_PRIVACY - bool "IPv6: Privacy Extensions (RFC 3041) support" - ---help--- - Privacy Extensions for Stateless Address Autoconfiguration in IPv6 - support. With this option, additional periodically-altered - pseudo-random global-scope unicast address(es) will be assigned to - your interface(s). - - We use our standard pseudo-random algorithm to generate the - randomized interface identifier, instead of one described in RFC 3041. - - By default the kernel does not generate temporary addresses. - To use temporary addresses, do - - echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr - - See <file:Documentation/networking/ip-sysctl.txt> for details. - config IPV6_ROUTER_PREF bool "IPv6: Router Preference (RFC 4191) support" ---help--- @@ -50,16 +32,15 @@ config IPV6_ROUTER_PREF If unsure, say N. config IPV6_ROUTE_INFO - bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)" - depends on IPV6_ROUTER_PREF && EXPERIMENTAL + bool "IPv6: Route Information (RFC 4191) support" + depends on IPV6_ROUTER_PREF ---help--- This is experimental support of Route Information. If unsure, say N. config IPV6_OPTIMISTIC_DAD - bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "IPv6: Enable RFC 4429 Optimistic DAD" ---help--- This is experimental support for optimistic Duplicate Address Detection. It allows for autoconfigured addresses @@ -69,7 +50,7 @@ config IPV6_OPTIMISTIC_DAD config INET6_AH tristate "IPv6: AH transformation" - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_HMAC select CRYPTO_MD5 @@ -81,7 +62,7 @@ config INET6_AH config INET6_ESP tristate "IPv6: ESP transformation" - select XFRM + select XFRM_ALGO select CRYPTO select CRYPTO_AUTHENC select CRYPTO_HMAC @@ -105,8 +86,7 @@ config INET6_IPCOMP If unsure, say Y. config IPV6_MIP6 - tristate "IPv6: Mobility (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "IPv6: Mobility" select XFRM ---help--- Support for IPv6 Mobility described in RFC 3775. @@ -150,15 +130,27 @@ config INET6_XFRM_MODE_BEET If unsure, say Y. config INET6_XFRM_MODE_ROUTEOPTIMIZATION - tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "IPv6: MIPv6 route optimization mode" select XFRM ---help--- Support for MIPv6 route optimization mode. +config IPV6_VTI +tristate "Virtual (secure) IPv6: tunneling" + select IPV6_TUNNEL + select NET_IP_TUNNEL + depends on INET6_XFRM_MODE_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This can be used with xfrm mode tunnel to give + the notion of a secure tunnel for IPSEC and then use routing protocol + on top. + config IPV6_SIT tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" select INET_TUNNEL + select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE default y ---help--- @@ -171,8 +163,8 @@ config IPV6_SIT Saying M here will produce a module called sit. If unsure, say Y. config IPV6_SIT_6RD - bool "IPv6: IPv6 Rapid Deployment (6RD) (EXPERIMENTAL)" - depends on IPV6_SIT && EXPERIMENTAL + bool "IPv6: IPv6 Rapid Deployment (6RD)" + depends on IPV6_SIT default n ---help--- IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon @@ -201,9 +193,25 @@ config IPV6_TUNNEL If unsure, say N. +config IPV6_GRE + tristate "IPv6: GRE tunnel" + select IPV6_TUNNEL + select NET_IP_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + GRE (Generic Routing Encapsulation) and at this time allows + encapsulating of IPv4 or IPv6 over existing IPv6 infrastructure. + This driver is useful if the other endpoint is a Cisco router: Cisco + likes GRE much better than the other Linux tunneling driver ("IP + tunneling" above). In addition, GRE allows multicast redistribution + through the tunnel. + + Saying M here will produce a module called ip6_gre. If unsure, say N. + config IPV6_MULTIPLE_TABLES bool "IPv6: Multiple Routing Tables" - depends on EXPERIMENTAL select FIB_RULES ---help--- Support multiple routing tables. @@ -223,14 +231,28 @@ config IPV6_SUBTREES If unsure, say N. config IPV6_MROUTE - bool "IPv6: multicast routing (EXPERIMENTAL)" - depends on IPV6 && EXPERIMENTAL + bool "IPv6: multicast routing" + depends on IPV6 ---help--- Experimental support for IPv6 multicast forwarding. If unsure, say N. +config IPV6_MROUTE_MULTIPLE_TABLES + bool "IPv6: multicast policy routing" + depends on IPV6_MROUTE + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + config IPV6_PIMSM_V2 - bool "IPv6: PIM-SM version 2 support (EXPERIMENTAL)" + bool "IPv6: PIM-SM version 2 support" depends on IPV6_MROUTE ---help--- Support for IPv6 PIM multicast routing protocol PIM-SMv2. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 686934acfac..2fe68364bb2 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -7,14 +7,16 @@ obj-$(CONFIG_IPV6) += ipv6.o ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ - raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o +ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o + ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ - xfrm6_output.o + xfrm6_output.o xfrm6_protocol.o ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o @@ -34,9 +36,12 @@ obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o +obj-$(CONFIG_IPV6_GRE) += ip6_gre.o -obj-y += addrconf_core.o exthdrs_core.o +obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o +obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index de7a194a64a..5667b3003af 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -38,6 +38,8 @@ * status etc. */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> @@ -53,6 +55,7 @@ #include <linux/route.h> #include <linux/inetdevice.h> #include <linux/init.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -60,11 +63,14 @@ #include <linux/delay.h> #include <linux/notifier.h> #include <linux/string.h> +#include <linux/hash.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> +#include <net/af_ieee802154.h> +#include <net/firewire.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> @@ -76,28 +82,30 @@ #include <net/pkt_sched.h> #include <linux/if_tunnel.h> #include <linux/rtnetlink.h> - -#ifdef CONFIG_IPV6_PRIVACY +#include <linux/netconf.h> #include <linux/random.h> -#endif - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/export.h> /* Set to 3 to get tracing... */ #define ACONF_DEBUG 2 #if ACONF_DEBUG >= 3 -#define ADBG(x) printk x +#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define ADBG(x) +#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) #endif #define INFINITY_LIFE_TIME 0xFFFFFFFF -#define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} #ifdef CONFIG_SYSCTL static void addrconf_sysctl_register(struct inet6_dev *idev); @@ -112,37 +120,40 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) } #endif -#ifdef CONFIG_IPV6_PRIVACY -static int __ipv6_regen_rndid(struct inet6_dev *idev); -static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void __ipv6_regen_rndid(struct inet6_dev *idev); +static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static void ipv6_regen_rndid(unsigned long data); -static int desync_factor = MAX_DESYNC_FACTOR * HZ; -#endif - static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(struct inet6_dev *idev); /* * Configured unicast address hash table */ -static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; -static DEFINE_RWLOCK(addrconf_hash_lock); +static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); -static void addrconf_verify(unsigned long); +static void addrconf_verify(void); +static void addrconf_verify_rtnl(void); +static void addrconf_verify_work(struct work_struct *); -static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); -static DEFINE_SPINLOCK(addrconf_verify_lock); +static struct workqueue_struct *addrconf_wq; +static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work); static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); -static void addrconf_bonding_change(struct net_device *dev, - unsigned long event); +static void addrconf_type_change(struct net_device *dev, + unsigned long event); static int addrconf_ifdown(struct net_device *dev, int how); -static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); -static void addrconf_dad_timer(unsigned long data); +static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, + int plen, + const struct net_device *dev, + u32 flags, u32 noflags); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp); +static void addrconf_dad_work(struct work_struct *w); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); static void addrconf_dad_run(struct inet6_dev *idev); static void addrconf_rs_timer(unsigned long data); @@ -151,10 +162,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo); -static int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev); - -static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev); static struct ipv6_devconf ipv6_devconf __read_mostly = { .forwarding = 0, @@ -164,17 +173,17 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_redirects = 1, .autoconf = 1, .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, -#ifdef CONFIG_IPV6_PRIVACY .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, -#endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, @@ -189,6 +198,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -198,17 +208,18 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra = 1, .accept_redirects = 1, .autoconf = 1, + .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, -#ifdef CONFIG_IPV6_PRIVACY .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, -#endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, @@ -223,118 +234,78 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; -/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ -const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; -const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; -const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; -const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; - /* Check if a valid qdisc is available */ static inline bool addrconf_qdisc_ok(const struct net_device *dev) { return !qdisc_tx_is_noop(dev); } -/* Check if a route is valid prefix route */ -static inline int addrconf_is_prefix_route(const struct rt6_info *rt) +static void addrconf_del_rs_timer(struct inet6_dev *idev) { - return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0); + if (del_timer(&idev->rs_timer)) + __in6_dev_put(idev); } -static void addrconf_del_timer(struct inet6_ifaddr *ifp) +static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) { - if (del_timer(&ifp->timer)) + if (cancel_delayed_work(&ifp->dad_work)) __in6_ifa_put(ifp); } -enum addrconf_timer_t +static void addrconf_mod_rs_timer(struct inet6_dev *idev, + unsigned long when) { - AC_NONE, - AC_DAD, - AC_RS, -}; + if (!timer_pending(&idev->rs_timer)) + in6_dev_hold(idev); + mod_timer(&idev->rs_timer, jiffies + when); +} -static void addrconf_mod_timer(struct inet6_ifaddr *ifp, - enum addrconf_timer_t what, - unsigned long when) +static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, + unsigned long delay) { - if (!del_timer(&ifp->timer)) + if (!delayed_work_pending(&ifp->dad_work)) in6_ifa_hold(ifp); - - switch (what) { - case AC_DAD: - ifp->timer.function = addrconf_dad_timer; - break; - case AC_RS: - ifp->timer.function = addrconf_rs_timer; - break; - default:; - } - ifp->timer.expires = jiffies + when; - add_timer(&ifp->timer); + mod_delayed_work(addrconf_wq, &ifp->dad_work, delay); } static int snmp6_alloc_dev(struct inet6_dev *idev) { - if (snmp_mib_init((void **)idev->stats.ipv6, - sizeof(struct ipstats_mib)) < 0) + int i; + + idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + if (!idev->stats.ipv6) goto err_ip; - if (snmp_mib_init((void **)idev->stats.icmpv6, - sizeof(struct icmpv6_mib)) < 0) + + for_each_possible_cpu(i) { + struct ipstats_mib *addrconf_stats; + addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i); + u64_stats_init(&addrconf_stats->syncp); + } + + + idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6dev) goto err_icmp; - if (snmp_mib_init((void **)idev->stats.icmpv6msg, - sizeof(struct icmpv6msg_mib)) < 0) + idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6msgdev) goto err_icmpmsg; return 0; err_icmpmsg: - snmp_mib_free((void **)idev->stats.icmpv6); + kfree(idev->stats.icmpv6dev); err_icmp: - snmp_mib_free((void **)idev->stats.ipv6); + free_percpu(idev->stats.ipv6); err_ip: return -ENOMEM; } -static void snmp6_free_dev(struct inet6_dev *idev) -{ - snmp_mib_free((void **)idev->stats.icmpv6msg); - snmp_mib_free((void **)idev->stats.icmpv6); - snmp_mib_free((void **)idev->stats.ipv6); -} - -/* Nobody refers to this device, we may destroy it. */ - -static void in6_dev_finish_destroy_rcu(struct rcu_head *head) -{ - struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); - kfree(idev); -} - -void in6_dev_finish_destroy(struct inet6_dev *idev) -{ - struct net_device *dev = idev->dev; - - WARN_ON(idev->addr_list != NULL); - WARN_ON(idev->mc_list != NULL); - -#ifdef NET_REFCNT_DEBUG - printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); -#endif - dev_put(dev); - if (!idev->dead) { - printk("Freeing alive inet6 device %p\n", idev); - return; - } - snmp6_free_dev(idev); - call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); -} - -EXPORT_SYMBOL(in6_dev_finish_destroy); - -static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { struct inet6_dev *ndev; @@ -350,6 +321,9 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) rwlock_init(&ndev->lock); ndev->dev = dev; + INIT_LIST_HEAD(&ndev->addr_list); + setup_timer(&ndev->rs_timer, addrconf_rs_timer, + (unsigned long)ndev); memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; @@ -364,19 +338,19 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG((KERN_WARNING - "%s(): cannot allocate memory for statistics; dev=%s.\n", - __func__, dev->name)); + ADBG(KERN_WARNING + "%s: cannot allocate memory for statistics; dev=%s.\n", + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); - ndev->dead = 1; - in6_dev_finish_destroy(ndev); + dev_put(dev); + kfree(ndev); return NULL; } if (snmp6_register_dev(ndev) < 0) { - ADBG((KERN_WARNING - "%s(): cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name)); + ADBG(KERN_WARNING + "%s: cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name); neigh_parms_release(&nd_tbl, ndev->nd_parms); ndev->dead = 1; in6_dev_finish_destroy(ndev); @@ -391,31 +365,27 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) ndev->cnf.accept_dad = -1; -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { - printk(KERN_INFO - "%s: Disabled Multicast RS\n", - dev->name); + pr_info("%s: Disabled Multicast RS\n", dev->name); ndev->cnf.rtr_solicits = 0; } #endif -#ifdef CONFIG_IPV6_PRIVACY + INIT_LIST_HEAD(&ndev->tempaddr_list); setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || dev->type == ARPHRD_TUNNEL6 || dev->type == ARPHRD_SIT || dev->type == ARPHRD_NONE) { - printk(KERN_INFO - "%s: Disabled Privacy Extensions\n", - dev->name); ndev->cnf.use_tempaddr = -1; } else { in6_dev_hold(ndev); ipv6_regen_rndid((unsigned long) ndev); } -#endif + + ndev->token = in6addr_any; if (netif_running(dev) && addrconf_qdisc_ok(dev)) ndev->if_flags |= IF_READY; @@ -426,20 +396,29 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) /* protected by rtnl_lock */ rcu_assign_pointer(dev->ip6_ptr, ndev); + /* Join interface-local all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes); + /* Join all-node multicast group */ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + return ndev; } -static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +static struct inet6_dev *ipv6_find_idev(struct net_device *dev) { struct inet6_dev *idev; ASSERT_RTNL(); - if ((idev = __in6_dev_get(dev)) == NULL) { - if ((idev = ipv6_add_dev(dev)) == NULL) + idev = __in6_dev_get(dev); + if (!idev) { + idev = ipv6_add_dev(dev); + if (!idev) return NULL; } @@ -448,6 +427,226 @@ static struct inet6_dev * ipv6_find_idev(struct net_device *dev) return idev; } +static int inet6_netconf_msgsize_devconf(int type) +{ + int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + + nla_total_size(4); /* NETCONFA_IFINDEX */ + + /* type -1 is used for ALL */ + if (type == -1 || type == NETCONFA_FORWARDING) + size += nla_total_size(4); +#ifdef CONFIG_IPV6_MROUTE + if (type == -1 || type == NETCONFA_MC_FORWARDING) + size += nla_total_size(4); +#endif + if (type == -1 || type == NETCONFA_PROXY_NEIGH) + size += nla_total_size(4); + + return size; +} + +static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, + struct ipv6_devconf *devconf, u32 portid, + u32 seq, int event, unsigned int flags, + int type) +{ + struct nlmsghdr *nlh; + struct netconfmsg *ncm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), + flags); + if (nlh == NULL) + return -EMSGSIZE; + + ncm = nlmsg_data(nlh); + ncm->ncm_family = AF_INET6; + + if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) + goto nla_put_failure; + + /* type -1 is used for ALL */ + if ((type == -1 || type == NETCONFA_FORWARDING) && + nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) + goto nla_put_failure; +#ifdef CONFIG_IPV6_MROUTE + if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + nla_put_s32(skb, NETCONFA_MC_FORWARDING, + devconf->mc_forwarding) < 0) + goto nla_put_failure; +#endif + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, + struct ipv6_devconf *devconf) +{ + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, + RTM_NEWNETCONF, 0, type); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC); + return; +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err); +} + +static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { + [NETCONFA_IFINDEX] = { .len = sizeof(int) }, + [NETCONFA_FORWARDING] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, +}; + +static int inet6_netconf_get_devconf(struct sk_buff *in_skb, + struct nlmsghdr *nlh) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NETCONFA_MAX+1]; + struct netconfmsg *ncm; + struct sk_buff *skb; + struct ipv6_devconf *devconf; + struct inet6_dev *in6_dev; + struct net_device *dev; + int ifindex; + int err; + + err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, + devconf_ipv6_policy); + if (err < 0) + goto errout; + + err = EINVAL; + if (!tb[NETCONFA_IFINDEX]) + goto errout; + + ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); + switch (ifindex) { + case NETCONFA_IFINDEX_ALL: + devconf = net->ipv6.devconf_all; + break; + case NETCONFA_IFINDEX_DEFAULT: + devconf = net->ipv6.devconf_dflt; + break; + default: + dev = __dev_get_by_index(net, ifindex); + if (dev == NULL) + goto errout; + in6_dev = __in6_dev_get(dev); + if (in6_dev == NULL) + goto errout; + devconf = &in6_dev->cnf; + break; + } + + err = -ENOBUFS; + skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_netconf_fill_devconf(skb, ifindex, devconf, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWNETCONF, 0, + -1); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: + return err; +} + +static int inet6_netconf_dump_devconf(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, s_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ + net->dev_base_seq; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + + if (inet6_netconf_fill_devconf(skb, dev->ifindex, + &idev->cnf, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + -1) <= 0) { + rcu_read_unlock(); + goto done; + } + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + rcu_read_unlock(); + } + if (h == NETDEV_HASHENTRIES) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } + if (h == NETDEV_HASHENTRIES + 1) { + if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + -1) <= 0) + goto done; + else + h++; + } +done: + cb->args[0] = h; + cb->args[1] = idx; + + return skb->len; +} + #ifdef CONFIG_SYSCTL static void dev_forward_change(struct inet6_dev *idev) { @@ -459,13 +658,19 @@ static void dev_forward_change(struct inet6_dev *idev) dev = idev->dev; if (idev->cnf.forwarding) dev_disable_lro(dev); - if (dev && (dev->flags & IFF_MULTICAST)) { - if (idev->cnf.forwarding) + if (dev->flags & IFF_MULTICAST) { + if (idev->cnf.forwarding) { ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); - else + ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters); + } else { ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters); + ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters); + } } - for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + + list_for_each_entry(ifa, &idev->addr_list, if_list) { if (ifa->flags&IFA_F_TENTATIVE) continue; if (idev->cnf.forwarding) @@ -473,6 +678,8 @@ static void dev_forward_change(struct inet6_dev *idev) else addrconf_leave_anycast(ifa); } + inet6_netconf_notify_devconf(dev_net(dev), NETCONFA_FORWARDING, + dev->ifindex, &idev->cnf); } @@ -481,8 +688,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -491,107 +697,105 @@ static void addrconf_forward_change(struct net *net, __s32 newf) dev_forward_change(idev); } } - rcu_read_unlock(); } -static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) { struct net *net; - - net = (struct net *)table->extra2; - if (p == &net->ipv6.devconf_dflt->forwarding) - return 0; + int old; if (!rtnl_trylock()) return restart_syscall(); + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->forwarding) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } + if (p == &net->ipv6.devconf_all->forwarding) { - __s32 newf = net->ipv6.devconf_all->forwarding; net->ipv6.devconf_dflt->forwarding = newf; addrconf_forward_change(net, newf); - } else if ((!*p) ^ (!old)) + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); rtnl_unlock(); - if (*p) + if (newf) rt6_purge_dflt_routers(net); return 1; } #endif /* Nobody refers to this ifaddr, destroy it */ - void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) { - WARN_ON(ifp->if_next != NULL); - WARN_ON(ifp->lst_next != NULL); + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); #ifdef NET_REFCNT_DEBUG - printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); + pr_debug("%s\n", __func__); #endif in6_dev_put(ifp->idev); - if (del_timer(&ifp->timer)) - printk("Timer is still running, when freeing ifa=%p\n", ifp); + if (cancel_delayed_work(&ifp->dad_work)) + pr_notice("delayed DAD work was pending while freeing ifa=%p\n", + ifp); - if (!ifp->dead) { - printk("Freeing alive inet6 address %p\n", ifp); + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + pr_warn("Freeing alive inet6 address %p\n", ifp); return; } - dst_release(&ifp->rt->u.dst); + ip6_rt_put(ifp->rt); - kfree(ifp); + kfree_rcu(ifp, rcu); } static void ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - struct inet6_ifaddr *ifa, **ifap; + struct list_head *p; int ifp_scope = ipv6_addr_src_scope(&ifp->addr); /* * Each device address list is sorted in order of scope - * global before linklocal. */ - for (ifap = &idev->addr_list; (ifa = *ifap) != NULL; - ifap = &ifa->if_next) { + list_for_each(p, &idev->addr_list) { + struct inet6_ifaddr *ifa + = list_entry(p, struct inet6_ifaddr, if_list); if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) break; } - ifp->if_next = *ifap; - *ifap = ifp; + list_add_tail(&ifp->if_list, p); } -/* - * Hash function taken from net_alias.c - */ -static u8 ipv6_addr_hash(const struct in6_addr *addr) +static u32 inet6_addr_hash(const struct in6_addr *addr) { - __u32 word; - - /* - * We perform the hash function over the last 64 bits of the address - * This will include the IEEE address token on links that support it. - */ - - word = (__force u32)(addr->s6_addr32[2] ^ addr->s6_addr32[3]); - word ^= (word >> 16); - word ^= (word >> 8); - - return ((word ^ (word >> 4)) & 0x0f); + return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT); } /* On success it returns ifp with increased reference count */ static struct inet6_ifaddr * -ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, - int scope, u32 flags) +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + const struct in6_addr *peer_addr, int pfxlen, + int scope, u32 flags, u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; - int hash; + unsigned int hash; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -612,11 +816,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, goto out2; } - write_lock(&addrconf_hash_lock); + spin_lock(&addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { - ADBG(("ipv6_add_addr: already assigned\n")); + ADBG("ipv6_add_addr: already assigned\n"); err = -EEXIST; goto out; } @@ -624,63 +828,56 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); if (ifa == NULL) { - ADBG(("ipv6_add_addr: malloc failed\n")); + ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } - rt = addrconf_dst_alloc(idev, addr, 0); + rt = addrconf_dst_alloc(idev, addr, false); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto out; } - ipv6_addr_copy(&ifa->addr, addr); + neigh_parms_data_state_setall(idev->nd_parms); + + ifa->addr = *addr; + if (peer_addr) + ifa->peer_addr = *peer_addr; spin_lock_init(&ifa->lock); - init_timer(&ifa->timer); - ifa->timer.data = (unsigned long) ifa; + spin_lock_init(&ifa->state_lock); + INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work); + INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; ifa->prefix_len = pfxlen; ifa->flags = flags | IFA_F_TENTATIVE; + ifa->valid_lft = valid_lft; + ifa->prefered_lft = prefered_lft; ifa->cstamp = ifa->tstamp = jiffies; + ifa->tokenized = false; ifa->rt = rt; - /* - * part one of RFC 4429, section 3.3 - * We should not configure an address as - * optimistic if we do not yet know the link - * layer address of our nexhop router - */ - - if (rt->rt6i_nexthop == NULL) - ifa->flags &= ~IFA_F_OPTIMISTIC; - ifa->idev = idev; in6_dev_hold(idev); /* For caller */ in6_ifa_hold(ifa); /* Add to big hash table */ - hash = ipv6_addr_hash(addr); + hash = inet6_addr_hash(addr); - ifa->lst_next = inet6_addr_lst[hash]; - inet6_addr_lst[hash] = ifa; - in6_ifa_hold(ifa); - write_unlock(&addrconf_hash_lock); + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); write_lock(&idev->lock); /* Add to inet6_dev unicast addr list. */ ipv6_link_dev_addr(idev, ifa); -#ifdef CONFIG_IPV6_PRIVACY if (ifa->flags&IFA_F_TEMPORARY) { - ifa->tmp_next = idev->tempaddr_list; - idev->tempaddr_list = ifa; + list_add(&ifa->tmp_list, &idev->tempaddr_list); in6_ifa_hold(ifa); } -#endif in6_ifa_hold(ifa); write_unlock(&idev->lock); @@ -688,7 +885,7 @@ out2: rcu_read_unlock_bh(); if (likely(err == 0)) - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + inet6addr_notifier_call_chain(NETDEV_UP, ifa); else { kfree(ifa); ifa = ERR_PTR(err); @@ -696,152 +893,162 @@ out2: return ifa; out: - write_unlock(&addrconf_hash_lock); + spin_unlock(&addrconf_hash_lock); goto out2; } -/* This function wants to get referenced ifp and releases it before return */ +enum cleanup_prefix_rt_t { + CLEANUP_PREFIX_RT_NOP, /* no cleanup action for prefix route */ + CLEANUP_PREFIX_RT_DEL, /* delete the prefix route */ + CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */ +}; -static void ipv6_del_addr(struct inet6_ifaddr *ifp) +/* + * Check, whether the prefix for ifp would still need a prefix route + * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_* + * constants. + * + * 1) we don't purge prefix if address was not permanent. + * prefix is managed by its own lifetime. + * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE. + * 3) if there are no addresses, delete prefix. + * 4) if there are still other permanent address(es), + * corresponding prefix is still permanent. + * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE, + * don't purge the prefix, assume user space is managing it. + * 6) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + **/ +static enum cleanup_prefix_rt_t +check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) { - struct inet6_ifaddr *ifa, **ifap; + struct inet6_ifaddr *ifa; struct inet6_dev *idev = ifp->idev; - int hash; - int deleted = 0, onlink = 0; - unsigned long expires = jiffies; + unsigned long lifetime; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL; - hash = ipv6_addr_hash(&ifp->addr); + *expires = jiffies; - ifp->dead = 1; + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (ifa == ifp) + continue; + if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) + continue; + if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE)) + return CLEANUP_PREFIX_RT_NOP; - write_lock_bh(&addrconf_hash_lock); - for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL; - ifap = &ifa->lst_next) { - if (ifa == ifp) { - *ifap = ifa->lst_next; - __in6_ifa_put(ifp); - ifa->lst_next = NULL; - break; - } + action = CLEANUP_PREFIX_RT_EXPIRE; + + spin_lock(&ifa->lock); + + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ + if (time_before(*expires, ifa->tstamp + lifetime * HZ)) + *expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); } - write_unlock_bh(&addrconf_hash_lock); - write_lock_bh(&idev->lock); -#ifdef CONFIG_IPV6_PRIVACY - if (ifp->flags&IFA_F_TEMPORARY) { - for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL; - ifap = &ifa->tmp_next) { - if (ifa == ifp) { - *ifap = ifa->tmp_next; - if (ifp->ifpub) { - in6_ifa_put(ifp->ifpub); - ifp->ifpub = NULL; - } - __in6_ifa_put(ifp); - ifa->tmp_next = NULL; - break; - } + return action; +} + +static void +cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) +{ + struct rt6_info *rt; + + rt = addrconf_get_prefix_route(&ifp->addr, + ifp->prefix_len, + ifp->idev->dev, + 0, RTF_GATEWAY | RTF_DEFAULT); + if (rt) { + if (del_rt) + ip6_del_rt(rt); + else { + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_set_expires(rt, expires); + ip6_rt_put(rt); } } -#endif +} - for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;) { - if (ifa == ifp) { - *ifap = ifa->if_next; - __in6_ifa_put(ifp); - ifa->if_next = NULL; - if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) - break; - deleted = 1; - continue; - } else if (ifp->flags & IFA_F_PERMANENT) { - if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, - ifp->prefix_len)) { - if (ifa->flags & IFA_F_PERMANENT) { - onlink = 1; - if (deleted) - break; - } else { - unsigned long lifetime; - - if (!onlink) - onlink = -1; - - spin_lock(&ifa->lock); - - lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); - /* - * Note: Because this address is - * not permanent, lifetime < - * LONG_MAX / HZ here. - */ - if (time_before(expires, - ifa->tstamp + lifetime * HZ)) - expires = ifa->tstamp + lifetime * HZ; - spin_unlock(&ifa->lock); - } - } + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + int state; + enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP; + unsigned long expires; + + ASSERT_RTNL(); + + spin_lock_bh(&ifp->state_lock); + state = ifp->state; + ifp->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->state_lock); + + if (state == INET6_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&ifp->idev->lock); + + if (ifp->flags&IFA_F_TEMPORARY) { + list_del(&ifp->tmp_list); + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; } - ifap = &ifa->if_next; + __in6_ifa_put(ifp); } - write_unlock_bh(&idev->lock); - addrconf_del_timer(ifp); + if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + action = check_cleanup_prefix_route(ifp, &expires); - ipv6_ifa_notify(RTM_DELADDR, ifp); + list_del_init(&ifp->if_list); + __in6_ifa_put(ifp); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); + write_unlock_bh(&ifp->idev->lock); - /* - * Purge or update corresponding prefix - * - * 1) we don't purge prefix here if address was not permanent. - * prefix is managed by its own lifetime. - * 2) if there're no addresses, delete prefix. - * 3) if there're still other permanent address(es), - * corresponding prefix is still permanent. - * 4) otherwise, update prefix lifetime to the - * longest valid lifetime among the corresponding - * addresses on the device. - * Note: subsequent RA will update lifetime. - * - * --yoshfuji - */ - if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { - struct in6_addr prefix; - struct rt6_info *rt; - struct net *net = dev_net(ifp->idev->dev); - ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); - rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1); + addrconf_del_dad_work(ifp); - if (rt && addrconf_is_prefix_route(rt)) { - if (onlink == 0) { - ip6_del_rt(rt); - rt = NULL; - } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { - rt->rt6i_expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; - } - } - dst_release(&rt->u.dst); + ipv6_ifa_notify(RTM_DELADDR, ifp); + + inet6addr_notifier_call_chain(NETDEV_DOWN, ifp); + + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, expires, + action == CLEANUP_PREFIX_RT_DEL); } + /* clean up prefsrc entries */ + rt6_remove_prefsrc(ifp); +out: in6_ifa_put(ifp); } -#ifdef CONFIG_IPV6_PRIVACY static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) { struct inet6_dev *idev = ifp->idev; struct in6_addr addr, *tmpaddr; - unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; unsigned long regen_advance; int tmp_plen; int ret = 0; - int max_addresses; u32 addr_flags; + unsigned long now = jiffies; - write_lock(&idev->lock); + write_lock_bh(&idev->lock); if (ift) { spin_lock_bh(&ift->lock); memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); @@ -853,9 +1060,8 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i retry: in6_dev_hold(idev); if (idev->cnf.use_tempaddr <= 0) { - write_unlock(&idev->lock); - printk(KERN_INFO - "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + write_unlock_bh(&idev->lock); + pr_info("%s: use_tempaddr is disabled\n", __func__); in6_dev_put(idev); ret = -1; goto out; @@ -864,49 +1070,43 @@ retry: if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { idev->cnf.use_tempaddr = -1; /*XXX*/ spin_unlock_bh(&ifp->lock); - write_unlock(&idev->lock); - printk(KERN_WARNING - "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + write_unlock_bh(&idev->lock); + pr_warn("%s: regeneration time exceeded - disabled temporary address support\n", + __func__); in6_dev_put(idev); ret = -1; goto out; } in6_ifa_hold(ifp); memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); - if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { - spin_unlock_bh(&ifp->lock); - write_unlock(&idev->lock); - printk(KERN_WARNING - "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); - in6_ifa_put(ifp); - in6_dev_put(idev); - ret = -1; - goto out; - } + __ipv6_try_regen_rndid(idev, tmpaddr); memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (now - ifp->tstamp) / HZ; tmp_valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft); + idev->cnf.temp_valid_lft + age); tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, - idev->cnf.temp_prefered_lft - desync_factor / HZ); + idev->cnf.temp_prefered_lft + age - + idev->cnf.max_desync_factor); tmp_plen = ifp->prefix_len; - max_addresses = idev->cnf.max_addresses; - tmp_cstamp = ifp->cstamp; tmp_tstamp = ifp->tstamp; spin_unlock_bh(&ifp->lock); regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - idev->nd_parms->retrans_time / HZ; - write_unlock(&idev->lock); + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + write_unlock_bh(&idev->lock); /* A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. In particular, * an implementation must not create a temporary address with a zero * Preferred Lifetime. + * Use age calculation as in addrconf_verify to avoid unnecessary + * temporary addresses being generated. */ - if (tmp_prefered_lft <= regen_advance) { + age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + if (tmp_prefered_lft <= regen_advance + age) { in6_ifa_put(ifp); in6_dev_put(idev); ret = -1; @@ -918,36 +1118,30 @@ retry: if (ifp->flags & IFA_F_OPTIMISTIC) addr_flags |= IFA_F_OPTIMISTIC; - ift = !max_addresses || - ipv6_count_addresses(idev) < max_addresses ? - ipv6_add_addr(idev, &addr, tmp_plen, - ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, - addr_flags) : NULL; - if (!ift || IS_ERR(ift)) { + ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen, + ipv6_addr_scope(&addr), addr_flags, + tmp_valid_lft, tmp_prefered_lft); + if (IS_ERR(ift)) { in6_ifa_put(ifp); in6_dev_put(idev); - printk(KERN_INFO - "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + pr_info("%s: retry temporary address regeneration\n", __func__); tmpaddr = &addr; - write_lock(&idev->lock); + write_lock_bh(&idev->lock); goto retry; } spin_lock_bh(&ift->lock); ift->ifpub = ifp; - ift->valid_lft = tmp_valid_lft; - ift->prefered_lft = tmp_prefered_lft; - ift->cstamp = tmp_cstamp; + ift->cstamp = now; ift->tstamp = tmp_tstamp; spin_unlock_bh(&ift->lock); - addrconf_dad_start(ift, 0); + addrconf_dad_start(ift); in6_ifa_put(ift); in6_dev_put(idev); out: return ret; } -#endif /* * Choose an appropriate source address (RFC3484) @@ -962,9 +1156,7 @@ enum { #endif IPV6_SADDR_RULE_OIF, IPV6_SADDR_RULE_LABEL, -#ifdef CONFIG_IPV6_PRIVACY IPV6_SADDR_RULE_PRIVACY, -#endif IPV6_SADDR_RULE_ORCHID, IPV6_SADDR_RULE_PREFIX, IPV6_SADDR_RULE_MAX @@ -989,8 +1181,7 @@ struct ipv6_saddr_dst { static inline int ipv6_saddr_preferred(int type) { - if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| - IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK)) return 1; return 0; } @@ -1036,7 +1227,7 @@ static int ipv6_get_saddr_eval(struct net *net, * | d is scope of the destination. * B-d | \ * | \ <- smaller scope is better if - * B-15 | \ if scope is enough for destinaion. + * B-15 | \ if scope is enough for destination. * | ret = B - scope (-1 <= scope >= d <= 15). * d-C-1 | / * |/ <- greater is better @@ -1079,11 +1270,10 @@ static int ipv6_get_saddr_eval(struct net *net, &score->ifa->addr, score->addr_type, score->ifa->idev->dev->ifindex) == dst->label; break; -#ifdef CONFIG_IPV6_PRIVACY case IPV6_SADDR_RULE_PRIVACY: { /* Rule 7: Prefer public address - * Note: prefer temprary address if use_tempaddr >= 2 + * Note: prefer temporary address if use_tempaddr >= 2 */ int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? !!(dst->prefs & IPV6_PREFER_SRC_TMP) : @@ -1091,7 +1281,6 @@ static int ipv6_get_saddr_eval(struct net *net, ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; break; } -#endif case IPV6_SADDR_RULE_ORCHID: /* Rule 8-: Prefer ORCHID vs ORCHID or * non-ORCHID vs non-ORCHID @@ -1101,8 +1290,10 @@ static int ipv6_get_saddr_eval(struct net *net, break; case IPV6_SADDR_RULE_PREFIX: /* Rule 8: Use longest matching prefix */ - score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr, - dst->addr); + ret = ipv6_addr_diff(&score->ifa->addr, dst->addr); + if (ret > score->ifa->prefix_len) + ret = score->ifa->prefix_len; + score->matchlen = ret; break; default: ret = 0; @@ -1115,7 +1306,7 @@ out: return ret; } -int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, +int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { @@ -1162,7 +1353,7 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, continue; read_lock_bh(&idev->lock); - for (score->ifa = idev->addr_list; score->ifa; score->ifa = score->ifa->if_next) { + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { int i; /* @@ -1236,31 +1427,42 @@ try_nextdev: if (!hiscore->ifa) return -EADDRNOTAVAIL; - ipv6_addr_copy(saddr, &hiscore->ifa->addr); + *saddr = hiscore->ifa->addr; in6_ifa_put(hiscore->ifa); return 0; } - EXPORT_SYMBOL(ipv6_dev_get_saddr); +int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) +{ + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, - unsigned char banned_flags) + u32 banned_flags) { struct inet6_dev *idev; int err = -EADDRNOTAVAIL; rcu_read_lock(); - if ((idev = __in6_dev_get(dev)) != NULL) { - struct inet6_ifaddr *ifp; - + idev = __in6_dev_get(dev); + if (idev) { read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == IFA_LINK && !(ifp->flags & banned_flags)) { - ipv6_addr_copy(addr, &ifp->addr); - err = 0; - break; - } - } + err = __ipv6_get_lladdr(idev, addr, banned_flags); read_unlock_bh(&idev->lock); } rcu_read_unlock(); @@ -1273,53 +1475,81 @@ static int ipv6_count_addresses(struct inet6_dev *idev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) + list_for_each_entry(ifp, &idev->addr_list, if_list) cnt++; read_unlock_bh(&idev->lock); return cnt; } -int ipv6_chk_addr(struct net *net, struct in6_addr *addr, - struct net_device *dev, int strict) +int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp; + unsigned int hash = inet6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && - !(ifp->flags&IFA_F_TENTATIVE)) { - if (dev == NULL || ifp->idev->dev == dev || - !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) - break; + !(ifp->flags&IFA_F_TENTATIVE) && + (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { + rcu_read_unlock_bh(); + return 1; } } - read_unlock_bh(&addrconf_hash_lock); - return ifp != NULL; + + rcu_read_unlock_bh(); + return 0; } EXPORT_SYMBOL(ipv6_chk_addr); -static -int ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev) +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + unsigned int hash = inet6_addr_hash(addr); + struct inet6_ifaddr *ifp; - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +/* Compares an address/prefix_len with addresses on device @dev. + * If one is found it returns true. + */ +bool ipv6_chk_custom_prefix(const struct in6_addr *addr, + const unsigned int prefix_len, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool ret = false; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); + if (ret) break; } + read_unlock_bh(&idev->lock); } - return ifp != NULL; + rcu_read_unlock(); + + return ret; } +EXPORT_SYMBOL(ipv6_chk_custom_prefix); -int ipv6_chk_prefix(struct in6_addr *addr, struct net_device *dev) +int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; @@ -1330,7 +1560,7 @@ int ipv6_chk_prefix(struct in6_addr *addr, struct net_device *dev) idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); - for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + list_for_each_entry(ifa, &idev->addr_list, if_list) { onlink = ipv6_prefix_equal(addr, &ifa->addr, ifa->prefix_len); if (onlink) @@ -1341,30 +1571,30 @@ int ipv6_chk_prefix(struct in6_addr *addr, struct net_device *dev) rcu_read_unlock(); return onlink; } - EXPORT_SYMBOL(ipv6_chk_prefix); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict) { - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp, *result = NULL; + unsigned int hash = inet6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr)) { if (dev == NULL || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + result = ifp; in6_ifa_hold(ifp); break; } } } - read_unlock_bh(&addrconf_hash_lock); + rcu_read_unlock_bh(); - return ifp; + return result; } /* Gets referenced address, destroys ifaddr */ @@ -1373,13 +1603,14 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) { if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); - addrconf_del_timer(ifp); + addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; if (dad_failed) ifp->flags |= IFA_F_DADFAILED; spin_unlock_bh(&ifp->lock); + if (dad_failed) + ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); -#ifdef CONFIG_IPV6_PRIVACY } else if (ifp->flags&IFA_F_TEMPORARY) { struct inet6_ifaddr *ifpub; spin_lock_bh(&ifp->lock); @@ -1393,18 +1624,36 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) spin_unlock_bh(&ifp->lock); } ipv6_del_addr(ifp); -#endif - } else + } else { ipv6_del_addr(ifp); + } +} + +static int addrconf_dad_end(struct inet6_ifaddr *ifp) +{ + int err = -ENOENT; + + spin_lock_bh(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_DAD) { + ifp->state = INET6_IFADDR_STATE_POSTDAD; + err = 0; + } + spin_unlock_bh(&ifp->state_lock); + + return err; } void addrconf_dad_failure(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; - if (net_ratelimit()) - printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", - ifp->idev->dev->name, &ifp->addr); + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); + return; + } + + net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n", + ifp->idev->dev->name, &ifp->addr); if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { struct in6_addr addr; @@ -1417,20 +1666,27 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) /* DAD failed for link-local based on MAC address */ idev->cnf.disable_ipv6 = 1; - printk(KERN_INFO "%s: IPv6 being disabled!\n", + pr_info("%s: IPv6 being disabled!\n", ifp->idev->dev->name); } } - addrconf_dad_stop(ifp, 1); + spin_lock_bh(&ifp->state_lock); + /* transition from _POSTDAD to _ERRDAD */ + ifp->state = INET6_IFADDR_STATE_ERRDAD; + spin_unlock_bh(&ifp->state_lock); + + addrconf_mod_dad_work(ifp, 0); } /* Join to solicited addr multicast group. */ -void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) +void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1438,10 +1694,12 @@ void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) ipv6_dev_mc_inc(dev, &maddr); } -void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) +void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; + ASSERT_RTNL(); + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) return; @@ -1452,6 +1710,11 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; @@ -1461,6 +1724,11 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; + + ASSERT_RTNL(); + + if (ifp->prefix_len >= 127) /* RFC 6164 */ + return; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); if (ipv6_addr_any(&addr)) return; @@ -1498,13 +1766,36 @@ static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) return 0; } +static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != IEEE802154_ADDR_LEN) + return -1; + memcpy(eui, dev->dev_addr, 8); + eui[0] ^= 2; + return 0; +} + +static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev) +{ + union fwnet_hwaddr *ha; + + if (dev->addr_len != FWNET_ALEN) + return -1; + + ha = (union fwnet_hwaddr *)dev->dev_addr; + + memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id)); + eui[0] ^= 2; + return 0; +} + static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) { /* XXX: inherit EUI-64 from other interface -- yoshfuji */ if (dev->addr_len != ARCNET_ALEN) return -1; memset(eui, 0, 7); - eui[7] = *(u8*)dev->dev_addr; + eui[7] = *(u8 *)dev->dev_addr; return 0; } @@ -1517,7 +1808,7 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) return 0; } -int __ipv6_isatap_ifid(u8 *eui, __be32 addr) +static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) { if (addr == 0) return -1; @@ -1533,7 +1824,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 addr) memcpy(eui + 4, &addr, 4); return 0; } -EXPORT_SYMBOL(__ipv6_isatap_ifid); static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) { @@ -1542,12 +1832,26 @@ static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) return -1; } +static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) +{ + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); +} + +static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev) +{ + memcpy(eui, dev->perm_addr, 3); + memcpy(eui + 5, dev->perm_addr + 3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: - case ARPHRD_IEEE802_TR: return addrconf_ifid_eui48(eui, dev); case ARPHRD_ARCNET: return addrconf_ifid_arcnet(eui, dev); @@ -1555,6 +1859,15 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) return addrconf_ifid_infiniband(eui, dev); case ARPHRD_SIT: return addrconf_ifid_sit(eui, dev); + case ARPHRD_IPGRE: + return addrconf_ifid_gre(eui, dev); + case ARPHRD_6LOWPAN: + case ARPHRD_IEEE802154: + return addrconf_ifid_eui64(eui, dev); + case ARPHRD_IEEE1394: + return addrconf_ifid_ieee1394(eui, dev); + case ARPHRD_TUNNEL6: + return addrconf_ifid_ip6tnl(eui, dev); } return -1; } @@ -1565,7 +1878,9 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (ifp->scope > IFA_LINK) + break; if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { memcpy(eui, ifp->addr.s6_addr+8, 8); err = 0; @@ -1576,9 +1891,8 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) return err; } -#ifdef CONFIG_IPV6_PRIVACY /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ -static int __ipv6_regen_rndid(struct inet6_dev *idev) +static void __ipv6_regen_rndid(struct inet6_dev *idev) { regen: get_random_bytes(idev->rndid, sizeof(idev->rndid)); @@ -1605,8 +1919,6 @@ regen: if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) goto regen; } - - return 0; } static void ipv6_regen_rndid(unsigned long data) @@ -1620,16 +1932,16 @@ static void ipv6_regen_rndid(unsigned long data) if (idev->dead) goto out; - if (__ipv6_regen_rndid(idev) < 0) - goto out; + __ipv6_regen_rndid(idev); expires = jiffies + idev->cnf.temp_prefered_lft * HZ - - idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor; + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * + NEIGH_VAR(idev->nd_parms, RETRANS_TIME) - + idev->cnf.max_desync_factor * HZ; if (time_before(expires, jiffies)) { - printk(KERN_WARNING - "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", - idev->dev->name); + pr_warn("%s: too short regeneration interval; timer disabled for %s\n", + __func__, idev->dev->name); goto out; } @@ -1642,14 +1954,11 @@ out: in6_dev_put(idev); } -static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { - int ret = 0; - +static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) +{ if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) - ret = __ipv6_regen_rndid(idev); - return ret; + __ipv6_regen_rndid(idev); } -#endif /* * Add prefix route. @@ -1670,13 +1979,13 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_protocol = RTPROT_KERNEL, }; - ipv6_addr_copy(&cfg.fc_dst, pfx); + cfg.fc_dst = *pfx; /* Prevent useless cloning on PtP SIT. This thing is done here expecting that the whole class of non-broadcast devices need not cloning. */ -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) cfg.fc_flags |= RTF_NONEXTHOP; #endif @@ -1684,6 +1993,40 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, ip6_route_add(&cfg); } + +static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, + int plen, + const struct net_device *dev, + u32 flags, u32 noflags) +{ + struct fib6_node *fn; + struct rt6_info *rt = NULL; + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); + if (table == NULL) + return NULL; + + read_lock_bh(&table->tb6_lock); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + if (!fn) + goto out; + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != dev->ifindex) + continue; + if ((rt->rt6i_flags & flags) != flags) + continue; + if ((rt->rt6i_flags & noflags) != 0) + continue; + dst_hold(&rt->dst); + break; + } +out: + read_unlock_bh(&table->tb6_lock); + return rt; +} + + /* Create "default" multicast route to the interface */ static void addrconf_add_mroute(struct net_device *dev) @@ -1702,49 +2045,94 @@ static void addrconf_add_mroute(struct net_device *dev) ip6_route_add(&cfg); } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) -static void sit_route_add(struct net_device *dev) +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) { - struct fib6_config cfg = { - .fc_table = RT6_TABLE_MAIN, - .fc_metric = IP6_RT_PRIO_ADDRCONF, - .fc_ifindex = dev->ifindex, - .fc_dst_len = 96, - .fc_flags = RTF_UP | RTF_NONEXTHOP, - .fc_nlinfo.nl_net = dev_net(dev), - }; + struct inet6_dev *idev; - /* prefix length - 96 bits "::d.d.d.d" */ - ip6_route_add(&cfg); -} -#endif + ASSERT_RTNL(); -static void addrconf_add_lroute(struct net_device *dev) -{ - struct in6_addr addr; + idev = ipv6_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); - ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, 0); + if (idev->cnf.disable_ipv6) + return ERR_PTR(-EACCES); + + /* Add default multicast route */ + if (!(dev->flags & IFF_LOOPBACK)) + addrconf_add_mroute(dev); + + return idev; } -static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +static void manage_tempaddrs(struct inet6_dev *idev, + struct inet6_ifaddr *ifp, + __u32 valid_lft, __u32 prefered_lft, + bool create, unsigned long now) { - struct inet6_dev *idev; + u32 flags; + struct inet6_ifaddr *ift; - ASSERT_RTNL(); + read_lock_bh(&idev->lock); + /* update all temporary addresses in the list */ + list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) { + int age, max_valid, max_prefered; - if ((idev = ipv6_find_idev(dev)) == NULL) - return NULL; + if (ifp != ift->ifpub) + continue; - /* Add default multicast route */ - addrconf_add_mroute(dev); + /* RFC 4941 section 3.3: + * If a received option will extend the lifetime of a public + * address, the lifetimes of temporary addresses should + * be extended, subject to the overall constraint that no + * temporary addresses should ever remain "valid" or "preferred" + * for a time longer than (TEMP_VALID_LIFETIME) or + * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively. + */ + age = (now - ift->cstamp) / HZ; + max_valid = idev->cnf.temp_valid_lft - age; + if (max_valid < 0) + max_valid = 0; + + max_prefered = idev->cnf.temp_prefered_lft - + idev->cnf.max_desync_factor - age; + if (max_prefered < 0) + max_prefered = 0; + + if (valid_lft > max_valid) + valid_lft = max_valid; + + if (prefered_lft > max_prefered) + prefered_lft = max_prefered; + + spin_lock(&ift->lock); + flags = ift->flags; + ift->valid_lft = valid_lft; + ift->prefered_lft = prefered_lft; + ift->tstamp = now; + if (prefered_lft > 0) + ift->flags &= ~IFA_F_DEPRECATED; + + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } - /* Add link local route */ - addrconf_add_lroute(dev); - return idev; + if ((create || list_empty(&idev->tempaddr_list)) && + idev->cnf.use_tempaddr > 0) { + /* When a new public address is created as described + * in [ADDRCONF], also create a new temporary address. + * Also create a temporary address if it's enabled but + * no temporary address currently exists. + */ + read_unlock_bh(&idev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&idev->lock); + } } -void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; __u32 valid_lft; @@ -1756,7 +2144,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG(("addrconf: prefix option too short\n")); + ADBG("addrconf: prefix option too short\n"); return; } @@ -1773,16 +2161,15 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) prefered_lft = ntohl(pinfo->prefered); if (prefered_lft > valid_lft) { - if (net_ratelimit()) - printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n"); return; } in6_dev = in6_dev_get(dev); if (in6_dev == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + net_dbg_ratelimited("addrconf: device %s not configured\n", + dev->name); return; } @@ -1809,21 +2196,22 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) if (addrconf_finite_timeout(rt_expires)) rt_expires *= HZ; - rt = rt6_lookup(net, &pinfo->prefix, NULL, - dev->ifindex, 1); + rt = addrconf_get_prefix_route(&pinfo->prefix, + pinfo->prefix_len, + dev, + RTF_ADDRCONF | RTF_PREFIX_RT, + RTF_GATEWAY | RTF_DEFAULT); - if (rt && addrconf_is_prefix_route(rt)) { + if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { ip6_del_rt(rt); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ - rt->rt6i_expires = jiffies + rt_expires; - rt->rt6i_flags |= RTF_EXPIRES; + rt6_set_expires(rt, jiffies + rt_expires); } else { - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->rt6i_expires = 0; + rt6_clean_expires(rt); } } else if (valid_lft) { clock_t expires = 0; @@ -1836,29 +2224,35 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, dev, expires, flags); } - if (rt) - dst_release(&rt->u.dst); + ip6_rt_put(rt); } /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { - struct inet6_ifaddr * ifp; + struct inet6_ifaddr *ifp; struct in6_addr addr; int create = 0, update_lft = 0; + bool tokenized = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); - if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && - ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + + if (!ipv6_addr_any(&in6_dev->token)) { + read_lock_bh(&in6_dev->lock); + memcpy(addr.s6_addr + 8, + in6_dev->token.s6_addr + 8, 8); + read_unlock_bh(&in6_dev->lock); + tokenized = true; + } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { in6_dev_put(in6_dev); return; } goto ok; } - if (net_ratelimit()) - printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", - pinfo->prefix_len); + net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); in6_dev_put(in6_dev); return; @@ -1872,7 +2266,7 @@ ok: #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && - !net->ipv6.devconf_all->forwarding) + !net->ipv6.devconf_all->forwarding && sllao) addr_flags = IFA_F_OPTIMISTIC; #endif @@ -1881,26 +2275,30 @@ ok: */ if (!max_addresses || ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + ifp = ipv6_add_addr(in6_dev, &addr, NULL, + pinfo->prefix_len, addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags); + addr_flags, valid_lft, + prefered_lft); - if (!ifp || IS_ERR(ifp)) { + if (IS_ERR_OR_NULL(ifp)) { in6_dev_put(in6_dev); return; } - update_lft = create = 1; + update_lft = 0; + create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; ifp->cstamp = jiffies; - addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); + addrconf_dad_start(ifp); } if (ifp) { - int flags; + u32 flags; unsigned long now; -#ifdef CONFIG_IPV6_PRIVACY - struct inet6_ifaddr *ift; -#endif u32 stored_lft; /* update lifetime (RFC2462 5.5.3 e) */ @@ -1910,44 +2308,22 @@ ok: stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!update_lft && stored_lft) { - if (valid_lft > MIN_VALID_LIFETIME || - valid_lft > stored_lft) - update_lft = 1; - else if (stored_lft <= MIN_VALID_LIFETIME) { - /* valid_lft <= stored_lft is always true */ - /* - * RFC 4862 Section 5.5.3e: - * "Note that the preferred lifetime of - * the corresponding address is always - * reset to the Preferred Lifetime in - * the received Prefix Information - * option, regardless of whether the - * valid lifetime is also reset or - * ignored." - * - * So if the preferred lifetime in - * this advertisement is different - * than what we have stored, but the - * valid lifetime is invalid, just - * reset prefered_lft. - * - * We must set the valid lifetime - * to the stored lifetime since we'll - * be updating the timestamp below, - * else we'll set it back to the - * minumum. - */ - if (prefered_lft != ifp->prefered_lft) { - valid_lft = stored_lft; - update_lft = 1; - } - } else { - valid_lft = MIN_VALID_LIFETIME; - if (valid_lft < prefered_lft) - prefered_lft = valid_lft; - update_lft = 1; - } + if (!update_lft && !create && stored_lft) { + const u32 minimum_lft = min( + stored_lft, (u32)MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; } if (update_lft) { @@ -1963,46 +2339,11 @@ ok: } else spin_unlock(&ifp->lock); -#ifdef CONFIG_IPV6_PRIVACY - read_lock_bh(&in6_dev->lock); - /* update all temporary addresses in the list */ - for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) { - /* - * When adjusting the lifetimes of an existing - * temporary address, only lower the lifetimes. - * Implementations must not increase the - * lifetimes of an existing temporary address - * when processing a Prefix Information Option. - */ - if (ifp != ift->ifpub) - continue; - - spin_lock(&ift->lock); - flags = ift->flags; - if (ift->valid_lft > valid_lft && - ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) - ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; - if (ift->prefered_lft > prefered_lft && - ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) - ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; - spin_unlock(&ift->lock); - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify(0, ift); - } + manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, + create, now); - if (create && in6_dev->cnf.use_tempaddr > 0) { - /* - * When a new public address is created as described in [ADDRCONF], - * also create a new temporary address. - */ - read_unlock_bh(&in6_dev->lock); - ipv6_create_tempaddr(ifp, NULL); - } else { - read_unlock_bh(&in6_dev->lock); - } -#endif in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify(); } } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); @@ -2032,7 +2373,7 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (dev == NULL) goto err_exit; -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) if (dev->type == ARPHRD_SIT) { const struct net_device_ops *ops = dev->netdev_ops; struct ifreq ifr; @@ -2078,9 +2419,11 @@ err_exit: /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, - unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, - __u32 valid_lft) +static int inet6_addr_add(struct net *net, int ifindex, + const struct in6_addr *pfx, + const struct in6_addr *peer_pfx, + unsigned int plen, __u32 ifa_flags, + __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; @@ -2099,12 +2442,16 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, if (!valid_lft || prefered_lft > valid_lft) return -EINVAL; + if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64) + return -EINVAL; + dev = __dev_get_by_index(net, ifindex); if (!dev) return -ENODEV; - if ((idev = addrconf_add_dev(dev)) == NULL) - return -ENOBUFS; + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); scope = ipv6_addr_scope(pfx); @@ -2126,33 +2473,34 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, prefered_lft = timeout; } - ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); + ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags, + valid_lft, prefered_lft); if (!IS_ERR(ifp)) { - spin_lock_bh(&ifp->lock); - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = jiffies; - spin_unlock_bh(&ifp->lock); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, + expires, flags); + } - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags); /* * Note that section 3.1 of RFC 4429 indicates * that the Optimistic flag should not be set for * manually configured addresses */ - addrconf_dad_start(ifp, 0); + addrconf_dad_start(ifp); + if (ifa_flags & IFA_F_MANAGETEMPADDR) + manage_tempaddrs(idev, ifp, valid_lft, prefered_lft, + true, jiffies); in6_ifa_put(ifp); - addrconf_verify(0); + addrconf_verify_rtnl(); return 0; } return PTR_ERR(ifp); } -static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, - unsigned int plen) +static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, + const struct in6_addr *pfx, unsigned int plen) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; @@ -2169,19 +2517,18 @@ static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, return -ENXIO; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) { + list_for_each_entry(ifp, &idev->addr_list, if_list) { if (ifp->prefix_len == plen && ipv6_addr_equal(pfx, &ifp->addr)) { in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifa_flags & IFA_F_MANAGETEMPADDR)) + manage_tempaddrs(idev, ifp, 0, 0, false, + jiffies); ipv6_del_addr(ifp); - - /* If the last address is deleted administratively, - disable IPv6 on this interface. - */ - if (idev->addr_list == NULL) - addrconf_ifdown(idev->dev, 1); + addrconf_verify_rtnl(); return 0; } } @@ -2195,14 +2542,14 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL, ireq.ifr6_prefixlen, IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); rtnl_unlock(); @@ -2214,14 +2561,14 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) struct in6_ifreq ireq; int err; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; rtnl_lock(); - err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, ireq.ifr6_prefixlen); rtnl_unlock(); return err; @@ -2232,7 +2579,9 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, { struct inet6_ifaddr *ifp; - ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT); + ifp = ipv6_add_addr(idev, addr, NULL, plen, + scope, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { spin_lock_bh(&ifp->lock); ifp->flags &= ~IFA_F_TENTATIVE; @@ -2242,13 +2591,14 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) static void sit_add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope; + int scope, plen; + u32 pflags = 0; ASSERT_RTNL(); @@ -2258,24 +2608,27 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; + plen = 64; } else { scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { - add_addr(idev, &addr, 128, scope); + add_addr(idev, &addr, plen, scope); + addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); return; } for_each_netdev(net, dev) { - struct in_device * in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { - struct in_ifaddr * ifa; + struct in_ifaddr *ifa; int flag = scope; for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - int plen; addr.s6_addr32[3] = ifa->ifa_local; @@ -2286,12 +2639,10 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) continue; flag |= IFA_HOST; } - if (idev->dev->flags&IFF_POINTOPOINT) - plen = 64; - else - plen = 96; add_addr(idev, &addr, plen, flag); + addrconf_prefix_route(&addr, plen, idev->dev, 0, + pflags); } } } @@ -2301,22 +2652,64 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) static void init_loopback(struct net_device *dev) { struct inet6_dev *idev; + struct net_device *sp_dev; + struct inet6_ifaddr *sp_ifa; + struct rt6_info *sp_rt; /* ::1 */ ASSERT_RTNL(); if ((idev = ipv6_find_idev(dev)) == NULL) { - printk(KERN_DEBUG "init loopback: add_dev failed\n"); + pr_debug("%s: add_dev failed\n", __func__); return; } add_addr(idev, &in6addr_loopback, 128, IFA_HOST); + + /* Add routes to other interface's IPv6 addresses */ + for_each_netdev(dev_net(dev), sp_dev) { + if (!strcmp(sp_dev->name, dev->name)) + continue; + + idev = __in6_dev_get(sp_dev); + if (!idev) + continue; + + read_lock_bh(&idev->lock); + list_for_each_entry(sp_ifa, &idev->addr_list, if_list) { + + if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE)) + continue; + + if (sp_ifa->rt) { + /* This dst has been added to garbage list when + * lo device down, release this obsolete dst and + * reallocate a new router for ifa. + */ + if (sp_ifa->rt->dst.obsolete > 0) { + ip6_rt_put(sp_ifa->rt); + sp_ifa->rt = NULL; + } else { + continue; + } + } + + sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false); + + /* Failure cases are ignored */ + if (!IS_ERR(sp_rt)) { + sp_ifa->rt = sp_rt; + ip6_ins_rt(sp_rt); + } + } + read_unlock_bh(&idev->lock); + } } -static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) { - struct inet6_ifaddr * ifp; + struct inet6_ifaddr *ifp; u32 addr_flags = IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD @@ -2326,10 +2719,11 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr #endif - ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); + ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); if (!IS_ERR(ifp)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); - addrconf_dad_start(ifp, 0); + addrconf_dad_start(ifp); in6_ifa_put(ifp); } } @@ -2337,21 +2731,24 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr static void addrconf_dev_config(struct net_device *dev) { struct in6_addr addr; - struct inet6_dev * idev; + struct inet6_dev *idev; ASSERT_RTNL(); if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_FDDI) && - (dev->type != ARPHRD_IEEE802_TR) && (dev->type != ARPHRD_ARCNET) && - (dev->type != ARPHRD_INFINIBAND)) { + (dev->type != ARPHRD_INFINIBAND) && + (dev->type != ARPHRD_IEEE802154) && + (dev->type != ARPHRD_IEEE1394) && + (dev->type != ARPHRD_TUNNEL6) && + (dev->type != ARPHRD_6LOWPAN)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } idev = addrconf_add_dev(dev); - if (idev == NULL) + if (IS_ERR(idev)) return; memset(&addr, 0, sizeof(struct in6_addr)); @@ -2361,7 +2758,7 @@ static void addrconf_dev_config(struct net_device *dev) addrconf_add_linklocal(idev, &addr); } -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +#if IS_ENABLED(CONFIG_IPV6_SIT) static void addrconf_sit_config(struct net_device *dev) { struct inet6_dev *idev; @@ -2375,7 +2772,7 @@ static void addrconf_sit_config(struct net_device *dev) */ if ((idev = ipv6_find_idev(dev)) == NULL) { - printk(KERN_DEBUG "init sit: add_dev failed\n"); + pr_debug("%s: add_dev failed\n", __func__); return; } @@ -2383,7 +2780,6 @@ static void addrconf_sit_config(struct net_device *dev) struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, 0); if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) addrconf_add_linklocal(idev, &addr); return; @@ -2391,72 +2787,41 @@ static void addrconf_sit_config(struct net_device *dev) sit_add_v4_addrs(idev); - if (dev->flags&IFF_POINTOPOINT) { + if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); - addrconf_add_lroute(dev); - } else - sit_route_add(dev); } #endif -static inline int -ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) -{ - struct in6_addr lladdr; - - if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { - addrconf_add_linklocal(idev, &lladdr); - return 0; - } - return -1; -} - -static void ip6_tnl_add_linklocal(struct inet6_dev *idev) -{ - struct net_device *link_dev; - struct net *net = dev_net(idev->dev); - - /* first try to inherit the link-local address from the link device */ - if (idev->dev->iflink && - (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - /* then try to inherit it from any device */ - for_each_netdev(net, link_dev) { - if (!ipv6_inherit_linklocal(idev, link_dev)) - return; - } - printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); -} - -/* - * Autoconfigure tunnel with a link-local address so routing protocols, - * DHCPv6, MLD etc. can be run over the virtual link - */ - -static void addrconf_ip6_tnl_config(struct net_device *dev) +#if IS_ENABLED(CONFIG_NET_IPGRE) +static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; + struct in6_addr addr; ASSERT_RTNL(); - if ((idev = addrconf_add_dev(dev)) == NULL) { - printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + if ((idev = ipv6_find_idev(dev)) == NULL) { + pr_debug("%s: add_dev failed\n", __func__); return; } - ip6_tnl_add_linklocal(idev); + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); + else + addrconf_prefix_route(&addr, 64, dev, 0, 0); } +#endif static int addrconf_notify(struct notifier_block *this, unsigned long event, - void * data) + void *ptr) { - struct net_device *dev = (struct net_device *) data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; - switch(event) { + switch (event) { case NETDEV_REGISTER: if (!idev && dev->mtu >= IPV6_MIN_MTU) { idev = ipv6_add_dev(dev); @@ -2464,6 +2829,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, return notifier_from_errno(-ENOMEM); } break; + case NETDEV_UP: case NETDEV_CHANGE: if (dev->flags & IFF_SLAVE) @@ -2472,9 +2838,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (event == NETDEV_UP) { if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ - printk(KERN_INFO - "ADDRCONF(NETDEV_UP): %s: " - "link is not ready\n", + pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); break; } @@ -2493,30 +2857,29 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } if (idev) { - if (idev->if_flags & IF_READY) { + if (idev->if_flags & IF_READY) /* device is already configured. */ break; - } idev->if_flags |= IF_READY; } - printk(KERN_INFO - "ADDRCONF(NETDEV_CHANGE): %s: " - "link becomes ready\n", - dev->name); + pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n", + dev->name); run_pending = 1; } - switch(dev->type) { -#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: addrconf_sit_config(dev); break; #endif - case ARPHRD_TUNNEL6: - addrconf_ip6_tnl_config(dev); +#if IS_ENABLED(CONFIG_NET_IPGRE) + case ARPHRD_IPGRE: + addrconf_gre_config(dev); break; +#endif case ARPHRD_LOOPBACK: init_loopback(dev); break; @@ -2525,25 +2888,30 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_dev_config(dev); break; } + if (idev) { if (run_pending) addrconf_dad_run(idev); - /* If the MTU changed during the interface down, when the - interface up, the changed MTU must be reflected in the - idev as well as routers. + /* + * If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. */ - if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { + if (idev->cnf.mtu6 != dev->mtu && + dev->mtu >= IPV6_MIN_MTU) { rt6_mtu_change(dev, dev->mtu); idev->cnf.mtu6 = dev->mtu; } idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); - /* If the changed mtu during down is lower than IPV6_MIN_MTU - stop IPv6 on this interface. + + /* + * If the changed mtu during down is lower than + * IPV6_MIN_MTU stop IPv6 on this interface. */ if (dev->mtu < IPV6_MIN_MTU) - addrconf_ifdown(dev, event != NETDEV_DOWN); + addrconf_ifdown(dev, 1); } break; @@ -2560,7 +2928,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; } - /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */ + /* + * if MTU under IPV6_MIN_MTU. + * Stop IPv6 on this interface. + */ case NETDEV_DOWN: case NETDEV_UNREGISTER: @@ -2580,9 +2951,10 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, return notifier_from_errno(err); } break; - case NETDEV_BONDING_OLDTYPE: - case NETDEV_BONDING_NEWTYPE: - addrconf_bonding_change(dev, event); + + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + addrconf_type_change(dev, event); break; } @@ -2594,28 +2966,27 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ static struct notifier_block ipv6_dev_notf = { .notifier_call = addrconf_notify, - .priority = 0 }; -static void addrconf_bonding_change(struct net_device *dev, unsigned long event) +static void addrconf_type_change(struct net_device *dev, unsigned long event) { struct inet6_dev *idev; ASSERT_RTNL(); idev = __in6_dev_get(dev); - if (event == NETDEV_BONDING_NEWTYPE) + if (event == NETDEV_POST_TYPE_CHANGE) ipv6_mc_remap(idev); - else if (event == NETDEV_BONDING_OLDTYPE) + else if (event == NETDEV_PRE_TYPE_CHANGE) ipv6_mc_unmap(idev); } static int addrconf_ifdown(struct net_device *dev, int how) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa, **bifa; struct net *net = dev_net(dev); - int i; + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int state, i; ASSERT_RTNL(); @@ -2626,14 +2997,15 @@ static int addrconf_ifdown(struct net_device *dev, int how) if (idev == NULL) return -ENODEV; - /* Step 1: remove reference to ipv6 device from parent device. - Do not dev_put! + /* + * Step 1: remove reference to ipv6 device from parent device. + * Do not dev_put! */ if (how) { idev->dead = 1; /* protected by rtnl_lock */ - rcu_assign_pointer(dev->ip6_ptr, NULL); + RCU_INIT_POINTER(dev->ip6_ptr, NULL); /* Step 1.5: remove snmp6 entry */ snmp6_unregister_dev(idev); @@ -2641,39 +3013,37 @@ static int addrconf_ifdown(struct net_device *dev, int how) } /* Step 2: clear hash table */ - for (i=0; i<IN6_ADDR_HSIZE; i++) { - bifa = &inet6_addr_lst[i]; + for (i = 0; i < IN6_ADDR_HSIZE; i++) { + struct hlist_head *h = &inet6_addr_lst[i]; - write_lock_bh(&addrconf_hash_lock); - while ((ifa = *bifa) != NULL) { + spin_lock_bh(&addrconf_hash_lock); + restart: + hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { - *bifa = ifa->lst_next; - ifa->lst_next = NULL; - addrconf_del_timer(ifa); - in6_ifa_put(ifa); - continue; + hlist_del_init_rcu(&ifa->addr_lst); + addrconf_del_dad_work(ifa); + goto restart; } - bifa = &ifa->lst_next; } - write_unlock_bh(&addrconf_hash_lock); + spin_unlock_bh(&addrconf_hash_lock); } write_lock_bh(&idev->lock); - /* Step 3: clear flags for stateless addrconf */ + addrconf_del_rs_timer(idev); + + /* Step 2: clear flags for stateless addrconf */ if (!how) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); - /* Step 4: clear address list */ -#ifdef CONFIG_IPV6_PRIVACY if (how && del_timer(&idev->regen_timer)) in6_dev_put(idev); - /* clear tempaddr list */ - while ((ifa = idev->tempaddr_list) != NULL) { - idev->tempaddr_list = ifa->tmp_next; - ifa->tmp_next = NULL; - ifa->dead = 1; + /* Step 3: clear tempaddr list */ + while (!list_empty(&idev->tempaddr_list)) { + ifa = list_first_entry(&idev->tempaddr_list, + struct inet6_ifaddr, tmp_list); + list_del(&ifa->tmp_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -2685,24 +3055,33 @@ static int addrconf_ifdown(struct net_device *dev, int how) in6_ifa_put(ifa); write_lock_bh(&idev->lock); } -#endif - while ((ifa = idev->addr_list) != NULL) { - idev->addr_list = ifa->if_next; - ifa->if_next = NULL; - ifa->dead = 1; - addrconf_del_timer(ifa); + + while (!list_empty(&idev->addr_list)) { + ifa = list_first_entry(&idev->addr_list, + struct inet6_ifaddr, if_list); + addrconf_del_dad_work(ifa); + + list_del(&ifa->if_list); + write_unlock_bh(&idev->lock); - __ipv6_ifa_notify(RTM_DELADDR, ifa); - atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); + spin_lock_bh(&ifa->state_lock); + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifa->state_lock); + + if (state != INET6_IFADDR_STATE_DEAD) { + __ipv6_ifa_notify(RTM_DELADDR, ifa); + inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } in6_ifa_put(ifa); write_lock_bh(&idev->lock); } + write_unlock_bh(&idev->lock); /* Step 5: Discard multicast list */ - if (how) ipv6_mc_destroy_dev(idev); else @@ -2710,8 +3089,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) idev->tstamp = jiffies; - /* Shot the device (if unregistered) */ - + /* Last: Shot the device (if unregistered) */ if (how) { addrconf_sysctl_unregister(idev); neigh_parms_release(&nd_tbl, idev->nd_parms); @@ -2723,41 +3101,47 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = (struct inet6_dev *)data; + struct net_device *dev = idev->dev; + struct in6_addr lladdr; - if (ifp->idev->cnf.forwarding) + write_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) goto out; - if (ifp->idev->if_flags & IF_RA_RCVD) { - /* - * Announcement received after solicitation - * was sent - */ + if (!ipv6_accept_ra(idev)) goto out; - } - spin_lock(&ifp->lock); - if (ifp->probes++ < ifp->idev->cnf.rtr_solicits) { - /* The wait after the last probe can be shorter */ - addrconf_mod_timer(ifp, AC_RS, - (ifp->probes == ifp->idev->cnf.rtr_solicits) ? - ifp->idev->cnf.rtr_solicit_delay : - ifp->idev->cnf.rtr_solicit_interval); - spin_unlock(&ifp->lock); + /* Announcement received after solicitation was sent */ + if (idev->if_flags & IF_RA_RCVD) + goto out; - ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + write_unlock(&idev->lock); + if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + ndisc_send_rs(dev, &lladdr, + &in6addr_linklocal_allrouters); + else + goto put; + + write_lock(&idev->lock); + /* The wait after the last probe can be shorter */ + addrconf_mod_rs_timer(idev, (idev->rs_probes == + idev->cnf.rtr_solicits) ? + idev->cnf.rtr_solicit_delay : + idev->cnf.rtr_solicit_interval); } else { - spin_unlock(&ifp->lock); /* * Note: we do not support deprecated "all on-link" * assumption any longer. */ - printk(KERN_DEBUG "%s: no IPv6 routers present\n", - ifp->idev->dev->name); + pr_debug("%s: no IPv6 routers present\n", idev->dev->name); } out: - in6_ifa_put(ifp); + write_unlock(&idev->lock); +put: + in6_dev_put(idev); } /* @@ -2771,32 +3155,32 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); + ifp->dad_probes = idev->cnf.dad_transmits; + addrconf_mod_dad_work(ifp, rand_num); } -static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) +static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; addrconf_join_solict(dev, &ifp->addr); - net_srandom(ifp->addr.s6_addr32[3]); + prandom_seed((__force u32) ifp->addr.s6_addr32[3]); read_lock_bh(&idev->lock); - if (ifp->dead) + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) goto out; - spin_lock_bh(&ifp->lock); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); - spin_unlock_bh(&ifp->lock); + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); addrconf_dad_completed(ifp); @@ -2804,7 +3188,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) } if (!(idev->if_flags & IF_READY)) { - spin_unlock_bh(&ifp->lock); + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); /* * If the device is not ready: @@ -2820,56 +3204,133 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) * Optimistic nodes can start receiving * Frames right away */ - if(ifp->flags & IFA_F_OPTIMISTIC) + if (ifp->flags & IFA_F_OPTIMISTIC) ip6_ins_rt(ifp->rt); addrconf_dad_kick(ifp); - spin_unlock_bh(&ifp->lock); out: + spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); } -static void addrconf_dad_timer(unsigned long data) +static void addrconf_dad_start(struct inet6_ifaddr *ifp) +{ + bool begin_dad = false; + + spin_lock_bh(&ifp->state_lock); + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + ifp->state = INET6_IFADDR_STATE_PREDAD; + begin_dad = true; + } + spin_unlock_bh(&ifp->state_lock); + + if (begin_dad) + addrconf_mod_dad_work(ifp, 0); +} + +static void addrconf_dad_work(struct work_struct *w) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_ifaddr *ifp = container_of(to_delayed_work(w), + struct inet6_ifaddr, + dad_work); struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; - read_lock_bh(&idev->lock); - if (idev->dead) { - read_unlock_bh(&idev->lock); + enum { + DAD_PROCESS, + DAD_BEGIN, + DAD_ABORT, + } action = DAD_PROCESS; + + rtnl_lock(); + + spin_lock_bh(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) { + action = DAD_BEGIN; + ifp->state = INET6_IFADDR_STATE_DAD; + } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) { + action = DAD_ABORT; + ifp->state = INET6_IFADDR_STATE_POSTDAD; + } + spin_unlock_bh(&ifp->state_lock); + + if (action == DAD_BEGIN) { + addrconf_dad_begin(ifp); + goto out; + } else if (action == DAD_ABORT) { + addrconf_dad_stop(ifp, 1); goto out; } - spin_lock_bh(&ifp->lock); - if (ifp->probes == 0) { + + if (!ifp->dad_probes && addrconf_dad_end(ifp)) + goto out; + + write_lock_bh(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + write_unlock_bh(&idev->lock); + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) { + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); + goto out; + } + + if (ifp->dad_probes == 0) { /* * DAD was successful */ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); - spin_unlock_bh(&ifp->lock); - read_unlock_bh(&idev->lock); + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); addrconf_dad_completed(ifp); goto out; } - ifp->probes--; - addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); - spin_unlock_bh(&ifp->lock); - read_unlock_bh(&idev->lock); + ifp->dad_probes--; + addrconf_mod_dad_work(ifp, + NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + spin_unlock(&ifp->lock); + write_unlock_bh(&idev->lock); /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); + rtnl_unlock(); +} + +/* ifp->idev must be at least read locked */ +static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifpiter; + struct inet6_dev *idev = ifp->idev; + + list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) { + if (ifpiter->scope > IFA_LINK) + break; + if (ifp != ifpiter && ifpiter->scope == IFA_LINK && + (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE| + IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) == + IFA_F_PERMANENT) + return false; + } + return true; } static void addrconf_dad_completed(struct inet6_ifaddr *ifp) { - struct net_device * dev = ifp->idev->dev; + struct net_device *dev = ifp->idev->dev; + struct in6_addr lladdr; + bool send_rs, send_mld; + + addrconf_del_dad_work(ifp); /* * Configure the address for reception. Now it is valid. @@ -2877,41 +3338,56 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) ipv6_ifa_notify(RTM_NEWADDR, ifp); - /* If added prefix is link local and forwarding is off, - start sending router solicitations. + /* If added prefix is link local and we are prepared to process + router advertisements, start sending router solicitations. + */ + + read_lock_bh(&ifp->idev->lock); + send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); + send_rs = send_mld && + ipv6_accept_ra(ifp->idev) && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0; + read_unlock_bh(&ifp->idev->lock); + + /* While dad is in progress mld report's source address is in6_addrany. + * Resend with proper ll now. */ + if (send_mld) + ipv6_mc_dad_complete(ifp->idev); - if (ifp->idev->cnf.forwarding == 0 && - ifp->idev->cnf.rtr_solicits > 0 && - (dev->flags&IFF_LOOPBACK) == 0 && - (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + if (send_rs) { /* * If a host as already performed a random delay * [...] as part of DAD [...] there is no need * to delay again before sending the first RS */ - ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + return; + ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters); - spin_lock_bh(&ifp->lock); - ifp->probes = 1; + write_lock_bh(&ifp->idev->lock); + spin_lock(&ifp->lock); + ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); - spin_unlock_bh(&ifp->lock); + addrconf_mod_rs_timer(ifp->idev, + ifp->idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + write_unlock_bh(&ifp->idev->lock); } } -static void addrconf_dad_run(struct inet6_dev *idev) { +static void addrconf_dad_run(struct inet6_dev *idev) +{ struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { - spin_lock_bh(&ifp->lock); - if (!(ifp->flags & IFA_F_TENTATIVE)) { - spin_unlock_bh(&ifp->lock); - continue; - } - spin_unlock_bh(&ifp->lock); - addrconf_dad_kick(ifp); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) + addrconf_dad_kick(ifp); + spin_unlock(&ifp->lock); } read_unlock_bh(&idev->lock); } @@ -2920,62 +3396,75 @@ static void addrconf_dad_run(struct inet6_dev *idev) { struct if6_iter_state { struct seq_net_private p; int bucket; + int offset; }; -static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) { struct inet6_ifaddr *ifa = NULL; struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + int p = 0; - for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - ifa = inet6_addr_lst[state->bucket]; + /* initial bucket if pos is 0 */ + if (pos == 0) { + state->bucket = 0; + state->offset = 0; + } - while (ifa && !net_eq(dev_net(ifa->idev->dev), net)) - ifa = ifa->lst_next; - if (ifa) - break; + for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket], + addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + /* sync with offset */ + if (p < state->offset) { + p++; + continue; + } + state->offset++; + return ifa; + } + + /* prepare for next bucket */ + state->offset = 0; + p = 0; } - return ifa; + return NULL; } -static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa) +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, + struct inet6_ifaddr *ifa) { struct if6_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - ifa = ifa->lst_next; -try_again: - if (ifa) { - if (!net_eq(dev_net(ifa->idev->dev), net)) { - ifa = ifa->lst_next; - goto try_again; - } + hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + state->offset++; + return ifa; } - if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { - ifa = inet6_addr_lst[state->bucket]; - goto try_again; + while (++state->bucket < IN6_ADDR_HSIZE) { + state->offset = 0; + hlist_for_each_entry_rcu_bh(ifa, + &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; + state->offset++; + return ifa; + } } - return ifa; -} - -static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) -{ - struct inet6_ifaddr *ifa = if6_get_first(seq); - - if (ifa) - while(pos && (ifa = if6_get_next(seq, ifa)) != NULL) - --pos; - return pos ? NULL : ifa; + return NULL; } static void *if6_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(addrconf_hash_lock) + __acquires(rcu_bh) { - read_lock_bh(&addrconf_hash_lock); - return if6_get_idx(seq, *pos); + rcu_read_lock_bh(); + return if6_get_first(seq, *pos); } static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2988,9 +3477,9 @@ static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void if6_seq_stop(struct seq_file *seq, void *v) - __releases(addrconf_hash_lock) + __releases(rcu_bh) { - read_unlock_bh(&addrconf_hash_lock); + rcu_read_unlock_bh(); } static int if6_seq_show(struct seq_file *seq, void *v) @@ -3001,7 +3490,7 @@ static int if6_seq_show(struct seq_file *seq, void *v) ifp->idev->dev->ifindex, ifp->prefix_len, ifp->scope, - ifp->flags, + (u8) ifp->flags, ifp->idev->dev->name); return 0; } @@ -3027,16 +3516,16 @@ static const struct file_operations if6_fops = { .release = seq_release_net, }; -static int if6_proc_net_init(struct net *net) +static int __net_init if6_proc_net_init(struct net *net) { - if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) + if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops)) return -ENOMEM; return 0; } -static void if6_proc_net_exit(struct net *net) +static void __net_exit if6_proc_net_exit(struct net *net) { - proc_net_remove(net, "if_inet6"); + remove_proc_entry("if_inet6", net->proc_net); } static struct pernet_operations if6_proc_net_ops = { @@ -3055,15 +3544,16 @@ void if6_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) /* Check if address is a home address configured on any interface. */ -int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) +int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) { int ret = 0; - struct inet6_ifaddr * ifp; - u8 hash = ipv6_addr_hash(addr); - read_lock_bh(&addrconf_hash_lock); - for (ifp = inet6_addr_lst[hash]; ifp; ifp = ifp->lst_next) { + struct inet6_ifaddr *ifp = NULL; + unsigned int hash = inet6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; if (ipv6_addr_equal(&ifp->addr, addr) && @@ -3072,7 +3562,7 @@ int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) break; } } - read_unlock_bh(&addrconf_hash_lock); + rcu_read_unlock_bh(); return ret; } #endif @@ -3081,45 +3571,41 @@ int ipv6_chk_home_addr(struct net *net, struct in6_addr *addr) * Periodic address status verification */ -static void addrconf_verify(unsigned long foo) +static void addrconf_verify_rtnl(void) { + unsigned long now, next, next_sec, next_sched; struct inet6_ifaddr *ifp; - unsigned long now, next; int i; - spin_lock_bh(&addrconf_verify_lock); - now = jiffies; - next = now + ADDR_CHECK_FREQUENCY; + ASSERT_RTNL(); - del_timer(&addr_chk_timer); + rcu_read_lock_bh(); + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - for (i=0; i < IN6_ADDR_HSIZE; i++) { + cancel_delayed_work(&addr_chk_work); + for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: - read_lock(&addrconf_hash_lock); - for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { + hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) { unsigned long age; -#ifdef CONFIG_IPV6_PRIVACY - unsigned long regen_advance; -#endif - if (ifp->flags & IFA_F_PERMANENT) + /* When setting preferred_lft to a value not zero or + * infinity, while valid_lft is infinity + * IFA_F_PERMANENT has a non-infinity life time. + */ + if ((ifp->flags & IFA_F_PERMANENT) && + (ifp->prefered_lft == INFINITY_LIFE_TIME)) continue; spin_lock(&ifp->lock); - age = (now - ifp->tstamp) / HZ; - -#ifdef CONFIG_IPV6_PRIVACY - regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - ifp->idev->nd_parms->retrans_time / HZ; -#endif + /* We try to batch several events at once. */ + age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifp->valid_lft != INFINITY_LIFE_TIME && age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); - read_unlock(&addrconf_hash_lock); ipv6_del_addr(ifp); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { @@ -3134,22 +3620,25 @@ restart: ifp->flags |= IFA_F_DEPRECATED; } - if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + if ((ifp->valid_lft != INFINITY_LIFE_TIME) && + (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))) next = ifp->tstamp + ifp->valid_lft * HZ; spin_unlock(&ifp->lock); if (deprecate) { in6_ifa_hold(ifp); - read_unlock(&addrconf_hash_lock); ipv6_ifa_notify(0, ifp); in6_ifa_put(ifp); goto restart; } -#ifdef CONFIG_IPV6_PRIVACY } else if ((ifp->flags&IFA_F_TEMPORARY) && !(ifp->flags&IFA_F_TENTATIVE)) { + unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -3159,7 +3648,7 @@ restart: in6_ifa_hold(ifp); in6_ifa_hold(ifpub); spin_unlock(&ifp->lock); - read_unlock(&addrconf_hash_lock); + spin_lock(&ifpub->lock); ifpub->regen_count = 0; spin_unlock(&ifpub->lock); @@ -3171,7 +3660,6 @@ restart: } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; spin_unlock(&ifp->lock); -#endif } else { /* ifp->prefered_lft <= ifp->valid_lft */ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) @@ -3179,26 +3667,51 @@ restart: spin_unlock(&ifp->lock); } } - read_unlock(&addrconf_hash_lock); } - addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; - add_timer(&addr_chk_timer); - spin_unlock_bh(&addrconf_verify_lock); + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; + + ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); + mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); + rcu_read_unlock_bh(); +} + +static void addrconf_verify_work(struct work_struct *w) +{ + rtnl_lock(); + addrconf_verify_rtnl(); + rtnl_unlock(); +} + +static void addrconf_verify(void) +{ + mod_delayed_work(addrconf_wq, &addr_chk_work, 0); } -static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) +static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local, + struct in6_addr **peer_pfx) { struct in6_addr *pfx = NULL; + *peer_pfx = NULL; + if (addr) pfx = nla_data(addr); if (local) { if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) - pfx = NULL; - else - pfx = nla_data(local); + *peer_pfx = pfx; + pfx = nla_data(local); } return pfx; @@ -3208,15 +3721,17 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) }, [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .len = sizeof(u32) }, }; static int -inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; + u32 ifa_flags; int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); @@ -3224,23 +3739,37 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; - return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen); + ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + + /* We ignore other flags so far. */ + ifa_flags &= IFA_F_MANAGETEMPADDR; + + return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen); } -static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, +static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, u32 prefered_lft, u32 valid_lft) { u32 flags; clock_t expires; unsigned long timeout; + bool was_managetempaddr; + bool had_prefixroute; + + ASSERT_RTNL(); if (!valid_lft || (prefered_lft > valid_lft)) return -EINVAL; + if (ifa_flags & IFA_F_MANAGETEMPADDR && + (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) + return -EINVAL; + timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) { expires = jiffies_to_clock_t(timeout * HZ); @@ -3260,7 +3789,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, } spin_lock_bh(&ifp->lock); - ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; + was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR; + had_prefixroute = ifp->flags & IFA_F_PERMANENT && + !(ifp->flags & IFA_F_NOPREFIXROUTE); + ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | + IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE); + ifp->flags |= ifa_flags; ifp->tstamp = jiffies; ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; @@ -3269,24 +3804,46 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, if (!(ifp->flags&IFA_F_TENTATIVE)) ipv6_ifa_notify(0, ifp); - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - expires, flags); - addrconf_verify(0); + if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, + expires, flags); + } else if (had_prefixroute) { + enum cleanup_prefix_rt_t action; + unsigned long rt_expires; + + write_lock_bh(&ifp->idev->lock); + action = check_cleanup_prefix_route(ifp, &rt_expires); + write_unlock_bh(&ifp->idev->lock); + + if (action != CLEANUP_PREFIX_RT_NOP) { + cleanup_prefix_route(ifp, rt_expires, + action == CLEANUP_PREFIX_RT_DEL); + } + } + + if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { + if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) + valid_lft = prefered_lft = 0; + manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft, + !was_managetempaddr, jiffies); + } + + addrconf_verify_rtnl(); return 0; } static int -inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *pfx; + struct in6_addr *pfx, *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; - u8 ifa_flags; + u32 ifa_flags; int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); @@ -3294,7 +3851,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return err; ifm = nlmsg_data(nlh); - pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx); if (pfx == NULL) return -EINVAL; @@ -3313,16 +3870,19 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (dev == NULL) return -ENODEV; + ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + /* We ignore other flags so far. */ - ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); + ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | + IFA_F_NOPREFIXROUTE; ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (ifa == NULL) { /* * It would be best to check for !NLM_F_CREATE here but - * userspace alreay relies on not having to provide this. + * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, pfx, + return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx, ifm->ifa_prefixlen, ifa_flags, preferred_lft, valid_lft); } @@ -3338,7 +3898,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return err; } -static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags, +static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags, u8 scope, int ifindex) { struct ifaddrmsg *ifm; @@ -3356,10 +3916,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, { struct ifa_cacheinfo ci; - ci.cstamp = (u32)(TIME_DELTA(cstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.tstamp = (u32)(TIME_DELTA(tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; @@ -3381,24 +3939,27 @@ static inline int rt_scope(int ifa_scope) static inline int inet6_ifaddr_msgsize(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16) /* IFA_LOCAL */ + nla_total_size(16) /* IFA_ADDRESS */ - + nla_total_size(sizeof(struct ifa_cacheinfo)); + + nla_total_size(sizeof(struct ifa_cacheinfo)) + + nla_total_size(4) /* IFA_FLAGS */; } static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct nlmsghdr *nlh; u32 preferred, valid; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); - if (!(ifa->flags&IFA_F_PERMANENT)) { + if (!((ifa->flags&IFA_F_PERMANENT) && + (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; valid = ifa->valid_lft; if (preferred != INFINITY_LIFE_TIME) { @@ -3407,25 +3968,41 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred -= tval; else preferred = 0; - if (valid != INFINITY_LIFE_TIME) - valid -= tval; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } - if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || - put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } + if (!ipv6_addr_any(&ifa->peer_addr)) { + if (nla_put(skb, IFA_LOCAL, 16, &ifa->addr) < 0 || + nla_put(skb, IFA_ADDRESS, 16, &ifa->peer_addr) < 0) + goto error; + } else + if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0) + goto error; + + if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) + goto error; + + if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) + goto error; return nlmsg_end(skb, nlh); + +error: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, - u32 pid, u32 seq, int event, u16 flags) + u32 portid, u32 seq, int event, u16 flags) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -3434,7 +4011,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3450,7 +4027,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, } static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; @@ -3459,7 +4036,7 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3474,8 +4051,7 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return nlmsg_end(skb, nlh); } -enum addr_type_t -{ +enum addr_type_t { UNICAST_ADDR, MULTICAST_ADDR, ANYCAST_ADDR, @@ -3486,7 +4062,6 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type, int s_ip_idx, int *p_ip_idx) { - struct inet6_ifaddr *ifa; struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; int err = 1; @@ -3494,21 +4069,24 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, read_lock_bh(&idev->lock); switch (type) { - case UNICAST_ADDR: + case UNICAST_ADDR: { + struct inet6_ifaddr *ifa; + /* unicast address incl. temp addr */ - for (ifa = idev->addr_list; ifa; - ifa = ifa->if_next, ip_idx++) { - if (ip_idx < s_ip_idx) + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) continue; err = inet6_fill_ifaddr(skb, ifa, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR, NLM_F_MULTI); if (err <= 0) break; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } break; + } case MULTICAST_ADDR: /* multicast address */ for (ifmca = idev->mc_list; ifmca; @@ -3516,7 +4094,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMULTICAST, NLM_F_MULTI); @@ -3531,7 +4109,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifacaddr(skb, ifaca, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETANYCAST, NLM_F_MULTI); @@ -3557,23 +4135,24 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; rcu_read_lock(); + cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; - if (idx > s_idx) + if (h > s_h || idx > s_idx) s_ip_idx = 0; ip_idx = 0; - if ((idev = __in6_dev_get(dev)) == NULL) + idev = __in6_dev_get(dev); + if (!idev) goto cont; if (in6_dump_addrs(idev, skb, cb, type, @@ -3614,13 +4193,12 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) return inet6_dump_addr(skb, cb, type); } -static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, - void *arg) +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; - struct in6_addr *addr = NULL; + struct in6_addr *addr = NULL, *peer; struct net_device *dev = NULL; struct inet6_ifaddr *ifa; struct sk_buff *skb; @@ -3630,7 +4208,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, if (err < 0) goto errout; - addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); if (addr == NULL) { err = -EINVAL; goto errout; @@ -3640,17 +4218,19 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, if (ifm->ifa_index) dev = __dev_get_by_index(net, ifm->ifa_index); - if ((ifa = ipv6_get_ifaddr(net, addr, dev, 1)) == NULL) { + ifa = ipv6_get_ifaddr(net, addr, dev, 1); + if (!ifa) { err = -EADDRNOTAVAIL; goto errout; } - if ((skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL)) == NULL) { + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL); + if (!skb) { err = -ENOBUFS; goto errout_ifa; } - err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, + err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWADDR, 0); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ @@ -3658,7 +4238,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, kfree_skb(skb); goto errout_ifa; } - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_ifa: in6_ifa_put(ifa); errout: @@ -3703,22 +4283,27 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_AUTOCONF] = cnf->autoconf; array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; - array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; - array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; -#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; -#endif array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; - array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif @@ -3734,6 +4319,18 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; + array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; + array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; +} + +static inline size_t inet6_ifla6_size(void) +{ + return nla_total_size(4) /* IFLA_INET6_FLAGS */ + + nla_total_size(sizeof(struct ifla_cacheinfo)) + + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ } static inline size_t inet6_if_nlmsg_size(void) @@ -3743,17 +4340,26 @@ static inline size_t inet6_if_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ - + nla_total_size( /* IFLA_PROTINFO */ - nla_total_size(4) /* IFLA_INET6_FLAGS */ - + nla_total_size(sizeof(struct ifla_cacheinfo)) - + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ - + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ - + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ - ); + + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ +} + +static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, + int items, int bytes) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(atomic_long_read(&mib[i]), &stats[i]); + + memset(&stats[items], 0, pad); } -static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, - int bytes) +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, + int items, int bytes, size_t syncpoff) { int i; int pad = bytes - sizeof(u64) * items; @@ -3762,7 +4368,7 @@ static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, /* Use put_unaligned() because stats may not be aligned for u64. */ put_unaligned(items, &stats[0]); for (i = 1; i < items; i++) - put_unaligned(snmp_fold_field(mib, i), &stats[i]); + put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); memset(&stats[items], 0, pad); } @@ -3770,27 +4376,169 @@ static inline void __snmp6_fill_stats(u64 *stats, void **mib, int items, static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, int bytes) { - switch(attrtype) { + switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats(stats, (void **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes); + __snmp6_fill_stats64(stats, idev->stats.ipv6, + IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: - __snmp6_fill_stats(stats, (void **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes); + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); break; } } +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +{ + struct nlattr *nla; + struct ifla_cacheinfo ci; + + if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags)) + goto nla_put_failure; + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME)); + if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; + nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); + if (nla == NULL) + goto nla_put_failure; + ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); + + /* XXX - MC not implemented */ + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); + if (nla == NULL) + goto nla_put_failure; + read_lock_bh(&idev->lock); + memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); + read_unlock_bh(&idev->lock); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t inet6_get_link_af_size(const struct net_device *dev) +{ + if (!__in6_dev_get(dev)) + return 0; + + return inet6_ifla6_size(); +} + +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev) + return -ENODATA; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + return -EMSGSIZE; + + return 0; +} + +static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) +{ + struct inet6_ifaddr *ifp; + struct net_device *dev = idev->dev; + bool update_rs = false; + struct in6_addr ll_addr; + + ASSERT_RTNL(); + + if (token == NULL) + return -EINVAL; + if (ipv6_addr_any(token)) + return -EINVAL; + if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) + return -EINVAL; + if (!ipv6_accept_ra(idev)) + return -EINVAL; + if (idev->cnf.rtr_solicits <= 0) + return -EINVAL; + + write_lock_bh(&idev->lock); + + BUILD_BUG_ON(sizeof(token->s6_addr) != 16); + memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8); + + write_unlock_bh(&idev->lock); + + if (!idev->dead && (idev->if_flags & IF_READY) && + !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE | + IFA_F_OPTIMISTIC)) { + + /* If we're not ready, then normal ifup will take care + * of this. Otherwise, we need to request our rs here. + */ + ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters); + update_rs = true; + } + + write_lock_bh(&idev->lock); + + if (update_rs) { + idev->if_flags |= IF_RS_SENT; + idev->rs_probes = 1; + addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + } + + /* Well, that's kinda nasty ... */ + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->tokenized) { + ifp->valid_lft = 0; + ifp->prefered_lft = 0; + } + spin_unlock(&ifp->lock); + } + + write_unlock_bh(&idev->lock); + addrconf_verify_rtnl(); + return 0; +} + +static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) +{ + int err = -EINVAL; + struct inet6_dev *idev = __in6_dev_get(dev); + struct nlattr *tb[IFLA_INET6_MAX + 1]; + + if (!idev) + return -EAFNOSUPPORT; + + if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) + BUG(); + + if (tb[IFLA_INET6_TOKEN]) + err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN])); + + return err; +} + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, - u32 pid, u32 seq, int event, unsigned int flags) + u32 portid, u32 seq, int event, unsigned int flags) { struct net_device *dev = idev->dev; - struct nlattr *nla; struct ifinfomsg *hdr; struct nlmsghdr *nlh; void *protoinfo; - struct ifla_cacheinfo ci; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3802,44 +4550,19 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; - NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); - - if (dev->addr_len) - NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); - - NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); - if (dev->ifindex != dev->iflink) - NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); - + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || + (dev->addr_len && + nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || + nla_put_u32(skb, IFLA_MTU, dev->mtu) || + (dev->ifindex != dev->iflink && + nla_put_u32(skb, IFLA_LINK, dev->iflink))) + goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); if (protoinfo == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); - - ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 - + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); - ci.reachable_time = idev->nd_parms->reachable_time; - ci.retrans_time = idev->nd_parms->retrans_time; - NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); - - nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); - if (nla == NULL) - goto nla_put_failure; - ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); - - /* XXX - MC not implemented */ - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (nla == NULL) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (nla == NULL) + if (inet6_fill_ifla6_attrs(skb, idev) < 0) goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); nla_nest_end(skb, protoinfo); return nlmsg_end(skb, nlh); @@ -3857,7 +4580,6 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; - struct hlist_node *node; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -3866,14 +4588,14 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + hlist_for_each_entry_rcu(dev, head, index_hlist) { if (idx < s_idx) goto cont; idev = __in6_dev_get(dev); if (!idev) goto cont; if (inet6_fill_ifinfo(skb, idev, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI) <= 0) goto out; @@ -3906,11 +4628,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); return; errout: if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -3921,14 +4643,14 @@ static inline size_t inet6_prefix_nlmsg_size(void) } static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, - struct prefix_info *pinfo, u32 pid, u32 seq, + struct prefix_info *pinfo, u32 portid, u32 seq, int event, unsigned int flags) { struct prefixmsg *pmsg; struct nlmsghdr *nlh; struct prefix_cacheinfo ci; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags); if (nlh == NULL) return -EMSGSIZE; @@ -3946,12 +4668,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, if (pinfo->autoconf) pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; - NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); - + if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix)) + goto nla_put_failure; ci.preferred_time = ntohl(pinfo->prefered); ci.valid_time = ntohl(pinfo->valid); - NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); - + if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; return nlmsg_end(skb, nlh); nla_put_failure: @@ -3986,6 +4708,11 @@ errout: static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { + struct net *net = dev_net(ifp->idev->dev); + + if (event) + ASSERT_RTNL(); + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); switch (event) { @@ -4000,16 +4727,34 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); + if (!ipv6_addr_any(&ifp->peer_addr)) + addrconf_prefix_route(&ifp->peer_addr, 128, + ifp->idev->dev, 0, 0); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); - dst_hold(&ifp->rt->u.dst); + if (!ipv6_addr_any(&ifp->peer_addr)) { + struct rt6_info *rt; + struct net_device *dev = ifp->idev->dev; + + rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL, + dev->ifindex, 1); + if (rt) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } + } + dst_hold(&ifp->rt->dst); + if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->u.dst); + dst_free(&ifp->rt->dst); break; } + atomic_inc(&net->ipv6.dev_addr_genid); + rt_genid_bump_ipv6(net); } static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -4023,29 +4768,43 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL static -int addrconf_sysctl_forward(ctl_table *ctl, int write, +int addrconf_sysctl_forward(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + /* + * ctl->data points to idev->cnf.forwarding, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); if (write) ret = addrconf_fixup_forwarding(ctl, valp, val); + if (ret) + *ppos = pos; return ret; } static void dev_disable_change(struct inet6_dev *idev) { + struct netdev_notifier_info info; + if (!idev || !idev->dev) return; + netdev_notifier_info_init(&info, idev->dev); if (idev->cnf.disable_ipv6) - addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + addrconf_notify(NULL, NETDEV_DOWN, &info); else - addrconf_notify(NULL, NETDEV_UP, idev->dev); + addrconf_notify(NULL, NETDEV_UP, &info); } static void addrconf_disable_change(struct net *net, __s32 newf) @@ -4066,23 +4825,27 @@ static void addrconf_disable_change(struct net *net, __s32 newf) rcu_read_unlock(); } -static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) +static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) { struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); net = (struct net *)table->extra2; + old = *p; + *p = newf; - if (p == &net->ipv6.devconf_dflt->disable_ipv6) + if (p == &net->ipv6.devconf_dflt->disable_ipv6) { + rtnl_unlock(); return 0; - - if (!rtnl_trylock()) - return restart_syscall(); + } if (p == &net->ipv6.devconf_all->disable_ipv6) { - __s32 newf = net->ipv6.devconf_all->disable_ipv6; net->ipv6.devconf_dflt->disable_ipv6 = newf; addrconf_disable_change(net, newf); - } else if ((!*p) ^ (!old)) + } else if ((!newf) ^ (!old)) dev_disable_change((struct inet6_dev *)table->extra1); rtnl_unlock(); @@ -4090,234 +4853,298 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) } static -int addrconf_sysctl_disable(ctl_table *ctl, int write, +int addrconf_sysctl_disable(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + /* + * ctl->data points to idev->cnf.disable_ipv6, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); if (write) ret = addrconf_disable_ipv6(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static +int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int ret; + int old, new; + + old = *valp; + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new = *valp; + + if (write && old != new) { + struct net *net = ctl->extra2; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + else if (valp == &net->ipv6.devconf_all->proxy_ndp) + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + else { + struct inet6_dev *idev = ctl->extra1; + + inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + idev->dev->ifindex, + &idev->cnf); + } + rtnl_unlock(); + } + return ret; } + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table addrconf_vars[DEVCONF_MAX+1]; - char *dev_name; + struct ctl_table addrconf_vars[DEVCONF_MAX+1]; } addrconf_sysctl __read_mostly = { .sysctl_header = NULL, .addrconf_vars = { { - .procname = "forwarding", - .data = &ipv6_devconf.forwarding, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_forward, + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_forward, }, { - .procname = "hop_limit", - .data = &ipv6_devconf.hop_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "mtu", - .data = &ipv6_devconf.mtu6, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "accept_ra", - .data = &ipv6_devconf.accept_ra, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "accept_redirects", - .data = &ipv6_devconf.accept_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "autoconf", - .data = &ipv6_devconf.autoconf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "dad_transmits", - .data = &ipv6_devconf.dad_transmits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "router_solicitations", - .data = &ipv6_devconf.rtr_solicits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "router_solicitation_interval", - .data = &ipv6_devconf.rtr_solicit_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .procname = "router_solicitation_delay", - .data = &ipv6_devconf.rtr_solicit_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .procname = "force_mld_version", - .data = &ipv6_devconf.force_mld_version, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, -#ifdef CONFIG_IPV6_PRIVACY { - .procname = "use_tempaddr", - .data = &ipv6_devconf.use_tempaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .procname = "temp_valid_lft", - .data = &ipv6_devconf.temp_valid_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .procname = "temp_prefered_lft", - .data = &ipv6_devconf.temp_prefered_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "regen_max_retry", - .data = &ipv6_devconf.regen_max_retry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "max_desync_factor", - .data = &ipv6_devconf.max_desync_factor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, -#endif { - .procname = "max_addresses", - .data = &ipv6_devconf.max_addresses, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "accept_ra_defrtr", - .data = &ipv6_devconf.accept_ra_defrtr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_ra_defrtr", + .data = &ipv6_devconf.accept_ra_defrtr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "accept_ra_pinfo", - .data = &ipv6_devconf.accept_ra_pinfo, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_IPV6_ROUTER_PREF { - .procname = "accept_ra_rtr_pref", - .data = &ipv6_devconf.accept_ra_rtr_pref, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { - .procname = "router_probe_interval", - .data = &ipv6_devconf.rtr_probe_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_IPV6_ROUTE_INFO { - .procname = "accept_ra_rt_info_max_plen", - .data = &ipv6_devconf.accept_ra_rt_info_max_plen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #endif #endif { - .procname = "proxy_ndp", - .data = &ipv6_devconf.proxy_ndp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "proxy_ndp", + .data = &ipv6_devconf.proxy_ndp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_proxy_ndp, }, { - .procname = "accept_source_route", - .data = &ipv6_devconf.accept_source_route, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_source_route", + .data = &ipv6_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD { - .procname = "optimistic_dad", - .data = &ipv6_devconf.optimistic_dad, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_IPV6_MROUTE { - .procname = "mc_forwarding", - .data = &ipv6_devconf.mc_forwarding, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, + .procname = "mc_forwarding", + .data = &ipv6_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, }, #endif { - .procname = "disable_ipv6", - .data = &ipv6_devconf.disable_ipv6, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_disable, + .procname = "disable_ipv6", + .data = &ipv6_devconf.disable_ipv6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable, }, { - .procname = "accept_dad", - .data = &ipv6_devconf.accept_dad, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "accept_dad", + .data = &ipv6_devconf.accept_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, }, { .procname = "force_tllao", @@ -4327,6 +5154,20 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec }, { + .procname = "ndisc_notify", + .data = &ipv6_devconf.ndisc_notify, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { /* sentinel */ } }, @@ -4337,49 +5178,27 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, { int i; struct addrconf_sysctl_table *t; - -#define ADDRCONF_CTL_PATH_DEV 3 - - struct ctl_path addrconf_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv6", }, - { .procname = "conf", }, - { /* to be set */ }, - { }, - }; - + char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); if (t == NULL) goto out; - for (i=0; t->addrconf_vars[i].data; i++) { - t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + for (i = 0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ t->addrconf_vars[i].extra2 = net; } - /* - * Make a copy of dev_name, because '.procname' is regarded as const - * by sysctl and we wouldn't want anyone to change it under our feet - * (see SIOCSIFNAME). - */ - t->dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!t->dev_name) - goto free; - - addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; + snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); - t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, - t->addrconf_vars); + t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars); if (t->sysctl_header == NULL) - goto free_procname; + goto free; p->sysctl = t; return 0; -free_procname: - kfree(t->dev_name); free: kfree(t); out: @@ -4395,15 +5214,13 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) t = p->sysctl; p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->dev_name); + unregister_net_sysctl_table(t->sysctl_header); kfree(t); } static void addrconf_sysctl_register(struct inet6_dev *idev) { - neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6, - NET_IPV6_NEIGH, "ipv6", + neigh_sysctl_register(idev->dev, idev->nd_parms, &ndisc_ifinfo_sysctl_change); __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, idev, &idev->cnf); @@ -4418,28 +5235,22 @@ static void addrconf_sysctl_unregister(struct inet6_dev *idev) #endif -static int addrconf_init_net(struct net *net) +static int __net_init addrconf_init_net(struct net *net) { - int err; + int err = -ENOMEM; struct ipv6_devconf *all, *dflt; - err = -ENOMEM; - all = &ipv6_devconf; - dflt = &ipv6_devconf_dflt; + all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; - if (!net_eq(net, &init_net)) { - all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); - if (all == NULL) - goto err_alloc_all; + dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; - dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); - if (dflt == NULL) - goto err_alloc_dflt; - } else { - /* these will be inherited by all namespaces */ - dflt->autoconf = ipv6_defaults.autoconf; - dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; - } + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; net->ipv6.devconf_all = all; net->ipv6.devconf_dflt = dflt; @@ -4467,7 +5278,7 @@ err_alloc_all: return err; } -static void addrconf_exit_net(struct net *net) +static void __net_exit addrconf_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); @@ -4484,23 +5295,12 @@ static struct pernet_operations addrconf_ops = { .exit = addrconf_exit_net, }; -/* - * Device notifier - */ - -int register_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&inet6addr_chain, nb); -} - -EXPORT_SYMBOL(register_inet6addr_notifier); - -int unregister_inet6addr_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&inet6addr_chain,nb); -} - -EXPORT_SYMBOL(unregister_inet6addr_notifier); +static struct rtnl_af_ops inet6_ops = { + .family = AF_INET6, + .fill_link_af = inet6_fill_link_af, + .get_link_af_size = inet6_get_link_af_size, + .set_link_af = inet6_set_link_af, +}; /* * Init / cleanup code @@ -4508,15 +5308,24 @@ EXPORT_SYMBOL(unregister_inet6addr_notifier); int __init addrconf_init(void) { - int err; + int i, err; - if ((err = ipv6_addr_label_init()) < 0) { - printk(KERN_CRIT "IPv6 Addrconf: cannot initialize default policy table: %d.\n", - err); - return err; + err = ipv6_addr_label_init(); + if (err < 0) { + pr_crit("%s: cannot initialize default policy table: %d\n", + __func__, err); + goto out; } - register_pernet_subsys(&addrconf_ops); + err = register_pernet_subsys(&addrconf_ops); + if (err < 0) + goto out_addrlabel; + + addrconf_wq = create_workqueue("ipv6_addrconf"); + if (!addrconf_wq) { + err = -ENOMEM; + goto out_nowq; + } /* The addrconf netdev notifier requires that loopback_dev * has it's ipv6 private information allocated and setup @@ -4543,43 +5352,61 @@ int __init addrconf_init(void) if (err) goto errlo; + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_addr_lst[i]); + register_netdevice_notifier(&ipv6_dev_notf); - addrconf_verify(0); + addrconf_verify(); + + rtnl_af_register(&inet6_ops); - err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo); + err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, + NULL); if (err < 0) goto errout; /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL); - __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL); - __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr); - __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr); - __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr); + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, + inet6_dump_ifaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, + inet6_dump_ifmcaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, + inet6_dump_ifacaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, + inet6_netconf_dump_devconf, NULL); ipv6_addr_label_rtnl_register(); return 0; errout: + rtnl_af_unregister(&inet6_ops); unregister_netdevice_notifier(&ipv6_dev_notf); errlo: + destroy_workqueue(addrconf_wq); +out_nowq: unregister_pernet_subsys(&addrconf_ops); - +out_addrlabel: + ipv6_addr_label_cleanup(); +out: return err; } void addrconf_cleanup(void) { - struct inet6_ifaddr *ifa; struct net_device *dev; int i; unregister_netdevice_notifier(&ipv6_dev_notf); unregister_pernet_subsys(&addrconf_ops); + ipv6_addr_label_cleanup(); rtnl_lock(); + __rtnl_af_unregister(&inet6_ops); + /* clean dev list */ for_each_netdev(&init_net, dev) { if (__in6_dev_get(dev) == NULL) @@ -4591,21 +5418,12 @@ void addrconf_cleanup(void) /* * Check hash table. */ - write_lock_bh(&addrconf_hash_lock); - for (i=0; i < IN6_ADDR_HSIZE; i++) { - for (ifa=inet6_addr_lst[i]; ifa; ) { - struct inet6_ifaddr *bifa; - - bifa = ifa; - ifa = ifa->lst_next; - printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa); - /* Do not free it; something is wrong. - Now we can investigate it with debugger. - */ - } - } - write_unlock_bh(&addrconf_hash_lock); - - del_timer(&addr_chk_timer); + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + cancel_delayed_work(&addr_chk_work); rtnl_unlock(); + + destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 3f82e9542ed..e6960457f62 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -3,13 +3,16 @@ * not configured or static. */ +#include <linux/export.h> #include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip.h> #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) -static inline unsigned ipv6_addr_scope2type(unsigned scope) +static inline unsigned int ipv6_addr_scope2type(unsigned int scope) { - switch(scope) { + switch (scope) { case IPV6_ADDR_SCOPE_NODELOCAL: return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | IPV6_ADDR_LOOPBACK); @@ -72,8 +75,76 @@ int __ipv6_addr_type(const struct in6_addr *addr) IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } - return (IPV6_ADDR_RESERVED | + return (IPV6_ADDR_UNICAST | IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } EXPORT_SYMBOL(__ipv6_addr_type); +static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(register_inet6addr_notifier); + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(unregister_inet6addr_notifier); + +int inet6addr_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&inet6addr_chain, val, v); +} +EXPORT_SYMBOL(inet6addr_notifier_call_chain); + +const struct ipv6_stub *ipv6_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_stub); + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +EXPORT_SYMBOL(in6addr_loopback); +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +EXPORT_SYMBOL(in6addr_any); +const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allnodes); +const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_linklocal_allrouters); +const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allnodes); +const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_interfacelocal_allrouters); +const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT; +EXPORT_SYMBOL(in6addr_sitelocal_allrouters); + +static void snmp6_free_dev(struct inet6_dev *idev) +{ + kfree(idev->stats.icmpv6msgdev); + kfree(idev->stats.icmpv6dev); + free_percpu(idev->stats.ipv6); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + WARN_ON(idev->mc_list != NULL); + WARN_ON(timer_pending(&idev->rs_timer)); + +#ifdef NET_REFCNT_DEBUG + pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + pr_warn("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree_rcu(idev, rcu); +} +EXPORT_SYMBOL(in6_dev_finish_destroy); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 6ff73c4c126..731e1e1722d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -6,13 +6,14 @@ */ /* * Author: - * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> + * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/in6.h> +#include <linux/slab.h> #include <net/addrconf.h> #include <linux/if_addrlabel.h> #include <linux/netlink.h> @@ -21,14 +22,13 @@ #if 0 #define ADDRLABEL(x...) printk(x) #else -#define ADDRLABEL(x...) do { ; } while(0) +#define ADDRLABEL(x...) do { ; } while (0) #endif /* * Policy Table */ -struct ip6addrlbl_entry -{ +struct ip6addrlbl_entry { #ifdef CONFIG_NET_NS struct net *lbl_net; #endif @@ -52,15 +52,11 @@ static struct ip6addrlbl_table static inline struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) { -#ifdef CONFIG_NET_NS - return lbl->lbl_net; -#else - return &init_net; -#endif + return read_pnet(&lbl->lbl_net); } /* - * Default policy table (RFC3484 + extensions) + * Default policy table (RFC6724 + extensions) * * prefix addr_type label * ------------------------------------------------------------------------- @@ -72,13 +68,17 @@ struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) * fc00::/7 N/A 5 ULA (RFC 4193) * 2001::/32 N/A 6 Teredo (RFC 4380) * 2001:10::/28 N/A 7 ORCHID (RFC 4843) + * fec0::/10 N/A 11 Site-local + * (deprecated by RFC3879) + * 3ffe::/16 N/A 12 6bone * * Note: 0xffffffff is used if we do not have any policies. + * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724. */ #define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL -static const __net_initdata struct ip6addrlbl_init_table +static const __net_initconst struct ip6addrlbl_init_table { const struct in6_addr *prefix; int prefixlen; @@ -87,31 +87,39 @@ static const __net_initdata struct ip6addrlbl_init_table { /* ::/0 */ .prefix = &in6addr_any, .label = 1, - },{ /* fc00::/7 */ - .prefix = &(struct in6_addr){{{ 0xfc }}}, + }, { /* fc00::/7 */ + .prefix = &(struct in6_addr){ { { 0xfc } } } , .prefixlen = 7, .label = 5, - },{ /* 2002::/16 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + }, { /* fec0::/10 */ + .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } }, + .prefixlen = 10, + .label = 11, + }, { /* 2002::/16 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } }, .prefixlen = 16, .label = 2, - },{ /* 2001::/32 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + }, { /* 3ffe::/16 */ + .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } }, + .prefixlen = 16, + .label = 12, + }, { /* 2001::/32 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } }, .prefixlen = 32, .label = 6, - },{ /* 2001:10::/28 */ - .prefix = &(struct in6_addr){{{ 0x20, 0x01, 0x00, 0x10 }}}, + }, { /* 2001:10::/28 */ + .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } }, .prefixlen = 28, .label = 7, - },{ /* ::ffff:0:0 */ - .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + }, { /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } }, .prefixlen = 96, .label = 4, - },{ /* ::/96 */ + }, { /* ::/96 */ .prefix = &in6addr_any, .prefixlen = 96, .label = 3, - },{ /* ::1/128 */ + }, { /* ::1/128 */ .prefix = &in6addr_loopback, .prefixlen = 128, .label = 0, @@ -132,7 +140,7 @@ static void ip6addrlbl_free_rcu(struct rcu_head *h) ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); } -static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p) +static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) { return atomic_inc_not_zero(&p->refcnt); } @@ -144,29 +152,28 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static int __ip6addrlbl_match(struct net *net, - struct ip6addrlbl_entry *p, - const struct in6_addr *addr, - int addrtype, int ifindex) +static bool __ip6addrlbl_match(struct net *net, + const struct ip6addrlbl_entry *p, + const struct in6_addr *addr, + int addrtype, int ifindex) { if (!net_eq(ip6addrlbl_net(p), net)) - return 0; + return false; if (p->ifindex && p->ifindex != ifindex) - return 0; + return false; if (p->addrtype && p->addrtype != addrtype) - return 0; + return false; if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) - return 0; - return 1; + return false; + return true; } static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex) { - struct hlist_node *pos; struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { if (__ip6addrlbl_match(net, p, addr, type, ifindex)) return p; } @@ -243,38 +250,36 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, /* add a label */ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) { + struct hlist_node *n; + struct ip6addrlbl_entry *last = NULL, *p = NULL; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", - __func__, - newp, replace); + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, + replace); - if (hlist_empty(&ip6addrlbl_table.head)) { - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); - } else { - struct hlist_node *pos, *n; - struct ip6addrlbl_entry *p = NULL; - hlist_for_each_entry_safe(p, pos, n, - &ip6addrlbl_table.head, list) { - if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && - p->ifindex == newp->ifindex && - ipv6_addr_equal(&p->prefix, &newp->prefix)) { - if (!replace) { - ret = -EEXIST; - goto out; - } - hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); - goto out; - } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || - (p->prefixlen < newp->prefixlen)) { - hlist_add_before_rcu(&newp->list, &p->list); + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; goto out; } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; } - hlist_add_after_rcu(&p->list, &newp->list); + last = p; } + if (last) + hlist_add_after_rcu(&last->list, &newp->list); + else + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: if (!ret) ip6addrlbl_table.seq++; @@ -310,13 +315,13 @@ static int __ip6addrlbl_del(struct net *net, int ifindex) { struct ip6addrlbl_entry *p = NULL; - struct hlist_node *pos, *n; + struct hlist_node *n; int ret = -ESRCH; ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && @@ -353,7 +358,7 @@ static int __net_init ip6addrlbl_net_init(struct net *net) int err = 0; int i; - ADDRLABEL(KERN_DEBUG "%s()\n", __func__); + ADDRLABEL(KERN_DEBUG "%s\n", __func__); for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { int ret = ip6addrlbl_add(net, @@ -371,11 +376,11 @@ static int __net_init ip6addrlbl_net_init(struct net *net) static void __net_exit ip6addrlbl_net_exit(struct net *net) { struct ip6addrlbl_entry *p = NULL; - struct hlist_node *pos, *n; + struct hlist_node *n; /* Remove all labels belonging to the exiting net */ spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { if (net_eq(ip6addrlbl_net(p), net)) { hlist_del_rcu(&p->list); ip6addrlbl_put(p); @@ -396,13 +401,17 @@ int __init ipv6_addr_label_init(void) return register_pernet_subsys(&ipv6_addr_label_ops); } +void ipv6_addr_label_cleanup(void) +{ + unregister_pernet_subsys(&ipv6_addr_label_ops); +} + static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, [IFAL_LABEL] = { .len = sizeof(u32), }, }; -static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, - void *arg) +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ifaddrlblmsg *ifal; @@ -421,16 +430,9 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, ifal->ifal_prefixlen > 128) return -EINVAL; - if (ifal->ifal_index && - !__dev_get_by_index(net, ifal->ifal_index)) - return -EINVAL; - if (!tb[IFAL_ADDRESS]) return -EINVAL; - pfx = nla_data(tb[IFAL_ADDRESS]); - if (!pfx) - return -EINVAL; if (!tb[IFAL_LABEL]) return -EINVAL; @@ -438,8 +440,12 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, if (label == IPV6_ADDR_LABEL_DEFAULT) return -EINVAL; - switch(nlh->nlmsg_type) { + switch (nlh->nlmsg_type) { case RTM_NEWADDRLABEL: + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, ifal->ifal_index, label, nlh->nlmsg_flags & NLM_F_REPLACE); @@ -454,8 +460,8 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, - int prefixlen, int ifindex, u32 lseq) +static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, + int prefixlen, int ifindex, u32 lseq) { struct ifaddrlblmsg *ifal = nlmsg_data(nlh); ifal->ifal_family = AF_INET6; @@ -468,10 +474,10 @@ static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, static int ip6addrlbl_fill(struct sk_buff *skb, struct ip6addrlbl_entry *p, u32 lseq, - u32 pid, u32 seq, int event, + u32 portid, u32 seq, int event, unsigned int flags) { - struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event, + struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrlblmsg), flags); if (!nlh) return -EMSGSIZE; @@ -491,20 +497,20 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; - struct hlist_node *pos; int idx = 0, s_idx = cb->args[0]; int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { if (idx >= s_idx && net_eq(ip6addrlbl_net(p), net)) { - if ((err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWADDRLABEL, - NLM_F_MULTI)) <= 0) + err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI); + if (err <= 0) break; } idx++; @@ -516,14 +522,12 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) static inline int ip6addrlbl_msgsize(void) { - return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + nla_total_size(16) /* IFAL_ADDRESS */ - + nla_total_size(4) /* IFAL_LABEL */ - ); + + nla_total_size(4); /* IFAL_LABEL */ } -static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, - void *arg) +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; @@ -550,10 +554,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, if (!tb[IFAL_ADDRESS]) return -EINVAL; - addr = nla_data(tb[IFAL_ADDRESS]); - if (!addr) - return -EINVAL; rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); @@ -567,13 +568,14 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, goto out; } - if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); + if (!skb) { ip6addrlbl_put(p); return -ENOBUFS; } err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, 0); ip6addrlbl_put(p); @@ -584,15 +586,18 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, goto out; } - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; } void __init ipv6_addr_label_rtnl_register(void) { - __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL); - __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL); - __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump); + __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, + ip6addrlbl_dump, NULL); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 12e69d364dd..7cb4392690d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -18,6 +18,7 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/capability.h> @@ -36,6 +37,7 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -47,19 +49,19 @@ #include <net/udp.h> #include <net/udplite.h> #include <net/tcp.h> -#include <net/ipip.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/route.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/mroute6.h> MODULE_AUTHOR("Cast of dozens"); @@ -77,7 +79,7 @@ struct ipv6_params ipv6_defaults = { .autoconf = 1, }; -static int disable_ipv6_mod = 0; +static int disable_ipv6_mod; module_param_named(disable, disable_ipv6_mod, int, 0444); MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); @@ -104,15 +106,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, struct inet_protosw *answer; struct proto *answer_prot; unsigned char answer_flags; - char answer_no_check; int try_loading_module = 0; int err; - if (sock->type != SOCK_RAW && - sock->type != SOCK_DGRAM && - !inet_ehash_secret) - build_ehash_secret(); - /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; @@ -159,12 +155,12 @@ lookup_protocol: } err = -EPERM; - if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + if (sock->type == SOCK_RAW && !kern && + !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; - answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); @@ -178,9 +174,8 @@ lookup_protocol: sock_init_data(sock, sk); err = 0; - sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) - sk->sk_reuse = 1; + sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; @@ -199,7 +194,7 @@ lookup_protocol: inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); np->hop_limit = -1; - np->mcast_hops = -1; + np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->ipv6only = net->ipv6.sysctl.bindv6only; @@ -213,8 +208,9 @@ lookup_protocol: inet->mc_ttl = 1; inet->mc_index = 0; inet->mc_list = NULL; + inet->rcv_tos = 0; - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; @@ -255,7 +251,7 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -271,12 +267,16 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; lock_sock(sk); @@ -299,7 +299,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - /* Reproduce AF_INET checks to make the bindings consitant */ + /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); if (!sysctl_ip_nonlocal_bind && @@ -316,7 +316,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev = NULL; rcu_read_lock(); - if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another one @@ -342,7 +342,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!ipv6_chk_addr(net, &addr->sin6_addr, + if (!(inet->freebind || inet->transparent) && + !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; goto out_unlock; @@ -355,10 +356,10 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_rcv_saddr = v4addr; inet->inet_saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; /* Make sure we are allowed to bind here. */ if (sk->sk_prot->get_port(sk, snum)) { @@ -384,7 +385,6 @@ out_unlock: rcu_read_unlock(); goto out; } - EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) @@ -402,7 +402,6 @@ int inet6_release(struct socket *sock) return inet_release(sock); } - EXPORT_SYMBOL(inet6_release); void inet6_destroy_sock(struct sock *sk) @@ -413,7 +412,12 @@ void inet6_destroy_sock(struct sock *sk) /* Release rx options */ - if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + skb = xchg(&np->pktoptions, NULL); + if (skb != NULL) + kfree_skb(skb); + + skb = xchg(&np->rxpmtu, NULL); + if (skb != NULL) kfree_skb(skb); /* Free flowlabels */ @@ -421,10 +425,10 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - if ((opt = xchg(&np->opt, NULL)) != NULL) + opt = xchg(&np->opt, NULL); + if (opt != NULL) sock_kfree_s(sk, opt, opt->tot_len); } - EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* @@ -434,7 +438,7 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { - struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -449,23 +453,22 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, peer == 1) return -ENOTCONN; sin->sin6_port = inet->inet_dport; - ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; } else { - if (ipv6_addr_any(&np->rcv_saddr)) - ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + sin->sin6_addr = np->saddr; else - ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; } - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = sk->sk_bound_dev_if; + sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, + sk->sk_bound_dev_if); *uaddr_len = sizeof(*sin); - return(0); + return 0; } - EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -473,8 +476,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sock *sk = sock->sk; struct net *net = sock_net(sk); - switch(cmd) - { + switch (cmd) { case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); @@ -484,7 +486,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: - return(ipv6_route_ioctl(net, cmd, (void __user *)arg)); + return ipv6_route_ioctl(net, cmd, (void __user *)arg); case SIOCSIFADDR: return addrconf_add_ifaddr(net, (void __user *) arg); @@ -498,9 +500,8 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ - return(0); + return 0; } - EXPORT_SYMBOL(inet6_ioctl); const struct proto_ops inet6_stream_ops = { @@ -518,10 +519,10 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = tcp_sendmsg, /* ok */ - .recvmsg = sock_common_recvmsg, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, - .sendpage = tcp_sendpage, + .sendpage = inet_sendpage, .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, @@ -545,7 +546,7 @@ const struct proto_ops inet6_dgram_ops = { .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = sock_common_recvmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT @@ -606,25 +607,21 @@ out: return ret; out_permanent: - printk(KERN_ERR "Attempt to override permanent protocol %d.\n", - protocol); + pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: - printk(KERN_ERR - "Ignoring attempt to register invalid socket type %d.\n", + pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } - EXPORT_SYMBOL(inet6_register_protosw); void inet6_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { - printk(KERN_ERR - "Attempt to unregister permanent protocol %d.\n", + pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw6_lock); @@ -634,51 +631,38 @@ inet6_unregister_protosw(struct inet_protosw *p) synchronize_net(); } } - EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { - int err; - struct dst_entry *dst; struct ipv6_pinfo *np = inet6_sk(sk); + struct dst_entry *dst; dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet->inet_dport; - fl.fl_ip_sport = inet->inet_sport; - security_sk_classify_flow(sk, &fl); - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { + struct in6_addr *final_p, final; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = np->saddr; + fl6.flowlabel = np->flow_label; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; + sk->sk_err_soft = -PTR_ERR(dst); + return PTR_ERR(dst); } __ip6_dst_store(sk, dst, NULL, NULL); @@ -686,276 +670,31 @@ int inet6_sk_rebuild_header(struct sock *sk) return 0; } - EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & - *(__be32 *)skb_network_header(skb)) && + (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; + return true; } - return 0; + return false; } - EXPORT_SYMBOL_GPL(ipv6_opt_accepted); -static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) -{ - const struct inet6_protocol *ops = NULL; - - for (;;) { - struct ipv6_opt_hdr *opth; - int len; - - if (proto != NEXTHDR_HOP) { - ops = rcu_dereference(inet6_protos[proto]); - - if (unlikely(!ops)) - break; - - if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } - - if (unlikely(!pskb_may_pull(skb, 8))) - break; - - opth = (void *)skb->data; - len = ipv6_optlen(opth); - - if (unlikely(!pskb_may_pull(skb, len))) - break; - - proto = opth->nexthdr; - __skb_pull(skb, len); - } - - return proto; -} - -static int ipv6_gso_send_check(struct sk_buff *skb) -{ - struct ipv6hdr *ipv6h; - const struct inet6_protocol *ops; - int err = -EINVAL; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - err = -EPROTONOSUPPORT; - - rcu_read_lock(); - ops = rcu_dereference(inet6_protos[ - ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); - - if (likely(ops && ops->gso_send_check)) { - skb_reset_transport_header(skb); - err = ops->gso_send_check(skb); - } - rcu_read_unlock(); - -out: - return err; -} - -static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct ipv6hdr *ipv6h; - const struct inet6_protocol *ops; - int proto; - struct frag_hdr *fptr; - unsigned int unfrag_ip6hlen; - u8 *prevhdr; - int offset = 0; - - if (!(features & NETIF_F_V6_CSUM)) - features &= ~NETIF_F_SG; - - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - 0))) - goto out; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - goto out; - - ipv6h = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*ipv6h)); - segs = ERR_PTR(-EPROTONOSUPPORT); - - proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); - rcu_read_lock(); - ops = rcu_dereference(inet6_protos[proto]); - if (likely(ops && ops->gso_segment)) { - skb_reset_transport_header(skb); - segs = ops->gso_segment(skb, features); - } - rcu_read_unlock(); - - if (unlikely(IS_ERR(segs))) - goto out; - - for (skb = segs; skb; skb = skb->next) { - ipv6h = ipv6_hdr(skb); - ipv6h->payload_len = htons(skb->len - skb->mac_len - - sizeof(*ipv6h)); - if (proto == IPPROTO_UDP) { - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - fptr = (struct frag_hdr *)(skb_network_header(skb) + - unfrag_ip6hlen); - fptr->frag_off = htons(offset); - if (skb->next != NULL) - fptr->frag_off |= htons(IP6_MF); - offset += (ntohs(ipv6h->payload_len) - - sizeof(struct frag_hdr)); - } - } - -out: - return segs; -} - -struct ipv6_gro_cb { - struct napi_gro_cb napi; - int proto; -}; - -#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb) - -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - const struct inet6_protocol *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; - struct ipv6hdr *iph; - unsigned int nlen; - unsigned int hlen; - unsigned int off; - int flush = 1; - int proto; - __wsum csum; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*iph); - iph = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - iph = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!iph)) - goto out; - } - - skb_gro_pull(skb, sizeof(*iph)); - skb_set_transport_header(skb, skb_gro_offset(skb)); - - flush += ntohs(iph->payload_len) != skb_gro_len(skb); - - rcu_read_lock(); - proto = iph->nexthdr; - ops = rcu_dereference(inet6_protos[proto]); - if (!ops || !ops->gro_receive) { - __pskb_pull(skb, skb_gro_offset(skb)); - proto = ipv6_gso_pull_exthdrs(skb, proto); - skb_gro_pull(skb, -skb_transport_offset(skb)); - skb_reset_transport_header(skb); - __skb_push(skb, skb_gro_offset(skb)); - - if (!ops || !ops->gro_receive) - goto out_unlock; - - iph = ipv6_hdr(skb); - } - - IPV6_GRO_CB(skb)->proto = proto; - - flush--; - nlen = skb_network_header_len(skb); - - for (p = *head; p; p = p->next) { - struct ipv6hdr *iph2; - - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - iph2 = ipv6_hdr(p); - - /* All fields must match except length. */ - if (nlen != skb_network_header_len(p) || - memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) || - memcmp(&iph->nexthdr, &iph2->nexthdr, - nlen - offsetof(struct ipv6hdr, nexthdr))) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - NAPI_GRO_CB(p)->flush |= flush; - } - - NAPI_GRO_CB(skb)->flush |= flush; - - csum = skb->csum; - skb_postpull_rcsum(skb, iph, skb_network_header_len(skb)); - - pp = ops->gro_receive(head, skb); - - skb->csum = csum; - -out_unlock: - rcu_read_unlock(); - -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int ipv6_gro_complete(struct sk_buff *skb) -{ - const struct inet6_protocol *ops; - struct ipv6hdr *iph = ipv6_hdr(skb); - int err = -ENOSYS; - - iph->payload_len = htons(skb->len - skb_network_offset(skb) - - sizeof(*iph)); - - rcu_read_lock(); - ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]); - if (WARN_ON(!ops || !ops->gro_complete)) - goto out_unlock; - - err = ops->gro_complete(skb); - -out_unlock: - rcu_read_unlock(); - - return err; -} - static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, - .gso_send_check = ipv6_gso_send_check, - .gso_segment = ipv6_gso_segment, - .gro_receive = ipv6_gro_receive, - .gro_complete = ipv6_gro_complete, }; static int __init ipv6_packet_init(void) @@ -971,41 +710,52 @@ static void ipv6_packet_cleanup(void) static int __net_init ipv6_init_mibs(struct net *net) { - if (snmp_mib_init((void **)net->mib.udp_stats_in6, - sizeof (struct udp_mib)) < 0) + int i; + + net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udp_stats_in6) return -ENOMEM; - if (snmp_mib_init((void **)net->mib.udplite_stats_in6, - sizeof (struct udp_mib)) < 0) + net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib); + if (!net->mib.udplite_stats_in6) goto err_udplite_mib; - if (snmp_mib_init((void **)net->mib.ipv6_statistics, - sizeof(struct ipstats_mib)) < 0) + net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib); + if (!net->mib.ipv6_statistics) goto err_ip_mib; - if (snmp_mib_init((void **)net->mib.icmpv6_statistics, - sizeof(struct icmpv6_mib)) < 0) + + for_each_possible_cpu(i) { + struct ipstats_mib *af_inet6_stats; + af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i); + u64_stats_init(&af_inet6_stats->syncp); + } + + + net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib); + if (!net->mib.icmpv6_statistics) goto err_icmp_mib; - if (snmp_mib_init((void **)net->mib.icmpv6msg_statistics, - sizeof(struct icmpv6msg_mib)) < 0) + net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), + GFP_KERNEL); + if (!net->mib.icmpv6msg_statistics) goto err_icmpmsg_mib; return 0; err_icmpmsg_mib: - snmp_mib_free((void **)net->mib.icmpv6_statistics); + free_percpu(net->mib.icmpv6_statistics); err_icmp_mib: - snmp_mib_free((void **)net->mib.ipv6_statistics); + free_percpu(net->mib.ipv6_statistics); err_ip_mib: - snmp_mib_free((void **)net->mib.udplite_stats_in6); + free_percpu(net->mib.udplite_stats_in6); err_udplite_mib: - snmp_mib_free((void **)net->mib.udp_stats_in6); + free_percpu(net->mib.udp_stats_in6); return -ENOMEM; } -static void __net_exit ipv6_cleanup_mibs(struct net *net) +static void ipv6_cleanup_mibs(struct net *net) { - snmp_mib_free((void **)net->mib.udp_stats_in6); - snmp_mib_free((void **)net->mib.udplite_stats_in6); - snmp_mib_free((void **)net->mib.ipv6_statistics); - snmp_mib_free((void **)net->mib.icmpv6_statistics); - snmp_mib_free((void **)net->mib.icmpv6msg_statistics); + free_percpu(net->mib.udp_stats_in6); + free_percpu(net->mib.udplite_stats_in6); + free_percpu(net->mib.ipv6_statistics); + free_percpu(net->mib.icmpv6_statistics); + kfree(net->mib.icmpv6msg_statistics); } static int __net_init inet6_net_init(struct net *net) @@ -1014,6 +764,8 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.flowlabel_consistency = 1; + atomic_set(&net->ipv6.rt_genid, 0); err = ipv6_init_mibs(net); if (err) @@ -1042,7 +794,7 @@ out: #endif } -static void inet6_net_exit(struct net *net) +static void __net_exit inet6_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS udp6_proc_exit(net); @@ -1057,22 +809,28 @@ static struct pernet_operations inet6_net_ops = { .exit = inet6_net_exit, }; +static const struct ipv6_stub ipv6_stub_impl = { + .ipv6_sock_mc_join = ipv6_sock_mc_join, + .ipv6_sock_mc_drop = ipv6_sock_mc_drop, + .ipv6_dst_lookup = ip6_dst_lookup, + .udpv6_encap_enable = udpv6_encap_enable, + .ndisc_send_na = ndisc_send_na, + .nd_tbl = &nd_tbl, +}; + static int __init inet6_init(void) { - struct sk_buff *dummy_skb; struct list_head *r; int err = 0; - BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); /* Register the socket-side information for inet6_create. */ - for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); if (disable_ipv6_mod) { - printk(KERN_INFO - "IPv6: Loaded, but administratively disabled, " - "reboot required to enable\n"); + pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } @@ -1092,6 +850,9 @@ static int __init inet6_init(void) if (err) goto out_unregister_udplite_proto; + err = proto_register(&pingv6_prot, 1); + if (err) + goto out_unregister_ping_proto; /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. @@ -1107,15 +868,10 @@ static int __init inet6_init(void) if (err) goto out_sock_register_fail; -#ifdef CONFIG_SYSCTL - err = ipv6_static_sysctl_register(); - if (err) - goto static_sysctl_fail; -#endif /* * ipngwg API draft makes clear that the correct semantics * for TCP and UDP is to consider one TCP and UDP instance - * in a host availiable by both INET and INET6 APIs and + * in a host available by both INET and INET6 APIs and * able to communicate via both network protocols. */ @@ -1134,6 +890,9 @@ static int __init inet6_init(void) err = igmp6_init(); if (err) goto igmp_fail; + + ipv6_stub = &ipv6_stub_impl; + err = ipv6_netfilter_init(); if (err) goto netfilter_fail; @@ -1152,6 +911,9 @@ static int __init inet6_init(void) err = ip6_route_init(); if (err) goto ip6_route_fail; + err = ndisc_late_init(); + if (err) + goto ndisc_late_fail; err = ip6_flowlabel_init(); if (err) goto ip6_flowlabel_fail; @@ -1185,6 +947,10 @@ static int __init inet6_init(void) if (err) goto ipv6_packet_fail; + err = pingv6_init(); + if (err) + goto pingv6_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -1195,8 +961,10 @@ out: #ifdef CONFIG_SYSCTL sysctl_fail: - ipv6_packet_cleanup(); + pingv6_exit(); #endif +pingv6_fail: + ipv6_packet_cleanup(); ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: @@ -1212,6 +980,8 @@ ipv6_exthdrs_fail: addrconf_fail: ip6_flowlabel_cleanup(); ip6_flowlabel_fail: + ndisc_late_cleanup(); +ndisc_late_fail: ip6_route_cleanup(); ip6_route_fail: #ifdef CONFIG_PROC_FS @@ -1236,14 +1006,12 @@ ipmr_fail: icmp_fail: unregister_pernet_subsys(&inet6_net_ops); register_pernet_fail: -#ifdef CONFIG_SYSCTL - ipv6_static_sysctl_unregister(); -static_sysctl_fail: -#endif sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); out_sock_register_fail: rawv6_exit(); +out_unregister_ping_proto: + proto_unregister(&pingv6_prot); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udplite_proto: @@ -1256,56 +1024,4 @@ out_unregister_tcp_proto: } module_init(inet6_init); -static void __exit inet6_exit(void) -{ - if (disable_ipv6_mod) - return; - - /* First of all disallow new sockets creation. */ - sock_unregister(PF_INET6); - /* Disallow any further netlink messages */ - rtnl_unregister_all(PF_INET6); - -#ifdef CONFIG_SYSCTL - ipv6_sysctl_unregister(); -#endif - udpv6_exit(); - udplitev6_exit(); - tcpv6_exit(); - - /* Cleanup code parts. */ - ipv6_packet_cleanup(); - ipv6_frag_exit(); - ipv6_exthdrs_exit(); - addrconf_cleanup(); - ip6_flowlabel_cleanup(); - ip6_route_cleanup(); -#ifdef CONFIG_PROC_FS - - /* Cleanup code parts. */ - if6_proc_exit(); - ipv6_misc_proc_exit(); - udplite6_proc_exit(); - raw6_proc_exit(); -#endif - ipv6_netfilter_fini(); - igmp6_cleanup(); - ndisc_cleanup(); - ip6_mr_cleanup(); - icmpv6_cleanup(); - rawv6_exit(); - - unregister_pernet_subsys(&inet6_net_ops); -#ifdef CONFIG_SYSCTL - ipv6_static_sysctl_unregister(); -#endif - proto_unregister(&rawv6_prot); - proto_unregister(&udplitev6_prot); - proto_unregister(&udpv6_prot); - proto_unregister(&tcpv6_prot); - - rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} -module_exit(inet6_exit); - MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index c2f300c314b..72a4930bdc0 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors * @@ -24,14 +23,18 @@ * This file is derived from net/ipv4/ah.c. */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <crypto/hash.h> #include <linux/module.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/string.h> #include <linux/scatterlist.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -40,7 +43,7 @@ #define IPV6HDR_BASELEN 8 struct tmp_ext { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) struct in6_addr saddr; #endif struct in6_addr daddr; @@ -110,7 +113,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, __alignof__(struct scatterlist)); } -static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) { u8 *opt = (u8 *)opthdr; int len = ipv6_optlen(opthdr); @@ -124,7 +127,7 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) switch (opt[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; default: @@ -142,13 +145,13 @@ static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) len -= optlen; } if (len == 0) - return 1; + return true; bad: - return 0; + return false; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) /** * ipv6_rearrange_destopt - rearrange IPv6 destination options header * @iph: IPv6 header @@ -168,7 +171,7 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des switch (opt[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; default: @@ -188,13 +191,13 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des hao = (struct ipv6_destopt_hao *)&opt[off]; if (hao->length != sizeof(hao->addr)) { - if (net_ratelimit()) - printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length); + net_warn_ratelimited("destopt hao: invalid header length: %u\n", + hao->length); goto bad; } - ipv6_addr_copy(&final_addr, &hao->addr); - ipv6_addr_copy(&hao->addr, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &final_addr); + final_addr = hao->addr; + hao->addr = iph->saddr; + iph->saddr = final_addr; } break; } @@ -240,13 +243,13 @@ static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) segments = rthdr->hdrlen >> 1; addrs = ((struct rt0_hdr *)rthdr)->addr; - ipv6_addr_copy(&final_addr, addrs + segments - 1); + final_addr = addrs[segments - 1]; addrs += segments - segments_left; memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); - ipv6_addr_copy(addrs, &iph->daddr); - ipv6_addr_copy(&iph->daddr, &final_addr); + addrs[0] = iph->daddr; + iph->daddr = final_addr; } static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) @@ -316,15 +319,13 @@ static void ah6_output_done(struct crypto_async_request *base, int err) memcpy(top_iph, iph_base, IPV6HDR_BASELEN); if (extlen) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(&top_iph->saddr, iph_ext, extlen); #else memcpy(&top_iph->daddr, iph_ext, extlen); #endif } - err = ah->nexthdr; - kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb, err); } @@ -345,6 +346,10 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) struct ip_auth_hdr *ah; struct ah_data *ahp; struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; @@ -358,15 +363,22 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) if (extlen) extlen += sizeof(*iph_ext); + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } err = -ENOMEM; - iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen); + iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN + + extlen + seqhi_len); if (!iph_base) goto out; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(ahash, iph_ext, extlen); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; ah = ip_auth_hdr(skb); memset(ah->auth_data, 0, ahp->icv_trunc_len); @@ -383,7 +395,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(iph_base, top_iph, IPV6HDR_BASELEN); if (extlen) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(iph_ext, &top_iph->saddr, extlen); #else memcpy(iph_ext, &top_iph->daddr, extlen); @@ -408,12 +420,17 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); - ahash_request_set_crypt(req, sg, icv, skb->len); + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(seqhisg, seqhi, seqhi_len); + } + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_output_done, skb); AH_SKB_CB(skb)->tmp = iph_base; @@ -432,7 +449,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph, iph_base, IPV6HDR_BASELEN); if (extlen) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) memcpy(&top_iph->saddr, iph_ext, extlen); #else memcpy(&top_iph->daddr, iph_ext, extlen); @@ -465,12 +482,15 @@ static void ah6_input_done(struct crypto_async_request *base, int err) if (err) goto out; + err = ah->nexthdr; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, hdr_len); __skb_pull(skb, ah_hlen + hdr_len); - skb_set_transport_header(skb, -hdr_len); - - err = ah->nexthdr; + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); @@ -510,14 +530,17 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) int nexthdr; int nfrags; int err = -ENOMEM; + int seqhi_len = 0; + __be32 *seqhi; + int sglists = 0; + struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; @@ -537,22 +560,32 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) if (!pskb_may_pull(skb, ah_hlen)) goto out; - ip6h = ipv6_hdr(skb); - - skb_push(skb, hdr_len); if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; - work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len); + ah = (struct ip_auth_hdr *)skb->data; + ip6h = ipv6_hdr(skb); + + skb_push(skb, hdr_len); + + if (x->props.flags & XFRM_STATE_ESN) { + sglists = 1; + seqhi_len = sizeof(*seqhi); + } + + work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + + ahp->icv_trunc_len + seqhi_len); if (!work_iph) goto out; - auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); + seqhisg = sg + nfrags; memcpy(work_iph, ip6h, hdr_len); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); @@ -567,10 +600,16 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) ip6h->flow_lbl[2] = 0; ip6h->hop_limit = 0; - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + sg_init_table(sg, nfrags + sglists); + skb_to_sgvec_nomark(skb, sg, 0, skb->len); + + if (x->props.flags & XFRM_STATE_ESN) { + /* Attach seqhi sg right after packet payload */ + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(seqhisg, seqhi, seqhi_len); + } - ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah6_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; @@ -580,8 +619,6 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) if (err == -EINPROGRESS) goto out; - if (err == -EBUSY) - err = NET_XMIT_DROP; goto out_free; } @@ -591,9 +628,13 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, hdr_len); - skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + hdr_len); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); + err = nexthdr; out_free: @@ -602,26 +643,29 @@ out: return err; } -static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); if (!x) - return; - - NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", - ntohl(ah->spi), &iph->daddr); + return 0; + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static int ah6_init_state(struct xfrm_state *x) @@ -660,9 +704,9 @@ static int ah6_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - printk(KERN_INFO "AH: %s digestsize %u != %hu\n", - x->aalg->alg_name, crypto_ahash_digestsize(ahash), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_ahash_digestsize(ahash), + aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } @@ -706,6 +750,11 @@ static void ah6_destroy(struct xfrm_state *x) kfree(ahp); } +static int ah6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type ah6_type = { .description = "AH6", @@ -719,21 +768,22 @@ static const struct xfrm_type ah6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol ah6_protocol = { +static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ah6_init(void) { if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { - printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah6_type, AF_INET6); return -EAGAIN; } @@ -743,11 +793,11 @@ static int __init ah6_init(void) static void __exit ah6_fini(void) { - if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) - printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f1c74c8ef9d..21018324468 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -29,6 +29,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -43,17 +44,17 @@ #include <net/checksum.h> -static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr); +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); /* Big ac list lock for all the sockets */ -static DEFINE_RWLOCK(ipv6_sk_ac_lock); +static DEFINE_SPINLOCK(ipv6_sk_ac_lock); /* * socket join an anycast group */ -int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) +int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev = NULL; @@ -63,7 +64,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) int ishost = !net->ipv6.devconf_all->forwarding; int err = 0; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -74,43 +75,42 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (pac == NULL) return -ENOMEM; pac->acl_next = NULL; - ipv6_addr_copy(&pac->acl_addr, addr); + pac->acl_addr = *addr; + rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } else if (ishost) { err = -EADDRNOTAVAIL; - goto out_free_pac; + goto error; } else { /* router, no matching interface: just pick one */ - - dev = dev_get_by_flags(net, IFF_UP, IFF_UP|IFF_LOOPBACK); + dev = dev_get_by_flags_rcu(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); } } else - dev = dev_get_by_index(net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev == NULL) { err = -ENODEV; - goto out_free_pac; + goto error; } - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; else err = -EADDRNOTAVAIL; - goto out_dev_put; + goto error; } /* reset ishost, now that we have a specific device */ ishost = !idev->cnf.forwarding; - in6_dev_put(idev); pac->acl_ifindex = dev->ifindex; @@ -123,40 +123,36 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (ishost) err = -EADDRNOTAVAIL; if (err) - goto out_dev_put; + goto error; } err = ipv6_dev_ac_inc(dev, addr); - if (err) - goto out_dev_put; - - write_lock_bh(&ipv6_sk_ac_lock); - pac->acl_next = np->ipv6_ac_list; - np->ipv6_ac_list = pac; - write_unlock_bh(&ipv6_sk_ac_lock); - - dev_put(dev); - - return 0; + if (!err) { + spin_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + spin_unlock_bh(&ipv6_sk_ac_lock); + pac = NULL; + } -out_dev_put: - dev_put(dev); -out_free_pac: - sock_kfree_s(sk, pac, sizeof(*pac)); +error: + rcu_read_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); return err; } /* * socket leave an anycast group */ -int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_ac_lock); + spin_lock_bh(&ipv6_sk_ac_lock); prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { if ((ifindex == 0 || pac->acl_ifindex == ifindex) && @@ -165,7 +161,7 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) prev_pac = pac; } if (!pac) { - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); return -ENOENT; } if (prev_pac) @@ -173,13 +169,14 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); - dev = dev_get_by_index(net, pac->acl_ifindex); - if (dev) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) ipv6_dev_ac_dec(dev, &pac->acl_addr); - dev_put(dev); - } + rcu_read_unlock(); + sock_kfree_s(sk, pac, sizeof(*pac)); return 0; } @@ -192,19 +189,21 @@ void ipv6_sock_ac_close(struct sock *sk) struct net *net = sock_net(sk); int prev_index; - write_lock_bh(&ipv6_sk_ac_lock); + if (!np->ipv6_ac_list) + return; + + spin_lock_bh(&ipv6_sk_ac_lock); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - write_unlock_bh(&ipv6_sk_ac_lock); + spin_unlock_bh(&ipv6_sk_ac_lock); prev_index = 0; + rcu_read_lock(); while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - if (dev) - dev_put(dev); - dev = dev_get_by_index(net, pac->acl_ifindex); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -212,44 +211,14 @@ void ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } - if (dev) - dev_put(dev); -} - -#if 0 -/* The function is not used, which is funny. Apparently, author - * supposed to use it to filter out datagrams inside udp/raw but forgot. - * - * It is OK, anycasts are not special comparing to delivery to unicasts. - */ - -int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) -{ - struct ipv6_ac_socklist *pac; - struct ipv6_pinfo *np = inet6_sk(sk); - int found; - - found = 0; - read_lock(&ipv6_sk_ac_lock); - for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) { - if (ifindex && pac->acl_ifindex != ifindex) - continue; - found = ipv6_addr_equal(&pac->acl_addr, addr); - if (found) - break; - } - read_unlock(&ipv6_sk_ac_lock); - - return found; + rcu_read_unlock(); } -#endif - static void aca_put(struct ifacaddr6 *ac) { if (atomic_dec_and_test(&ac->aca_refcnt)) { in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->u.dst); + dst_release(&ac->aca_rt->dst); kfree(ac); } } @@ -257,7 +226,7 @@ static void aca_put(struct ifacaddr6 *ac) /* * device anycast group inc (add if not found) */ -int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) +int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) { struct ifacaddr6 *aca; struct inet6_dev *idev; @@ -294,14 +263,14 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) goto out; } - rt = addrconf_dst_alloc(idev, addr, 1); + rt = addrconf_dst_alloc(idev, addr, true); if (IS_ERR(rt)) { kfree(aca); err = PTR_ERR(rt); goto out; } - ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_addr = *addr; aca->aca_idev = idev; aca->aca_rt = rt; aca->aca_users = 1; @@ -329,7 +298,7 @@ out: /* * device anycast group decrement */ -int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) +int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; @@ -355,65 +324,76 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->u.dst); + dst_hold(&aca->aca_rt->dst); ip6_del_rt(aca->aca_rt); aca_put(aca); return 0; } -static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) +/* called with rcu_read_lock() */ +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { - int ret; - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); + if (idev == NULL) return -ENODEV; - ret = __ipv6_dev_ac_dec(idev, addr); - in6_dev_put(idev); - return ret; + return __ipv6_dev_ac_dec(idev, addr); } /* * check if the interface has this anycast address + * called with rcu_read_lock() */ -static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) +static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) { struct inet6_dev *idev; struct ifacaddr6 *aca; - idev = in6_dev_get(dev); + idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); for (aca = idev->ac_list; aca; aca = aca->aca_next) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; read_unlock_bh(&idev->lock); - in6_dev_put(idev); return aca != NULL; } - return 0; + return false; } /* * check if given interface (or any, if dev==0) has this anycast address */ -int ipv6_chk_acast_addr(struct net *net, struct net_device *dev, - struct in6_addr *addr) +bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, + const struct in6_addr *addr) { - int found = 0; + bool found = false; - if (dev) - return ipv6_chk_acast_dev(dev, addr); rcu_read_lock(); - for_each_netdev_rcu(net, dev) - if (ipv6_chk_acast_dev(dev, addr)) { - found = 1; - break; - } + if (dev) + found = ipv6_chk_acast_dev(dev, addr); + else + for_each_netdev_rcu(net, dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = true; + break; + } rcu_read_unlock(); return found; } +/* check if this anycast address is link-local on given interface or + * is global + */ +bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, + const struct in6_addr *addr) +{ + return ipv6_chk_acast_addr(net, + (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? + dev : NULL), + addr); +} #ifdef CONFIG_PROC_FS struct ac6_iter_state { @@ -538,9 +518,9 @@ static const struct file_operations ac6_seq_fops = { .release = seq_release_net, }; -int ac6_proc_init(struct net *net) +int __net_init ac6_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "anycast6", S_IRUGO, &ac6_seq_fops)) + if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops)) return -ENOMEM; return 0; @@ -548,7 +528,7 @@ int ac6_proc_init(struct net *net) void ac6_proc_exit(struct net *net) { - proc_net_remove(net, "anycast6"); + remove_proc_entry("anycast6", net->proc_net); } #endif diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e6f9cdf780f..c3bf2d2e519 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -21,6 +21,8 @@ #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -28,19 +30,26 @@ #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> +#include <net/dsfield.h> #include <linux/errqueue.h> #include <asm/uaccess.h> +static bool ipv6_mapped_addr_any(const struct in6_addr *a) +{ + return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); +} + int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr, *final_p = NULL, final; + struct in6_addr *daddr, *final_p, final; struct dst_entry *dst; - struct flowi fl; + struct flowi6 fl6; struct ip6_flowlabel *flowlabel = NULL; + struct ipv6_txoptions *opt; int addr_type; int err; @@ -57,14 +66,13 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { - fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); } } @@ -91,26 +99,31 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sin.sin_port = usin->sin6_port; err = ip4_datagram_connect(sk, - (struct sockaddr*) &sin, + (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; - ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr); + ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); - if (ipv6_addr_any(&np->saddr)) + if (ipv6_addr_any(&np->saddr) || + ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); - if (ipv6_addr_any(&np->rcv_saddr)) + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || + ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, - &np->rcv_saddr); + &sk->sk_v6_rcv_saddr); + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } goto out; } - if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (sk->sk_bound_dev_if && @@ -131,8 +144,8 @@ ipv4_connected: } } - ipv6_addr_copy(&np->daddr, daddr); - np->flow_label = fl.fl6_flowlabel; + sk->sk_v6_daddr = *daddr; + np->flow_label = fl6.flowlabel; inet->inet_dport = usin->sin6_port; @@ -141,62 +154,46 @@ ipv4_connected: * destination cache for it. */ - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet->inet_dport; - fl.fl_ip_sport = inet->inet_sport; - - if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST)) - fl.oif = np->mcast_oif; - - security_sk_classify_flow(sk, &fl); - - if (flowlabel) { - if (flowlabel->opt && flowlabel->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - } else if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl6.flowi6_oif = np->mcast_oif; + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + opt = flowlabel ? flowlabel->opt : np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); - if (err < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; } /* source address lookup done in ip6_dst_lookup */ if (ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&np->saddr, &fl.fl6_src); + np->saddr = fl6.saddr; - if (ipv6_addr_any(&np->rcv_saddr)) { - ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); } ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? - &np->daddr : NULL, + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? &np->saddr : #endif NULL); @@ -206,6 +203,17 @@ out: fl6_sock_release(flowlabel); return err; } +EXPORT_SYMBOL_GPL(ip6_datagram_connect); + +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); + if (sin6->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + return ip6_datagram_connect(sk, uaddr, addr_len); +} +EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) @@ -221,6 +229,8 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, if (!skb) return; + skb->protocol = htons(ETH_P_IPV6); + serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; @@ -240,7 +250,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, kfree_skb(skb); } -void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) +void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; @@ -254,10 +264,12 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) if (!skb) return; + skb->protocol = htons(ETH_P_IPV6); + skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); - ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); + iph->daddr = fl6->daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; @@ -268,7 +280,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); - serr->port = fl->fl_ip_dport; + serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); @@ -277,15 +289,50 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) kfree_skb(skb); } +void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + struct ip6_mtuinfo *mtu_info; + + if (!np->rxopt.bits.rxpmtu) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + iph->daddr = fl6->daddr; + + mtu_info = IP6CBMTU(skb); + + mtu_info->ip6m_mtu = mtu; + mtu_info->ip6m_addr.sin6_family = AF_INET6; + mtu_info->ip6m_addr.sin6_port = 0; + mtu_info->ip6m_addr.sin6_flowinfo = 0; + mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; + mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + skb = xchg(&np->rxpmtu, skb); + kfree_skb(skb); +} + /* * Handle MSG_ERRQUEUE */ -int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in6 *sin; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; @@ -311,26 +358,26 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in6 *)msg->msg_name; if (sin) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; - sin->sin6_scope_id = 0; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { - ipv6_addr_copy(&sin->sin6_addr, - (struct in6_addr *)(nh + serr->addr_offset)); + if (skb->protocol == htons(ETH_P_IPV6)) { + const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), + struct ipv6hdr, daddr); + sin->sin6_addr = ip6h->daddr; if (np->sndflow) - sin->sin6_flowinfo = - (*(__be32 *)(nh + serr->addr_offset - 24) & - IPV6_FLOWINFO_MASK); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + sin->sin6_flowinfo = ip6_flowinfo(ip6h); + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); + sin->sin6_scope_id = 0; } + *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); @@ -339,18 +386,22 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; - sin->sin6_scope_id = 0; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { - ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr); + sin->sin6_port = 0; + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6)) { + sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); - if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin->sin6_scope_id = IP6CB(skb)->iif; + ip6_datagram_recv_specific_ctl(sk, msg, skb); + sin->sin6_scope_id = + ipv6_iface_scope_id(&sin->sin6_addr, + IP6CB(skb)->iif); } else { struct inet_sock *inet = inet_sk(sk); ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); + sin->sin6_scope_id = 0; if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } @@ -379,22 +430,87 @@ out_free_skb: out: return err; } +EXPORT_SYMBOL_GPL(ipv6_recv_error); + +/* + * Handle IPV6_RECVPATHMTU + */ +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, + int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ip6_mtuinfo mtu_info; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); + int err; + int copied; + + err = -EAGAIN; + skb = xchg(&np->rxpmtu, NULL); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); + + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = 0; + sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; + sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; + *addr_len = sizeof(*sin); + } + put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} -int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) + +void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - unsigned char *nh = skb_network_header(skb); + bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + if (is_ipv6) { + src_info.ipi6_ifindex = IP6CB(skb)->iif; + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + } else { + src_info.ipi6_ifindex = + PKTINFO_SKB_CB(skb)->ipi_ifindex; + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + &src_info.ipi6_addr); + } put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } +} + +void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; @@ -402,13 +518,14 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) } if (np->rxopt.bits.rxtclass) { - int tclass = (ntohl(*(__be32 *)ipv6_hdr(skb)) >> 20) & 0xff; + int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } - if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { - __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; - put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); + if (flowinfo) + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ @@ -432,10 +549,10 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { - unsigned len; + unsigned int len; u8 *ptr = nh + off; - switch(nexthdr) { + switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; @@ -467,7 +584,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; - ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr); + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { @@ -490,13 +607,41 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } - return 0; + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; + __be16 *ports = (__be16 *) skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 <= skb->len) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ipv6_hdr(skb)->daddr; + sin6.sin6_port = ports[1]; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = + ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, + opt->iif); + + put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); + } + } } -int datagram_send_ctl(struct net *net, - struct msghdr *msg, struct flowi *fl, - struct ipv6_txoptions *opt, - int *hlimit, int *tclass) +void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + ip6_datagram_recv_common_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); +} +EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); + +int ip6_datagram_send_ctl(struct net *net, struct sock *sk, + struct msghdr *msg, struct flowi6 *fl6, + struct ipv6_txoptions *opt, + int *hlimit, int *tclass, int *dontfrag) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -530,16 +675,17 @@ int datagram_send_ctl(struct net *net, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (src_info->ipi6_ifindex) { - if (fl->oif && src_info->ipi6_ifindex != fl->oif) + if (fl6->flowi6_oif && + src_info->ipi6_ifindex != fl6->flowi6_oif) return -EINVAL; - fl->oif = src_info->ipi6_ifindex; + fl6->flowi6_oif = src_info->ipi6_ifindex; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); - if (fl->oif) { - dev = dev_get_by_index_rcu(net, fl->oif); + if (fl6->flowi6_oif) { + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; @@ -551,11 +697,14 @@ int datagram_send_ctl(struct net *net, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!ipv6_chk_addr(net, &src_info->ipi6_addr, - strict ? dev : NULL, 0)) + if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + !ipv6_chk_addr(net, &src_info->ipi6_addr, + strict ? dev : NULL, 0) && + !ipv6_chk_acast_addr_src(net, dev, + &src_info->ipi6_addr)) err = -EINVAL; else - ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); + fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); @@ -572,13 +721,13 @@ int datagram_send_ctl(struct net *net, goto exit_f; } - if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) { - if ((fl->fl6_flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } - fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); + fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: @@ -594,7 +743,7 @@ int datagram_send_ctl(struct net *net, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -614,7 +763,7 @@ int datagram_send_ctl(struct net *net, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -639,7 +788,7 @@ int datagram_send_ctl(struct net *net, err = -EINVAL; goto exit_f; } - if (!capable(CAP_NET_RAW)) { + if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } @@ -662,7 +811,7 @@ int datagram_send_ctl(struct net *net, rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { @@ -723,9 +872,8 @@ int datagram_send_ctl(struct net *net, int tc; err = -EINVAL; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; - } tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) @@ -736,6 +884,24 @@ int datagram_send_ctl(struct net *net, break; } + + case IPV6_DONTFRAG: + { + int df; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + goto exit_f; + + df = *(int *)CMSG_DATA(cmsg); + if (df < 0 || df > 1) + goto exit_f; + + err = 0; + *dontfrag = df; + + break; + } default: LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type); @@ -747,3 +913,30 @@ int datagram_send_ctl(struct net *net, exit_f: return err; } +EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); + +void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int bucket) +{ + const struct in6_addr *dest, *src; + + dest = &sp->sk_v6_daddr; + src = &sp->sk_v6_rcv_saddr; + seq_printf(seq, + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops)); +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 668a46b655e..d15da137714 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors * @@ -24,6 +23,8 @@ * This file is derived from net/ipv4/esp.c */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> @@ -37,6 +38,7 @@ #include <linux/random.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -49,19 +51,25 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); + /* * Allocate an AEAD request structure with extra space for SG and IV. * - * For alignment considerations the IV is placed at the front, followed - * by the request and finally the SG list. + * For alignment considerations the upper 32 bits of the sequence number are + * placed at the front, if present. Followed by the IV, the request and finally + * the SG list. * * TODO: Use spare space in skb for this where possible. */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) { unsigned int len; - len = crypto_aead_ivsize(aead); + len = seqihlen; + + len += crypto_aead_ivsize(aead); + if (len) { len += crypto_aead_alignmask(aead) & ~(crypto_tfm_ctx_alignment() - 1); @@ -76,10 +84,16 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) return kmalloc(len, GFP_ATOMIC); } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} + +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) { return crypto_aead_ivsize(aead) ? - PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -140,47 +154,73 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) int blksize; int clen; int alen; + int plen; + int tfclen; int nfrags; + int assoclen; + int sglists; + int seqhilen; u8 *iv; u8 *tail; - struct esp_data *esp = x->data; + __be32 *seqhi; /* skb is pure payload to encrypt */ - err = -ENOMEM; - - /* Round to block size */ - clen = skb->len; - - aead = esp->aead; + aead = x->data; alen = crypto_aead_authsize(aead); + tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + tfclen = padto - skb->len; + } blksize = ALIGN(crypto_aead_blocksize(aead), 4); - clen = ALIGN(clen + 2, blksize); - if (esp->padlen) - clen = ALIGN(clen, esp->padlen); + clen = ALIGN(skb->len + 2 + tfclen, blksize); + plen = clen - skb->len - tfclen; - if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) + err = skb_cow_data(skb, tfclen + plen + alen, &trailer); + if (err < 0) goto error; nfrags = err; - tmp = esp_alloc_tmp(aead, nfrags + 1); - if (!tmp) + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) { + err = -ENOMEM; goto error; + } - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_givreq(aead, iv); asg = esp_givreq_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; /* Fill padding... */ tail = skb_tail_pointer(trailer); + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } do { int i; - for (i=0; i<clen-skb->len - 2; i++) + for (i = 0; i < plen - 2; i++) tail[i] = i + 1; } while (0); - tail[clen-skb->len - 2] = (clen - skb->len) - 2; - tail[clen - skb->len - 1] = *skb_mac_header(skb); + tail[plen - 2] = plen - 2; + tail[plen - 1] = *skb_mac_header(skb); pskb_put(skb, trailer, clen - skb->len + alen); skb_push(skb, -skb_network_offset(skb)); @@ -188,19 +228,27 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, esph->enc_data + crypto_aead_ivsize(aead) - skb->data, clen + alen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_givcrypt_set_callback(req, 0, esp_output_done, skb); aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); + aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output); + XFRM_SKB_CB(skb)->seq.output.low); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); @@ -219,8 +267,7 @@ error: static int esp_input_done2(struct sk_buff *skb, int err) { struct xfrm_state *x = xfrm_input_state(skb); - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int elen = skb->len - hlen; @@ -248,7 +295,10 @@ static int esp_input_done2(struct sk_buff *skb, int err) pskb_trim(skb, skb->len - alen - padlen - 2); __skb_pull(skb, hlen); - skb_set_transport_header(skb, -hdr_len); + if (x->props.mode == XFRM_MODE_TUNNEL) + skb_reset_transport_header(skb); + else + skb_set_transport_header(skb, -hdr_len); err = nexthdr[1]; @@ -270,14 +320,17 @@ static void esp_input_done(struct crypto_async_request *base, int err) static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; - struct esp_data *esp = x->data; - struct crypto_aead *aead = esp->aead; + struct crypto_aead *aead = x->data; struct aead_request *req; struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); int nfrags; + int assoclen; + int sglists; + int seqhilen; int ret = 0; void *tmp; + __be32 *seqhi; u8 *iv; struct scatterlist *sg; struct scatterlist *asg; @@ -298,15 +351,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) } ret = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + 1); + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto out; ESP_SKB_CB(skb)->tmp = tmp; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; @@ -317,11 +382,19 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, sizeof(*esph)); + aead_request_set_assoc(req, asg, assoclen); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) @@ -335,58 +408,57 @@ out: static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) { - struct esp_data *esp = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); - u32 align = max_t(u32, blksize, esp->padlen); - u32 rem; - - mtu -= x->props.header_len + crypto_aead_authsize(esp->aead); - rem = mtu & (align - 1); - mtu &= ~(align - 1); - - if (x->props.mode != XFRM_MODE_TUNNEL) { - u32 padsize = ((blksize - 1) & 7) + 1; - mtu -= blksize - padsize; - mtu += min_t(u32, blksize - padsize, rem); - } + struct crypto_aead *aead = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); + unsigned int net_adj; + + if (x->props.mode != XFRM_MODE_TUNNEL) + net_adj = sizeof(struct ipv6hdr); + else + net_adj = 0; - return mtu - 2; + return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - + net_adj) & ~(blksize - 1)) + net_adj - 2; } -static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) +static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && - type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET6); if (!x) - return; - printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%pI6\n", - ntohl(esph->spi), &iph->daddr); + return 0; + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static void esp6_destroy(struct xfrm_state *x) { - struct esp_data *esp = x->data; + struct crypto_aead *aead = x->data; - if (!esp) + if (!aead) return; - crypto_free_aead(esp->aead); - kfree(esp); + crypto_free_aead(aead); } static int esp_init_aead(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; int err; @@ -395,7 +467,7 @@ static int esp_init_aead(struct xfrm_state *x) if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; err = crypto_aead_setkey(aead, x->aead->alg_key, (x->aead->alg_key_len + 7) / 8); @@ -412,7 +484,6 @@ error: static int esp_init_authenc(struct xfrm_state *x) { - struct esp_data *esp = x->data; struct crypto_aead *aead; struct crypto_authenc_key_param *param; struct rtattr *rta; @@ -427,17 +498,27 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; err = -ENAMETOOLONG; - if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", - x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; - esp->aead = aead; + x->data = aead; keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); @@ -492,7 +573,6 @@ error: static int esp6_init_state(struct xfrm_state *x) { - struct esp_data *esp; struct crypto_aead *aead; u32 align; int err; @@ -500,11 +580,7 @@ static int esp6_init_state(struct xfrm_state *x) if (x->encap) return -EINVAL; - esp = kzalloc(sizeof(*esp), GFP_KERNEL); - if (esp == NULL) - return -ENOMEM; - - x->data = esp; + x->data = NULL; if (x->aead) err = esp_init_aead(x); @@ -514,9 +590,7 @@ static int esp6_init_state(struct xfrm_state *x) if (err) goto error; - aead = esp->aead; - - esp->padlen = 0; + aead = x->data; x->props.header_len = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -536,14 +610,17 @@ static int esp6_init_state(struct xfrm_state *x) } align = ALIGN(crypto_aead_blocksize(aead), 4); - if (esp->padlen) - align = max_t(u32, align, esp->padlen); - x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); error: return err; } +static int esp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; +} + static const struct xfrm_type esp6_type = { .description = "ESP6", @@ -558,20 +635,21 @@ static const struct xfrm_type esp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol esp6_protocol = { - .handler = xfrm6_rcv, +static struct xfrm6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init esp6_init(void) { if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { - printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&esp6_type, AF_INET6); return -EAGAIN; } @@ -581,10 +659,10 @@ static int __init esp6_init(void) static void __exit esp6_fini(void) { - if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) - printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } module_init(esp6_init); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index df159fffe4b..8d67900aa00 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -29,6 +29,8 @@ #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/dst.h> #include <net/sock.h> @@ -41,67 +43,23 @@ #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <asm/uaccess.h> -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) -{ - const unsigned char *nh = skb_network_header(skb); - int packet_len = skb->tail - skb->network_header; - struct ipv6_opt_hdr *hdr; - int len; - - if (offset + 2 > packet_len) - goto bad; - hdr = (struct ipv6_opt_hdr *)(nh + offset); - len = ((hdr->hdrlen + 1) << 3); - - if (offset + len > packet_len) - goto bad; - - offset += 2; - len -= 2; - - while (len > 0) { - int opttype = nh[offset]; - int optlen; - - if (opttype == type) - return offset; - - switch (opttype) { - case IPV6_TLV_PAD0: - optlen = 1; - break; - default: - optlen = nh[offset + 1] + 2; - if (optlen > len) - goto bad; - break; - } - offset += optlen; - len -= optlen; - } - /* not_found */ - bad: - return -1; -} -EXPORT_SYMBOL_GPL(ipv6_find_tlv); - /* * Parsing tlv encoded headers. * - * Parsing function "func" returns 1, if parsing succeed - * and 0, if it failed. + * Parsing function "func" returns true, if parsing succeed + * and false, if it failed. * It MUST NOT touch skb->h. */ struct tlvtype_proc { int type; - int (*func)(struct sk_buff *skb, int offset); + bool (*func)(struct sk_buff *skb, int offset); }; /********************* @@ -110,11 +68,11 @@ struct tlvtype_proc { /* An unknown option is detected, decide what to do */ -static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) { switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ - return 1; + return true; case 1: /* drop packet */ break; @@ -127,21 +85,22 @@ static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) break; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); - return 0; + return false; } kfree_skb(skb); - return 0; + return false; } /* Parse tlv encoded option header (hop-by-hop or destination) */ -static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb) { - struct tlvtype_proc *curr; + const struct tlvtype_proc *curr; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); int len = (skb_transport_header(skb)[1] + 1) << 3; + int padlen = 0; if (skb_transport_offset(skb) + len > skb_headlen(skb)) goto bad; @@ -151,13 +110,33 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) while (len > 0) { int optlen = nh[off + 1] + 2; + int i; switch (nh[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; + padlen++; + if (padlen > 7) + goto bad; break; case IPV6_TLV_PADN: + /* RFC 2460 states that the purpose of PadN is + * to align the containing header to multiples + * of 8. 7 is therefore the highest valid value. + * See also RFC 4942, Section 2.1.9.5. + */ + padlen += optlen; + if (padlen > 7) + goto bad; + /* RFC 4942 recommends receiving hosts to + * actively check PadN payload to contain + * only zeroes. + */ + for (i = 2; i < optlen; i++) { + if (nh[off + i] != 0) + goto bad; + } break; default: /* Other TLV code so scan list */ @@ -168,33 +147,35 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) /* type specific length/alignment checks will be performed in the func(). */ - if (curr->func(skb, off) == 0) - return 0; + if (curr->func(skb, off) == false) + return false; break; } } if (curr->type < 0) { if (ip6_tlvopt_unknown(skb, off) == 0) - return 0; + return false; } + padlen = 0; break; } off += optlen; len -= optlen; } + if (len == 0) - return 1; + return true; bad: kfree_skb(skb); - return 0; + return false; } /***************************** Destination options header. *****************************/ -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -static int ipv6_dest_hao(struct sk_buff *skb, int optoff) +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); @@ -241,23 +222,23 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff) if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; - ipv6_addr_copy(&tmp_addr, &ipv6h->saddr); - ipv6_addr_copy(&ipv6h->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp_addr); + tmp_addr = ipv6h->saddr; + ipv6h->saddr = hao->addr; + hao->addr = tmp_addr; if (skb->tstamp.tv64 == 0) __net_timestamp(skb); - return 1; + return true; discard: kfree_skb(skb); - return 0; + return false; } #endif -static struct tlvtype_proc tlvprocdestopt_lst[] = { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static const struct tlvtype_proc tlvprocdestopt_lst[] = { +#if IS_ENABLED(CONFIG_IPV6_MIP6) { .type = IPV6_TLV_HAO, .func = ipv6_dest_hao, @@ -269,31 +250,29 @@ static struct tlvtype_proc tlvprocdestopt_lst[] = { static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif - struct dst_entry *dst; + struct dst_entry *dst = skb_dst(skb); if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } opt->lastopt = opt->dst1 = skb_network_header_len(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif - dst = dst_clone(skb_dst(skb)); if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { - dst_release(dst); skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; @@ -303,7 +282,6 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); - dst_release(dst); return -1; } @@ -311,6 +289,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) Routing header. ********************************/ +/* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); @@ -323,12 +302,9 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int accept_source_route = net->ipv6.devconf_all->accept_source_route; - idev = in6_dev_get(skb->dev); - if (idev) { - if (accept_source_route > idev->cnf.accept_source_route) - accept_source_route = idev->cnf.accept_source_route; - in6_dev_put(idev); - } + idev = __in6_dev_get(skb->dev); + if (idev && accept_source_route > idev->cnf.accept_source_route) + accept_source_route = idev->cnf.accept_source_route; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + @@ -352,7 +328,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own @@ -378,7 +354,7 @@ looped_back: } switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; @@ -435,7 +411,7 @@ looped_back: addr += i - 1; switch (hdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, @@ -464,9 +440,9 @@ looped_back: return -1; } - ipv6_addr_copy(&daddr, addr); - ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr); - ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr); + daddr = *addr; + *addr = ipv6_hdr(skb)->daddr; + ipv6_hdr(skb)->daddr = daddr; skb_dst_drop(skb); ip6_route_input(skb); @@ -481,7 +457,7 @@ looped_back: IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - 0, skb->dev); + 0); kfree_skb(skb); return -1; } @@ -502,12 +478,12 @@ unknown_rh: static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, + .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, + .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol nodata_protocol = { @@ -533,10 +509,10 @@ int __init ipv6_exthdrs_init(void) out: return ret; -out_rthdr: - inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); +out_rthdr: + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; @@ -559,29 +535,35 @@ static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); } +static inline struct net *ipv6_skb_net(struct sk_buff *skb) +{ + return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); +} + /* Router Alert as of RFC 2711 */ -static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { - IP6CB(skb)->ra = optoff; - return 1; + IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; + memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); + return true; } LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb(skb); - return 0; + return false; } /* Jumbo payload */ -static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); + struct net *net = ipv6_skb_net(skb); u32 pkt_len; - struct net *net = dev_net(skb_dst(skb)->dev); if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", @@ -596,13 +578,13 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); - return 0; + return false; } if (ipv6_hdr(skb)->payload_len) { IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); - return 0; + return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { @@ -614,14 +596,14 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; - return 1; + return true; drop: kfree_skb(skb); - return 0; + return false; } -static struct tlvtype_proc tlvprochopopt_lst[] = { +static const struct tlvtype_proc tlvprochopopt_lst[] = { { .type = IPV6_TLV_ROUTERALERT, .func = ipv6_hop_ra, @@ -688,7 +670,7 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); - ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; @@ -720,7 +702,6 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } - EXPORT_SYMBOL(ipv6_push_nfrag_opts); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) @@ -736,20 +717,19 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); if (opt2) { - long dif = (char*)opt2 - (char*)opt; + long dif = (char *)opt2 - (char *)opt; memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) - *((char**)&opt2->hopopt) += dif; + *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) - *((char**)&opt2->dst0opt) += dif; + *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) - *((char**)&opt2->dst1opt) += dif; + *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) - *((char**)&opt2->srcrt) += dif; + *((char **)&opt2->srcrt) += dif; } return opt2; } - EXPORT_SYMBOL_GPL(ipv6_dup_options); static int ipv6_renew_option(void *ohdr, @@ -762,14 +742,14 @@ static int ipv6_renew_option(void *ohdr, if (ohdr) { memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); *hdr = (struct ipv6_opt_hdr *)*p; - *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr)); + *p += CMSG_ALIGN(ipv6_optlen(*hdr)); } } else { if (newopt) { if (copy_from_user(*p, newopt, newoptlen)) return -EFAULT; *hdr = (struct ipv6_opt_hdr *)*p; - if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen) + if (ipv6_optlen(*hdr) > newoptlen) return -EINVAL; *p += CMSG_ALIGN(newoptlen); } @@ -867,4 +847,28 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, return opt; } +EXPORT_SYMBOL_GPL(ipv6_fixup_options); +/** + * fl6_update_dst - update flowi destination address with info given + * by srcrt option, if any. + * + * @fl6: flowi6 for which daddr is to be updated + * @opt: struct ipv6_txoptions in which to look for srcrt opt + * @orig: copy of original daddr address if modified + * + * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * and initial value of fl6->daddr set in orig + */ +struct in6_addr *fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (!opt || !opt->srcrt) + return NULL; + + *orig = fl6->daddr; + fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; + return orig; +} +EXPORT_SYMBOL_GPL(fl6_update_dst); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index e1caa5d526c..8af3eb57f43 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -2,24 +2,26 @@ * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ +#include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ -int ipv6_ext_hdr(u8 nexthdr) +bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ - return ( (nexthdr == NEXTHDR_HOP) || + return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || - (nexthdr == NEXTHDR_DEST) ); + (nexthdr == NEXTHDR_DEST); } +EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. @@ -56,6 +58,9 @@ int ipv6_ext_hdr(u8 nexthdr) * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. + * - Reports the offset field of the final fragment header so it is + * possible to tell whether this is a first fragment, later fragment, + * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. @@ -63,10 +68,13 @@ int ipv6_ext_hdr(u8 nexthdr) * --ANK (980726) */ -int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, + __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + *frag_offp = 0; + while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; @@ -86,7 +94,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) if (fp == NULL) return -1; - if (ntohs(*fp) & ~0x7) + *frag_offp = *fp; + if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) @@ -101,6 +110,172 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp) *nexthdrp = nexthdr; return start; } - -EXPORT_SYMBOL(ipv6_ext_hdr); EXPORT_SYMBOL(ipv6_skip_exthdr); + +int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +{ + const unsigned char *nh = skb_network_header(skb); + int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); + struct ipv6_opt_hdr *hdr; + int len; + + if (offset + 2 > packet_len) + goto bad; + hdr = (struct ipv6_opt_hdr *)(nh + offset); + len = ((hdr->hdrlen + 1) << 3); + + if (offset + len > packet_len) + goto bad; + + offset += 2; + len -= 2; + + while (len > 0) { + int opttype = nh[offset]; + int optlen; + + if (opttype == type) + return offset; + + switch (opttype) { + case IPV6_TLV_PAD1: + optlen = 1; + break; + default: + optlen = nh[offset + 1] + 2; + if (optlen > len) + goto bad; + break; + } + offset += optlen; + len -= optlen; + } + /* not_found */ + bad: + return -1; +} +EXPORT_SYMBOL_GPL(ipv6_find_tlv); + +/* + * find the offset to specified header or the protocol number of last header + * if target < 0. "last header" is transport protocol header, ESP, or + * "No next header". + * + * Note that *offset is used as input/output parameter. an if it is not zero, + * then it must be a valid offset to an inner IPv6 header. This can be used + * to explore inner IPv6 header, eg. ICMPv6 error messages. + * + * If target header is found, its offset is set in *offset and return protocol + * number. Otherwise, return -1. + * + * If the first fragment doesn't contain the final protocol header or + * NEXTHDR_NONE it is considered invalid. + * + * Note that non-1st fragment is special case that "the protocol number + * of last header" is "next header" field in Fragment header. In this case, + * *offset is meaningless and fragment offset is stored in *fragoff if fragoff + * isn't NULL. + * + * if flags is not NULL and it's a fragment, then the frag flag + * IP6_FH_F_FRAG will be set. If it's an AH header, the + * IP6_FH_F_AUTH flag is set and target < 0, then this function will + * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this + * function will skip all those routing headers, where segements_left was 0. + */ +int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff, int *flags) +{ + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + unsigned int len; + bool found; + + if (fragoff) + *fragoff = 0; + + if (*offset) { + struct ipv6hdr _ip6, *ip6; + + ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); + if (!ip6 || (ip6->version != 6)) { + printk(KERN_ERR "IPv6 header not found\n"); + return -EBADMSG; + } + start = *offset + sizeof(struct ipv6hdr); + nexthdr = ip6->nexthdr; + } + len = skb->len - start; + + do { + struct ipv6_opt_hdr _hdr, *hp; + unsigned int hdrlen; + found = (nexthdr == target); + + if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { + if (target < 0 || found) + break; + return -ENOENT; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -EBADMSG; + + if (nexthdr == NEXTHDR_ROUTING) { + struct ipv6_rt_hdr _rh, *rh; + + rh = skb_header_pointer(skb, start, sizeof(_rh), + &_rh); + if (rh == NULL) + return -EBADMSG; + + if (flags && (*flags & IP6_FH_F_SKIP_RH) && + rh->segments_left == 0) + found = false; + } + + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off; + __be16 *fp; + + if (flags) /* Indicate that this is a fragment */ + *flags |= IP6_FH_F_FRAG; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -EBADMSG; + + _frag_off = ntohs(*fp) & ~0x7; + if (_frag_off) { + if (target < 0 && + ((!ipv6_ext_hdr(hp->nexthdr)) || + hp->nexthdr == NEXTHDR_NONE)) { + if (fragoff) + *fragoff = _frag_off; + return hp->nexthdr; + } + return -ENOENT; + } + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) + break; + hdrlen = (hp->hdrlen + 2) << 2; + } else + hdrlen = ipv6_optlen(hp); + + if (!found) { + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + } while (!found); + + *offset = start; + return nexthdr; +} +EXPORT_SYMBOL(ipv6_find_hdr); + diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c new file mode 100644 index 00000000000..447a7fbd1bb --- /dev/null +++ b/net/ipv6/exthdrs_offload.c @@ -0,0 +1,41 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * IPV6 Extension Header GSO/GRO support + */ +#include <net/protocol.h> +#include "ip6_offload.h" + +static const struct net_offload rthdr_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + +static const struct net_offload dstopt_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + +int __init ipv6_exthdrs_offload_init(void) +{ + int ret; + + ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING); + if (ret) + goto out; + + ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS); + if (ret) + goto out_rt; + +out: + return ret; + +out_rt: + inet_del_offload(&rthdr_offload, IPPROTO_ROUTING); + goto out; +} diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b7aa7c64cc4..b4d5e1d97c1 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -14,6 +14,7 @@ */ #include <linux/netdevice.h> +#include <linux/export.h> #include <net/fib_rules.h> #include <net/ipv6.h> @@ -21,59 +22,66 @@ #include <net/ip6_route.h> #include <net/netlink.h> -struct fib6_rule -{ +struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; u8 tclass; }; -struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl, +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, }; - fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg); - if (arg.rule) - fib_rule_put(arg.rule); + fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); if (arg.result) return arg.result; - dst_hold(&net->ipv6.ip6_null_entry->u.dst); - return &net->ipv6.ip6_null_entry->u.dst; + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; + int err = 0; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: + err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: + err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: + err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } table = fib6_get_table(net, rule->table); - if (table) - rt = lookup(net, table, flp, flags); + if (!table) { + err = -EAGAIN; + goto out; + } + rt = lookup(net, table, flp6, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; @@ -84,46 +92,67 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; - unsigned int srcprefs = 0; - - if (flags & RT6_LOOKUP_F_SRCPREF_TMP) - srcprefs |= IPV6_PREFER_SRC_TMP; - if (flags & RT6_LOOKUP_F_SRCPREF_PUBLIC) - srcprefs |= IPV6_PREFER_SRC_PUBLIC; - if (flags & RT6_LOOKUP_F_SRCPREF_COA) - srcprefs |= IPV6_PREFER_SRC_COA; if (ipv6_dev_get_saddr(net, - ip6_dst_idev(&rt->u.dst)->dev, - &flp->fl6_dst, srcprefs, + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), &saddr)) goto again; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) goto again; - ipv6_addr_copy(&flp->fl6_src, &saddr); + flp6->saddr = saddr; } goto out; } again: - dst_release(&rt->u.dst); + ip6_rt_put(rt); + err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); out: arg->result = rt; - return rt == NULL ? -EAGAIN : 0; + return err; } +static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct rt6_info *rt = (struct rt6_info *) arg->result; + struct net_device *dev = NULL; + + if (rt->rt6i_idev) + dev = rt->rt6i_idev->dev; + + /* do not accept result if the route does + * not meet the required prefix length + */ + if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: + ip6_rt_put(rt); + return true; +} static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; + struct flowi6 *fl6 = &fl->u.ip6; if (r->dst.plen && - !ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen)) + !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* @@ -133,14 +162,14 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) */ if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { - if (!ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, + if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } - if (r->tclass && r->tclass != ((ntohl(fl->fl6_flowlabel) >> 20) & 0xff)) + if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; return 1; @@ -215,19 +244,17 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, { struct fib6_rule *rule6 = (struct fib6_rule *) rule; - frh->family = AF_INET6; frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; frh->tos = rule6->tclass; - if (rule6->dst.plen) - NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr), - &rule6->dst.addr); - - if (rule6->src.plen) - NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr), - &rule6->src.addr); - + if ((rule6->dst.plen && + nla_put(skb, FRA_DST, sizeof(struct in6_addr), + &rule6->dst.addr)) || + (rule6->src.plen && + nla_put(skb, FRA_SRC, sizeof(struct in6_addr), + &rule6->src.addr))) + goto nla_put_failure; return 0; nla_put_failure: @@ -245,12 +272,13 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(16); /* src */ } -static struct fib_rules_ops fib6_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, + .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, @@ -262,7 +290,7 @@ static struct fib_rules_ops fib6_rules_ops_template = { .fro_net = &init_net, }; -static int fib6_rules_net_init(struct net *net) +static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; int err = -ENOMEM; @@ -291,7 +319,7 @@ out_fib6_rules_ops: goto out; } -static void fib6_rules_net_exit(struct net *net) +static void __net_exit fib6_rules_net_exit(struct net *net) { fib_rules_unregister(net->ipv6.fib6_rules_ops); } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4ae661bc367..f6c84a6eb23 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -29,6 +29,8 @@ * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -40,6 +42,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/netfilter.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -54,6 +57,7 @@ #include <net/ipv6.h> #include <net/ip6_checksum.h> +#include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> @@ -63,14 +67,9 @@ #include <net/icmp.h> #include <net/xfrm.h> #include <net/inet_common.h> +#include <net/dsfield.h> #include <asm/uaccess.h> -#include <asm/system.h> - -DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; -EXPORT_SYMBOL(icmpv6_statistics); -DEFINE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics) __read_mostly; -EXPORT_SYMBOL(icmpv6msg_statistics); /* * The ICMP socket(s). This is the most convenient way to flow control @@ -84,10 +83,28 @@ static inline struct sock *icmpv6_sk(struct net *net) return net->ipv6.icmp_sk[smp_processor_id()]; } +static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ + struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); + struct net *net = dev_net(skb->dev); + + if (type == ICMPV6_PKT_TOOBIG) + ip6_update_pmtu(skb, net, info, 0, 0); + else if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + + if (!(type & ICMPV6_INFOMSG_MASK)) + if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) + ping_err(skb, offset, info); +} + static int icmpv6_rcv(struct sk_buff *skb); static const struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, + .err_handler = icmpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; @@ -115,15 +132,6 @@ static __inline__ void icmpv6_xmit_unlock(struct sock *sk) } /* - * Slightly more convenient version of icmpv6_send. - */ -void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) -{ - icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); - kfree_skb(skb); -} - -/* * Figure out, may we reply to this packet with icmp error. * * We do not reply, if: @@ -134,18 +142,19 @@ void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) * --ANK (980726) */ -static int is_ineligible(struct sk_buff *skb) +static bool is_ineligible(const struct sk_buff *skb) { int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; if (len < 0) - return 1; + return true; - ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr); + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) - return 0; + return false; if (nexthdr == IPPROTO_ICMPV6) { u8 _type, *tp; tp = skb_header_pointer(skb, @@ -153,49 +162,53 @@ static int is_ineligible(struct sk_buff *skb) sizeof(_type), &_type); if (tp == NULL || !(*tp & ICMPV6_INFOMSG_MASK)) - return 1; + return true; } - return 0; + return false; } /* * Check the ICMP output rate limit */ -static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi *fl) +static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { struct dst_entry *dst; struct net *net = sock_net(sk); - int res = 0; + bool res = false; /* Informational messages are not limited. */ if (type & ICMPV6_INFOMSG_MASK) - return 1; + return true; /* Do not limit pmtu discovery, it would break it. */ if (type == ICMPV6_PKT_TOOBIG) - return 1; + return true; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ - dst = ip6_route_output(net, sk, fl); + dst = ip6_route_output(net, sk, fl6); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { - res = 1; + res = true; } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - res = xrlim_allow(dst, tmo); + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; @@ -208,18 +221,19 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, u8 type, * highest-order two bits set to 10 */ -static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +static bool opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (op == NULL) - return 1; + return true; return (*op & 0xC0) == 0x80; } -static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) +int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; @@ -235,9 +249,9 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct if (skb_queue_len(&sk->sk_write_queue) == 1) { skb->csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), skb->csum); - icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - len, fl->proto, + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, skb->csum); } else { __wsum tmp_csum = 0; @@ -248,9 +262,9 @@ static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct tmp_csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), tmp_csum); - icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - len, fl->proto, + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, tmp_csum); } ip6_push_pending_frames(sk); @@ -278,7 +292,7 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st return 0; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) static void mip6_addr_swap(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -292,9 +306,9 @@ static void mip6_addr_swap(struct sk_buff *skb) if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); - ipv6_addr_copy(&tmp, &iph->saddr); - ipv6_addr_copy(&iph->saddr, &hao->addr); - ipv6_addr_copy(&hao->addr, &tmp); + tmp = iph->saddr; + iph->saddr = hao->addr; + hao->addr = tmp; } } } @@ -302,43 +316,106 @@ static void mip6_addr_swap(struct sk_buff *skb) static inline void mip6_addr_swap(struct sk_buff *skb) {} #endif +static struct dst_entry *icmpv6_route_lookup(struct net *net, + struct sk_buff *skb, + struct sock *sk, + struct flowi6 *fl6) +{ + struct dst_entry *dst, *dst2; + struct flowi6 fl2; + int err; + + err = ip6_dst_lookup(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: acast source\n"); + dst_release(dst); + return ERR_PTR(-EINVAL); + } + + /* No need to clone since we're just using its address. */ + dst2 = dst; + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); + if (!IS_ERR(dst)) { + if (dst != dst2) + return dst; + } else { + if (PTR_ERR(dst) == -EPERM) + dst = NULL; + else + return dst; + } + + err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6); + if (err) + goto relookup_failed; + + err = ip6_dst_lookup(sk, &dst2, &fl2); + if (err) + goto relookup_failed; + + dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); + if (!IS_ERR(dst2)) { + dst_release(dst); + dst = dst2; + } else { + err = PTR_ERR(dst2); + if (err == -EPERM) { + dst_release(dst); + return dst2; + } else + goto relookup_failed; + } + +relookup_failed: + if (dst) + return dst; + return ERR_PTR(err); +} + /* * Send an ICMP message in response to a packet in error */ -void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, - struct net_device *dev) +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { struct net *net = dev_net(skb->dev); struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct ipv6_pinfo *np; - struct in6_addr *saddr = NULL; + const struct in6_addr *saddr = NULL; struct dst_entry *dst; - struct dst_entry *dst2; struct icmp6hdr tmp_hdr; - struct flowi fl; - struct flowi fl2; + struct flowi6 fl6; struct icmpv6_msg msg; int iif = 0; int addr_type = 0; int len; int hlimit; int err = 0; + u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || - (skb->network_header + sizeof(*hdr)) > skb->tail) + (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) - * Rule (e.1) is enforced by not using icmpv6_send + * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0)) + if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || + ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* @@ -361,7 +438,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, * Source addr check */ - if (addr_type & IPV6_ADDR_LINKLOCAL) + if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; /* @@ -371,7 +448,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: addr_any/mcast source\n"); return; } @@ -379,28 +456,30 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); + LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: no reply to icmp error\n"); return; } mip6_addr_swap(skb); - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr); + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = hdr->saddr; if (saddr) - ipv6_addr_copy(&fl.fl6_src, saddr); - fl.oif = iif; - fl.fl_icmp_type = type; - fl.fl_icmp_code = code; - security_skb_classify_flow(skb, &fl); + fl6.saddr = *saddr; + fl6.flowi6_mark = mark; + fl6.flowi6_oif = iif; + fl6.fl6_icmp_type = type; + fl6.fl6_icmp_code = code; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl)) + if (!icmpv6_xrlim_allow(sk, type, &fl6)) goto out; tmp_hdr.icmp6_type = type; @@ -408,66 +487,16 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, tmp_hdr.icmp6_cksum = 0; tmp_hdr.icmp6_pointer = htonl(info); - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) - goto out; - - /* - * We won't send icmp if the destination is known - * anycast. - */ - if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { - LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); - goto out_dst_release; - } - - /* No need to clone since we're just using its address. */ - dst2 = dst; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - err = xfrm_lookup(net, &dst, &fl, sk, 0); - switch (err) { - case 0: - if (dst != dst2) - goto route_done; - break; - case -EPERM: - dst = NULL; - break; - default: + dst = icmpv6_route_lookup(net, skb, sk, &fl6); + if (IS_ERR(dst)) goto out; - } - - if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6)) - goto relookup_failed; - - if (ip6_dst_lookup(sk, &dst2, &fl2)) - goto relookup_failed; - - err = xfrm_lookup(net, &dst2, &fl2, sk, XFRM_LOOKUP_ICMP); - switch (err) { - case 0: - dst_release(dst); - dst = dst2; - break; - case -EPERM: - goto out_dst_release; - default: -relookup_failed: - if (!dst) - goto out; - break; - } -route_done: - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -480,29 +509,35 @@ route_done: goto out_dst_release; } - idev = in6_dev_get(skb->dev); + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); err = ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), hlimit, - np->tclass, NULL, &fl, (struct rt6_info*)dst, - MSG_DONTWAIT); + np->tclass, NULL, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, np->dontfrag); if (err) { + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); - goto out_put; + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } - err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr)); - -out_put: - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); } -EXPORT_SYMBOL(icmpv6_send); +/* Slightly more convenient version of icmp6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmp6_send(skb, ICMPV6_PARAMPROB, code, pos); + kfree_skb(skb); +} static void icmpv6_echo_reply(struct sk_buff *skb) { @@ -510,82 +545,86 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; - struct in6_addr *saddr = NULL; + const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); struct icmp6hdr tmp_hdr; - struct flowi fl; + struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; int err = 0; int hlimit; + u8 tclass; + u32 mark = IP6_REPLY_MARK(net, skb->mark); saddr = &ipv6_hdr(skb)->daddr; - if (!ipv6_unicast_destination(skb)) + if (!ipv6_unicast_destination(skb) && + !(net->ipv6.sysctl.anycast_src_echo_reply && + ipv6_anycast_destination(skb))) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_ICMPV6; - ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) - ipv6_addr_copy(&fl.fl6_src, saddr); - fl.oif = skb->dev->ifindex; - fl.fl_icmp_type = ICMPV6_ECHO_REPLY; - security_skb_classify_flow(skb, &fl); + fl6.saddr = *saddr; + fl6.flowi6_oif = skb->dev->ifindex; + fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + fl6.flowi6_mark = mark; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); if (sk == NULL) return; + sk->sk_mark = mark; np = inet6_sk(sk); - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(sk, &dst, &fl); + err = ip6_dst_lookup(sk, &dst, &fl6); if (err) goto out; - if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0) + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); + if (IS_ERR(dst)) goto out; - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + tclass = ipv6_get_dsfield(ipv6_hdr(skb)); err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl, - (struct rt6_info*)dst, MSG_DONTWAIT); + sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + np->dontfrag); if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); - goto out_put; + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } - err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); - -out_put: - if (likely(idev != NULL)) - in6_dev_put(idev); dst_release(dst); out: icmpv6_xmit_unlock(sk); } -static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { const struct inet6_protocol *ipprot; int inner_offset; - int hash; + __be16 frag_off; u8 nexthdr; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) @@ -594,7 +633,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ - inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr); + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); if (inner_offset<0) return; } else { @@ -612,10 +652,8 @@ static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) --ANK (980726) */ - hash = nexthdr & (MAX_INET_PROTOS - 1); - rcu_read_lock(); - ipprot = rcu_dereference(inet6_protos[hash]); + ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); rcu_read_unlock(); @@ -631,8 +669,7 @@ static int icmpv6_rcv(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); - struct in6_addr *saddr, *daddr; - struct ipv6hdr *orig_hdr; + const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; @@ -644,7 +681,7 @@ static int icmpv6_rcv(struct sk_buff *skb) XFRM_STATE_ICMP)) goto drop_no_count; - if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) + if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) goto drop_no_count; nh = skb_network_offset(skb); @@ -661,21 +698,11 @@ static int icmpv6_rcv(struct sk_buff *skb) saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; - /* Perform checksum. */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, - IPPROTO_ICMPV6, 0)); - if (__skb_checksum_complete(skb)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n", - saddr, daddr); - goto discard_it; - } + if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ICMPv6 checksum failed [%pI6c > %pI6c]\n", + saddr, daddr); + goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) @@ -693,7 +720,7 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - /* we couldn't care less */ + ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: @@ -705,9 +732,6 @@ static int icmpv6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; hdr = icmp6_hdr(skb); - orig_hdr = (struct ipv6hdr *) (hdr + 1); - rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, - ntohl(hdr->icmp6_mtu)); /* * Drop through to notify @@ -763,6 +787,8 @@ static int icmpv6_rcv(struct sk_buff *skb) kfree_skb(skb); return 0; +csum_error: + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: @@ -770,20 +796,20 @@ drop_no_count: return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi *fl, +void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif) { - memset(fl, 0, sizeof(*fl)); - ipv6_addr_copy(&fl->fl6_src, saddr); - ipv6_addr_copy(&fl->fl6_dst, daddr); - fl->proto = IPPROTO_ICMPV6; - fl->fl_icmp_type = type; - fl->fl_icmp_code = 0; - fl->oif = oif; - security_sk_classify_flow(sk, fl); + memset(fl6, 0, sizeof(*fl6)); + fl6->saddr = *saddr; + fl6->daddr = *daddr; + fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->fl6_icmp_type = type; + fl6->fl6_icmp_code = 0; + fl6->flowi6_oif = oif; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } /* @@ -805,9 +831,7 @@ static int __net_init icmpv6_sk_init(struct net *net) err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - printk(KERN_ERR - "Failed to initialize the ICMP6 control socket " - "(err %d).\n", + pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); goto fail; } @@ -826,8 +850,7 @@ static int __net_init icmpv6_sk_init(struct net *net) /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ - sk->sk_sndbuf = - (2 * ((64 * 1024) + sizeof(struct sk_buff))); + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); } return 0; @@ -864,16 +887,23 @@ int __init icmpv6_init(void) err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; + + err = inet6_register_icmp_sender(icmp6_send); + if (err) + goto sender_reg_err; return 0; +sender_reg_err: + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: - printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + pr_err("Failed to register ICMP6 protocol\n"); unregister_pernet_subsys(&icmpv6_sk_ops); return err; } void icmpv6_cleanup(void) { + inet6_unregister_icmp_sender(icmp6_send); unregister_pernet_subsys(&icmpv6_sk_ops); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } @@ -903,6 +933,14 @@ static const struct icmp6_err { .err = ECONNREFUSED, .fatal = 1, }, + { /* POLICY_FAIL */ + .err = EACCES, + .fatal = 1, + }, + { /* REJECT_ROUTE */ + .err = EACCES, + .fatal = 1, + }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) @@ -914,7 +952,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; - if (code <= ICMPV6_PORT_UNREACH) { + if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } @@ -936,11 +974,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) return fatal; } - EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL -ctl_table ipv6_icmp_table_template[] = { +static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, @@ -951,7 +988,7 @@ ctl_table ipv6_icmp_table_template[] = { { }, }; -struct ctl_table *ipv6_icmp_sysctl_init(struct net *net) +struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) { struct ctl_table *table; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 3516e6fe2e5..a245e5ddffb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -17,6 +17,7 @@ #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/jhash.h> +#include <linux/slab.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -27,50 +28,87 @@ #include <net/inet6_connection_sock.h> int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) + const struct inet_bind_bucket *tb, bool relax) { const struct sock *sk2; - const struct hlist_node *node; + int reuse = sk->sk_reuse; + int reuseport = sk->sk_reuseport; + kuid_t uid = sock_i_uid((struct sock *)sk); /* We must walk the whole port owner list in this case. -DaveM */ /* * See comment in inet_csk_bind_conflict about sock lookup * vs net namespaces issues. */ - sk_for_each_bound(sk2, node, &tb->owners) { + sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if ((!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + (!reuseport || !sk2->sk_reuseport || + (sk2->sk_state != TCP_TIME_WAIT && + !uid_eq(uid, + sock_i_uid((struct sock *)sk2))))) { + if (ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + if (!relax && reuse && sk2->sk_reuse && + sk2->sk_state != TCP_LISTEN && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } } - return node != NULL; + return sk2 != NULL; } EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); +struct dst_entry *inet6_csk_route_req(struct sock *sk, + struct flowi6 *fl6, + const struct request_sock *req) +{ + struct inet_request_sock *ireq = inet_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *final_p, final; + struct dst_entry *dst; + + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = IPPROTO_TCP; + fl6->daddr = ireq->ir_v6_rmt_addr; + final_p = fl6_update_dst(fl6, np->opt, &final); + fl6->saddr = ireq->ir_v6_loc_addr; + fl6->flowi6_oif = ireq->ir_iif; + fl6->flowi6_mark = ireq->ir_mark; + fl6->fl6_dport = ireq->ir_rmt_port; + fl6->fl6_sport = htons(ireq->ir_num); + security_req_classify_flow(req, flowi6_to_flowi(fl6)); + + dst = ip6_dst_lookup_flow(sk, fl6, final_p); + if (IS_ERR(dst)) + return NULL; + + return dst; +} + /* * request_sock (formerly open request) hash tables. */ static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, - const u32 rnd, const u16 synq_hsize) + const u32 rnd, const u32 synq_hsize) { - u32 a = (__force u32)raddr->s6_addr32[0]; - u32 b = (__force u32)raddr->s6_addr32[1]; - u32 c = (__force u32)raddr->s6_addr32[2]; + u32 c; - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); + c = jhash_3words((__force u32)raddr->s6_addr32[0], + (__force u32)raddr->s6_addr32[1], + (__force u32)raddr->s6_addr32[2], + rnd); - a += (__force u32)raddr->s6_addr32[3]; - b += (__force u32)rport; - __jhash_mix(a, b, c); + c = jhash_2words((__force u32)raddr->s6_addr32[3], + (__force u32)rport, + c); return c & (synq_hsize - 1); } @@ -91,13 +129,13 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { - const struct inet6_request_sock *treq = inet6_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); - if (inet_rsk(req)->rmt_port == rport && + if (ireq->ir_rmt_port == rport && req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { + ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) && + ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) && + (!ireq->ir_iif || ireq->ir_iif == iif)) { WARN_ON(req->sk != NULL); *prevp = prev; return req; @@ -115,8 +153,8 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, - inet_rsk(req)->rmt_port, + const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr, + inet_rsk(req)->ir_rmt_port, lopt->hash_rnd, lopt->nr_table_entries); reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); @@ -127,113 +165,102 @@ EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { - struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_addr = sk->sk_v6_daddr; sin6->sin6_port = inet_sk(sk)->inet_dport; /* We do not store received flowlabel for TCP */ sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + sk->sk_bound_dev_if); } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) + const struct in6_addr *daddr, + const struct in6_addr *saddr) { __ip6_dst_store(sk, dst, daddr, saddr); - -#ifdef CONFIG_XFRM - { - struct rt6_info *rt = (struct rt6_info *)dst; - rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); - } -#endif } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst; - - dst = __sk_dst_check(sk, cookie); - -#ifdef CONFIG_XFRM - if (dst) { - struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { - __sk_dst_reset(sk); - dst = NULL; - } - } -#endif - - return dst; + return __sk_dst_check(sk, cookie); } -int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +static struct dst_entry *inet6_csk_route_socket(struct sock *sk, + struct flowi6 *fl6) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; + struct in6_addr *final_p, final; struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = sk->sk_protocol; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_sport = inet->inet_sport; - fl.fl_ip_dport = inet->inet_dport; - security_sk_classify_flow(sk, &fl); - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - dst = __inet6_csk_dst_check(sk, np->dst_cookie); + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = sk->sk_protocol; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = np->saddr; + fl6->flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl6->flowlabel); + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_sport = inet->inet_sport; + fl6->fl6_dport = inet->inet_dport; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); + final_p = fl6_update_dst(fl6, np->opt, &final); - if (err) { - sk->sk_err_soft = -err; - kfree_skb(skb); - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); + dst = __inet6_csk_dst_check(sk, np->dst_cookie); + if (!dst) { + dst = ip6_dst_lookup_flow(sk, fl6, final_p); - if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - kfree_skb(skb); - return err; - } + if (!IS_ERR(dst)) + __inet6_csk_dst_store(sk, dst, NULL, NULL); + } + return dst; +} - __inet6_csk_dst_store(sk, dst, NULL, NULL); +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + struct dst_entry *dst; + int res; + + dst = inet6_csk_route_socket(sk, &fl6); + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); } - skb_dst_set(skb, dst_clone(dst)); + rcu_read_lock(); + skb_dst_set_noref(skb, dst); /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + fl6.daddr = sk->sk_v6_daddr; - return ip6_xmit(sk, skb, &fl, np->opt, 0); + res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_unlock(); + return res; } - EXPORT_SYMBOL_GPL(inet6_csk_xmit); + +struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) +{ + struct flowi6 fl6; + struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); + + if (IS_ERR(dst)) + return NULL; + dst->ops->update_pmtu(dst, sk, NULL, mtu); + + dst = inet6_csk_route_socket(sk, &fl6); + return IS_ERR(dst) ? NULL : dst; +} +EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 633a6c26613..262e13c02ec 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -20,8 +20,42 @@ #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> +#include <net/secure_seq.h> #include <net/ip.h> +static unsigned int inet6_ehashfn(struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) +{ + static u32 inet6_ehash_secret __read_mostly; + static u32 ipv6_hash_secret __read_mostly; + + u32 lhash, fhash; + + net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); + net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + + return __inet6_ehashfn(lhash, lport, fhash, fport, + inet6_ehash_secret + net_hash_mix(net)); +} + +static int inet6_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct in6_addr *laddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *faddr = &sk->sk_v6_daddr; + const __u16 lport = inet->inet_num; + const __be16 fport = inet->inet_dport; + struct net *net = sock_net(sk); + + return inet6_ehashfn(net, laddr, lport, faddr, fport); +} + int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; @@ -86,45 +120,30 @@ struct sock *__inet6_lookup_established(struct net *net, rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { - /* For IPV6 do the cheaper port and family tests first. */ - if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { - if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) - goto begintw; - if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { - sock_put(sk); - goto begin; - } - goto out; - } - } - if (get_nulls_value(node) != slot) - goto begin; - -begintw: - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_nulls_for_each_rcu(sk, node, &head->twchain) { - if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { - if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { - sk = NULL; - goto out; - } - if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { - sock_put(sk); - goto begintw; - } + if (sk->sk_hash != hash) + continue; + if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif)) + continue; + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) goto out; + + if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) { + sock_gen_put(sk); + goto begin; } + goto found; } if (get_nulls_value(node) != slot) - goto begintw; - sk = NULL; + goto begin; out: + sk = NULL; +found: rcu_read_unlock(); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); -static int inline compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif) @@ -133,11 +152,10 @@ static int inline compute_score(struct sock *sk, struct net *net, if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { - const struct ipv6_pinfo *np = inet6_sk(sk); score = 1; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; score++; } @@ -151,25 +169,38 @@ static int inline compute_score(struct sock *sk, struct net *net, } struct sock *inet6_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, + struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, + const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { struct sock *sk; const struct hlist_nulls_node *node; struct sock *result; - int score, hiscore; + int score, hiscore, matches = 0, reuseport = 0; + u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; rcu_read_lock(); begin: result = NULL; - hiscore = -1; + hiscore = 0; sk_nulls_for_each(sk, node, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { hiscore = score; result = sk; + reuseport = sk->sk_reuseport; + if (reuseport) { + phash = inet6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == hiscore && reuseport) { + matches++; + if (((u64)phash * matches) >> 32 == 0) + result = sk; + phash = next_pseudo_random32(phash); } } /* @@ -216,9 +247,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); @@ -228,33 +258,28 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; - struct inet_timewait_sock *tw; + struct inet_timewait_sock *tw = NULL; int twrefcnt = 0; spin_lock(lock); - /* Check TIME-WAIT sockets first. */ - sk_nulls_for_each(sk2, node, &head->twchain) { - tw = inet_twsk(sk2); - - if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ sk_nulls_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) + if (sk2->sk_hash != hash) + continue; + + if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) { + if (sk2->sk_state == TCP_TIME_WAIT) { + tw = inet_twsk(sk2); + if (twsk_unique(sk, sk2, twp)) + break; + } goto not_unique; + } } -unique: /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ + * in hash table socket with a funny identity. + */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; @@ -287,9 +312,9 @@ not_unique: static inline u32 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, + + return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, inet->inet_dport); } diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c new file mode 100644 index 00000000000..9a4d7322fb2 --- /dev/null +++ b/net/ipv6/ip6_checksum.c @@ -0,0 +1,124 @@ +#include <net/ip.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <asm/checksum.h> + +#ifndef _HAVE_ARCH_IPV6_CSUM +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum csum) +{ + + int carry; + __u32 ulen; + __u32 uproto; + __u32 sum = (__force u32)csum; + + sum += (__force u32)saddr->s6_addr32[0]; + carry = (sum < (__force u32)saddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[1]; + carry = (sum < (__force u32)saddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[2]; + carry = (sum < (__force u32)saddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)saddr->s6_addr32[3]; + carry = (sum < (__force u32)saddr->s6_addr32[3]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[0]; + carry = (sum < (__force u32)daddr->s6_addr32[0]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[1]; + carry = (sum < (__force u32)daddr->s6_addr32[1]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[2]; + carry = (sum < (__force u32)daddr->s6_addr32[2]); + sum += carry; + + sum += (__force u32)daddr->s6_addr32[3]; + carry = (sum < (__force u32)daddr->s6_addr32[3]); + sum += carry; + + ulen = (__force u32)htonl((__u32) len); + sum += ulen; + carry = (sum < ulen); + sum += carry; + + uproto = (__force u32)htonl(proto); + sum += uproto; + carry = (sum < uproto); + sum += carry; + + return csum_fold((__force __wsum)sum); +} +EXPORT_SYMBOL(csum_ipv6_magic); +#endif + +int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) + * we accept a checksum of zero here. When we find the socket + * for the UDP packet we'll check if that socket allows zero checksum + * for IPv6 (set by socket option). + */ + return skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); +} +EXPORT_SYMBOL(udp6_csum_init); + +/* Function to set UDP checksum for an IPv6 UDP packet. This is intended + * for the simple case like when setting the checksum for a UDP tunnel. + */ +void udp6_set_csum(bool nocheck, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + struct udphdr *uh = udp_hdr(skb); + + if (nocheck) + uh->check = 0; + else if (skb_is_gso(skb)) + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + else if (skb_dst(skb) && skb_dst(skb)->dev && + (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) { + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~udp_v6_check(len, saddr, daddr, 0); + } else { + __wsum csum; + + BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); + + uh->check = 0; + csum = skb_checksum(skb, 0, len, 0); + uh->check = udp_v6_check(len, saddr, daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } +} +EXPORT_SYMBOL(udp6_set_csum); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0e93ca56eb6..cb4459bd1d2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -9,15 +9,16 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. */ -/* - * Changes: - * Yuji SEKIYA @USAGI: Support default route on router node; - * remove ip6_null_entry from the top of - * routing table. - * Ville Nuorvala: Fixed routing subtrees. - */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> @@ -26,10 +27,7 @@ #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> - -#ifdef CONFIG_PROC_FS -#include <linux/proc_fs.h> -#endif +#include <linux/slab.h> #include <net/ipv6.h> #include <net/ndisc.h> @@ -41,15 +39,14 @@ #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#define RT6_TRACE(x...) pr_debug(x) #else #define RT6_TRACE(x...) do { ; } while (0) #endif -static struct kmem_cache * fib6_node_kmem __read_mostly; +static struct kmem_cache *fib6_node_kmem __read_mostly; -enum fib_walk_state_t -{ +enum fib_walk_state_t { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif @@ -59,8 +56,7 @@ enum fib_walk_state_t FWS_U }; -struct fib6_cleaner_t -{ +struct fib6_cleaner_t { struct fib6_walker_t w; struct net *net; int (*func)(struct rt6_info *, void *arg); @@ -75,8 +71,7 @@ static DEFINE_RWLOCK(fib6_walker_lock); #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt); +static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); static int fib6_walk(struct fib6_walker_t *w); @@ -93,29 +88,20 @@ static __u32 rt_sernum; static void fib6_gc_timer_cb(unsigned long arg); -static struct fib6_walker_t fib6_walker_list = { - .prev = &fib6_walker_list, - .next = &fib6_walker_list, -}; - -#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) +static LIST_HEAD(fib6_walkers); +#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) static inline void fib6_walker_link(struct fib6_walker_t *w) { write_lock_bh(&fib6_walker_lock); - w->next = fib6_walker_list.next; - w->prev = &fib6_walker_list; - w->next->prev = w; - w->prev->next = w; + list_add(&w->lh, &fib6_walkers); write_unlock_bh(&fib6_walker_lock); } static inline void fib6_walker_unlink(struct fib6_walker_t *w) { write_lock_bh(&fib6_walker_lock); - w->next->prev = w->prev; - w->prev->next = w->next; - w->prev = w->next = w; + list_del(&w->lh); write_unlock_bh(&fib6_walker_lock); } static __inline__ u32 fib6_new_sernum(void) @@ -136,15 +122,27 @@ static __inline__ u32 fib6_new_sernum(void) /* * test bit */ +#if defined(__LITTLE_ENDIAN) +# define BITOP_BE32_SWIZZLE (0x1F & ~7) +#else +# define BITOP_BE32_SWIZZLE 0 +#endif -static __inline__ __be32 addr_bit_set(void *token, int fn_bit) +static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) { - __be32 *addr = token; - - return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; + const __be32 *addr = token; + /* + * Here, + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * is optimized version of + * htonl(1 << ((~fn_bit)&0x1F)) + * See include/asm-generic/bitops/le.h. + */ + return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & + addr[fn_bit >> 5]; } -static __inline__ struct fib6_node * node_alloc(void) +static __inline__ struct fib6_node *node_alloc(void) { struct fib6_node *fn; @@ -153,7 +151,7 @@ static __inline__ struct fib6_node * node_alloc(void) return fn; } -static __inline__ void node_free(struct fib6_node * fn) +static __inline__ void node_free(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } @@ -161,7 +159,7 @@ static __inline__ void node_free(struct fib6_node * fn) static __inline__ void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) - dst_free(&rt->u.dst); + dst_free(&rt->dst); } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -190,10 +188,11 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (table != NULL) { + if (table) { table->tb6_id = id; table->tb6_root.leaf = net->ipv6.ip6_null_entry; table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&table->tb6_peers); } return table; @@ -210,7 +209,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id) return tb; tb = fib6_alloc_table(net, id); - if (tb != NULL) + if (tb) fib6_link_table(net, tb); return tb; @@ -220,7 +219,6 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; - struct hlist_node *node; unsigned int h; if (id == 0) @@ -228,7 +226,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; @@ -239,7 +237,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) return NULL; } -static void fib6_tables_init(struct net *net) +static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); @@ -256,13 +254,13 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) return net->ipv6.fib6_main_tbl; } -struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl, +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { - return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl, flags); + return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); } -static void fib6_tables_init(struct net *net) +static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } @@ -274,7 +272,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -289,7 +287,7 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void*)cb->args[2]; + struct fib6_walker_t *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { @@ -299,7 +297,7 @@ static void fib6_dump_end(struct netlink_callback *cb) cb->args[2] = 0; kfree(w); } - cb->done = (void*)cb->args[3]; + cb->done = (void *)cb->args[3]; cb->args[1] = 3; } @@ -319,12 +317,26 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->root = &table->tb6_root; if (cb->args[4] == 0) { + w->count = 0; + w->skip = 0; + read_lock_bh(&table->tb6_lock); res = fib6_walk(w); read_unlock_bh(&table->tb6_lock); - if (res > 0) + if (res > 0) { cb->args[4] = 1; + cb->args[5] = w->root->fn_sernum; + } } else { + if (cb->args[5] != w->root->fn_sernum) { + /* Begin at the root if the tree changed */ + cb->args[5] = w->root->fn_sernum; + w->state = FWS_INIT; + w->node = w->root; + w->skip = w->count; + } else + w->skip = 0; + read_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); @@ -345,7 +357,6 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg; struct fib6_walker_t *w; struct fib6_table *tb; - struct hlist_node *node; struct hlist_head *head; int res = 0; @@ -353,7 +364,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) s_e = cb->args[1]; w = (void *)cb->args[2]; - if (w == NULL) { + if (!w) { /* New dump: * * 1. hook callback destructor. @@ -365,7 +376,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (w == NULL) + if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -376,10 +387,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) arg.net = net; w->args = &arg; + rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry(tb, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); @@ -390,6 +402,7 @@ next: } } out: + rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; @@ -407,9 +420,10 @@ out: * node. */ -static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, - int addrlen, int plen, - int offset) +static struct fib6_node *fib6_add_1(struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -431,8 +445,16 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, * Prefix match */ if (plen < fn->fn_bit || - !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { + if (!allow_create) { + if (replace_required) { + pr_warn("Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + } goto insert_above; + } /* * Exact match ? @@ -440,7 +462,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, if (plen == fn->fn_bit) { /* clean up an intermediate node */ - if ((fn->fn_flags & RTN_RTINFO) == 0) { + if (!(fn->fn_flags & RTN_RTINFO)) { rt6_release(fn->leaf); fn->leaf = NULL; } @@ -458,9 +480,25 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right: fn->left; + fn = dir ? fn->right : fn->left; } while (fn); + if (!allow_create) { + /* We should not create new node because + * NLM_F_REPLACE was specified without NLM_F_CREATE + * I assume it is safe to require NLM_F_CREATE when + * REPLACE flag is used! Later we may want to remove the + * check for replace_required, because according + * to netlink specification, NLM_F_CREATE + * MUST be specified if new route is created. + * That would keep IPv6 consistent with IPv4 + */ + if (replace_required) { + pr_warn("Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + } /* * We walked to the bottom of tree. * Create new leaf node without children. @@ -468,8 +506,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, ln = node_alloc(); - if (ln == NULL) - return NULL; + if (!ln) + return ERR_PTR(-ENOMEM); ln->fn_bit = plen; ln->parent = pn; @@ -500,7 +538,7 @@ insert_above: but if it is >= plen, the value is ignored in any case. */ - bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] @@ -511,12 +549,12 @@ insert_above: in = node_alloc(); ln = node_alloc(); - if (in == NULL || ln == NULL) { + if (!in || !ln) { if (in) node_free(in); if (ln) node_free(ln); - return NULL; + return ERR_PTR(-ENOMEM); } /* @@ -565,8 +603,8 @@ insert_above: ln = node_alloc(); - if (ln == NULL) - return NULL; + if (!ln) + return ERR_PTR(-ENOMEM); ln->fn_bit = plen; @@ -589,19 +627,61 @@ insert_above: return ln; } +static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + +static int fib6_commit_metrics(struct dst_entry *dst, + struct nlattr *mx, int mx_len) +{ + struct nlattr *nla; + int remaining; + u32 *mp; + + if (dst->flags & DST_HOST) { + mp = dst_metrics_write_ptr(dst); + } else { + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!mp) + return -ENOMEM; + dst_init_metrics(dst, mp, 0); + } + + nla_for_each_attr(nla, mx, mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) + return -EINVAL; + + mp[type - 1] = nla_get_u32(nla); + } + } + return 0; +} + /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info) + struct nl_info *info, struct nlattr *mx, int mx_len) { struct rt6_info *iter = NULL; struct rt6_info **ins; + int replace = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int add = (!info->nlh || + (info->nlh->nlmsg_flags & NLM_F_CREATE)); + int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + int err; ins = &fn->leaf; - for (iter = fn->leaf; iter; iter=iter->u.dst.rt6_next) { + for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { /* * Search for duplicates */ @@ -610,46 +690,131 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, /* * Same priority level */ + if (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_EXCL)) + return -EEXIST; + if (replace) { + found++; + break; + } - if (iter->rt6i_dev == rt->rt6i_dev && + if (iter->dst.dev == rt->dst.dev && iter->rt6i_idev == rt->rt6i_idev && ipv6_addr_equal(&iter->rt6i_gateway, &rt->rt6i_gateway)) { - if (!(iter->rt6i_flags&RTF_EXPIRES)) + if (rt->rt6i_nsiblings) + rt->rt6i_nsiblings = 0; + if (!(iter->rt6i_flags & RTF_EXPIRES)) return -EEXIST; - iter->rt6i_expires = rt->rt6i_expires; - if (!(rt->rt6i_flags&RTF_EXPIRES)) { - iter->rt6i_flags &= ~RTF_EXPIRES; - iter->rt6i_expires = 0; - } + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_clean_expires(iter); + else + rt6_set_expires(iter, rt->dst.expires); return -EEXIST; } + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->rt6i_nsiblings++; } if (iter->rt6i_metric > rt->rt6i_metric) break; - ins = &iter->u.dst.rt6_next; + ins = &iter->dst.rt6_next; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; + /* Link this route to others same route. */ + if (rt->rt6i_nsiblings) { + unsigned int rt6i_nsiblings; + struct rt6_info *sibling, *temp_sibling; + + /* Find the first route that have the same metric */ + sibling = fn->leaf; + while (sibling) { + if (sibling->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->rt6i_siblings, + &sibling->rt6i_siblings); + break; + } + sibling = sibling->dst.rt6_next; + } + /* For each sibling in the list, increment the counter of + * siblings. BUG() if counters does not match, list of siblings + * is broken! + */ + rt6i_nsiblings = 0; + list_for_each_entry_safe(sibling, temp_sibling, + &rt->rt6i_siblings, rt6i_siblings) { + sibling->rt6i_nsiblings++; + BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); + rt6i_nsiblings++; + } + BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + } + /* * insert node */ + if (!replace) { + if (!add) + pr_warn("NLM_F_CREATE should be set when creating new route\n"); + +add: + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } + rt->dst.rt6_next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + info->nl_net->ipv6.rt6_stats->fib_rt_entries++; + + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } - rt->u.dst.rt6_next = iter; - *ins = rt; - rt->rt6i_node = fn; - atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); - info->nl_net->ipv6.rt6_stats->fib_rt_entries++; - - if ((fn->fn_flags & RTN_RTINFO) == 0) { - info->nl_net->ipv6.rt6_stats->fib_route_nodes++; - fn->fn_flags |= RTN_RTINFO; + } else { + if (!found) { + if (add) + goto add; + pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); + return -ENOENT; + } + if (mx) { + err = fib6_commit_metrics(&rt->dst, mx, mx_len); + if (err) + return err; + } + *ins = rt; + rt->rt6i_node = fn; + rt->dst.rt6_next = iter->dst.rt6_next; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + rt6_release(iter); + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } } return 0; @@ -658,7 +823,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) + (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -676,16 +841,31 @@ void fib6_force_start_gc(struct net *net) * with source addr info in sub-trees */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; - - fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); - - if (fn == NULL) + int allow_create = 1; + int replace_required = 0; + + if (info->nlh) { + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) + allow_create = 0; + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) + replace_required = 1; + } + if (!allow_create && !replace_required) + pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + offsetof(struct rt6_info, rt6i_dst), allow_create, + replace_required); + if (IS_ERR(fn)) { + err = PTR_ERR(fn); + fn = NULL; goto out; + } pn = fn; @@ -693,7 +873,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (fn->subtree == NULL) { + if (!fn->subtree) { struct fib6_node *sfn; /* @@ -708,7 +888,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) /* Create subtree root node */ sfn = node_alloc(); - if (sfn == NULL) + if (!sfn) goto st_failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; @@ -719,15 +899,17 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) /* Now add the first leaf node to new subtree */ sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) { + if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node in main tree. */ node_free(sfn); + err = PTR_ERR(sn); goto st_failure; } @@ -736,14 +918,17 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) fn->subtree = sfn; } else { sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src)); + rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); - if (sn == NULL) + if (IS_ERR(sn)) { + err = PTR_ERR(sn); goto st_failure; + } } - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->leaf = rt; atomic_inc(&rt->rt6i_ref); } @@ -751,12 +936,11 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) } #endif - err = fib6_add_rt2node(fn, rt, info); - - if (err == 0) { + err = fib6_add_rt2node(fn, rt, info, mx, mx_len); + if (!err) { fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags&RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn, rt); + if (!(rt->rt6i_flags & RTF_CACHE)) + fib6_prune_clones(info->nl_net, pn); } out: @@ -781,7 +965,7 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->u.dst); + dst_free(&rt->dst); } return err; @@ -792,7 +976,7 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->u.dst); + dst_free(&rt->dst); return err; #endif } @@ -803,12 +987,12 @@ st_failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ - struct in6_addr *addr; /* search key */ + int offset; /* key offset on rt6_info */ + const struct in6_addr *addr; /* search key */ }; -static struct fib6_node * fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -833,11 +1017,10 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, fn = next; continue; } - break; } - while(fn) { + while (fn) { if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { struct rt6key *key; @@ -846,14 +1029,22 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) - fn = fib6_lookup_1(fn->subtree, args + 1); + if (fn->subtree) { + struct fib6_node *sfn; + sfn = fib6_lookup_1(fn->subtree, + args + 1); + if (!sfn) + goto backtrack; + fn = sfn; + } #endif - if (!fn || fn->fn_flags & RTN_RTINFO) + if (fn->fn_flags & RTN_RTINFO) return fn; } } - +#ifdef CONFIG_IPV6_SUBTREES +backtrack: +#endif if (fn->fn_flags & RTN_ROOT) break; @@ -863,8 +1054,8 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, return NULL; } -struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, - struct in6_addr *saddr) +struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { @@ -884,8 +1075,7 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, }; fn = fib6_lookup_1(root, daddr ? args : args + 1); - - if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; @@ -897,9 +1087,9 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, */ -static struct fib6_node * fib6_locate_1(struct fib6_node *root, - struct in6_addr *addr, - int plen, int offset) +static struct fib6_node *fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset) { struct fib6_node *fn; @@ -927,9 +1117,9 @@ static struct fib6_node * fib6_locate_1(struct fib6_node *root, return NULL; } -struct fib6_node * fib6_locate(struct fib6_node *root, - struct in6_addr *daddr, int dst_len, - struct in6_addr *saddr, int src_len) +struct fib6_node *fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len) { struct fib6_node *fn; @@ -945,7 +1135,7 @@ struct fib6_node * fib6_locate(struct fib6_node *root, } #endif - if (fn && fn->fn_flags&RTN_RTINFO) + if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; @@ -959,14 +1149,13 @@ struct fib6_node * fib6_locate(struct fib6_node *root, static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) { - if (fn->fn_flags&RTN_ROOT) + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; - while(fn) { - if(fn->left) + while (fn) { + if (fn->left) return fn->left->leaf; - - if(fn->right) + if (fn->right) return fn->right->leaf; fn = FIB6_SUBTREE(fn); @@ -998,18 +1187,20 @@ static struct fib6_node *fib6_repair_tree(struct net *net, children = 0; child = NULL; - if (fn->right) child = fn->right, children |= 1; - if (fn->left) child = fn->left, children |= 2; + if (fn->right) + child = fn->right, children |= 1; + if (fn->left) + child = fn->left, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ - || (children && fn->fn_flags&RTN_ROOT) + || (children && fn->fn_flags & RTN_ROOT) #endif ) { fn->leaf = fib6_find_prefix(net, fn); #if RT6_DEBUG >= 2 - if (fn->leaf==NULL) { + if (!fn->leaf) { WARN_ON(!fn->leaf); fn->leaf = net->ipv6.ip6_null_entry; } @@ -1027,8 +1218,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) pn->right = child; - else if (pn->left == fn) pn->left = child; + if (pn->right == fn) + pn->right = child; + else if (pn->left == fn) + pn->left = child; #if RT6_DEBUG >= 2 else WARN_ON(1); @@ -1042,7 +1235,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&fib6_walker_lock); FOR_WALKERS(w) { - if (child == NULL) { + if (!child) { if (w->root == fn) { w->root = w->node = NULL; RT6_TRACE("W %p adjusted by delroot 1\n", w); @@ -1060,10 +1253,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } @@ -1071,7 +1264,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_unlock(&fib6_walker_lock); node_free(fn); - if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) + if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; rt6_release(pn->leaf); @@ -1090,7 +1283,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, RT6_TRACE("fib6_del_route\n"); /* Unlink it */ - *rtp = rt->u.dst.rt6_next; + *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1099,22 +1292,33 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, if (fn->rr_ptr == rt) fn->rr_ptr = NULL; + /* Remove this entry from other siblings */ + if (rt->rt6i_nsiblings) { + struct rt6_info *sibling, *next_sibling; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->rt6i_siblings, rt6i_siblings) + sibling->rt6i_nsiblings--; + rt->rt6i_nsiblings = 0; + list_del_init(&rt->rt6i_siblings); + } + /* Adjust walkers */ read_lock(&fib6_walker_lock); FOR_WALKERS(w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->u.dst.rt6_next; - if (w->leaf == NULL) + w->leaf = rt->dst.rt6_next; + if (!w->leaf) w->state = FWS_U; } } read_unlock(&fib6_walker_lock); - rt->u.dst.rt6_next = NULL; + rt->dst.rt6_next = NULL; /* If it was last route, expunge its radix tree node */ - if (fn->leaf == NULL) { + if (!fn->leaf) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; fn = fib6_repair_tree(net, fn); @@ -1128,7 +1332,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { + if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { fn->leaf = fib6_find_prefix(net, fn); atomic_inc(&fn->leaf->rt6i_ref); rt6_release(rt); @@ -1150,34 +1354,34 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) struct rt6_info **rtp; #if RT6_DEBUG >= 2 - if (rt->u.dst.obsolete>0) { + if (rt->dst.obsolete > 0) { WARN_ON(fn != NULL); return -ENOENT; } #endif - if (fn == NULL || rt == net->ipv6.ip6_null_entry) + if (!fn || rt == net->ipv6.ip6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags&RTF_CACHE)) { + if (!(rt->rt6i_flags & RTF_CACHE)) { struct fib6_node *pn = fn; #ifdef CONFIG_IPV6_SUBTREES /* clones of this route might be in another subtree */ if (rt->rt6i_src.plen) { - while (!(pn->fn_flags&RTN_ROOT)) + while (!(pn->fn_flags & RTN_ROOT)) pn = pn->parent; pn = pn->parent; } #endif - fib6_prune_clones(info->nl_net, pn, rt); + fib6_prune_clones(info->nl_net, pn); } /* * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.dst.rt6_next) { + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { if (*rtp == rt) { fib6_del_route(fn, rtp, info); return 0; @@ -1216,11 +1420,11 @@ static int fib6_walk_continue(struct fib6_walker_t *w) for (;;) { fn = w->node; - if (fn == NULL) + if (!fn) return 0; if (w->prune && fn != w->root && - fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { w->state = FWS_C; w->leaf = fn->leaf; } @@ -1249,12 +1453,22 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->state = FWS_C; w->leaf = fn->leaf; case FWS_C: - if (w->leaf && fn->fn_flags&RTN_RTINFO) { - int err = w->func(w); + if (w->leaf && fn->fn_flags & RTN_RTINFO) { + int err; + + if (w->skip) { + w->skip--; + goto skip; + } + + err = w->func(w); if (err) return err; + + w->count++; continue; } +skip: w->state = FWS_U; case FWS_U: if (fn == w->root) @@ -1307,14 +1521,15 @@ static int fib6_clean_node(struct fib6_walker_t *w) .nl_net = c->net, }; - for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 - printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); + pr_debug("%s: del failed: rt=%p@%p err=%d\n", + __func__, rt, rt->rt6i_node, res); #endif continue; } @@ -1346,6 +1561,8 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.root = root; c.w.func = fib6_clean_node; c.w.prune = prune; + c.w.count = 0; + c.w.skip = 0; c.func = func; c.arg = arg; c.net = net; @@ -1354,20 +1571,19 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, } void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + void *arg) { struct fib6_table *table; - struct hlist_node *node; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + hlist_for_each_entry_rcu(table, head, tb6_hlist) { write_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, prune, arg); + func, 0, arg); write_unlock_bh(&table->tb6_lock); } } @@ -1384,10 +1600,9 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg) return 0; } -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt) +static void fib6_prune_clones(struct net *net, struct fib6_node *fn) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); + fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL); } /* @@ -1412,22 +1627,31 @@ static int fib6_age(struct rt6_info *rt, void *arg) * only if they are not in use now. */ - if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { - if (time_after(now, rt->rt6i_expires)) { + if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { + if (time_after(now, rt->dst.expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args.more++; } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->u.dst.__refcnt) == 0 && - time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { + if (atomic_read(&rt->dst.__refcnt) == 0 && + time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; - } else if ((rt->rt6i_flags & RTF_GATEWAY) && - (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } } gc_args.more++; } @@ -1437,27 +1661,28 @@ static int fib6_age(struct rt6_info *rt, void *arg) static DEFINE_SPINLOCK(fib6_gc_lock); -void fib6_run_gc(unsigned long expires, struct net *net) +void fib6_run_gc(unsigned long expires, struct net *net, bool force) { - if (expires != ~0UL) { + unsigned long now; + + if (force) { spin_lock_bh(&fib6_gc_lock); - gc_args.timeout = expires ? (int)expires : - net->ipv6.sysctl.ip6_rt_gc_interval; - } else { - if (!spin_trylock_bh(&fib6_gc_lock)) { - mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); - return; - } - gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; + } else if (!spin_trylock_bh(&fib6_gc_lock)) { + mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); + return; } + gc_args.timeout = expires ? (int)expires : + net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = icmp6_dst_gc(); - fib6_clean_all(net, fib6_age, 0, NULL); + fib6_clean_all(net, fib6_age, NULL); + now = jiffies; + net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, - round_jiffies(jiffies + round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); @@ -1466,20 +1691,23 @@ void fib6_run_gc(unsigned long expires, struct net *net) static void fib6_gc_timer_cb(unsigned long arg) { - fib6_run_gc(0, (struct net *)arg); + fib6_run_gc(0, (struct net *)arg, true); } -static int fib6_net_init(struct net *net) +static int __net_init fib6_net_init(struct net *net) { + size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_timer; - net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ, - sizeof(*net->ipv6.fib_table_hash), - GFP_KERNEL); + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; @@ -1492,6 +1720,7 @@ static int fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -1502,6 +1731,7 @@ static int fib6_net_init(struct net *net) net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); @@ -1517,7 +1747,7 @@ out_rt6_stats: kfree(net->ipv6.rt6_stats); out_timer: return -ENOMEM; - } +} static void fib6_net_exit(struct net *net) { @@ -1525,8 +1755,10 @@ static void fib6_net_exit(struct net *net) del_timer_sync(&net->ipv6.ip6_fib_timer); #ifdef CONFIG_IPV6_MULTIPLE_TABLES + inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); kfree(net->ipv6.fib6_local_tbl); #endif + inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); kfree(net->ipv6.fib6_main_tbl); kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); @@ -1552,7 +1784,8 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); + ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, + NULL); if (ret) goto out_unregister_subsys; out: @@ -1570,3 +1803,189 @@ void fib6_gc_cleanup(void) unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } + +#ifdef CONFIG_PROC_FS + +struct ipv6_route_iter { + struct seq_net_private p; + struct fib6_walker_t w; + loff_t skip; + struct fib6_table *tbl; + __u32 sernum; +}; + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct rt6_info *rt = v; + struct ipv6_route_iter *iter = seq->private; + + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); +#else + seq_puts(seq, "00000000000000000000000000000000 00 "); +#endif + if (rt->rt6i_flags & RTF_GATEWAY) + seq_printf(seq, "%pi6", &rt->rt6i_gateway); + else + seq_puts(seq, "00000000000000000000000000000000"); + + seq_printf(seq, " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + iter->w.leaf = NULL; + return 0; +} + +static int ipv6_route_yield(struct fib6_walker_t *w) +{ + struct ipv6_route_iter *iter = w->args; + + if (!iter->skip) + return 1; + + do { + iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->skip--; + if (!iter->skip && iter->w.leaf) + return 1; + } while (iter->w.leaf); + + return 0; +} + +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +{ + memset(&iter->w, 0, sizeof(iter->w)); + iter->w.func = ipv6_route_yield; + iter->w.root = &iter->tbl->tb6_root; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + iter->w.args = iter; + iter->sernum = iter->w.root->fn_sernum; + INIT_LIST_HEAD(&iter->w.lh); + fib6_walker_link(&iter->w); +} + +static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, + struct net *net) +{ + unsigned int h; + struct hlist_node *node; + + if (tbl) { + h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; + node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); + } else { + h = 0; + node = NULL; + } + + while (!node && h < FIB6_TABLE_HASHSZ) { + node = rcu_dereference_bh( + hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); + } + return hlist_entry_safe(node, struct fib6_table, tb6_hlist); +} + +static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) +{ + if (iter->sernum != iter->w.root->fn_sernum) { + iter->sernum = iter->w.root->fn_sernum; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + WARN_ON(iter->w.skip); + iter->w.skip = iter->w.count; + } +} + +static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int r; + struct rt6_info *n; + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + if (!v) + goto iter_table; + + n = ((struct rt6_info *)v)->dst.rt6_next; + if (n) { + ++*pos; + return n; + } + +iter_table: + ipv6_route_check_sernum(iter); + read_lock(&iter->tbl->tb6_lock); + r = fib6_walk_continue(&iter->w); + read_unlock(&iter->tbl->tb6_lock); + if (r > 0) { + if (v) + ++*pos; + return iter->w.leaf; + } else if (r < 0) { + fib6_walker_unlink(&iter->w); + return NULL; + } + fib6_walker_unlink(&iter->w); + + iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); + if (!iter->tbl) + return NULL; + + ipv6_route_seq_setup_walk(iter); + goto iter_table; +} + +static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU_BH) +{ + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + rcu_read_lock_bh(); + iter->tbl = ipv6_route_seq_next_table(NULL, net); + iter->skip = *pos; + + if (iter->tbl) { + ipv6_route_seq_setup_walk(iter); + return ipv6_route_seq_next(seq, NULL, pos); + } else { + return NULL; + } +} + +static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) +{ + struct fib6_walker_t *w = &iter->w; + return w->node && !(w->state == FWS_U && w->node == w->root); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) + __releases(RCU_BH) +{ + struct ipv6_route_iter *iter = seq->private; + + if (ipv6_route_iter_active(iter)) + fib6_walker_unlink(&iter->w); + + rcu_read_unlock_bh(); +} + +static const struct seq_operations ipv6_route_seq_ops = { + .start = ipv6_route_seq_start, + .next = ipv6_route_seq_next, + .stop = ipv6_route_seq_stop, + .show = ipv6_route_seq_show +}; + +int ipv6_route_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipv6_route_seq_ops, + sizeof(struct ipv6_route_iter)); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 6e7bffa2205..4052694c6f2 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -15,22 +15,18 @@ #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> #include <linux/in6.h> -#include <linux/route.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> -#include <net/ndisc.h> -#include <net/protocol.h> -#include <net/ip6_route.h> -#include <net/addrconf.h> #include <net/rawv6.h> -#include <net/icmp.h> #include <net/transp_v6.h> #include <asm/uaccess.h> @@ -38,7 +34,7 @@ #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ -#define FL_MAX_LINGER 60 /* Maximal linger timeout */ +#define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ @@ -48,25 +44,38 @@ #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); -static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; +static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(unsigned long dummy); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); /* FL hash table lock: it protects only of GC */ -static DEFINE_RWLOCK(ip6_fl_lock); +static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ -static DEFINE_RWLOCK(ip6_sk_fl_lock); +static DEFINE_SPINLOCK(ip6_sk_fl_lock); +#define for_each_fl_rcu(hash, fl) \ + for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) +#define for_each_fl_continue_rcu(fl) \ + for (fl = rcu_dereference_bh(fl->next); \ + fl != NULL; \ + fl = rcu_dereference_bh(fl->next)) + +#define for_each_sk_fl_rcu(np, sfl) \ + for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \ + sfl != NULL; \ + sfl = rcu_dereference_bh(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } @@ -77,11 +86,11 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); fl = __fl_lookup(net, label); - if (fl) - atomic_inc(&fl->users); - read_unlock_bh(&ip6_fl_lock); + if (fl && !atomic_inc_not_zero(&fl->users)) + fl = NULL; + rcu_read_unlock_bh(); return fl; } @@ -89,15 +98,17 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) static void fl_free(struct ip6_flowlabel *fl) { if (fl) { + if (fl->share == IPV6_FL_S_PROCESS) + put_pid(fl->owner.pid); release_net(fl->fl_net); kfree(fl->opt); + kfree_rcu(fl, rcu); } - kfree(fl); } static void fl_release(struct ip6_flowlabel *fl) { - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { @@ -114,7 +125,7 @@ static void fl_release(struct ip6_flowlabel *fl) time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(unsigned long dummy) @@ -123,12 +134,15 @@ static void ip6_fl_gc(unsigned long dummy) unsigned long now = jiffies; unsigned long sched = 0; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i=0; i<=FL_HASH_MASK; i++) { - struct ip6_flowlabel *fl, **flp; + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + flp = &fl_ht[i]; - while ((fl=*flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) @@ -151,18 +165,21 @@ static void ip6_fl_gc(unsigned long dummy) if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } -static void ip6_fl_purge(struct net *net) +static void __net_exit ip6_fl_purge(struct net *net) { int i; - write_lock(&ip6_fl_lock); + spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { - struct ip6_flowlabel *fl, **flp; + struct ip6_flowlabel *fl; + struct ip6_flowlabel __rcu **flp; + flp = &fl_ht[i]; - while ((fl = *flp) != NULL) { + while ((fl = rcu_dereference_protected(*flp, + lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; @@ -173,7 +190,7 @@ static void ip6_fl_purge(struct net *net) flp = &fl->next; } } - write_unlock(&ip6_fl_lock); + spin_unlock(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, @@ -183,10 +200,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->label = label & IPV6_FLOWLABEL_MASK; - write_lock_bh(&ip6_fl_lock); + spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { - fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (lfl == NULL) @@ -205,16 +222,16 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl != NULL) { atomic_inc(&lfl->users); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; - fl_ht[FL_HASH(fl->label)] = fl; + rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); - write_unlock_bh(&ip6_fl_lock); + spin_unlock_bh(&ip6_fl_lock); return NULL; } @@ -229,17 +246,17 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) label &= IPV6_FLOWLABEL_MASK; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label) { fl->lastuse = jiffies; atomic_inc(&fl->users); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return fl; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return NULL; } @@ -250,11 +267,21 @@ void fl6_free_socklist(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; - while ((sfl = np->ipv6_fl_list) != NULL) { + if (!rcu_access_pointer(np->ipv6_fl_list)) + return; + + spin_lock_bh(&ip6_sk_fl_lock); + while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; + spin_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); + + spin_lock_bh(&ip6_sk_fl_lock); } + spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ @@ -292,6 +319,7 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, opt_space->opt_flen = fopt->opt_flen; return opt_space; } +EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { @@ -310,6 +338,8 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo expires = check_linger(expires); if (!expires) return -EPERM; + + spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; @@ -317,12 +347,14 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; + spin_unlock_bh(&ip6_fl_lock); + return 0; } static struct ip6_flowlabel * -fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, - int optlen, int *err_p) +fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, + char __user *optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; @@ -341,7 +373,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, if (olen > 0) { struct msghdr msg; - struct flowi flowi; + struct flowi6 flowi6; int junk; err = -ENOMEM; @@ -357,9 +389,10 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, msg.msg_controllen = olen; msg.msg_control = (void*)(fl->opt+1); - flowi.oif = 0; + memset(&flowi6, 0, sizeof(flowi6)); - err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, &junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, + &junk, &junk, &junk); if (err) goto done; err = -EINVAL; @@ -383,17 +416,17 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, err = -EINVAL; goto done; } - ipv6_addr_copy(&fl->dst, &freq->flr_dst); + fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: - fl->owner = current->pid; + fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: - fl->owner = current_euid(); + fl->owner.uid = current_euid(); break; default: err = -EINVAL; @@ -417,8 +450,10 @@ static int mem_check(struct sock *sk) if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) count++; + rcu_read_unlock_bh(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || @@ -429,42 +464,51 @@ static int mem_check(struct sock *sk) return 0; } -static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { - if (h1 == h2) - return 0; - if (h1 == NULL || h2 == NULL) - return 1; - if (h1->hdrlen != h2->hdrlen) - return 1; - return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); + spin_lock_bh(&ip6_sk_fl_lock); + sfl->fl = fl; + sfl->next = np->ipv6_fl_list; + rcu_assign_pointer(np->ipv6_fl_list, sfl); + spin_unlock_bh(&ip6_sk_fl_lock); } -static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, + int flags) { - if (o1 == o2) + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + if (flags & IPV6_FL_F_REMOTE) { + freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; - if (o1 == NULL || o2 == NULL) - return 1; - if (o1->opt_nflen != o2->opt_nflen) - return 1; - if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) - return 1; - if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) - return 1; - if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) - return 1; - return 0; -} + } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) -{ - write_lock_bh(&ip6_sk_fl_lock); - sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - np->ipv6_fl_list = sfl; - write_unlock_bh(&ip6_sk_fl_lock); + if (np->repflow) { + freq->flr_label = np->flow_label; + return 0; + } + + rcu_read_lock_bh(); + + for_each_sk_fl_rcu(np, sfl) { + if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { + spin_lock_bh(&ip6_fl_lock); + freq->flr_label = sfl->fl->label; + freq->flr_dst = sfl->fl->dst; + freq->flr_share = sfl->fl->share; + freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; + freq->flr_linger = sfl->fl->linger / HZ; + + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); + return 0; + } + } + rcu_read_unlock_bh(); + + return -ENOENT; } int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) @@ -474,7 +518,8 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) struct ipv6_pinfo *np = inet6_sk(sk); struct in6_flowlabel_req freq; struct ipv6_fl_socklist *sfl1=NULL; - struct ipv6_fl_socklist *sfl, **sflp; + struct ipv6_fl_socklist *sfl; + struct ipv6_fl_socklist __rcu **sflp; struct ip6_flowlabel *fl, *fl1 = NULL; @@ -486,33 +531,45 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) switch (freq.flr_action) { case IPV6_FL_A_PUT: - write_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (!np->repflow) + return -ESRCH; + np->flow_label = 0; + np->repflow = 0; + return 0; + } + spin_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; + (sfl = rcu_dereference(*sflp))!=NULL; + sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = sfl->next; - write_unlock_bh(&ip6_sk_fl_lock); + *sflp = rcu_dereference(sfl->next); + spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); - kfree(sfl); + kfree_rcu(sfl, rcu); return 0; } } - write_unlock_bh(&ip6_sk_fl_lock); + spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; case IPV6_FL_A_RENEW: - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); return err; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); - if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { + if (freq.flr_share == IPV6_FL_S_NONE && + ns_capable(net->user_ns, CAP_NET_ADMIN)) { fl = fl_lookup(net, freq.flr_label); if (fl) { err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); @@ -523,21 +580,35 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) return -ESRCH; case IPV6_FL_A_GET: + if (freq.flr_flags & IPV6_FL_F_REFLECT) { + struct net *net = sock_net(sk); + if (net->ipv6.sysctl.flowlabel_consistency) { + net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); + return -EPERM; + } + + if (sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + + np->repflow = 1; + return 0; + } + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; - fl = fl_create(net, &freq, optval, optlen, &err); + fl = fl_create(net, sk, &freq, optval, optlen, &err); if (fl == NULL) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq.flr_label) { err = -EEXIST; - read_lock_bh(&ip6_sk_fl_lock); - for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + rcu_read_lock_bh(); + for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_flags&IPV6_FL_F_EXCL) { - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); goto done; } fl1 = sfl->fl; @@ -545,7 +616,7 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) break; } } - read_unlock_bh(&ip6_sk_fl_lock); + rcu_read_unlock_bh(); if (fl1 == NULL) fl1 = fl_lookup(net, freq.flr_label); @@ -557,12 +628,10 @@ recheck: err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || - fl1->owner != fl->owner) - goto release; - - err = -EINVAL; - if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || - ipv6_opt_cmp(fl1->opt, fl->opt)) + ((fl1->share == IPV6_FL_S_PROCESS) && + (fl1->owner.pid == fl->owner.pid)) || + ((fl1->share == IPV6_FL_S_USER) && + uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; @@ -617,6 +686,7 @@ done: struct ip6fl_iter_state { struct seq_net_private p; + struct pid_namespace *pid_ns; int bucket; }; @@ -629,13 +699,13 @@ static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { - fl = fl_ht[state->bucket]; - - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - if (fl) - break; + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } } + fl = NULL; +out: return fl; } @@ -644,18 +714,22 @@ static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flo struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); - fl = fl->next; + for_each_fl_continue_rcu(fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + try_again: - while (fl && !net_eq(fl->fl_net, net)) - fl = fl->next; - - while (!fl) { - if (++state->bucket <= FL_HASH_MASK) { - fl = fl_ht[state->bucket]; - goto try_again; - } else - break; + if (++state->bucket <= FL_HASH_MASK) { + for_each_fl_rcu(state->bucket, fl) { + if (net_eq(fl->fl_net, net)) + goto out; + } + goto try_again; } + fl = NULL; + +out: return fl; } @@ -669,9 +743,9 @@ static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(ip6_fl_lock) + __acquires(RCU) { - read_lock_bh(&ip6_fl_lock); + rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -688,13 +762,14 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ip6fl_seq_stop(struct seq_file *seq, void *v) - __releases(ip6_fl_lock) + __releases(RCU) { - read_unlock_bh(&ip6_fl_lock); + rcu_read_unlock_bh(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); @@ -702,9 +777,13 @@ static int ip6fl_seq_show(struct seq_file *seq, void *v) struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", - (unsigned)ntohl(fl->label), + (unsigned int)ntohl(fl->label), fl->share, - (unsigned)fl->owner, + ((fl->share == IPV6_FL_S_PROCESS) ? + pid_nr_ns(fl->owner.pid, state->pid_ns) : + ((fl->share == IPV6_FL_S_USER) ? + from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : + 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, @@ -723,8 +802,29 @@ static const struct seq_operations ip6fl_seq_ops = { static int ip6fl_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &ip6fl_seq_ops, - sizeof(struct ip6fl_iter_state)); + struct seq_file *seq; + struct ip6fl_iter_state *state; + int err; + + err = seq_open_net(inode, file, &ip6fl_seq_ops, + sizeof(struct ip6fl_iter_state)); + + if (!err) { + seq = file->private_data; + state = ip6fl_seq_private(seq); + rcu_read_lock(); + state->pid_ns = get_pid_ns(task_active_pid_ns(current)); + rcu_read_unlock(); + } + return err; +} + +static int ip6fl_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + put_pid_ns(state->pid_ns); + return seq_release_net(inode, file); } static const struct file_operations ip6fl_seq_fops = { @@ -732,20 +832,20 @@ static const struct file_operations ip6fl_seq_fops = { .open = ip6fl_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_net, + .release = ip6fl_seq_release, }; -static int ip6_flowlabel_proc_init(struct net *net) +static int __net_init ip6_flowlabel_proc_init(struct net *net) { - if (!proc_net_fops_create(net, "ip6_flowlabel", - S_IRUGO, &ip6fl_seq_fops)) + if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net, + &ip6fl_seq_fops)) return -ENOMEM; return 0; } -static void ip6_flowlabel_proc_fini(struct net *net) +static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { - proc_net_remove(net, "ip6_flowlabel"); + remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) @@ -754,11 +854,10 @@ static inline int ip6_flowlabel_proc_init(struct net *net) } static inline void ip6_flowlabel_proc_fini(struct net *net) { - return ; } #endif -static inline void ip6_flowlabel_net_exit(struct net *net) +static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c new file mode 100644 index 00000000000..3873181ed85 --- /dev/null +++ b/net/ipv6/ip6_gre.c @@ -0,0 +1,1722 @@ +/* + * GRE over IPv6 protocol decoder. + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/mroute.h> +#include <linux/init.h> +#include <linux/in6.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/hash.h> +#include <linux/if_tunnel.h> +#include <linux/ip6_tunnel.h> + +#include <net/sock.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <net/addrconf.h> +#include <net/arp.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/rtnetlink.h> + +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ip6_tunnel.h> + + +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) + +static int ip6gre_net_id __read_mostly; +struct ip6gre_net { + struct ip6_tnl __rcu *tunnels[4][HASH_SIZE]; + + struct net_device *fb_tunnel_dev; +}; + +static struct rtnl_link_ops ip6gre_link_ops __read_mostly; +static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; +static int ip6gre_tunnel_init(struct net_device *dev); +static void ip6gre_tunnel_setup(struct net_device *dev); +static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); +static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1)) +static u32 HASH_ADDR(const struct in6_addr *addr) +{ + u32 hash = ipv6_addr_hash(addr); + + return hash_32(hash, HASH_SIZE_SHIFT); +} + +#define tunnels_r_l tunnels[3] +#define tunnels_r tunnels[2] +#define tunnels_l tunnels[1] +#define tunnels_wc tunnels[0] + +/* Given src, dst and key, find appropriate for input tunnel. */ + +static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, + const struct in6_addr *remote, const struct in6_addr *local, + __be32 key, __be16 gre_proto) +{ + struct net *net = dev_net(dev); + int link = dev->ifindex; + unsigned int h0 = HASH_ADDR(remote); + unsigned int h1 = HASH_KEY(key); + struct ip6_tnl *t, *cand = NULL; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int dev_type = (gre_proto == htons(ETH_P_TEB)) ? + ARPHRD_ETHER : ARPHRD_IP6GRE; + int score, cand_score = 4; + + for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_equal(remote, &t->parms.raddr) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { + if (!ipv6_addr_equal(remote, &t->parms.raddr) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { + if ((!ipv6_addr_equal(local, &t->parms.laddr) && + (!ipv6_addr_equal(local, &t->parms.raddr) || + !ipv6_addr_is_multicast(local))) || + key != t->parms.i_key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { + if (t->parms.i_key != key || + !(t->dev->flags & IFF_UP)) + continue; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + continue; + + score = 0; + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) + return t; + + if (score < cand_score) { + cand = t; + cand_score = score; + } + } + + if (cand != NULL) + return cand; + + dev = ign->fb_tunnel_dev; + if (dev->flags & IFF_UP) + return netdev_priv(dev); + + return NULL; +} + +static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign, + const struct __ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = HASH_KEY(p->i_key); + int prio = 0; + + if (!ipv6_addr_any(local)) + prio |= 1; + if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) { + prio |= 2; + h ^= HASH_ADDR(remote); + } + + return &ign->tunnels[prio][h]; +} + +static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign, + const struct ip6_tnl *t) +{ + return __ip6gre_bucket(ign, &t->parms); +} + +static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6gre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static struct ip6_tnl *ip6gre_tunnel_find(struct net *net, + const struct __ip6_tnl_parm *parms, + int type) +{ + const struct in6_addr *remote = &parms->raddr; + const struct in6_addr *local = &parms->laddr; + __be32 key = parms->i_key; + int link = parms->link; + struct ip6_tnl *t; + struct ip6_tnl __rcu **tp; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + for (tp = __ip6gre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + key == t->parms.i_key && + link == t->parms.link && + type == t->dev->type) + break; + + return t; +} + +static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, + const struct __ip6_tnl_parm *parms, int create) +{ + struct ip6_tnl *t, *nt; + struct net_device *dev; + char name[IFNAMSIZ]; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); + if (t || !create) + return t; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "ip6gre%d"); + + dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup); + if (!dev) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + nt->parms = *parms; + dev->rtnl_link_ops = &ip6gre_link_ops; + + nt->dev = dev; + nt->net = dev_net(dev); + ip6gre_tnl_link_config(nt, 1); + + if (register_netdevice(dev) < 0) + goto failed_free; + + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + + dev_hold(dev); + ip6gre_tunnel_link(ign, nt); + return nt; + +failed_free: + free_netdev(dev); + return NULL; +} + +static void ip6gre_tunnel_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); + + ip6gre_tunnel_unlink(ign, t); + dev_put(dev); +} + + +static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; + __be16 *p = (__be16 *)(skb->data + offset); + int grehlen = offset + 4; + struct ip6_tnl *t; + __be16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (!pskb_may_pull(skb, grehlen)) + return; + ipv6h = (const struct ipv6hdr *)skb->data; + p = (__be16 *)(skb->data + offset); + + t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, + flags & GRE_KEY ? + *(((__be32 *)p) + (grehlen / 4) - 1) : 0, + p[1]); + if (t == NULL) + return; + + switch (type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); + break; + case ICMPV6_TIME_EXCEED: + if (code == ICMPV6_EXC_HOPLIMIT) { + net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); + } + break; + case ICMPV6_PARAMPROB: + teli = 0; + if (code == ICMPV6_HDR_FIELD) + teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); + + if (teli && teli == info - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); + } + } else { + net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = info - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + break; + } + + if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +} + +static int ip6gre_rcv(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + u8 *h; + __be16 flags; + __sum16 csum = 0; + __be32 key = 0; + u32 seqno = 0; + struct ip6_tnl *tunnel; + int offset = 4; + __be16 gre_proto; + int err; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + goto drop; + + ipv6h = ipv6_hdr(skb); + h = skb->data; + flags = *(__be16 *)h; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop; + + if (flags&GRE_CSUM) { + csum = skb_checksum_simple_validate(skb); + offset += 4; + } + if (flags&GRE_KEY) { + key = *(__be32 *)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(__be32 *)(h + offset)); + offset += 4; + } + } + + gre_proto = *(__be16 *)(h + 2); + + tunnel = ip6gre_tunnel_lookup(skb->dev, + &ipv6h->saddr, &ipv6h->daddr, key, + gre_proto); + if (tunnel) { + struct pcpu_sw_netstats *tstats; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) { + tunnel->dev->stats.rx_dropped++; + goto drop; + } + + skb->protocol = gre_proto; + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { + skb->protocol = htons(ETH_P_IP); + if ((*(h + offset) & 0xF0) != 0x40) + offset += 4; + } + + skb->mac_header = skb->network_header; + __pskb_pull(skb, offset); + skb_postpull_rcsum(skb, skb_transport_header(skb), offset); + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && + (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } + + ipv6h = ipv6_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + skb_reset_network_header(skb); + + err = IP6_ECN_decapsulate(ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; + } + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + netif_rx(skb); + + return 0; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +{ + memset(opt, 0, sizeof(struct ipv6_tel_txoption)); + + opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; + opt->dst_opt[3] = 1; + opt->dst_opt[4] = encap_limit; + opt->dst_opt[5] = IPV6_TLV_PADN; + opt->dst_opt[6] = 1; + + opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; + opt->ops.opt_nflen = 8; +} + +static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, + struct net_device *dev, + __u8 dsfield, + struct flowi6 *fl6, + int encap_limit, + __u32 *pmtu) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct net *net = tunnel->net; + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *ipv6h; /* Our new IP header */ + unsigned int max_headroom = 0; /* The extra header space needed */ + int gre_hlen; + struct ipv6_tel_txoption opt; + int mtu; + struct dst_entry *dst = NULL, *ndst = NULL; + struct net_device_stats *stats = &tunnel->dev->stats; + int err = -1; + u8 proto; + struct sk_buff *new_skb; + + if (dev->type == ARPHRD_ETHER) + IPCB(skb)->flags = 0; + + if (dev->header_ops && dev->type == ARPHRD_IP6GRE) { + gre_hlen = 0; + ipv6h = (struct ipv6hdr *)skb->data; + fl6->daddr = ipv6h->daddr; + } else { + gre_hlen = tunnel->hlen; + fl6->daddr = tunnel->parms.raddr; + } + + if (!fl6->flowi6_mark) + dst = ip6_tnl_dst_check(tunnel); + + if (!dst) { + ndst = ip6_route_output(net, NULL, fl6); + + if (ndst->error) + goto tx_err_link_failure; + ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); + ndst = NULL; + goto tx_err_link_failure; + } + dst = ndst; + } + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + tunnel->parms.name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst) - sizeof(*ipv6h); + if (encap_limit >= 0) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->len > mtu) { + *pmtu = mtu; + err = -EMSGSIZE; + goto tx_err_dst_release; + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); + + max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + if (!new_skb) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + consume_skb(skb); + skb = new_skb; + } + + if (fl6->flowi6_mark) { + skb_dst_set(skb, dst); + ndst = NULL; + } else { + skb_dst_set_noref(skb, dst); + } + + proto = NEXTHDR_GRE; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); + } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + skb_push(skb, gre_hlen); + skb_reset_network_header(skb); + skb_set_transport_header(skb, sizeof(*ipv6h)); + + /* + * Push down and install the IP header. + */ + ipv6h = ipv6_hdr(skb); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); + ipv6h->hop_limit = tunnel->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; + + ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; + ((__be16 *)(ipv6h + 1))[1] = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__sum16 *)ptr = ip_compute_csum((void *)(ipv6h+1), + skb->len - sizeof(struct ipv6hdr)); + } + } + + ip6tunnel_xmit(skb, dev); + if (ndst) + ip6_tnl_dst_store(tunnel, ndst); + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(ndst); + return err; +} + +static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + const struct iphdr *iph = ip_hdr(skb); + int encap_limit = -1; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + dsfield = ipv4_get_dsfield(iph); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + return -1; + } + + return 0; +} + +static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int encap_limit = -1; + __u16 offset; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + return -1; + + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + + dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + if (err == -EMSGSIZE) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + return -1; + } + + return 0; +} + +/** + * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t, + const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int encap_limit = -1; + struct flowi6 fl6; + __u32 mtu; + int err; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = skb->protocol; + + err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu); + + return err; +} + +static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + int ret; + + if (!ip6_tnl_xmit_ctl(t)) + goto tx_err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ret = ip6gre_xmit_ipv4(skb, dev); + break; + case htons(ETH_P_IPV6): + ret = ip6gre_xmit_ipv6(skb, dev); + break; + default: + ret = ip6gre_xmit_other(skb, dev); + break; + } + + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) +{ + struct net_device *dev = t->dev; + struct __ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; + int addend = sizeof(struct ipv6hdr) + 4; + + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + } + + /* Set up flowi template */ + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); + + if (p->flags&IP6_TNL_F_CAP_XMIT && + p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + /* Precalculate GRE options length */ + if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (t->parms.o_flags&GRE_CSUM) + addend += 4; + if (t->parms.o_flags&GRE_KEY) + addend += 4; + if (t->parms.o_flags&GRE_SEQ) + addend += 4; + } + t->hlen = addend; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); + + struct rt6_info *rt = rt6_lookup(t->net, + &p->raddr, &p->laddr, + p->link, strict); + + if (rt == NULL) + return; + + if (rt->dst.dev) { + dev->hard_header_len = rt->dst.dev->hard_header_len + addend; + + if (set_mtu) { + dev->mtu = rt->dst.dev->mtu - addend; + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + } + ip6_rt_put(rt); + } +} + +static int ip6gre_tnl_change(struct ip6_tnl *t, + const struct __ip6_tnl_parm *p, int set_mtu) +{ + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + t->parms.link = p->link; + t->parms.proto = p->proto; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; + ip6_tnl_dst_reset(t); + ip6gre_tnl_link_config(t, set_mtu); + return 0; +} + +static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, + const struct ip6_tnl_parm2 *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->flags = u->flags; + p->hop_limit = u->hop_limit; + p->encap_limit = u->encap_limit; + p->flowinfo = u->flowinfo; + p->link = u->link; + p->i_key = u->i_key; + p->o_key = u->o_key; + p->i_flags = u->i_flags; + p->o_flags = u->o_flags; + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, + const struct __ip6_tnl_parm *p) +{ + u->proto = IPPROTO_GRE; + u->laddr = p->laddr; + u->raddr = p->raddr; + u->flags = p->flags; + u->hop_limit = p->hop_limit; + u->encap_limit = p->encap_limit; + u->flowinfo = p->flowinfo; + u->link = p->link; + u->i_key = p->i_key; + u->o_key = p->o_key; + u->i_flags = p->i_flags; + u->o_flags = p->o_flags; + memcpy(u->name, p->name, sizeof(u->name)); +} + +static int ip6gre_tunnel_ioctl(struct net_device *dev, + struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm2 p; + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ign->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); + } + memset(&p, 0, sizeof(p)); + ip6gre_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)) + goto done; + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL); + + if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + t = netdev_priv(dev); + + ip6gre_tunnel_unlink(ign, t); + synchronize_net(); + ip6gre_tnl_change(t, &p1, 1); + ip6gre_tunnel_link(ign, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + + memset(&p, 0, sizeof(p)); + ip6gre_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto done; + + if (dev == ign->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + ip6gre_tnl_parm_from_user(&p1, &p); + t = ip6gre_tunnel_locate(net, &p1, 0); + if (t == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(ign->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || + new_mtu > 0xFFF8 - dev->hard_header_len) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned int len) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen); + __be16 *p = (__be16 *)(ipv6h+1); + + ip6_flow_hdr(ipv6h, 0, t->fl.u.ip6.flowlabel); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = NEXTHDR_GRE; + ipv6h->saddr = t->parms.laddr; + ipv6h->daddr = t->parms.raddr; + + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr)); + if (daddr) + memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr)); + if (!ipv6_addr_any(&ipv6h->daddr)) + return t->hlen; + + return -t->hlen; +} + +static const struct header_ops ip6gre_header_ops = { + .create = ip6gre_header, +}; + +static const struct net_device_ops ip6gre_netdev_ops = { + .ndo_init = ip6gre_tunnel_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6gre_tunnel_xmit, + .ndo_do_ioctl = ip6gre_tunnel_ioctl, + .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void ip6gre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ip6gre_tunnel_setup(struct net_device *dev) +{ + struct ip6_tnl *t; + + dev->netdev_ops = &ip6gre_netdev_ops; + dev->destructor = ip6gre_dev_free; + + dev->type = ARPHRD_IP6GRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4; + dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4; + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + dev->flags |= IFF_NOARP; + dev->iflink = 0; + dev->addr_len = sizeof(struct in6_addr); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int i; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); + + if (ipv6_addr_any(&tunnel->parms.raddr)) + dev->header_ops = &ip6gre_header_ops; + + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct pcpu_sw_netstats *ip6gre_tunnel_stats; + ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); + u64_stats_init(&ip6gre_tunnel_stats->syncp); + } + + + return 0; +} + +static void ip6gre_fb_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + tunnel->hlen = sizeof(struct ipv6hdr) + 4; + + dev_hold(dev); +} + + +static struct inet6_protocol ip6gre_protocol __read_mostly = { + .handler = ip6gre_rcv, + .err_handler = ip6gre_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +{ + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct net_device *dev, *aux; + int prio; + + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6gre_link_ops || + dev->rtnl_link_ops == &ip6gre_tap_ops) + unregister_netdevice_queue(dev, head); + + for (prio = 0; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip6_tnl *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); + + while (t != NULL) { + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, + head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init ip6gre_init_net(struct net *net) +{ + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int err; + + ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", + ip6gre_tunnel_setup); + if (!ign->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(ign->fb_tunnel_dev, net); + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + + + ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); + ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; + + err = register_netdev(ign->fb_tunnel_dev); + if (err) + goto err_reg_dev; + + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); + return 0; + +err_reg_dev: + ip6gre_dev_free(ign->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ip6gre_exit_net(struct net *net) +{ + LIST_HEAD(list); + + rtnl_lock(); + ip6gre_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations ip6gre_net_ops = { + .init = ip6gre_init_net, + .exit = ip6gre_exit_net, + .id = &ip6gre_net_id, + .size = sizeof(struct ip6gre_net), +}; + +static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + __be16 flags; + + if (!data) + return 0; + + flags = 0; + if (data[IFLA_GRE_IFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (data[IFLA_GRE_OFLAGS]) + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + if (flags & (GRE_VERSION|GRE_ROUTING)) + return -EINVAL; + + return 0; +} + +static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + struct in6_addr daddr; + + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + if (!data) + goto out; + + if (data[IFLA_GRE_REMOTE]) { + nla_memcpy(&daddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + if (ipv6_addr_any(&daddr)) + return -EINVAL; + } + +out: + return ip6gre_tunnel_validate(tb, data); +} + + +static void ip6gre_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_GRE_LINK]) + parms->link = nla_get_u32(data[IFLA_GRE_LINK]); + + if (data[IFLA_GRE_IFLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + + if (data[IFLA_GRE_OFLAGS]) + parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + + if (data[IFLA_GRE_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); + + if (data[IFLA_GRE_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); + + if (data[IFLA_GRE_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_GRE_LOCAL], sizeof(struct in6_addr)); + + if (data[IFLA_GRE_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_GRE_REMOTE], sizeof(struct in6_addr)); + + if (data[IFLA_GRE_TTL]) + parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]); + + if (data[IFLA_GRE_ENCAP_LIMIT]) + parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); + + if (data[IFLA_GRE_FLOWINFO]) + parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]); + + if (data[IFLA_GRE_FLAGS]) + parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); +} + +static int ip6gre_tap_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + + tunnel = netdev_priv(dev); + + tunnel->dev = dev; + tunnel->net = dev_net(dev); + strcpy(tunnel->parms.name, dev->name); + + ip6gre_tnl_link_config(tunnel, 1); + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static const struct net_device_ops ip6gre_tap_netdev_ops = { + .ndo_init = ip6gre_tap_init, + .ndo_uninit = ip6gre_tunnel_uninit, + .ndo_start_xmit = ip6gre_tunnel_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +static void ip6gre_tap_setup(struct net_device *dev) +{ + + ether_setup(dev); + + dev->netdev_ops = &ip6gre_tap_netdev_ops; + dev->destructor = ip6gre_dev_free; + + dev->iflink = 0; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static int ip6gre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ip6_tnl *nt; + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + int err; + + nt = netdev_priv(dev); + ip6gre_netlink_parms(data, &nt->parms); + + if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) + return -EEXIST; + + if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) + eth_hw_addr_random(dev); + + nt->dev = dev; + nt->net = dev_net(dev); + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + + err = register_netdevice(dev); + if (err) + goto out; + + dev_hold(dev); + ip6gre_tunnel_link(ign, nt); + +out: + return err; +} + +static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t, *nt = netdev_priv(dev); + struct net *net = nt->net; + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct __ip6_tnl_parm p; + + if (dev == ign->fb_tunnel_dev) + return -EINVAL; + + ip6gre_netlink_parms(data, &p); + + t = ip6gre_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else { + t = nt; + + ip6gre_tunnel_unlink(ign, t); + ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6gre_tunnel_link(ign, t); + netdev_state_change(dev); + } + + return 0; +} + +static void ip6gre_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + if (dev != ign->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + +static size_t ip6gre_get_size(const struct net_device *dev) +{ + return + /* IFLA_GRE_LINK */ + nla_total_size(4) + + /* IFLA_GRE_IFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_OFLAGS */ + nla_total_size(2) + + /* IFLA_GRE_IKEY */ + nla_total_size(4) + + /* IFLA_GRE_OKEY */ + nla_total_size(4) + + /* IFLA_GRE_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_GRE_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_GRE_TTL */ + nla_total_size(1) + + /* IFLA_GRE_TOS */ + nla_total_size(1) + + /* IFLA_GRE_ENCAP_LIMIT */ + nla_total_size(1) + + /* IFLA_GRE_FLOWINFO */ + nla_total_size(4) + + /* IFLA_GRE_FLAGS */ + nla_total_size(4) + + 0; +} + +static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct __ip6_tnl_parm *p = &t->parms; + + if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || + nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || + nla_put(skb, IFLA_GRE_LOCAL, sizeof(struct in6_addr), &p->laddr) || + nla_put(skb, IFLA_GRE_REMOTE, sizeof(struct in6_addr), &p->raddr) || + nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || + /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/ + nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || + nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || + nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { + [IFLA_GRE_LINK] = { .type = NLA_U32 }, + [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_IKEY] = { .type = NLA_U32 }, + [IFLA_GRE_OKEY] = { .type = NLA_U32 }, + [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) }, + [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) }, + [IFLA_GRE_TTL] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, + [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, + [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { + .kind = "ip6gre", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6gre_tunnel_setup, + .validate = ip6gre_tunnel_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .dellink = ip6gre_dellink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, +}; + +static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { + .kind = "ip6gretap", + .maxtype = IFLA_GRE_MAX, + .policy = ip6gre_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6gre_tap_setup, + .validate = ip6gre_tap_validate, + .newlink = ip6gre_newlink, + .changelink = ip6gre_changelink, + .get_size = ip6gre_get_size, + .fill_info = ip6gre_fill_info, +}; + +/* + * And now the modules code and kernel interface. + */ + +static int __init ip6gre_init(void) +{ + int err; + + pr_info("GRE over IPv6 tunneling driver\n"); + + err = register_pernet_device(&ip6gre_net_ops); + if (err < 0) + return err; + + err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE); + if (err < 0) { + pr_info("%s: can't add protocol\n", __func__); + goto add_proto_failed; + } + + err = rtnl_link_register(&ip6gre_link_ops); + if (err < 0) + goto rtnl_link_failed; + + err = rtnl_link_register(&ip6gre_tap_ops); + if (err < 0) + goto tap_ops_failed; + +out: + return err; + +tap_ops_failed: + rtnl_link_unregister(&ip6gre_link_ops); +rtnl_link_failed: + inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); +add_proto_failed: + unregister_pernet_device(&ip6gre_net_ops); + goto out; +} + +static void __exit ip6gre_fini(void) +{ + rtnl_link_unregister(&ip6gre_tap_ops); + rtnl_link_unregister(&ip6gre_link_ops); + inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); + unregister_pernet_device(&ip6gre_net_ops); +} + +module_init(ip6gre_init); +module_exit(ip6gre_fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); +MODULE_ALIAS_RTNL_LINK("ip6gre"); +MODULE_ALIAS_NETDEV("ip6gre0"); diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c new file mode 100644 index 00000000000..4578e23834f --- /dev/null +++ b/net/ipv6/ip6_icmp.c @@ -0,0 +1,47 @@ +#include <linux/export.h> +#include <linux/icmpv6.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#include <net/ipv6.h> + +#if IS_ENABLED(CONFIG_IPV6) + +static ip6_icmp_send_t __rcu *ip6_icmp_send; + +int inet6_register_icmp_sender(ip6_icmp_send_t *fn) +{ + return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? + 0 : -EBUSY; +} +EXPORT_SYMBOL(inet6_register_icmp_sender); + +int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) +{ + int ret; + + ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? + 0 : -EINVAL; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet6_unregister_icmp_sender); + +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + ip6_icmp_send_t *send; + + rcu_read_lock(); + send = rcu_dereference(ip6_icmp_send); + + if (!send) + goto out; + send(skb, type, code, info); +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(icmpv6_send); +#endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 237e2dba6e9..51d54dc376f 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -28,6 +28,7 @@ #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -43,12 +44,19 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> +#include <net/inet_ecn.h> - -inline int ip6_rcv_finish( struct sk_buff *skb) +int ip6_rcv_finish(struct sk_buff *skb) { - if (skb_dst(skb) == NULL) + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + } + if (!skb_dst(skb)) ip6_route_input(skb); return dst_input(skb); @@ -56,7 +64,7 @@ inline int ip6_rcv_finish( struct sk_buff *skb) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct ipv6hdr *hdr; + const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; struct net *net = dev_net(skb->dev); @@ -101,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + IP6_ADD_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_NOECTPKTS + + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address @@ -110,6 +122,35 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt ipv6_addr_loopback(&hdr->daddr)) goto err; + /* RFC4291 Errata ID: 3480 + * Interface-Local scope spans only a single interface on a + * node and is useful only for loopback transmission of + * multicast. Packets with interface-local scope received + * from another node must be discarded. + */ + if (!(skb->pkt_type == PACKET_LOOPBACK || + dev->flags & IFF_LOOPBACK) && + ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) + goto err; + + /* RFC4291 2.7 + * Nodes must not originate a packet to a multicast address whose scope + * field contains the reserved value 0; if such a packet is received, it + * must be silently dropped. + */ + if (ipv6_addr_is_multicast(&hdr->daddr) && + IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0) + goto err; + + /* + * RFC4291 2.7 + * Multicast addresses must not be used as source addresses in IPv6 + * packets or appear in any Routing header. + */ + if (ipv6_addr_is_multicast(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); @@ -142,7 +183,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish); err: IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); @@ -159,12 +200,12 @@ drop: static int ip6_input_finish(struct sk_buff *skb) { + struct net *net = dev_net(skb_dst(skb)->dev); const struct inet6_protocol *ipprot; - unsigned int nhoff; - int nexthdr, raw; - u8 hash; struct inet6_dev *idev; - struct net *net = dev_net(skb_dst(skb)->dev); + unsigned int nhoff; + int nexthdr; + bool raw; /* * Parse extension headers @@ -179,13 +220,11 @@ resubmit: nexthdr = skb_network_header(skb)[nhoff]; raw = raw6_local_deliver(skb, nexthdr); - - hash = nexthdr & (MAX_INET_PROTOS - 1); - if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) { int ret; if (ipprot->flags & INET6_PROTO_FINAL) { - struct ipv6hdr *hdr; + const struct ipv6hdr *hdr; /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded @@ -198,7 +237,7 @@ resubmit: if (ipv6_addr_is_multicast(&hdr->daddr) && !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, &hdr->saddr) && - !ipv6_is_mld(skb, nexthdr)) + !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && @@ -216,12 +255,13 @@ resubmit: IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_UNK_NEXTHDR, nhoff, - skb->dev); + ICMPV6_UNK_NEXTHDR, nhoff); } - } else + kfree_skb(skb); + } else { IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + consume_skb(skb); + } } rcu_read_unlock(); return 0; @@ -236,14 +276,14 @@ discard: int ip6_input(struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish); } int ip6_mc_input(struct sk_buff *skb) { - struct ipv6hdr *hdr; - int deliver; + const struct ipv6hdr *hdr; + bool deliver; IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, @@ -257,7 +297,8 @@ int ip6_mc_input(struct sk_buff *skb) * IPv6 multicast router mode is now supported ;) */ if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && - !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) && + !(ipv6_addr_type(&hdr->daddr) & + (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { /* * Okay, we try to forward - split and duplicate @@ -267,46 +308,31 @@ int ip6_mc_input(struct sk_buff *skb) struct inet6_skb_parm *opt = IP6CB(skb); /* Check for MLD */ - if (unlikely(opt->ra)) { + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { /* Check if this is a mld message */ - u8 *ptr = skb_network_header(skb) + opt->ra; - struct icmp6hdr *icmp6; u8 nexthdr = hdr->nexthdr; + __be16 frag_off; int offset; /* Check if the value of Router Alert * is for MLD (0x0000). */ - if ((ptr[2] | ptr[3]) == 0) { - deliver = 0; + if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) { + deliver = false; if (!ipv6_ext_hdr(nexthdr)) { /* BUG */ goto out; } offset = ipv6_skip_exthdr(skb, sizeof(*hdr), - &nexthdr); + &nexthdr, &frag_off); if (offset < 0) goto out; - if (nexthdr != IPPROTO_ICMPV6) - goto out; - - if (!pskb_may_pull(skb, (skb_network_header(skb) + - offset + 1 - skb->data))) + if (!ipv6_is_mld(skb, nexthdr, offset)) goto out; - icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); - - switch (icmp6->icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - deliver = 1; - break; - } - goto out; + deliver = true; } /* unknown RA - process it normally */ } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c new file mode 100644 index 00000000000..65eda2a8af4 --- /dev/null +++ b/net/ipv6/ip6_offload.c @@ -0,0 +1,337 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/printk.h> + +#include <net/protocol.h> +#include <net/ipv6.h> + +#include "ip6_offload.h" + +static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) +{ + const struct net_offload *ops = NULL; + + for (;;) { + struct ipv6_opt_hdr *opth; + int len; + + if (proto != NEXTHDR_HOP) { + ops = rcu_dereference(inet6_offloads[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + + if (unlikely(!pskb_may_pull(skb, 8))) + break; + + opth = (void *)skb->data; + len = ipv6_optlen(opth); + + if (unlikely(!pskb_may_pull(skb, len))) + break; + + proto = opth->nexthdr; + __skb_pull(skb, len); + } + + return proto; +} + +static int ipv6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + const struct net_offload *ops; + int err = -EINVAL; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + err = -EPROTONOSUPPORT; + + ops = rcu_dereference(inet6_offloads[ + ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); + + if (likely(ops && ops->callbacks.gso_send_check)) { + skb_reset_transport_header(skb); + err = ops->callbacks.gso_send_check(skb); + } + +out: + return err; +} + +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct ipv6hdr *ipv6h; + const struct net_offload *ops; + int proto; + struct frag_hdr *fptr; + unsigned int unfrag_ip6hlen; + u8 *prevhdr; + int offset = 0; + bool encap, udpfrag; + int nhoff; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_MPLS | + SKB_GSO_TCPV6 | + 0))) + goto out; + + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + encap = SKB_GSO_CB(skb)->encap_level > 0; + if (encap) + features = skb->dev->hw_enc_features & netif_skb_features(skb); + SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + segs = ERR_PTR(-EPROTONOSUPPORT); + + proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); + + if (skb->encapsulation && + skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + udpfrag = proto == IPPROTO_UDP && encap; + else + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + + ops = rcu_dereference(inet6_offloads[proto]); + if (likely(ops && ops->callbacks.gso_segment)) { + skb_reset_transport_header(skb); + segs = ops->callbacks.gso_segment(skb, features); + } + + if (IS_ERR(segs)) + goto out; + + for (skb = segs; skb; skb = skb->next) { + ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); + ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); + skb->network_header = (u8 *)ipv6h - skb->head; + + if (udpfrag) { + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + fptr->frag_off = htons(offset); + if (skb->next != NULL) + fptr->frag_off |= htons(IP6_MF); + offset += (ntohs(ipv6h->payload_len) - + sizeof(struct frag_hdr)); + } + if (encap) + skb_reset_inner_headers(skb); + } + +out: + return segs; +} + +/* Return the total length of all the extension hdrs, following the same + * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. + */ +static int ipv6_exthdrs_len(struct ipv6hdr *iph, + const struct net_offload **opps) +{ + struct ipv6_opt_hdr *opth = (void *)iph; + int len = 0, proto, optlen = sizeof(*iph); + + proto = iph->nexthdr; + for (;;) { + if (proto != NEXTHDR_HOP) { + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + opth = (void *)opth + optlen; + optlen = ipv6_optlen(opth); + len += optlen; + proto = opth->nexthdr; + } + return len; +} + +static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct ipv6hdr *iph; + unsigned int nlen; + unsigned int hlen; + unsigned int off; + u16 flush = 1; + int proto; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } + + skb_set_network_header(skb, off); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + + flush += ntohs(iph->payload_len) != skb_gro_len(skb); + + rcu_read_lock(); + proto = iph->nexthdr; + ops = rcu_dereference(inet6_offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + ops = rcu_dereference(inet6_offloads[proto]); + if (!ops || !ops->callbacks.gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + NAPI_GRO_CB(skb)->proto = proto; + + flush--; + nlen = skb_network_header_len(skb); + + for (p = *head; p; p = p->next) { + const struct ipv6hdr *iph2; + __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + iph2 = (struct ipv6hdr *)(p->data + off); + first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; + + /* All fields must match except length and Traffic Class. + * XXX skbs on the gro_list have all been parsed and pulled + * already so we don't need to compare nlen + * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) + * memcmp() alone below is suffcient, right? + */ + if ((first_word & htonl(0xF00FFFFF)) || + memcmp(&iph->nexthdr, &iph2->nexthdr, + nlen - offsetof(struct ipv6hdr, nexthdr))) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + /* flush if Traffic Class fields are different */ + NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); + NAPI_GRO_CB(p)->flush |= flush; + } + + NAPI_GRO_CB(skb)->flush |= flush; + + skb_gro_postpull_rcsum(skb, iph, nlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload *ops; + struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); + int err = -ENOSYS; + + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); + + rcu_read_lock(); + + nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct packet_offload ipv6_packet_offload __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), + .callbacks = { + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, + }, +}; + +static const struct net_offload sit_offload = { + .callbacks = { + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + }, +}; + +static int __init ipv6_offload_init(void) +{ + + if (tcpv6_offload_init() < 0) + pr_crit("%s: Cannot add TCP protocol offload\n", __func__); + if (udp_offload_init() < 0) + pr_crit("%s: Cannot add UDP protocol offload\n", __func__); + if (ipv6_exthdrs_offload_init() < 0) + pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); + + dev_add_offload(&ipv6_packet_offload); + + inet_add_offload(&sit_offload, IPPROTO_IPV6); + + return 0; +} + +fs_initcall(ipv6_offload_init); diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h new file mode 100644 index 00000000000..2e155c651b3 --- /dev/null +++ b/net/ipv6/ip6_offload.h @@ -0,0 +1,18 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __ip6_offload_h +#define __ip6_offload_h + +int ipv6_exthdrs_offload_init(void); +int udp_offload_init(void); +int tcpv6_offload_init(void); + +#endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cd48801a8d6..45702b8cd14 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -37,6 +37,7 @@ #include <linux/tcp.h> #include <linux/route.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -55,77 +56,22 @@ #include <net/checksum.h> #include <linux/mroute6.h> -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); - -int __ip6_local_out(struct sk_buff *skb) -{ - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; - ipv6_hdr(skb)->payload_len = htons(len); - - return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, - dst_output); -} - -int ip6_local_out(struct sk_buff *skb) -{ - int err; - - err = __ip6_local_out(skb); - if (likely(err == 1)) - err = dst_output(skb); - - return err; -} -EXPORT_SYMBOL_GPL(ip6_local_out); - -static int ip6_output_finish(struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - - if (dst->hh) - return neigh_hh_output(dst->hh, skb); - else if (dst->neighbour) - return dst->neighbour->output(skb); - - IP6_INC_STATS_BH(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); - return -EINVAL; - -} - -/* dev_loopback_xmit for use with netfilter. */ -static int ip6_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - - netif_rx(newskb); - return 0; -} - - -static int ip6_output2(struct sk_buff *skb) +static int ip6_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; + struct neighbour *neigh; + struct in6_addr *nexthop; + int ret; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { - struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) && - ((mroute6_socket(dev_net(dev)) && + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + ((mroute6_socket(dev_net(dev), skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr))) { @@ -135,9 +81,9 @@ static int ip6_output2(struct sk_buff *skb) is not supported in any case. */ if (newskb) - NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, - ip6_dev_loopback_xmit); + NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + dev_loopback_xmit); if (ipv6_hdr(skb)->hop_limit == 0) { IP6_INC_STATS(dev_net(dev), idev, @@ -149,53 +95,74 @@ static int ip6_output2(struct sk_buff *skb) IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, skb->len); + + if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= + IPV6_ADDR_SCOPE_NODELOCAL && + !(dev->flags & IFF_LOOPBACK)) { + kfree_skb(skb); + return 0; + } + } + + rcu_read_lock_bh(); + nexthop = rt6_nexthop((struct rt6_info *)dst); + neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + if (!IS_ERR(neigh)) { + ret = dst_neigh_output(dst, neigh, skb); + rcu_read_unlock_bh(); + return ret; } + rcu_read_unlock_bh(); - return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev, - ip6_output_finish); + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; } -static inline int ip6_skb_dst_mtu(struct sk_buff *skb) +static int ip6_finish_output(struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; - - return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)) || + (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) + return ip6_fragment(skb, ip6_finish_output2); + else + return ip6_finish_output2(skb); } -int ip6_output(struct sk_buff *skb) +int ip6_output(struct sock *sk, struct sk_buff *skb) { + struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); if (unlikely(idev->cnf.disable_ipv6)) { - IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev, + IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } - if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb))) - return ip6_fragment(skb, ip6_output2); - else - return ip6_output2(skb); + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* - * xmit an sk_buff (used by TCP) + * xmit an sk_buff (used by TCP, SCTP and DCCP) */ -int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct ipv6_txoptions *opt, int ipfragok) +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *first_hop = &fl->fl6_dst; + struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; - u8 proto = fl->proto; + u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; - int tclass = 0; u32 mtu; if (opt) { @@ -216,10 +183,9 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, kfree_skb(skb); return -ENOBUFS; } - kfree_skb(skb); + consume_skb(skb); skb = skb2; - if (sk) - skb_set_owner_w(skb, sk); + skb_set_owner_w(skb, sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -231,44 +197,37 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - /* Allow local fragmentation. */ - if (ipfragok) - skb->local_df = 1; - /* * Fill in the IPv6 header */ - if (np) { - tclass = np->tclass; + if (np) hlimit = np->hop_limit; - } if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); - *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel; + ip6_flow_hdr(hdr, tclass, fl6->flowlabel); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; - ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); - ipv6_addr_copy(&hdr->daddr, first_hop); + hdr->saddr = fl6->saddr; + hdr->daddr = *first_hop; + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; mtu = dst_mtu(dst); - if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); - return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, - dst_output); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + dst->dev, dst_output); } - if (net_ratelimit()) - printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + ipv6_local_error(sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; @@ -276,42 +235,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, EXPORT_SYMBOL(ip6_xmit); -/* - * To avoid extra problems ND packets are send through this - * routine. It's code duplication but I really want to avoid - * extra checks since ipv6_build_header is used by TCP (which - * is for us performance critical) - */ - -int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, - const struct in6_addr *saddr, const struct in6_addr *daddr, - int proto, int len) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6hdr *hdr; - int totlen; - - skb->protocol = htons(ETH_P_IPV6); - skb->dev = dev; - - totlen = len + sizeof(struct ipv6hdr); - - skb_reset_network_header(skb); - skb_put(skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(skb); - - *(__be32*)hdr = htonl(0x60000000); - - hdr->payload_len = htons(len); - hdr->nexthdr = proto; - hdr->hop_limit = np->hop_limit; - - ipv6_addr_copy(&hdr->saddr, saddr); - ipv6_addr_copy(&hdr->daddr, daddr); - - return 0; -} - static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; @@ -345,10 +268,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; + __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { - offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr); + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else @@ -397,21 +321,65 @@ static inline int ip6_forward_finish(struct sk_buff *skb) return dst_output(skb); } +static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + unsigned int mtu; + struct inet6_dev *idev; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb->ignore_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + u32 mtu; if (net->ipv6.devconf_all->forwarding == 0) goto error; + if (skb->pkt_type != PACKET_HOST) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } @@ -430,9 +398,8 @@ int ip6_forward(struct sk_buff *skb) * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ - if (opt->ra) { - u8 *ptr = skb_network_header(skb) + opt->ra; - if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { + if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0; } @@ -442,10 +409,9 @@ int ip6_forward(struct sk_buff *skb) if (hdr->hop_limit <= 1) { /* Force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, - 0, skb->dev); - IP6_INC_STATS_BH(net, - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -458,14 +424,15 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -474,11 +441,10 @@ int ip6_forward(struct sk_buff *skb) send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 && - !skb_sec_path(skb)) { + if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; + struct inet_peer *peer; struct rt6_info *rt; - struct neighbour *n = dst->neighbour; /* * incoming and outgoing devices are the same @@ -486,16 +452,20 @@ int ip6_forward(struct sk_buff *skb) */ rt = (struct rt6_info *) dst; - if ((rt->rt6i_flags & RTF_GATEWAY)) - target = (struct in6_addr*)&n->primary_key; + if (rt->rt6i_flags & RTF_GATEWAY) + target = &rt->rt6i_gateway; else target = &hdr->daddr; + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ - if (xrlim_allow(dst, 1*HZ)) - ndisc_send_redirect(skb, n, target); + if (inet_peer_xrlim_allow(peer, 1*HZ)) + ndisc_send_redirect(skb, target); + if (peer) + inet_putpeer(peer); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -505,25 +475,30 @@ int ip6_forward(struct sk_buff *skb) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, - ICMPV6_NOT_NEIGHBOUR, 0, skb->dev); + ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } - if (skb->len > dst_mtu(dst)) { + mtu = ip6_dst_mtu_forward(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev); - IP6_INC_STATS_BH(net, - ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); - IP6_INC_STATS_BH(net, - ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTDISCARDS); goto drop; } @@ -534,7 +509,8 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); error: @@ -558,53 +534,24 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) - to->nf_trace = from->nf_trace; -#endif skb_copy_secmark(to, from); } -int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) { - u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); - unsigned int packet_len = skb->tail - skb->network_header; - int found_rhdr = 0; - *nexthdr = &ipv6_hdr(skb)->nexthdr; - - while (offset + 1 <= packet_len) { + static u32 ip6_idents_hashrnd __read_mostly; + u32 hash, id; - switch (**nexthdr) { + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - case NEXTHDR_HOP: - break; - case NEXTHDR_ROUTING: - found_rhdr = 1; - break; - case NEXTHDR_DEST: -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) - if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) - break; -#endif - if (found_rhdr) - return offset; - break; - default : - return offset; - } + hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); + hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash); - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + - offset); - } - - return offset; + id = ip_idents_reserve(hash, 1); + fhdr->identification = htonl(id); } -static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); @@ -612,6 +559,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; + int hroom, troom; __be32 frag_id = 0; int ptr, offset = 0, err=0; u8 *prevhdr, nexthdr = 0; @@ -623,12 +571,16 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) mtu = ip6_skb_dst_mtu(skb); /* We must not fragment if the socket is set to force MTU discovery - * or if the skb it not generated by a local socket. (This last - * check should be redundant, but it's free.) + * or if the skb it not generated by a local socket. */ - if (!skb->local_df) { + if (unlikely(!skb->ignore_df && skb->len > mtu) || + (IP6CB(skb)->frag_max_size && + IP6CB(skb)->frag_max_size > mtu)) { + if (skb->sk && dst_allfrag(skb_dst(skb))) + sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); + skb->dev = skb_dst(skb)->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); @@ -641,9 +593,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); - int truesizes = 0; + struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -655,18 +607,18 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen) - goto slow_path; + goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) - goto slow_path; + goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; - truesizes += frag->truesize; } + skb->truesize -= frag->truesize; } err = 0; @@ -689,7 +641,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_reset_network_header(skb); memcpy(skb_network_header(skb), tmp_hdr, hlen); - ipv6_select_ident(fh); + ipv6_select_ident(fh, rt); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); @@ -697,12 +649,11 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); - skb->truesize -= truesizes; skb->len = first_len; ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); for (;;) { /* Prepare header of the next frame, @@ -730,7 +681,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) err = output(skb); if(!err) - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); if (err || !frag) @@ -744,9 +695,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) kfree(tmp_hdr); if (err == 0) { - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); - dst_release(&rt->u.dst); + ip6_rt_put(rt); return 0; } @@ -756,13 +707,26 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) frag = skb; } - IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst), + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); - dst_release(&rt->u.dst); + ip6_rt_put(rt); return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } } slow_path: + if ((skb->ip_summed == CHECKSUM_PARTIAL) && + skb_checksum_help(skb)) + goto fail; + left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ @@ -771,6 +735,8 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; + hroom = LL_RESERVED_SPACE(rt->dst.dev); + troom = rt->dst.dev->needed_tailroom; /* * Keep copying data until we run out. @@ -780,7 +746,7 @@ slow_path: /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) len = mtu; - /* IF: we are not sending upto and including the packet end + /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len &= ~7; @@ -789,7 +755,8 @@ slow_path: * Allocate buffer. */ - if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC)) == NULL) { NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); @@ -802,7 +769,7 @@ slow_path: */ ip6_copy_metadata(frag, skb); - skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(frag, hroom); skb_put(frag, len + hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); @@ -827,7 +794,7 @@ slow_path: fh->nexthdr = nexthdr; fh->reserved = 0; if (!frag_id) { - ipv6_select_ident(fh); + ipv6_select_ident(fh, rt); frag_id = fh->identification; } else fh->identification = frag_id; @@ -860,7 +827,7 @@ slow_path: } IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGOKS); - kfree_skb(skb); + consume_skb(skb); return err; fail: @@ -870,24 +837,30 @@ fail: return err; } -static inline int ip6_rt_check(struct rt6key *rt_key, - struct in6_addr *fl_addr, - struct in6_addr *addr_cache) +static inline int ip6_rt_check(const struct rt6key *rt_key, + const struct in6_addr *fl_addr, + const struct in6_addr *addr_cache) { - return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && - (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache))); + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, struct dst_entry *dst, - struct flowi *fl) + const struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt; if (!dst) goto out; + if (dst->ops->family != AF_INET6) { + dst_release(dst); + return NULL; + } + + rt = (struct rt6_info *)dst; /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -905,11 +878,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl->oif && fl->oif != dst->dev->ifindex)) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } @@ -919,22 +892,26 @@ out: } static int ip6_dst_lookup_tail(struct sock *sk, - struct dst_entry **dst, struct flowi *fl) + struct dst_entry **dst, struct flowi6 *fl6) { - int err; struct net *net = sock_net(sk); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + struct neighbour *n; + struct rt6_info *rt; +#endif + int err; if (*dst == NULL) - *dst = ip6_route_output(net, sk, fl); + *dst = ip6_route_output(net, sk, fl6); if ((err = (*dst)->error)) goto out_err_release; - if (ipv6_addr_any(&fl->fl6_src)) { - err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev, - &fl->fl6_dst, - sk ? inet6_sk(sk)->srcprefs : 0, - &fl->fl6_src); + if (ipv6_addr_any(&fl6->saddr)) { + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); if (err) goto out_err_release; } @@ -948,12 +925,18 @@ static int ip6_dst_lookup_tail(struct sock *sk, * dst entry and replace it instead with the * dst entry of the nexthop router */ - if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) { + rt = (struct rt6_info *) *dst; + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); + err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; + rcu_read_unlock_bh(); + + if (err) { struct inet6_ifaddr *ifp; - struct flowi fl_gw; + struct flowi6 fl_gw6; int redirect; - ifp = ipv6_get_ifaddr(net, &fl->fl6_src, + ifp = ipv6_get_ifaddr(net, &fl6->saddr, (*dst)->dev, 1); redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); @@ -966,9 +949,9 @@ static int ip6_dst_lookup_tail(struct sock *sk, * default router instead */ dst_release(*dst); - memcpy(&fl_gw, fl, sizeof(struct flowi)); - memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr)); - *dst = ip6_route_output(net, sk, &fl_gw); + memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); + memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw6); if ((err = (*dst)->error)) goto out_err_release; } @@ -979,7 +962,7 @@ static int ip6_dst_lookup_tail(struct sock *sk, out_err_release: if (err == -ENETUNREACH) - IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); dst_release(*dst); *dst = NULL; return err; @@ -989,52 +972,88 @@ out_err_release: * ip6_dst_lookup - perform route lookup on flow * @sk: socket which provides route info * @dst: pointer to dst_entry * for result - * @fl: flow to lookup + * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error. */ -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { *dst = NULL; - return ip6_dst_lookup_tail(sk, dst, fl); + return ip6_dst_lookup_tail(sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** - * ip6_sk_dst_lookup - perform socket cached route lookup on flow + * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @sk: socket which provides route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * + * This function performs a route lookup on the given flow. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) +{ + struct dst_entry *dst = NULL; + int err; + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +/** + * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info - * @dst: pointer to dst_entry * for result - * @fl: flow to lookup + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * - * It returns zero on success, or a standard errno code on error. + * It returns a valid dst pointer on success, or a pointer encoded + * error code. */ -int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst) { - *dst = NULL; - if (sk) { - *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); - *dst = ip6_sk_dst_check(sk, *dst, fl); - } + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + int err; - return ip6_dst_lookup_tail(sk, dst, fl); + dst = ip6_sk_dst_check(sk, dst, fl6); + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } -EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup); +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, - int transhdrlen, int mtu,unsigned int flags) + int transhdrlen, int mtu,unsigned int flags, + struct rt6_info *rt) { struct sk_buff *skb; + struct frag_hdr fhdr; int err; /* There is support for UDP large send offload by network @@ -1046,7 +1065,7 @@ static inline int ip6_ufo_append_data(struct sock *sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); if (skb == NULL) - return -ENOMEM; + return err; /* reserve space for Hardware header */ skb_reserve(skb, hh_len); @@ -1060,34 +1079,27 @@ static inline int ip6_ufo_append_data(struct sock *sk, /* initialize protocol header pointer */ skb->transport_header = skb->network_header + fragheaderlen; - skb->ip_summed = CHECKSUM_PARTIAL; + skb->protocol = htons(ETH_P_IPV6); skb->csum = 0; - sk->sk_sndmsg_off = 0; - } - - err = skb_append_datato_frags(sk,skb, getfrag, from, - (length - transhdrlen)); - if (!err) { - struct frag_hdr fhdr; - /* Specify the length of each IPv6 datagram fragment. - * It has to be a multiple of 8. - */ - skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - - sizeof(struct frag_hdr)) & ~7; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - ipv6_select_ident(&fhdr); - skb_shinfo(skb)->ip6_frag_id = fhdr.identification; __skb_queue_tail(&sk->sk_write_queue, skb); - - return 0; + } else if (skb_is_gso(skb)) { + goto append; } - /* There is not enough support do UPD LSO, - * so follow normal path - */ - kfree_skb(skb); - return err; + skb->ip_summed = CHECKSUM_PARTIAL; + /* Specify the length of each IPv6 datagram fragment. + * It has to be a multiple of 8. + */ + skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - + sizeof(struct frag_hdr)) & ~7; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + ipv6_select_ident(&fhdr, rt); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + +append: + return skb_append_datato_frags(sk, skb, getfrag, from, + (length - transhdrlen)); } static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, @@ -1102,26 +1114,52 @@ static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } +static void ip6_append_data_mtu(unsigned int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, + struct rt6_info *rt, + unsigned int orig_mtu) +{ + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { + /* first fragment, reserve header_len */ + *mtu = orig_mtu - rt->dst.header_len; + + } else { + /* + * this fragment is not first, the headers + * space is regarded as data space. + */ + *mtu = orig_mtu; + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); + } +} + int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl, - struct rt6_info *rt, unsigned int flags) + int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, int dontfrag) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - unsigned int maxfraglen, fragheaderlen; + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; int exthdrlen; + int dst_exthdrlen; int hh_len; - int mtu; int copy; int err; int offset = 0; - int csummode = CHECKSUM_NONE; + __u8 tx_flags = 0; if (flags&MSG_PROBE) return 0; + cork = &inet->cork.base; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking @@ -1130,7 +1168,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, if (WARN_ON(np->cork.opt)) return -EINVAL; - np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); + np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); if (unlikely(np->cork.opt == NULL)) return -ENOBUFS; @@ -1160,49 +1198,83 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, /* need source address above miyazawa*/ } - dst_hold(&rt->u.dst); - inet->cork.dst = &rt->u.dst; - inet->cork.fl = *fl; + dst_hold(&rt->dst); + cork->dst = &rt->dst; + inet->cork.fl.u.ip6 = *fl6; np->cork.hop_limit = hlimit; np->cork.tclass = tclass; - mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); + if (rt->dst.flags & DST_XFRM_TUNNEL) + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + else + mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } - inet->cork.fragsize = mtu; - if (dst_allfrag(rt->u.dst.path)) - inet->cork.flags |= IPCORK_ALLFRAG; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) - - rt->rt6i_nfheader_len; + cork->fragsize = mtu; + if (dst_allfrag(rt->dst.path)) + cork->flags |= IPCORK_ALLFRAG; + cork->length = 0; + exthdrlen = (opt ? opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; + dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } else { - rt = (struct rt6_info *)inet->cork.dst; - fl = &inet->cork.fl; + rt = (struct rt6_info *)cork->dst; + fl6 = &inet->cork.fl.u.ip6; opt = np->cork.opt; transhdrlen = 0; exthdrlen = 0; - mtu = inet->cork.fragsize; + dst_exthdrlen = 0; + mtu = cork->fragsize; } + orig_mtu = mtu; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { - if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { - ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen); + unsigned int maxnonfragsize, headersize; + + headersize = sizeof(struct ipv6hdr) + + (opt ? opt->opt_flen + opt->opt_nflen : 0) + + (dst_allfrag(&rt->dst) ? + sizeof(struct frag_hdr) : 0) + + rt->rt6i_nfheader_len; + + if (ip6_sk_ignore_df(sk)) + maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; + else + maxnonfragsize = mtu; + + /* dontfrag active */ + if ((cork->length + length > mtu - headersize) && dontfrag && + (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_RAW)) { + ipv6_local_rxpmtu(sk, fl6, mtu - headersize + + sizeof(struct ipv6hdr)); + goto emsgsize; + } + + if (cork->length + length > maxnonfragsize - headersize) { +emsgsize: + ipv6_local_error(sk, EMSGSIZE, fl6, + mtu - headersize + + sizeof(struct ipv6hdr)); return -EMSGSIZE; } } + /* For UDP, check if TX timestamp is enabled */ + if (sk->sk_type == SOCK_DGRAM) + sock_tx_timestamp(sk, &tx_flags); + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1219,24 +1291,26 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, * --yoshfuji */ - inet->cork.length += length; - if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { - - err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, - flags); + skb = skb_peek_tail(&sk->sk_write_queue); + cork->length += length; + if (((length > mtu) || + (skb && skb_is_gso(skb))) && + (sk->sk_protocol == IPPROTO_UDP) && + (rt->dst.dev->features & NETIF_F_UFO)) { + err = ip6_ufo_append_data(sk, getfrag, from, length, + hh_len, fragheaderlen, + transhdrlen, mtu, flags, rt); if (err) goto error; return 0; } - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ - copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; @@ -1246,38 +1320,46 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - struct sk_buff *skb_prev; alloc_new_skb: - skb_prev = skb; - /* There's no room in the current skb */ - if (skb_prev) - fraggap = skb_prev->len - maxfraglen; + if (skb) + fraggap = skb->len - maxfraglen; else fraggap = 0; + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, + fragheaderlen, skb, rt, + orig_mtu); + + skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; - if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) - datalen = maxfraglen - fragheaderlen; - fraglen = datalen + fragheaderlen; + if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; - /* - * The last fragment gets additional space at tail. - * Note: we overallocate on fragments with MSG_MODE - * because we have no idea if we're the last one. - */ - if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; + alloclen += dst_exthdrlen; + + if (datalen != length + fraggap) { + /* + * this is not the last fragment, the trailer + * space is regarded as data space. + */ + datalen += rt->dst.trailer_len; + } + + alloclen += rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; /* * We just reserve space for fragment header. @@ -1299,16 +1381,27 @@ alloc_new_skb: sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; + else { + /* Only the initial fragment + * is time stamped. + */ + tx_flags = 0; + } } if (skb == NULL) goto error; /* * Fill in the control structures */ - skb->ip_summed = csummode; + skb->protocol = htons(ETH_P_IPV6); + skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; - /* reserve for fragmentation */ - skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + /* reserve for fragmentation and ipsec header */ + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + + dst_exthdrlen); + + if (sk->sk_type == SOCK_DGRAM) + skb_shinfo(skb)->tx_flags = tx_flags; /* * Find where to start putting bytes @@ -1328,6 +1421,7 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; + if (copy < 0) { err = -EINVAL; kfree_skb(skb); @@ -1342,7 +1436,7 @@ alloc_new_skb: length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; - csummode = CHECKSUM_NONE; + dst_exthdrlen = 0; /* * Put the packet on the pending queue @@ -1354,7 +1448,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1366,46 +1460,31 @@ alloc_new_skb: } } else { int i = skb_shinfo(skb)->nr_frags; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; - unsigned int left; - - if (page && (left = PAGE_SIZE - off) > 0) { - if (copy >= left) - copy = left; - if (page != frag->page) { - if (i == MAX_SKB_FRAGS) { - err = -EMSGSIZE; - goto error; - } - get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); - frag = &skb_shinfo(skb)->frags[i]; - } - } else if(i < MAX_SKB_FRAGS) { - if (copy > PAGE_SIZE) - copy = PAGE_SIZE; - page = alloc_pages(sk->sk_allocation, 0); - if (page == NULL) { - err = -ENOMEM; - goto error; - } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; + struct page_frag *pfrag = sk_page_frag(sk); - skb_fill_page_desc(skb, i, page, 0, 0); - frag = &skb_shinfo(skb)->frags[i]; - } else { - err = -EMSGSIZE; - goto error; - } - if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { - err = -EFAULT; + err = -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) goto error; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + err = -EMSGSIZE; + if (i == MAX_SKB_FRAGS) + goto error; + + __skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, 0); + skb_shinfo(skb)->nr_frags = ++i; + get_page(pfrag->page); } - sk->sk_sndmsg_off += copy; - frag->size += copy; + copy = min_t(int, copy, pfrag->size - pfrag->offset); + if (getfrag(from, + page_address(pfrag->page) + pfrag->offset, + offset, copy, skb->len, skb) < 0) + goto error_efault; + + pfrag->offset += copy; + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -1414,12 +1493,17 @@ alloc_new_skb: offset += copy; length -= copy; } + return 0; + +error_efault: + err = -EFAULT; error: - inet->cork.length -= length; + cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); return err; } +EXPORT_SYMBOL_GPL(ip6_append_data); static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) { @@ -1432,10 +1516,10 @@ static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) np->cork.opt = NULL; } - if (inet->cork.dst) { - dst_release(inet->cork.dst); - inet->cork.dst = NULL; - inet->cork.flags &= ~IPCORK_ALLFRAG; + if (inet->cork.base.dst) { + dst_release(inet->cork.base.dst); + inet->cork.base.dst = NULL; + inet->cork.base.flags &= ~IPCORK_ALLFRAG; } memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); } @@ -1450,9 +1534,9 @@ int ip6_push_pending_frames(struct sock *sk) struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = np->cork.opt; - struct rt6_info *rt = (struct rt6_info *)inet->cork.dst; - struct flowi *fl = &inet->cork.fl; - unsigned char proto = fl->proto; + struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + unsigned char proto = fl6->flowi6_proto; int err = 0; if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) @@ -1474,10 +1558,9 @@ int ip6_push_pending_frames(struct sock *sk) } /* Allow local fragmentation. */ - if (np->pmtudisc < IPV6_PMTUDISC_DO) - skb->local_df = 1; + skb->ignore_df = ip6_sk_ignore_df(sk); - ipv6_addr_copy(final_dst, &fl->fl6_dst); + *final_dst = fl6->daddr; __skb_pull(skb, skb_network_header_len(skb)); if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); @@ -1488,24 +1571,22 @@ int ip6_push_pending_frames(struct sock *sk) skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - *(__be32*)hdr = fl->fl6_flowlabel | - htonl(0x60000000 | ((int)np->cork.tclass << 20)); - + ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); hdr->hop_limit = np->cork.hop_limit; hdr->nexthdr = proto; - ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); - ipv6_addr_copy(&hdr->daddr, final_dst); + hdr->saddr = fl6->saddr; + hdr->daddr = *final_dst; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } err = ip6_local_out(skb); @@ -1523,6 +1604,7 @@ error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); goto out; } +EXPORT_SYMBOL_GPL(ip6_push_pending_frames); void ip6_flush_pending_frames(struct sock *sk) { @@ -1537,3 +1619,4 @@ void ip6_flush_pending_frames(struct sock *sk) ip6_cork_release(inet_sk(sk), inet6_sk(sk)); } +EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d453d07b0df..afa08245836 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -18,6 +18,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> @@ -27,7 +29,6 @@ #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> -#include <linux/if_tunnel.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> @@ -37,12 +38,16 @@ #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/etherdevice.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> +#include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> @@ -56,44 +61,78 @@ MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); - -#define IPV6_TLV_TEL_DST_SIZE 8 +MODULE_ALIAS_RTNL_LINK("ip6tnl"); +MODULE_ALIAS_NETDEV("ip6tnl0"); #ifdef IP6_TNL_DEBUG -#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__) +#define IP6_TNL_TRACE(x...) pr_debug("%s:" x "\n", __func__) #else #define IP6_TNL_TRACE(x...) do {;} while(0) #endif -#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) -#define IPV6_TCLASS_SHIFT 20 +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) -#define HASH_SIZE 32 +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ - (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ - (HASH_SIZE - 1)) +static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) +{ + u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); -static void ip6_fb_tnl_dev_init(struct net_device *dev); -static void ip6_tnl_dev_init(struct net_device *dev); + return hash_32(hash, HASH_SIZE_SHIFT); +} + +static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); +static struct rtnl_link_ops ip6_link_ops __read_mostly; static int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl *tnls_r_l[HASH_SIZE]; - struct ip6_tnl *tnls_wc[1]; - struct ip6_tnl **tnls[2]; + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; }; +static struct net_device_stats *ip6_get_stats(struct net_device *dev) +{ + struct pcpu_sw_netstats tmp, sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + unsigned int start; + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&tstats->syncp); + tmp.rx_packets = tstats->rx_packets; + tmp.rx_bytes = tstats->rx_bytes; + tmp.tx_packets = tstats->tx_packets; + tmp.tx_bytes = tstats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); + + sum.rx_packets += tmp.rx_packets; + sum.rx_bytes += tmp.rx_bytes; + sum.tx_packets += tmp.tx_packets; + sum.tx_bytes += tmp.tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ip6_tnl_lock); -static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) { struct dst_entry *dst = t->dst_cache; @@ -106,20 +145,23 @@ static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) return dst; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); -static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +void ip6_tnl_dst_reset(struct ip6_tnl *t) { dst_release(t->dst_cache); t->dst_cache = NULL; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; dst_release(t->dst_cache); t->dst_cache = dst; } +EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -136,14 +178,13 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip6_tnl * -ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local) +ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) { + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) @@ -167,17 +208,17 @@ ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local) * Return: head of IPv6 tunnel list **/ -static struct ip6_tnl ** -ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p) +static struct ip6_tnl __rcu ** +ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { - struct in6_addr *remote = &p->raddr; - struct in6_addr *local = &p->laddr; - unsigned h = 0; + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; - h = HASH(remote) ^ HASH(local); + h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } @@ -190,12 +231,10 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p) static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp = ip6_tnl_bucket(ip6n, &t->parms); + struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); - spin_lock_bh(&ip6_tnl_lock); - t->next = *tp; + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ip6_tnl_lock); } /** @@ -206,20 +245,54 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp; - - for (tp = ip6_tnl_bucket(ip6n, &t->parms); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ip6_tnl_lock); - *tp = t->next; - spin_unlock_bh(&ip6_tnl_lock); + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } } +static void ip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static int ip6_tnl_create2(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err; + + t = netdev_priv(dev); + err = ip6_tnl_dev_init(dev); + if (err < 0) + goto out; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &ip6_link_ops; + + dev_hold(dev); + ip6_tnl_link(ip6n, t); + return 0; + +out: + return err; +} + /** - * ip6_tnl_create() - create a new tunnel + * ip6_tnl_create - create a new tunnel * @p: tunnel parameters * @pt: pointer to new tunnel * @@ -230,13 +303,12 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) * created tunnel or NULL **/ -static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) +static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (p->name[0]) strlcpy(name, p->name, IFNAMSIZ); @@ -249,24 +321,17 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - t = netdev_priv(dev); t->parms = *p; - ip6_tnl_dev_init(dev); - - if ((err = register_netdevice(dev)) < 0) + t->net = dev_net(dev); + err = ip6_tnl_create2(dev); + if (err < 0) goto failed_free; - dev_hold(dev); - ip6_tnl_link(ip6n, t); return t; failed_free: - free_netdev(dev); + ip6_dev_free(dev); failed: return NULL; } @@ -286,14 +351,17 @@ failed: **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, - struct ip6_tnl_parm *p, int create) + struct __ip6_tnl_parm *p, int create) { - struct in6_addr *remote = &p->raddr; - struct in6_addr *local = &p->laddr; + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for (t = *ip6_tnl_bucket(ip6n, p); t; t = t->next) { + for (tp = ip6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) return t; @@ -315,16 +383,13 @@ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - if (dev == ip6n->fb_tnl_dev) { - spin_lock_bh(&ip6_tnl_lock); - ip6n->tnls_wc[0] = NULL; - spin_unlock_bh(&ip6_tnl_lock); - } else { + if (dev == ip6n->fb_tnl_dev) + RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); + else ip6_tnl_unlink(ip6n, t); - } ip6_tnl_dst_reset(t); dev_put(dev); } @@ -338,10 +403,9 @@ ip6_tnl_dev_uninit(struct net_device *dev) * else index to encapsulation limit **/ -static __u16 -parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { - struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; __u8 nexthdr = ipv6h->nexthdr; __u16 off = sizeof (*ipv6h); @@ -389,6 +453,7 @@ parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) } return 0; } +EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /** * ip6_tnl_err - tunnel error handler @@ -402,7 +467,7 @@ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { - struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; struct ip6_tnl *t; int rel_msg = 0; u8 rel_type = ICMPV6_DEST_UNREACH; @@ -430,41 +495,32 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Path to destination invalid " - "or inactive!\n", t->parms.name); + net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Too small hop limit or " - "routing loop in tunnel!\n", - t->parms.name); + net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: teli = 0; if ((*code) == ICMPV6_HDR_FIELD) - teli = parse_tlv_tnl_enc_lim(skb, skb->data); + teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Too small encapsulation " - "limit or routing loop in " - "tunnel!\n", t->parms.name); + net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } - } else if (net_ratelimit()) { - printk(KERN_WARNING - "%s: Recipient unable to parse tunneled " - "packet!\n ", t->parms.name); + } else { + net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -502,9 +558,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u32 rel_info = ntohl(info); int err; struct sk_buff *skb2; - struct iphdr *eiph; - struct flowi fl; + const struct iphdr *eiph; struct rtable *rt; + struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); @@ -527,6 +583,9 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; + case NDISC_REDIRECT: + rel_type = ICMP_REDIRECT; + rel_code = ICMP_REDIR_HOST; default: return 0; } @@ -545,28 +604,31 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, eiph = ip_hdr(skb2); /* Try to guess incoming interface */ - memset(&fl, 0, sizeof(fl)); - fl.fl4_dst = eiph->saddr; - fl.fl4_tos = RT_TOS(eiph->tos); - fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(dev_net(skb->dev), &rt, &fl)) + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->saddr, 0, + 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt)) goto out; - skb2->dev = rt->u.dst.dev; + skb2->dev = rt->dst.dev; /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - fl.fl4_dst = eiph->daddr; - fl.fl4_src = eiph->saddr; - fl.fl4_tos = eiph->tos; - if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) || - rt->u.dst.dev->type != ARPHRD_TUNNEL) { - ip_rt_put(rt); + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->daddr, eiph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || + rt->dst.dev->type != ARPHRD_TUNNEL) { + if (!IS_ERR(rt)) + ip_rt_put(rt); goto out; } - skb_dst_set(skb2, (struct dst_entry *)rt); + skb_dst_set(skb2, &rt->dst); } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, @@ -580,8 +642,10 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info); } + if (rel_type == ICMP_REDIRECT) + skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2); icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -620,13 +684,12 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); - if (rt && rt->rt6i_dev) - skb2->dev = rt->rt6i_dev; + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; - icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + icmpv6_send(skb2, rel_type, rel_code, rel_info); - if (rt) - dst_release(&rt->u.dst); + ip6_rt_put(rt); kfree_skb(skb2); } @@ -634,51 +697,77 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } -static void ip4ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb) +static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); - if (INET_ECN_is_ce(dsfield)) - IP_ECN_set_ce(ip_hdr(skb)); + return IP6_ECN_decapsulate(ipv6h, skb); } -static void ip6ip6_dscp_ecn_decapsulate(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb) +static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); - if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) - IP6_ECN_set_ce(ipv6_hdr(skb)); + return IP6_ECN_decapsulate(ipv6h, skb); +} + +__u32 ip6_tnl_get_cap(struct ip6_tnl *t, + const struct in6_addr *laddr, + const struct in6_addr *raddr) +{ + struct __ip6_tnl_parm *p = &t->parms; + int ltype = ipv6_addr_type(laddr); + int rtype = ipv6_addr_type(raddr); + __u32 flags = 0; + + if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { + flags = IP6_TNL_F_CAP_PER_PACKET; + } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && + (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { + if (ltype&IPV6_ADDR_UNICAST) + flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + flags |= IP6_TNL_F_CAP_RCV; + } + return flags; } +EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ -static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) +int ip6_tnl_rcv_ctl(struct ip6_tnl *t, + const struct in6_addr *laddr, + const struct in6_addr *raddr) { - struct ip6_tnl_parm *p = &t->parms; + struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; - if (p->flags & IP6_TNL_F_CAP_RCV) { + if ((p->flags & IP6_TNL_F_CAP_RCV) || + ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && + (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); - if ((ipv6_addr_is_multicast(&p->laddr) || - likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) && - likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0))) + if ((ipv6_addr_is_multicast(laddr) || + likely(ipv6_chk_addr(net, laddr, ldev, 0))) && + likely(!ipv6_chk_addr(net, raddr, NULL, 0))) ret = 1; - } return ret; } +EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); /** * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally @@ -691,17 +780,20 @@ static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, __u8 ipproto, - void (*dscp_ecn_decapsulate)(struct ip6_tnl *t, - struct ipv6hdr *ipv6h, - struct sk_buff *skb)) + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) { struct ip6_tnl *t; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int err; rcu_read_lock(); if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { + struct pcpu_sw_netstats *tstats; + if (t->parms.proto != ipproto && t->parms.proto != 0) { rcu_read_unlock(); goto discard; @@ -712,26 +804,40 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, goto discard; } - if (!ip6_tnl_rcv_ctl(t)) { + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { t->dev->stats.rx_dropped++; rcu_read_unlock(); goto discard; } - secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->protocol = htons(protocol); - skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - skb->dev = t->dev; - skb_dst_drop(skb); - nf_reset(skb); - dscp_ecn_decapsulate(t, ipv6h, skb); + __skb_tunnel_rx(skb, t->dev, t->net); + + err = dscp_ecn_decapsulate(t, ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++t->dev->stats.rx_frame_errors; + ++t->dev->stats.rx_errors; + rcu_read_unlock(); + goto discard; + } + } + + tstats = this_cpu_ptr(t->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); - t->dev->stats.rx_packets++; - t->dev->stats.rx_bytes += skb->len; netif_rx(skb); + rcu_read_unlock(); return 0; } @@ -788,17 +894,17 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) * 0 else **/ -static inline int -ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +static inline bool +ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } -static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) +int ip6_tnl_xmit_ctl(struct ip6_tnl *t) { - struct ip6_tnl_parm *p = &t->parms; + struct __ip6_tnl_parm *p = &t->parms; int ret = 0; - struct net *net = dev_net(t->dev); + struct net *net = t->net; if (p->flags & IP6_TNL_F_CAP_XMIT) { struct net_device *ldev = NULL; @@ -808,21 +914,20 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0))) - printk(KERN_WARNING - "%s xmit: Local address not yet configured!\n", - p->name); + pr_warn("%s xmit: Local address not yet configured!\n", + p->name); else if (!ipv6_addr_is_multicast(&p->raddr) && unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0))) - printk(KERN_WARNING - "%s xmit: Routing loop! " - "Remote address found on this node!\n", - p->name); + pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", + p->name); else ret = 1; rcu_read_unlock(); } return ret; } +EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); + /** * ip6_tnl_xmit2 - encapsulate packet and send * @skb: the outgoing socket buffer @@ -845,40 +950,44 @@ static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) static int ip6_tnl_xmit2(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, - struct flowi *fl, + struct flowi6 *fl6, int encap_limit, __u32 *pmtu) { - struct net *net = dev_net(dev); struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ipv6_tel_txoption opt; - struct dst_entry *dst; + struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int max_headroom = sizeof(struct ipv6hdr); u8 proto; int err = -1; - int pkt_len; - if ((dst = ip6_tnl_dst_check(t)) != NULL) - dst_hold(dst); - else { - dst = ip6_route_output(net, NULL, fl); + if (!fl6->flowi6_mark) + dst = ip6_tnl_dst_check(t); + if (!dst) { + ndst = ip6_route_output(net, NULL, fl6); - if (dst->error || xfrm_lookup(net, &dst, fl, NULL, 0) < 0) + if (ndst->error) goto tx_err_link_failure; + ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); + ndst = NULL; + goto tx_err_link_failure; + } + dst = ndst; } tdev = dst->dev; if (tdev == dev) { stats->collisions++; - if (net_ratelimit()) - printk(KERN_WARNING - "%s: Local routing loop detected!\n", - t->parms.name); + net_warn_ratelimited("%s: Local routing loop detected!\n", + t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - sizeof (*ipv6h); @@ -889,13 +998,15 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + /* * Okay, now see if we can stuff it in the buffer as-is. */ @@ -910,47 +1021,45 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, if (skb->sk) skb_set_owner_w(new_skb, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = new_skb; } - skb_dst_drop(skb); - skb_dst_set(skb, dst_clone(dst)); - + if (fl6->flowi6_mark) { + skb_dst_set(skb, dst); + ndst = NULL; + } else { + skb_dst_set_noref(skb, dst); + } skb->transport_header = skb->network_header; - proto = fl->proto; + proto = fl6->flowi6_proto; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); - *(__be32*)ipv6h = fl->fl6_flowlabel | htonl(0x60000000); - dsfield = INET_ECN_encapsulate(0, dsfield); - ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), fl6->flowlabel); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; - ipv6_addr_copy(&ipv6h->saddr, &fl->fl6_src); - ipv6_addr_copy(&ipv6h->daddr, &fl->fl6_dst); - nf_reset(skb); - pkt_len = skb->len; - err = ip6_local_out(skb); - - if (net_xmit_eval(err) == 0) { - stats->tx_bytes += pkt_len; - stats->tx_packets++; - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } - ip6_tnl_dst_store(t, dst); + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; + ip6tunnel_xmit(skb, dev); + if (ndst) + ip6_tnl_dst_store(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(dst); + dst_release(ndst); return err; } @@ -958,9 +1067,9 @@ static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); int encap_limit = -1; - struct flowi fl; + struct flowi6 fl6; __u8 dsfield; __u32 mtu; int err; @@ -972,16 +1081,18 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl, &t->fl, sizeof (fl)); - fl.proto = IPPROTO_IPIP; + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) - fl.fl6_flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -1000,7 +1111,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; __u16 offset; - struct flowi fl; + struct flowi6 fl6; __u8 dsfield; __u32 mtu; int err; @@ -1009,32 +1120,34 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) return -1; - offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb)); + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2, skb->dev); + ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; - memcpy(&fl, &t->fl, sizeof (fl)); - fl.proto = IPPROTO_IPV6; + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); - if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl.fl6_flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); - - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl, encap_limit, &mtu); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); if (err != 0) { if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -1; } @@ -1071,46 +1184,28 @@ tx_err: return NETDEV_TX_OK; } -static void ip6_tnl_set_cap(struct ip6_tnl *t) -{ - struct ip6_tnl_parm *p = &t->parms; - int ltype = ipv6_addr_type(&p->laddr); - int rtype = ipv6_addr_type(&p->raddr); - - p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); - - if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && - (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { - if (ltype&IPV6_ADDR_UNICAST) - p->flags |= IP6_TNL_F_CAP_XMIT; - if (rtype&IPV6_ADDR_UNICAST) - p->flags |= IP6_TNL_F_CAP_RCV; - } -} - static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; - struct ip6_tnl_parm *p = &t->parms; - struct flowi *fl = &t->fl; + struct __ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ - ipv6_addr_copy(&fl->fl6_src, &p->laddr); - ipv6_addr_copy(&fl->fl6_dst, &p->raddr); - fl->oif = p->link; - fl->fl6_flowlabel = 0; + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) - fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; - ip6_tnl_set_cap(t); + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; @@ -1123,23 +1218,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - struct rt6_info *rt = rt6_lookup(dev_net(dev), + struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, strict); if (rt == NULL) return; - if (rt->rt6i_dev) { - dev->hard_header_len = rt->rt6i_dev->hard_header_len + + if (rt->dst.dev) { + dev->hard_header_len = rt->dst.dev->hard_header_len + sizeof (struct ipv6hdr); - dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - dst_release(&rt->u.dst); + ip6_rt_put(rt); } } @@ -1153,10 +1250,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) **/ static int -ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { - ipv6_addr_copy(&t->parms.laddr, &p->laddr); - ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; @@ -1168,6 +1265,48 @@ ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) return 0; } +static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +{ + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err; + + ip6_tnl_unlink(ip6n, t); + synchronize_net(); + err = ip6_tnl_change(t, p); + ip6_tnl_link(ip6n, t); + netdev_state_change(t->dev); + return err; +} + +static void +ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->flags = u->flags; + p->hop_limit = u->hop_limit; + p->encap_limit = u->encap_limit; + p->flowinfo = u->flowinfo; + p->link = u->link; + p->proto = u->proto; + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void +ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) +{ + u->laddr = p->laddr; + u->raddr = p->raddr; + u->flags = p->flags; + u->hop_limit = p->hop_limit; + u->encap_limit = p->encap_limit; + u->flowinfo = p->flowinfo; + u->link = p->link; + u->proto = p->proto; + memcpy(u->name, p->name, sizeof(u->name)); +} + /** * ip6_tnl_ioctl - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel @@ -1201,8 +1340,9 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip6_tnl_parm p; - struct ip6_tnl *t = NULL; - struct net *net = dev_net(dev); + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); switch (cmd) { @@ -1212,11 +1352,14 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = -EFAULT; break; } - t = ip6_tnl_locate(net, &p, 0); + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) + t = netdev_priv(dev); + } else { + memset(&p, 0, sizeof(p)); } - if (t == NULL) - t = netdev_priv(dev); - memcpy(&p, &t->parms, sizeof (p)); + ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { err = -EFAULT; } @@ -1224,7 +1367,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) @@ -1233,7 +1376,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; - t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL); + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t != NULL) { if (t->dev != dev) { @@ -1243,14 +1387,12 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } else t = netdev_priv(dev); - ip6_tnl_unlink(ip6n, t); - err = ip6_tnl_change(t, &p); - ip6_tnl_link(ip6n, t); - netdev_state_change(dev); + err = ip6_tnl_update(t, &p1); } if (t) { err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p))) + ip6_tnl_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; } else @@ -1258,7 +1400,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { @@ -1266,7 +1408,9 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) break; err = -ENOENT; - if ((t = ip6_tnl_locate(net, &p, 0)) == NULL) + ip6_tnl_parm_from_user(&p1, &p); + t = ip6_tnl_locate(net, &p1, 0); + if (t == NULL) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) @@ -1295,21 +1439,31 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) static int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { - if (new_mtu < IPV6_MIN_MTU) { - return -EINVAL; + struct ip6_tnl *tnl = netdev_priv(dev); + + if (tnl->parms.proto == IPPROTO_IPIP) { + if (new_mtu < 68) + return -EINVAL; + } else { + if (new_mtu < IPV6_MIN_MTU) + return -EINVAL; } + if (new_mtu > 0xFFF8 - dev->hard_header_len) + return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops ip6_tnl_netdev_ops = { - .ndo_uninit = ip6_tnl_dev_uninit, + .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats = ip6_get_stats, }; + /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel @@ -1320,15 +1474,23 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { + struct ip6_tnl *t; + dev->netdev_ops = &ip6_tnl_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + /* This perm addr will be used as interface identifier by IPv6 */ + dev->addr_assign_type = NET_ADDR_RANDOM; + eth_random_addr(dev->perm_addr); } @@ -1337,12 +1499,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static inline void +static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + t->dev = dev; - strcpy(t->parms.name, dev->name); + t->net = dev_net(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } /** @@ -1350,11 +1517,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static void ip6_tnl_dev_init(struct net_device *dev) +static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dev_init_gen(dev); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; ip6_tnl_link_config(t); + return 0; } /** @@ -1364,52 +1535,236 @@ static void ip6_tnl_dev_init(struct net_device *dev) * Return: 0 **/ -static void ip6_fb_tnl_dev_init(struct net_device *dev) +static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; - ip6_tnl_dev_init_gen(dev); t->parms.proto = IPPROTO_IPV6; dev_hold(dev); - ip6n->tnls_wc[0] = t; + + ip6_tnl_link_config(t); + + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPV6 && + proto != IPPROTO_IPIP && + proto != 0) + return -EINVAL; + + return 0; +} + +static void ip6_tnl_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); + + if (data[IFLA_IPTUN_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_IPTUN_LOCAL], + sizeof(struct in6_addr)); + + if (data[IFLA_IPTUN_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_IPTUN_REMOTE], + sizeof(struct in6_addr)); + + if (data[IFLA_IPTUN_TTL]) + parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); + + if (data[IFLA_IPTUN_ENCAP_LIMIT]) + parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); + + if (data[IFLA_IPTUN_FLOWINFO]) + parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); + + if (data[IFLA_IPTUN_FLAGS]) + parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); +} + +static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *nt; + + nt = netdev_priv(dev); + ip6_tnl_netlink_parms(data, &nt->parms); + + if (ip6_tnl_locate(net, &nt->parms, 0)) + return -EEXIST; + + return ip6_tnl_create2(dev); +} + +static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct __ip6_tnl_parm p; + struct net *net = t->net; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev == ip6n->fb_tnl_dev) + return -EINVAL; + + ip6_tnl_netlink_parms(data, &p); + + t = ip6_tnl_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + return ip6_tnl_update(t, &p); +} + +static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev != ip6n->fb_tnl_dev) + unregister_netdevice_queue(dev, head); +} + +static size_t ip6_tnl_get_size(const struct net_device *dev) +{ + return + /* IFLA_IPTUN_LINK */ + nla_total_size(4) + + /* IFLA_IPTUN_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_TTL */ + nla_total_size(1) + + /* IFLA_IPTUN_ENCAP_LIMIT */ + nla_total_size(1) + + /* IFLA_IPTUN_FLOWINFO */ + nla_total_size(4) + + /* IFLA_IPTUN_FLAGS */ + nla_total_size(4) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + + 0; +} + +static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct __ip6_tnl_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || + nla_put(skb, IFLA_IPTUN_LOCAL, sizeof(struct in6_addr), + &parm->laddr) || + nla_put(skb, IFLA_IPTUN_REMOTE, sizeof(struct in6_addr), + &parm->raddr) || + nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || + nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || + nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || + nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; } -static struct xfrm6_tunnel ip4ip6_handler = { +static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { + [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, + [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, + [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, + [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, + [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, +}; + +static struct rtnl_link_ops ip6_link_ops __read_mostly = { + .kind = "ip6tnl", + .maxtype = IFLA_IPTUN_MAX, + .policy = ip6_tnl_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = ip6_tnl_dev_setup, + .validate = ip6_tnl_validate, + .newlink = ip6_tnl_newlink, + .changelink = ip6_tnl_changelink, + .dellink = ip6_tnl_dellink, + .get_size = ip6_tnl_get_size, + .fill_info = ip6_tnl_fill_info, +}; + +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; -static struct xfrm6_tunnel ip6ip6_handler = { +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; -static void ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) { + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct net_device *dev, *aux; int h; struct ip6_tnl *t; LIST_HEAD(list); + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &ip6_link_ops) + unregister_netdevice_queue(dev, &list); + for (h = 0; h < HASH_SIZE; h++) { - t = ip6n->tnls_r_l[h]; + t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t != NULL) { - unregister_netdevice_queue(t->dev, &list); - t = t->next; + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); } } - t = ip6n->tnls_wc[0]; - unregister_netdevice_queue(t->dev, &list); unregister_netdevice_many(&list); } -static int ip6_tnl_init_net(struct net *net) +static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; @@ -1422,26 +1777,35 @@ static int ip6_tnl_init_net(struct net *net) if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); + ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; - ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: - free_netdev(ip6n->fb_tnl_dev); + ip6_dev_free(ip6n->fb_tnl_dev); err_alloc_dev: return err; } -static void ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_net(struct net *net) { - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - rtnl_lock(); - ip6_tnl_destroy_tunnels(ip6n); + ip6_tnl_destroy_tunnels(net); rtnl_unlock(); } @@ -1462,27 +1826,34 @@ static int __init ip6_tunnel_init(void) { int err; - if (xfrm6_tunnel_register(&ip4ip6_handler, AF_INET)) { - printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n"); - err = -EAGAIN; - goto out; - } + err = register_pernet_device(&ip6_tnl_net_ops); + if (err < 0) + goto out_pernet; - if (xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6)) { - printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n"); - err = -EAGAIN; - goto unreg_ip4ip6; + err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); + if (err < 0) { + pr_err("%s: can't register ip4ip6\n", __func__); + goto out_ip4ip6; } - err = register_pernet_device(&ip6_tnl_net_ops); + err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); + if (err < 0) { + pr_err("%s: can't register ip6ip6\n", __func__); + goto out_ip6ip6; + } + err = rtnl_link_register(&ip6_link_ops); if (err < 0) - goto err_pernet; + goto rtnl_link_failed; + return 0; -err_pernet: + +rtnl_link_failed: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); -unreg_ip4ip6: +out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); -out: +out_ip4ip6: + unregister_pernet_device(&ip6_tnl_net_ops); +out_pernet: return err; } @@ -1492,11 +1863,12 @@ out: static void __exit ip6_tunnel_cleanup(void) { + rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) - printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n"); + pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) - printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n"); + pr_info("%s: can't deregister ip6ip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c new file mode 100644 index 00000000000..9aaa6bb229e --- /dev/null +++ b/net/ipv6/ip6_vti.c @@ -0,0 +1,1160 @@ +/* + * IPv6 virtual tunneling interface + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv6/ip6_tunnel.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/sockios.h> +#include <linux/icmp.h> +#include <linux/if.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/route.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv6.h> +#include <linux/slab.h> +#include <linux/hash.h> + +#include <linux/uaccess.h> +#include <linux/atomic.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#define HASH_SIZE_SHIFT 5 +#define HASH_SIZE (1 << HASH_SIZE_SHIFT) + +static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) +{ + u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); + + return hash_32(hash, HASH_SIZE_SHIFT); +} + +static int vti6_dev_init(struct net_device *dev); +static void vti6_dev_setup(struct net_device *dev); +static struct rtnl_link_ops vti6_link_ops __read_mostly; + +static int vti6_net_id __read_mostly; +struct vti6_net { + /* the vti6 tunnel fallback device */ + struct net_device *fb_tnl_dev; + /* lists for storing tunnels in use */ + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; +}; + +#define for_each_vti6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/** + * vti6_tnl_lookup - fetch tunnel matching the end-point addresses + * @net: network namespace + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ +static struct ip6_tnl * +vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, + const struct in6_addr *local) +{ + unsigned int hash = HASH(remote, local); + struct ip6_tnl *t; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); + if (t && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * vti6_tnl_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * vti6_tnl_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ +static struct ip6_tnl __rcu ** +vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned int h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote, local); + } + return &ip6n->tnls[prio][h]; +} + +static void +vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); + + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void +vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = vti6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void vti6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static int vti6_tnl_create2(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err; + + err = vti6_dev_init(dev); + if (err < 0) + goto out; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &vti6_link_ops; + + dev_hold(dev); + vti6_tnl_link(ip6n, t); + + return 0; + +out: + return err; +} + +static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + sprintf(name, "ip6_vti%%d"); + + dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup); + if (dev == NULL) + goto failed; + + dev_net_set(dev, net); + + t = netdev_priv(dev); + t->parms = *p; + t->net = dev_net(dev); + + err = vti6_tnl_create2(dev); + if (err < 0) + goto failed_free; + + return t; + +failed_free: + vti6_dev_free(dev); +failed: + return NULL; +} + +/** + * vti6_locate - find or create tunnel matching given parameters + * @net: network namespace + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * vti6_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * matching tunnel or NULL + **/ +static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, + int create) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; + struct ip6_tnl *t; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + for (tp = vti6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) + return t; + } + if (!create) + return NULL; + return vti6_tnl_create(net, p); +} + +/** + * vti6_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * vti6_dev_uninit() removes tunnel from its list + **/ +static void vti6_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + if (dev == ip6n->fb_tnl_dev) + RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); + else + vti6_tnl_unlink(ip6n, t); + dev_put(dev); +} + +static int vti6_rcv(struct sk_buff *skb) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + rcu_read_lock(); + if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, + &ipv6h->daddr)) != NULL) { + if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { + rcu_read_unlock(); + goto discard; + } + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + rcu_read_unlock(); + return 0; + } + + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { + t->dev->stats.rx_dropped++; + rcu_read_unlock(); + goto discard; + } + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + skb->mark = be32_to_cpu(t->parms.i_key); + + rcu_read_unlock(); + + return xfrm6_rcv(skb); + } + rcu_read_unlock(); + return -EINVAL; +discard: + kfree_skb(skb); + return 0; +} + +static int vti6_rcv_cb(struct sk_buff *skb, int err) +{ + unsigned short family; + struct net_device *dev; + struct pcpu_sw_netstats *tstats; + struct xfrm_state *x; + struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; + + if (!t) + return 1; + + dev = t->dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + x = xfrm_input_state(skb); + family = x->inner_mode->afinfo->family; + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + return -EPERM; + + skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); + skb->dev = dev; + + tstats = this_cpu_ptr(dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +/** + * vti6_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ +static inline bool +vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static bool vti6_state_check(const struct xfrm_state *x, + const struct in6_addr *dst, + const struct in6_addr *src) +{ + xfrm_address_t *daddr = (xfrm_address_t *)dst; + xfrm_address_t *saddr = (xfrm_address_t *)src; + + /* if there is no transform then this tunnel is not functional. + * Or if the xfrm is not mode tunnel. + */ + if (!x || x->props.mode != XFRM_MODE_TUNNEL || + x->props.family != AF_INET6) + return false; + + if (ipv6_addr_any(dst)) + return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); + + if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) + return false; + + return true; +} + +/** + * vti6_xmit - send a packet + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * @fl: the flow informations for the xfrm_lookup + **/ +static int +vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct dst_entry *dst = skb_dst(skb); + struct net_device *tdev; + int err = -1; + + if (!dst) + goto tx_err_link_failure; + + dst_hold(dst); + dst = xfrm_lookup(t->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + if (!vti6_state_check(dst->xfrm, &t->parms.raddr, &t->parms.laddr)) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = skb_dst(skb)->dev; + + err = dst_output(skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += skb->len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t +vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct ipv6hdr *ipv6h; + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + skb->mark = be32_to_cpu(t->parms.o_key); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + ipv6h = ipv6_hdr(skb); + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || vti6_addr_conflict(t, ipv6h)) + goto tx_err; + + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + default: + goto tx_err; + } + + ret = vti6_xmit(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __be32 spi; + __u32 mark; + struct xfrm_state *x; + struct ip6_tnl *t; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct ip_comp_hdr *ipch; + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + int protocol = iph->nexthdr; + + t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); + if (!t) + return -1; + + mark = be32_to_cpu(t->parms.o_key); + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); + xfrm_state_put(x); + + return 0; +} + +static void vti6_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct __ip6_tnl_parm *p = &t->parms; + + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | + IP6_TNL_F_CAP_PER_PACKET); + p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); + + if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; +} + +/** + * vti6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * + * Description: + * vti6_tnl_change() updates the tunnel parameters + **/ +static int +vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) +{ + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; + t->parms.link = p->link; + t->parms.i_key = p->i_key; + t->parms.o_key = p->o_key; + t->parms.proto = p->proto; + ip6_tnl_dst_reset(t); + vti6_link_config(t); + return 0; +} + +static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +{ + struct net *net = dev_net(t->dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err; + + vti6_tnl_unlink(ip6n, t); + synchronize_net(); + err = vti6_tnl_change(t, p); + vti6_tnl_link(ip6n, t); + netdev_state_change(t->dev); + return err; +} + +static void +vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) +{ + p->laddr = u->laddr; + p->raddr = u->raddr; + p->link = u->link; + p->i_key = u->i_key; + p->o_key = u->o_key; + p->proto = u->proto; + + memcpy(p->name, u->name, sizeof(u->name)); +} + +static void +vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) +{ + u->laddr = p->laddr; + u->raddr = p->raddr; + u->link = p->link; + u->i_key = p->i_key; + u->o_key = p->o_key; + u->proto = p->proto; + + memcpy(u->name, p->name, sizeof(u->name)); +} + +/** + * vti6_tnl_ioctl - configure vti6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * vti6_ioctl() is used for managing vti6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6_vti0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ +static int +vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm2 p; + struct __ip6_tnl_parm p1; + struct ip6_tnl *t = NULL; + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6n->fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, 0); + } else { + memset(&p, 0, sizeof(p)); + } + if (t == NULL) + t = netdev_priv(dev); + vti6_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + break; + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + break; + err = -EINVAL; + if (p.proto != IPPROTO_IPV6 && p.proto != 0) + break; + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); + if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else + t = netdev_priv(dev); + + err = vti6_update(t, &p1); + } + if (t) { + err = 0; + vti6_parm_to_user(&p, &t->parms); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + break; + + if (dev == ip6n->fb_tnl_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + break; + err = -ENOENT; + vti6_parm_from_user(&p1, &p); + t = vti6_locate(net, &p1, 0); + if (t == NULL) + break; + err = -EPERM; + if (t->dev == ip6n->fb_tnl_dev) + break; + dev = t->dev; + } + err = 0; + unregister_netdevice(dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * vti6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ +static int vti6_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops vti6_netdev_ops = { + .ndo_uninit = vti6_dev_uninit, + .ndo_start_xmit = vti6_tnl_xmit, + .ndo_do_ioctl = vti6_ioctl, + .ndo_change_mtu = vti6_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, +}; + +/** + * vti6_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ +static void vti6_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &vti6_netdev_ops; + dev->destructor = vti6_dev_free; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); + dev->mtu = ETH_DATA_LEN; + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + +/** + * vti6_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ +static inline int vti6_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + + t->dev = dev; + t->net = dev_net(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + return 0; +} + +/** + * vti6_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ +static int vti6_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int err = vti6_dev_init_gen(dev); + + if (err) + return err; + vti6_link_config(t); + return 0; +} + +/** + * vti6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ +static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + int err = vti6_dev_init_gen(dev); + + if (err) + return err; + + t->parms.proto = IPPROTO_IPV6; + dev_hold(dev); + + vti6_link_config(t); + + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static int vti6_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + return 0; +} + +static void vti6_netlink_parms(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_VTI_LINK]) + parms->link = nla_get_u32(data[IFLA_VTI_LINK]); + + if (data[IFLA_VTI_LOCAL]) + nla_memcpy(&parms->laddr, data[IFLA_VTI_LOCAL], + sizeof(struct in6_addr)); + + if (data[IFLA_VTI_REMOTE]) + nla_memcpy(&parms->raddr, data[IFLA_VTI_REMOTE], + sizeof(struct in6_addr)); + + if (data[IFLA_VTI_IKEY]) + parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); + + if (data[IFLA_VTI_OKEY]) + parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); +} + +static int vti6_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *nt; + + nt = netdev_priv(dev); + vti6_netlink_parms(data, &nt->parms); + + nt->parms.proto = IPPROTO_IPV6; + + if (vti6_locate(net, &nt->parms, 0)) + return -EEXIST; + + return vti6_tnl_create2(dev); +} + +static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip6_tnl *t; + struct __ip6_tnl_parm p; + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + if (dev == ip6n->fb_tnl_dev) + return -EINVAL; + + vti6_netlink_parms(data, &p); + + t = vti6_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + return vti6_update(t, &p); +} + +static size_t vti6_get_size(const struct net_device *dev) +{ + return + /* IFLA_VTI_LINK */ + nla_total_size(4) + + /* IFLA_VTI_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_VTI_REMOTE */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_VTI_IKEY */ + nla_total_size(4) + + /* IFLA_VTI_OKEY */ + nla_total_size(4) + + 0; +} + +static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip6_tnl *tunnel = netdev_priv(dev); + struct __ip6_tnl_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || + nla_put(skb, IFLA_VTI_LOCAL, sizeof(struct in6_addr), + &parm->laddr) || + nla_put(skb, IFLA_VTI_REMOTE, sizeof(struct in6_addr), + &parm->raddr) || + nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || + nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { + [IFLA_VTI_LINK] = { .type = NLA_U32 }, + [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, + [IFLA_VTI_IKEY] = { .type = NLA_U32 }, + [IFLA_VTI_OKEY] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops vti6_link_ops __read_mostly = { + .kind = "vti6", + .maxtype = IFLA_VTI_MAX, + .policy = vti6_policy, + .priv_size = sizeof(struct ip6_tnl), + .setup = vti6_dev_setup, + .validate = vti6_validate, + .newlink = vti6_newlink, + .changelink = vti6_changelink, + .get_size = vti6_get_size, + .fill_info = vti6_fill_info, +}; + +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +{ + int h; + struct ip6_tnl *t; + LIST_HEAD(list); + + for (h = 0; h < HASH_SIZE; h++) { + t = rtnl_dereference(ip6n->tnls_r_l[h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); + } + } + + t = rtnl_dereference(ip6n->tnls_wc[0]); + unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init vti6_init_net(struct net *net) +{ + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct ip6_tnl *t = NULL; + int err; + + ip6n->tnls[0] = ip6n->tnls_wc; + ip6n->tnls[1] = ip6n->tnls_r_l; + + err = -ENOMEM; + ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", + vti6_dev_setup); + + if (!ip6n->fb_tnl_dev) + goto err_alloc_dev; + dev_net_set(ip6n->fb_tnl_dev, net); + + err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + err = register_netdev(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); + return 0; + +err_register: + vti6_dev_free(ip6n->fb_tnl_dev); +err_alloc_dev: + return err; +} + +static void __net_exit vti6_exit_net(struct net *net) +{ + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + rtnl_lock(); + vti6_destroy_tunnels(ip6n); + rtnl_unlock(); +} + +static struct pernet_operations vti6_net_ops = { + .init = vti6_init_net, + .exit = vti6_exit_net, + .id = &vti6_net_id, + .size = sizeof(struct vti6_net), +}; + +static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { + .handler = vti6_rcv, + .cb_handler = vti6_rcv_cb, + .err_handler = vti6_err, + .priority = 100, +}; + +/** + * vti6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ +static int __init vti6_tunnel_init(void) +{ + int err; + + err = register_pernet_device(&vti6_net_ops); + if (err < 0) + goto out_pernet; + + err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); + if (err < 0) { + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); + if (err < 0) { + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) { + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); + pr_err("%s: can't register vti6 protocol\n", __func__); + + goto out; + } + + err = rtnl_link_register(&vti6_link_ops); + if (err < 0) + goto rtnl_link_failed; + + return 0; + +rtnl_link_failed: + xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); +out: + unregister_pernet_device(&vti6_net_ops); +out_pernet: + return err; +} + +/** + * vti6_tunnel_cleanup - free resources and unregister protocol + **/ +static void __exit vti6_tunnel_cleanup(void) +{ + rtnl_link_unregister(&vti6_link_ops); + if (xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH)) + pr_info("%s: can't deregister protocol\n", __func__); + if (xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP)) + pr_info("%s: can't deregister protocol\n", __func__); + + unregister_pernet_device(&vti6_net_ops); +} + +module_init(vti6_tunnel_init); +module_exit(vti6_tunnel_cleanup); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("vti6"); +MODULE_ALIAS_NETDEV("ip6_vti0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("IPv6 virtual tunnel interface"); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 52e0f74fdfe..8250474ab7d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -16,7 +16,6 @@ * */ -#include <asm/system.h> #include <asm/uaccess.h> #include <linux/types.h> #include <linux/sched.h> @@ -33,6 +32,8 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> +#include <linux/slab.h> +#include <linux/compat.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -41,6 +42,7 @@ #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> +#include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -48,7 +50,37 @@ #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> +#include <linux/export.h> #include <net/ip6_checksum.h> +#include <linux/netconf.h> + +struct mr6_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock *mroute6_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc6_unres_queue; + struct list_head mfc6_cache_array[MFC6_LINES]; + struct mif_device vif6_table[MAXMIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; +#ifdef CONFIG_IPV6_PIMSM_V2 + int mroute_reg_vif_num; +#endif +}; + +struct ip6mr_rule { + struct fib_rule common; +}; + +struct ip6mr_result { + struct mr6_table *mrt; +}; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. @@ -60,9 +92,7 @@ static DEFINE_RWLOCK(mrt_lock); * Multicast router control variables */ -#define MIF_EXISTS(_net, _idx) ((_net)->ipv6.vif6_table[_idx].dev != NULL) - -static struct mfc6_cache *mfc_unres_queue; /* Queue of unresolved entries */ +#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -77,20 +107,246 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; -static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache); -static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr6_table *mrt); + +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache); +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); -static int ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm); -static void mroute_clean_tables(struct net *net); +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm); +static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, + int cmd); +static int ip6mr_rtm_dumproute(struct sk_buff *skb, + struct netlink_callback *cb); +static void mroute_clean_tables(struct mr6_table *mrt); +static void ipmr_expire_process(unsigned long arg); + +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +#define ip6mr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + + ip6mr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + int err; + struct ip6mr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; + + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(flp6), 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ip6mr_result *res = arg->result; + struct mr6_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ip6mr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) +{ + return 1; +} + +static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .rule_size = sizeof(struct ip6mr_rule), + .addr_size = sizeof(struct in6_addr), + .action = ip6mr_rule_action, + .match = ip6mr_rule_match, + .configure = ip6mr_rule_configure, + .compare = ip6mr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ip6mr_rule_fill, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = ip6mr_rule_policy, + .owner = THIS_MODULE, +}; -static struct timer_list ipmr_expire_timer; +static int __net_init ip6mr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr6_table *mrt; + int err; + + ops = fib_rules_register(&ip6mr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + INIT_LIST_HEAD(&net->ipv6.mr6_tables); + + mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + if (err < 0) + goto err2; + + net->ipv6.mr6_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + struct mr6_table *mrt, *next; + + rtnl_lock(); + list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { + list_del(&mrt->list); + ip6mr_free_table(mrt); + } + rtnl_unlock(); + fib_rules_unregister(net->ipv6.mr6_rules_ops); +} +#else +#define ip6mr_for_each_table(mrt, net) \ + for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + return net->ipv6.mrt6; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + *mrt = net->ipv6.mrt6; + return 0; +} + +static int __net_init ip6mr_rules_init(struct net *net) +{ + net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); + return net->ipv6.mrt6 ? 0 : -ENOMEM; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + rtnl_lock(); + ip6mr_free_table(net->ipv6.mrt6); + net->ipv6.mrt6 = NULL; + rtnl_unlock(); +} +#endif + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + unsigned int i; + + mrt = ip6mr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + /* Forwarding cache */ + for (i = 0; i < MFC6_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IPV6_PIMSM_V2 + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); +#endif + return mrt; +} + +static void ip6mr_free_table(struct mr6_table *mrt) +{ + del_timer(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} #ifdef CONFIG_PROC_FS struct ipmr_mfc_iter { struct seq_net_private p; - struct mfc6_cache **cache; + struct mr6_table *mrt; + struct list_head *cache; int ct; }; @@ -98,22 +354,22 @@ struct ipmr_mfc_iter { static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { + struct mr6_table *mrt = it->mrt; struct mfc6_cache *mfc; - it->cache = net->ipv6.mfc6_cache_array; read_lock(&mrt_lock); - for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) - for (mfc = net->ipv6.mfc6_cache_array[it->ct]; - mfc; mfc = mfc->next) + for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; + } read_unlock(&mrt_lock); - it->cache = &mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); - for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) - if (net_eq(mfc6_net(mfc), net) && - pos-- == 0) + it->cache = &mrt->mfc6_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) return mfc; spin_unlock_bh(&mfc_unres_lock); @@ -121,15 +377,13 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, return NULL; } - - - /* * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; + struct mr6_table *mrt; int ct; }; @@ -137,11 +391,13 @@ static struct mif_device *ip6mr_vif_seq_idx(struct net *net, struct ipmr_vif_iter *iter, loff_t pos) { - for (iter->ct = 0; iter->ct < net->ipv6.maxvif; ++iter->ct) { - if (!MIF_EXISTS(net, iter->ct)) + struct mr6_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!MIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) - return &net->ipv6.vif6_table[iter->ct]; + return &mrt->vif6_table[iter->ct]; } return NULL; } @@ -149,7 +405,15 @@ static struct mif_device *ip6mr_vif_seq_idx(struct net *net, static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { + struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; read_lock(&mrt_lock); return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) @@ -160,15 +424,16 @@ static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); + struct mr6_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return ip6mr_vif_seq_idx(net, iter, 0); - while (++iter->ct < net->ipv6.maxvif) { - if (!MIF_EXISTS(net, iter->ct)) + while (++iter->ct < mrt->maxvif) { + if (!MIF_EXISTS(mrt, iter->ct)) continue; - return &net->ipv6.vif6_table[iter->ct]; + return &mrt->vif6_table[iter->ct]; } return NULL; } @@ -181,7 +446,8 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { - struct net *net = seq_file_net(seq); + struct ipmr_vif_iter *iter = seq->private; + struct mr6_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -192,7 +458,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", - vif - net->ipv6.vif6_table, + vif - mrt->vif6_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); @@ -223,8 +489,15 @@ static const struct file_operations ip6mr_vif_fops = { static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { + struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + it->mrt = mrt; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } @@ -234,35 +507,36 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct mfc6_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); + struct mr6_table *mrt = it->mrt; ++*pos; if (v == SEQ_START_TOKEN) return ipmr_mfc_seq_idx(net, seq->private, 0); - if (mfc->next) - return mfc->next; + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc6_cache, list); - if (it->cache == &mfc_unres_queue) + if (it->cache == &mrt->mfc6_unres_queue) goto end_of_list; - BUG_ON(it->cache != net->ipv6.mfc6_cache_array); + BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); while (++it->ct < MFC6_LINES) { - mfc = net->ipv6.mfc6_cache_array[it->ct]; - if (mfc) - return mfc; + it->cache = &mrt->mfc6_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc6_cache, list); } /* exhausted cache_array, show unresolved */ read_unlock(&mrt_lock); - it->cache = &mfc_unres_queue; + it->cache = &mrt->mfc6_unres_queue; it->ct = 0; spin_lock_bh(&mfc_unres_lock); - mfc = mfc_unres_queue; - if (mfc) - return mfc; + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc6_cache, list); end_of_list: spin_unlock_bh(&mfc_unres_lock); @@ -274,18 +548,17 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) { struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); + struct mr6_table *mrt = it->mrt; - if (it->cache == &mfc_unres_queue) + if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == net->ipv6.mfc6_cache_array) + else if (it->cache == mrt->mfc6_cache_array) read_unlock(&mrt_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; - struct net *net = seq_file_net(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -295,19 +568,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) } else { const struct mfc6_cache *mfc = v; const struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->mf6c_parent); - if (it->cache != &mfc_unres_queue) { + if (it->cache != &mrt->mfc6_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->mfc_un.res.pkt, mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; n < mfc->mfc_un.res.maxvif; n++) { - if (MIF_EXISTS(net, n) && + if (MIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -354,7 +628,12 @@ static int pim6_rcv(struct sk_buff *skb) struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); - int reg_vif_num = net->ipv6.mroute_reg_vif_num; + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; @@ -377,9 +656,13 @@ static int pim6_rcv(struct sk_buff *skb) ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto drop; + reg_vif_num = mrt->mroute_reg_vif_num; + read_lock(&mrt_lock); if (reg_vif_num >= 0) - reg_dev = net->ipv6.vif6_table[reg_vif_num].dev; + reg_dev = mrt->vif6_table[reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -390,15 +673,13 @@ static int pim6_rcv(struct sk_buff *skb) skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); - skb->dev = reg_dev; skb->protocol = htons(ETH_P_IPV6); - skb->ip_summed = 0; - skb->pkt_type = PACKET_HOST; - skb_dst_drop(skb); - reg_dev->stats.rx_bytes += skb->len; - reg_dev->stats.rx_packets++; - nf_reset(skb); + skb->ip_summed = CHECKSUM_NONE; + + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); + netif_rx(skb); + dev_put(reg_dev); return 0; drop: @@ -416,12 +697,24 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_oif = dev->ifindex, + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ip6mr_cache_report(net, skb, net->ipv6.mroute_reg_vif_num, - MRT6MSG_WHOLEPKT); + ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; @@ -441,11 +734,17 @@ static void reg_vif_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static struct net_device *ip6mr_reg_vif(struct net *net) +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) { struct net_device *dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT6_TABLE_DFLT) + sprintf(name, "pim6reg"); + else + sprintf(name, "pim6reg%u", mrt->id); - dev = alloc_netdev(0, "pim6reg", reg_vif_setup); + dev = alloc_netdev(0, name, reg_vif_setup); if (dev == NULL) return NULL; @@ -477,15 +776,16 @@ failure: * Delete a VIF entry */ -static int mif6_delete(struct net *net, int vifi, struct list_head *head) +static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) { struct mif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; - if (vifi < 0 || vifi >= net->ipv6.maxvif) + + if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; - v = &net->ipv6.vif6_table[vifi]; + v = &mrt->vif6_table[vifi]; write_lock_bh(&mrt_lock); dev = v->dev; @@ -497,17 +797,17 @@ static int mif6_delete(struct net *net, int vifi, struct list_head *head) } #ifdef CONFIG_IPV6_PIMSM_V2 - if (vifi == net->ipv6.mroute_reg_vif_num) - net->ipv6.mroute_reg_vif_num = -1; + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; #endif - if (vifi + 1 == net->ipv6.maxvif) { + if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { - if (MIF_EXISTS(net, tmp)) + if (MIF_EXISTS(mrt, tmp)) break; } - net->ipv6.maxvif = tmp + 1; + mrt->maxvif = tmp + 1; } write_unlock_bh(&mrt_lock); @@ -515,8 +815,12 @@ static int mif6_delete(struct net *net, int vifi, struct list_head *head) dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); - if (in6_dev) + if (in6_dev) { in6_dev->cnf.mc_forwarding--; + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_MC_FORWARDING, + dev->ifindex, &in6_dev->cnf); + } if (v->flags & MIFF_REGISTER) unregister_netdevice_queue(dev, head); @@ -527,7 +831,6 @@ static int mif6_delete(struct net *net, int vifi, struct list_head *head) static inline void ip6mr_cache_free(struct mfc6_cache *c) { - release_net(mfc6_net(c)); kmem_cache_free(mrt_cachep, c); } @@ -535,21 +838,21 @@ static inline void ip6mr_cache_free(struct mfc6_cache *c) and reporting error to netlink readers. */ -static void ip6mr_destroy_unres(struct mfc6_cache *c) +static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) { + struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; - struct net *net = mfc6_net(c); - atomic_dec(&net->ipv6.cache_resolve_queue_len); + atomic_dec(&mrt->cache_resolve_queue_len); while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT; - rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } @@ -558,60 +861,60 @@ static void ip6mr_destroy_unres(struct mfc6_cache *c) } -/* Single timer process for all the unresolved queue. */ +/* Timer process for all the unresolved queue. */ -static void ipmr_do_expire_process(unsigned long dummy) +static void ipmr_do_expire_process(struct mr6_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; - struct mfc6_cache *c, **cp; + struct mfc6_cache *c, *next; - cp = &mfc_unres_queue; - - while ((c = *cp) != NULL) { + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; - cp = &c->next; continue; } - *cp = c->next; - ip6mr_destroy_unres(c); + list_del(&c->list); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, c); } - if (mfc_unres_queue != NULL) - mod_timer(&ipmr_expire_timer, jiffies + expires); + if (!list_empty(&mrt->mfc6_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } -static void ipmr_expire_process(unsigned long dummy) +static void ipmr_expire_process(unsigned long arg) { + struct mr6_table *mrt = (struct mr6_table *)arg; + if (!spin_trylock(&mfc_unres_lock)) { - mod_timer(&ipmr_expire_timer, jiffies + 1); + mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } - if (mfc_unres_queue != NULL) - ipmr_do_expire_process(dummy); + if (!list_empty(&mrt->mfc6_unres_queue)) + ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttls) +static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, + unsigned char *ttls) { int vifi; - struct net *net = mfc6_net(cache); cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); - for (vifi = 0; vifi < net->ipv6.maxvif; vifi++) { - if (MIF_EXISTS(net, vifi) && + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (MIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) @@ -622,16 +925,17 @@ static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttl } } -static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) +static int mif6_add(struct net *net, struct mr6_table *mrt, + struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; - struct mif_device *v = &net->ipv6.vif6_table[vifi]; + struct mif_device *v = &mrt->vif6_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ - if (MIF_EXISTS(net, vifi)) + if (MIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { @@ -641,9 +945,9 @@ static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ - if (net->ipv6.mroute_reg_vif_num >= 0) + if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; - dev = ip6mr_reg_vif(net); + dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); @@ -669,8 +973,12 @@ static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) } in6_dev = __in6_dev_get(dev); - if (in6_dev) + if (in6_dev) { in6_dev->cnf.mc_forwarding++; + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_MC_FORWARDING, + dev->ifindex, &in6_dev->cnf); + } /* * Fill in the VIF structures @@ -693,50 +1001,92 @@ static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) v->dev = dev; #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) - net->ipv6.mroute_reg_vif_num = vifi; + mrt->mroute_reg_vif_num = vifi; #endif - if (vifi + 1 > net->ipv6.maxvif) - net->ipv6.maxvif = vifi + 1; + if (vifi + 1 > mrt->maxvif) + mrt->maxvif = vifi + 1; write_unlock_bh(&mrt_lock); return 0; } -static struct mfc6_cache *ip6mr_cache_find(struct net *net, - struct in6_addr *origin, - struct in6_addr *mcastgrp) +static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp) { int line = MFC6_HASH(mcastgrp, origin); struct mfc6_cache *c; - for (c = net->ipv6.mfc6_cache_array[line]; c; c = c->next) { + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { if (ipv6_addr_equal(&c->mf6c_origin, origin) && ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) - break; + return c; } - return c; + return NULL; +} + +/* Look for a (*,*,oif) entry */ +static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, + mifi_t mifi) +{ + int line = MFC6_HASH(&in6addr_any, &in6addr_any); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp) && + (c->mfc_un.res.ttls[mifi] < 255)) + return c; + + return NULL; +} + +/* Look for a (*,G) entry */ +static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, + struct in6_addr *mcastgrp, + mifi_t mifi) +{ + int line = MFC6_HASH(mcastgrp, &in6addr_any); + struct mfc6_cache *c, *proxy; + + if (ipv6_addr_any(mcastgrp)) + goto skip; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) { + if (c->mfc_un.res.ttls[mifi] < 255) + return c; + + /* It's ok if the mifi is part of the static tree */ + proxy = ip6mr_cache_find_any_parent(mrt, + c->mf6c_parent); + if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) + return c; + } + +skip: + return ip6mr_cache_find_any_parent(mrt, mifi); } /* * Allocate a multicast cache entry */ -static struct mfc6_cache *ip6mr_cache_alloc(struct net *net) +static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c == NULL) return NULL; c->mfc_un.res.minvif = MAXMIFS; - mfc6_net_set(c, net); return c; } -static struct mfc6_cache *ip6mr_cache_alloc_unres(struct net *net) +static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c == NULL) return NULL; skb_queue_head_init(&c->mfc_un.unres.unresolved); c->mfc_un.unres.expires = jiffies + 10 * HZ; - mfc6_net_set(c, net); return c; } @@ -744,7 +1094,8 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(struct net *net) * A cache entry has gone into a resolved state from queued */ -static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c) +static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, + struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; @@ -754,20 +1105,19 @@ static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c) while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { - int err; struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); - if (ip6mr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); - ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE; + ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } - err = rtnl_unicast(skb, mfc6_net(uc), NETLINK_CB(skb).pid); + rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(skb, c); + ip6_mr_forward(net, mrt, skb, c); } } @@ -778,8 +1128,8 @@ static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c) * Called under mrt_lock. */ -static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, - int assert) +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert) { struct sk_buff *skb; struct mrt6msg *msg; @@ -815,10 +1165,10 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = MRT6MSG_WHOLEPKT; - msg->im6_mif = net->ipv6.mroute_reg_vif_num; + msg->im6_mif = mrt->mroute_reg_vif_num; msg->im6_pad = 0; - ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); - ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else @@ -843,14 +1193,14 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; - ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr); - ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr); + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (net->ipv6.mroute6_sk == NULL) { + if (mrt->mroute6_sk == NULL) { kfree_skb(skb); return -EINVAL; } @@ -858,10 +1208,9 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, /* * Deliver to user space multicast routing algorithms */ - ret = sock_queue_rcv_skb(net->ipv6.mroute6_sk, skb); + ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); if (ret < 0) { - if (net_ratelimit()) - printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n"); + net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } @@ -873,26 +1222,28 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, */ static int -ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb) +ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) { + bool found = false; int err; struct mfc6_cache *c; spin_lock_bh(&mfc_unres_lock); - for (c = mfc_unres_queue; c; c = c->next) { - if (net_eq(mfc6_net(c), net) && - ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && - ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) + list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && + ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { + found = true; break; + } } - if (c == NULL) { + if (!found) { /* * Create a new entry if allowable */ - if (atomic_read(&net->ipv6.cache_resolve_queue_len) >= 10 || - (c = ip6mr_cache_alloc_unres(net)) == NULL) { + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ip6mr_cache_alloc_unres()) == NULL) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); @@ -909,7 +1260,7 @@ ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb) /* * Reflect first query at pim6sd */ - err = ip6mr_cache_report(net, skb, mifi, MRT6MSG_NOCACHE); + err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker @@ -921,11 +1272,11 @@ ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb) return err; } - atomic_inc(&net->ipv6.cache_resolve_queue_len); - c->next = mfc_unres_queue; - mfc_unres_queue = c; + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc6_unres_queue); + mr6_netlink_event(mrt, c, RTM_NEWROUTE); - ipmr_do_expire_process(1); + ipmr_do_expire_process(mrt); } /* @@ -947,21 +1298,24 @@ ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb) * MFC6 cache manipulation by user space */ -static int ip6mr_mfc_delete(struct net *net, struct mf6cctl *mfc) +static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, + int parent) { int line; - struct mfc6_cache *c, **cp; + struct mfc6_cache *c, *next; line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); - for (cp = &net->ipv6.mfc6_cache_array[line]; - (c = *cp) != NULL; cp = &c->next) { + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == c->mf6c_parent)) { write_lock_bh(&mrt_lock); - *cp = c->next; + list_del(&c->list); write_unlock_bh(&mrt_lock); + mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_cache_free(c); return 0; } @@ -972,8 +1326,9 @@ static int ip6mr_mfc_delete(struct net *net, struct mf6cctl *mfc) static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct mr6_table *mrt; struct mif_device *v; int ct; LIST_HEAD(list); @@ -981,10 +1336,12 @@ static int ip6mr_device_event(struct notifier_block *this, if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - v = &net->ipv6.vif6_table[0]; - for (ct = 0; ct < net->ipv6.maxvif; ct++, v++) { - if (v->dev == dev) - mif6_delete(net, ct, &list); + ip6mr_for_each_table(mrt, net) { + v = &mrt->vif6_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + mif6_delete(mrt, ct, &list); + } } unregister_netdevice_many(&list); @@ -1001,44 +1358,28 @@ static struct notifier_block ip6_mr_notifier = { static int __net_init ip6mr_net_init(struct net *net) { - int err = 0; - net->ipv6.vif6_table = kcalloc(MAXMIFS, sizeof(struct mif_device), - GFP_KERNEL); - if (!net->ipv6.vif6_table) { - err = -ENOMEM; - goto fail; - } - - /* Forwarding cache */ - net->ipv6.mfc6_cache_array = kcalloc(MFC6_LINES, - sizeof(struct mfc6_cache *), - GFP_KERNEL); - if (!net->ipv6.mfc6_cache_array) { - err = -ENOMEM; - goto fail_mfc6_cache; - } + int err; -#ifdef CONFIG_IPV6_PIMSM_V2 - net->ipv6.mroute_reg_vif_num = -1; -#endif + err = ip6mr_rules_init(net); + if (err < 0) + goto fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; - if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops)) + if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops)) goto proc_vif_fail; - if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops)) + if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops)) goto proc_cache_fail; #endif + return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: - proc_net_remove(net, "ip6_mr_vif"); + remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: - kfree(net->ipv6.mfc6_cache_array); + ip6mr_rules_exit(net); #endif -fail_mfc6_cache: - kfree(net->ipv6.vif6_table); fail: return err; } @@ -1046,12 +1387,10 @@ fail: static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(net, "ip6_mr_cache"); - proc_net_remove(net, "ip6_mr_vif"); + remove_proc_entry("ip6_mr_cache", net->proc_net); + remove_proc_entry("ip6_mr_vif", net->proc_net); #endif - mroute_clean_tables(net); - kfree(net->ipv6.mfc6_cache_array); - kfree(net->ipv6.vif6_table); + ip6mr_rules_exit(net); } static struct pernet_operations ip6mr_net_ops = { @@ -1074,24 +1413,24 @@ int __init ip6_mr_init(void) if (err) goto reg_pernet_fail; - setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { - printk(KERN_ERR "ip6_mr_init: can't add PIM protocol\n"); + pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif + rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, + ip6mr_rtm_dumproute, NULL); return 0; #ifdef CONFIG_IPV6_PIMSM_V2 add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: - del_timer(&ipmr_expire_timer); unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); @@ -1101,18 +1440,22 @@ reg_pernet_fail: void ip6_mr_cleanup(void) { unregister_netdevice_notifier(&ip6_mr_notifier); - del_timer(&ipmr_expire_timer); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } -static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock) +static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, + struct mf6cctl *mfc, int mrtsock, int parent) { + bool found = false; int line; - struct mfc6_cache *uc, *c, **cp; + struct mfc6_cache *uc, *c; unsigned char ttls[MAXMIFS]; int i; + if (mfc->mf6cc_parent >= MAXMIFS) + return -ENFILE; + memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) @@ -1122,65 +1465,70 @@ static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock) line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); - for (cp = &net->ipv6.mfc6_cache_array[line]; - (c = *cp) != NULL; cp = &c->next) { + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) + ipv6_addr_equal(&c->mf6c_mcastgrp, + &mfc->mf6cc_mcastgrp.sin6_addr) && + (parent == -1 || parent == mfc->mf6cc_parent)) { + found = true; break; + } } - if (c != NULL) { + if (found) { write_lock_bh(&mrt_lock); c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(c, ttls); + ip6mr_update_thresholds(mrt, c, ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } - if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) + if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && + !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; - c = ip6mr_cache_alloc(net); + c = ip6mr_cache_alloc(); if (c == NULL) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(c, ttls); + ip6mr_update_thresholds(mrt, c, ttls); if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_lock_bh(&mrt_lock); - c->next = net->ipv6.mfc6_cache_array[line]; - net->ipv6.mfc6_cache_array[line] = c; + list_add(&c->list, &mrt->mfc6_cache_array[line]); write_unlock_bh(&mrt_lock); /* * Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ + found = false; spin_lock_bh(&mfc_unres_lock); - for (cp = &mfc_unres_queue; (uc = *cp) != NULL; - cp = &uc->next) { - if (net_eq(mfc6_net(uc), net) && - ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && + list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { - *cp = uc->next; - atomic_dec(&net->ipv6.cache_resolve_queue_len); + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; break; } } - if (mfc_unres_queue == NULL) - del_timer(&ipmr_expire_timer); + if (list_empty(&mrt->mfc6_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); - if (uc) { - ip6mr_cache_resolve(uc, c); + if (found) { + ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } + mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1188,17 +1536,18 @@ static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock) * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct net *net) +static void mroute_clean_tables(struct mr6_table *mrt) { int i; LIST_HEAD(list); + struct mfc6_cache *c, *next; /* * Shut down all active vif entries */ - for (i = 0; i < net->ipv6.maxvif; i++) { - if (!(net->ipv6.vif6_table[i].flags & VIFF_STATIC)) - mif6_delete(net, i, &list); + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) + mif6_delete(mrt, i, &list); } unregister_netdevice_many(&list); @@ -1206,49 +1555,42 @@ static void mroute_clean_tables(struct net *net) * Wipe the cache */ for (i = 0; i < MFC6_LINES; i++) { - struct mfc6_cache *c, **cp; - - cp = &net->ipv6.mfc6_cache_array[i]; - while ((c = *cp) != NULL) { - if (c->mfc_flags & MFC_STATIC) { - cp = &c->next; + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { + if (c->mfc_flags & MFC_STATIC) continue; - } write_lock_bh(&mrt_lock); - *cp = c->next; + list_del(&c->list); write_unlock_bh(&mrt_lock); + mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_cache_free(c); } } - if (atomic_read(&net->ipv6.cache_resolve_queue_len) != 0) { - struct mfc6_cache *c, **cp; - + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - cp = &mfc_unres_queue; - while ((c = *cp) != NULL) { - if (!net_eq(mfc6_net(c), net)) { - cp = &c->next; - continue; - } - *cp = c->next; - ip6mr_destroy_unres(c); + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_del(&c->list); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, c); } spin_unlock_bh(&mfc_unres_lock); } } -static int ip6mr_sk_init(struct sock *sk) +static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); write_lock_bh(&mrt_lock); - if (likely(net->ipv6.mroute6_sk == NULL)) { - net->ipv6.mroute6_sk = sk; + if (likely(mrt->mroute6_sk == NULL)) { + mrt->mroute6_sk = sk; net->ipv6.devconf_all->mc_forwarding++; + inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); } else err = -EADDRINUSE; @@ -1261,24 +1603,47 @@ static int ip6mr_sk_init(struct sock *sk) int ip6mr_sk_done(struct sock *sk) { - int err = 0; + int err = -EACCES; struct net *net = sock_net(sk); + struct mr6_table *mrt; rtnl_lock(); - if (sk == net->ipv6.mroute6_sk) { - write_lock_bh(&mrt_lock); - net->ipv6.mroute6_sk = NULL; - net->ipv6.devconf_all->mc_forwarding--; - write_unlock_bh(&mrt_lock); + ip6mr_for_each_table(mrt, net) { + if (sk == mrt->mroute6_sk) { + write_lock_bh(&mrt_lock); + mrt->mroute6_sk = NULL; + net->ipv6.devconf_all->mc_forwarding--; + inet6_netconf_notify_devconf(net, + NETCONFA_MC_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + write_unlock_bh(&mrt_lock); - mroute_clean_tables(net); - } else - err = -EACCES; + mroute_clean_tables(mrt); + err = 0; + break; + } + } rtnl_unlock(); return err; } +struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +{ + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, + .flowi6_oif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + return NULL; + + return mrt->mroute6_sk; +} + /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately @@ -1288,14 +1653,19 @@ int ip6mr_sk_done(struct sock *sk) int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) { - int ret; + int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; if (optname != MRT6_INIT) { - if (sk != net->ipv6.mroute6_sk && !capable(CAP_NET_ADMIN)) + if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } @@ -1307,7 +1677,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (optlen < sizeof(int)) return -EINVAL; - return ip6mr_sk_init(sk); + return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); @@ -1320,7 +1690,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); - ret = mif6_add(net, &vif, sk == net->ipv6.mroute6_sk); + ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); rtnl_unlock(); return ret; @@ -1330,7 +1700,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (copy_from_user(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); - ret = mif6_delete(net, mifi, NULL); + ret = mif6_delete(mrt, mifi, NULL); rtnl_unlock(); return ret; @@ -1340,16 +1710,21 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: + parent = -1; + case MRT6_ADD_MFC_PROXY: + case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_user(&mfc, optval, sizeof(mfc))) return -EFAULT; + if (parent == 0) + parent = mfc.mf6cc_parent; rtnl_lock(); - if (optname == MRT6_DEL_MFC) - ret = ip6mr_mfc_delete(net, &mfc); + if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) + ret = ip6mr_mfc_delete(mrt, &mfc, parent); else - ret = ip6mr_mfc_add(net, &mfc, - sk == net->ipv6.mroute6_sk); + ret = ip6mr_mfc_add(net, mrt, &mfc, + sk == mrt->mroute6_sk, parent); rtnl_unlock(); return ret; @@ -1359,9 +1734,12 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ASSERT: { int v; + + if (optlen != sizeof(v)) + return -EINVAL; if (get_user(v, (int __user *)optval)) return -EFAULT; - net->ipv6.mroute_do_assert = !!v; + mrt->mroute_do_assert = v; return 0; } @@ -1369,20 +1747,47 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_PIM: { int v; + + if (optlen != sizeof(v)) + return -EINVAL; if (get_user(v, (int __user *)optval)) return -EFAULT; v = !!v; rtnl_lock(); ret = 0; - if (v != net->ipv6.mroute_do_pim) { - net->ipv6.mroute_do_pim = v; - net->ipv6.mroute_do_assert = v; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; } rtnl_unlock(); return ret; } #endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + case MRT6_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ + if (v != RT_TABLE_DEFAULT && v >= 100000000) + return -EINVAL; + if (sk == mrt->mroute6_sk) + return -EBUSY; + + rtnl_lock(); + ret = 0; + if (!ip6mr_new_table(net, v)) + ret = -ENOMEM; + raw6_sk(sk)->ip6mr_table = v; + rtnl_unlock(); + return ret; + } +#endif /* * Spurious command, or MRT6_VERSION which you cannot * set. @@ -1402,6 +1807,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int olr; int val; struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; switch (optname) { case MRT6_VERSION: @@ -1409,11 +1819,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: - val = net->ipv6.mroute_do_pim; + val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: - val = net->ipv6.mroute_do_assert; + val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; @@ -1444,16 +1854,21 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) struct mif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; - if (vr.mifi >= net->ipv6.maxvif) + if (vr.mifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &net->ipv6.vif6_table[vr.mifi]; - if (MIF_EXISTS(net, vr.mifi)) { + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1471,7 +1886,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) return -EFAULT; read_lock(&mrt_lock); - c = ip6mr_cache_find(net, &sr.src.sin6_addr, &sr.grp.sin6_addr); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; @@ -1489,11 +1904,87 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) } } +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req6 { + struct sockaddr_in6 src; + struct sockaddr_in6 grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_mif_req6 { + mifi_t mifi; + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req6 sr; + struct compat_sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif static inline int ip6mr_forward2_finish(struct sk_buff *skb) { IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTOCTETS, skb->len); return dst_output(skb); } @@ -1501,14 +1992,14 @@ static inline int ip6mr_forward2_finish(struct sk_buff *skb) * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi) +static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *c, int vifi) { struct ipv6hdr *ipv6h; - struct net *net = mfc6_net(c); - struct mif_device *vif = &net->ipv6.vif6_table[vifi]; + struct mif_device *vif = &mrt->vif6_table[vifi]; struct net_device *dev; struct dst_entry *dst; - struct flowi fl; + struct flowi6 fl6; if (vif->dev == NULL) goto out_free; @@ -1519,23 +2010,23 @@ static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi) vif->bytes_out += skb->len; vif->dev->stats.tx_bytes += skb->len; vif->dev->stats.tx_packets++; - ip6mr_cache_report(net, skb, vifi, MRT6MSG_WHOLEPKT); + ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); - fl = (struct flowi) { - .oif = vif->link, - .nl_u = { .ip6_u = - { .daddr = ipv6h->daddr, } - } + fl6 = (struct flowi6) { + .flowi6_oif = vif->link, + .daddr = ipv6h->daddr, }; - dst = ip6_route_output(net, NULL, &fl); - if (!dst) + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); goto out_free; + } skb_dst_drop(skb); skb_dst_set(skb, dst); @@ -1566,7 +2057,7 @@ static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi) IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dev, + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev, ip6mr_forward2_finish); out_free: @@ -1574,76 +2065,104 @@ out_free: return 0; } -static int ip6mr_find_vif(struct net_device *dev) +static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) { - struct net *net = dev_net(dev); int ct; - for (ct = net->ipv6.maxvif - 1; ct >= 0; ct--) { - if (net->ipv6.vif6_table[ct].dev == dev) + + for (ct = mrt->maxvif - 1; ct >= 0; ct--) { + if (mrt->vif6_table[ct].dev == dev) break; } return ct; } -static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache) +static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache) { int psend = -1; int vif, ct; - struct net *net = mfc6_net(cache); + int true_vifi = ip6mr_find_vif(mrt, skb->dev); vif = cache->mf6c_parent; cache->mfc_un.res.pkt++; cache->mfc_un.res.bytes += skb->len; + if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { + struct mfc6_cache *cache_proxy; + + /* For an (*,G) entry, we only check that the incomming + * interface is part of the static tree. + */ + cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); + if (cache_proxy && + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + goto forward; + } + /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (net->ipv6.vif6_table[vif].dev != skb->dev) { - int true_vifi; - + if (mrt->vif6_table[vif].dev != skb->dev) { cache->mfc_un.res.wrong_if++; - true_vifi = ip6mr_find_vif(skb->dev); - if (true_vifi >= 0 && net->ipv6.mroute_do_assert && + if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ - (net->ipv6.mroute_do_pim || + (mrt->mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { cache->mfc_un.res.last_assert = jiffies; - ip6mr_cache_report(net, skb, true_vifi, MRT6MSG_WRONGMIF); + ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); } goto dont_forward; } - net->ipv6.vif6_table[vif].pkt_in++; - net->ipv6.vif6_table[vif].bytes_in += skb->len; +forward: + mrt->vif6_table[vif].pkt_in++; + mrt->vif6_table[vif].bytes_in += skb->len; /* * Forward the frame */ + if (ipv6_addr_any(&cache->mf6c_origin) && + ipv6_addr_any(&cache->mf6c_mcastgrp)) { + if (true_vifi >= 0 && + true_vifi != cache->mf6c_parent && + ipv6_hdr(skb)->hop_limit > + cache->mfc_un.res.ttls[cache->mf6c_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = cache->mf6c_parent; + goto last_forward; + } + goto dont_forward; + } for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { - if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + /* For (*,G) entry, don't forward to the incoming interface */ + if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) && + ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ip6mr_forward2(skb2, cache, psend); + ip6mr_forward2(net, mrt, skb2, cache, psend); } psend = ct; } } +last_forward: if (psend != -1) { - ip6mr_forward2(skb, cache, psend); - return 0; + ip6mr_forward2(net, mrt, skb, cache, psend); + return; } dont_forward: kfree_skb(skb); - return 0; } @@ -1655,10 +2174,30 @@ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } read_lock(&mrt_lock); - cache = ip6mr_cache_find(net, + cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (cache == NULL) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } /* * No usable cache entry @@ -1666,9 +2205,9 @@ int ip6_mr_input(struct sk_buff *skb) if (cache == NULL) { int vif; - vif = ip6mr_find_vif(skb->dev); + vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) { - int err = ip6mr_cache_unresolved(net, vif, skb); + int err = ip6mr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); return err; @@ -1678,7 +2217,7 @@ int ip6_mr_input(struct sk_buff *skb) return -ENODEV; } - ip6_mr_forward(skb, cache); + ip6_mr_forward(net, mrt, skb, cache); read_unlock(&mrt_lock); @@ -1686,51 +2225,73 @@ int ip6_mr_input(struct sk_buff *skb) } -static int -ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm) +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm) { int ct; struct rtnexthop *nhp; - struct net *net = mfc6_net(c); - struct net_device *dev = net->ipv6.vif6_table[c->mf6c_parent].dev; - u8 *b = skb_tail_pointer(skb); - struct rtattr *mp_head; + struct nlattr *mp_attr; + struct rta_mfc_stats mfcs; - if (dev) - RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mf6c_parent >= MAXMIFS) + return -ENOENT; - mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + if (MIF_EXISTS(mrt, c->mf6c_parent) && + nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) + return -EMSGSIZE; + mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + if (mp_attr == NULL) + return -EMSGSIZE; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (c->mfc_un.res.ttls[ct] < 255) { - if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) - goto rtattr_failure; - nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); + if (nhp == NULL) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = net->ipv6.vif6_table[ct].dev->ifindex; + nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } - mp_head->rta_type = RTA_MULTIPATH; - mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + + nla_nest_end(skb, mp_attr); + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) + return -EMSGSIZE; + rtm->rtm_type = RTN_MULTICAST; return 1; - -rtattr_failure: - nlmsg_trim(skb, b); - return -EMSGSIZE; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, int nowait) { int err; + struct mr6_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + read_lock(&mrt_lock); - cache = ip6mr_cache_find(net, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + if (!cache && skb->dev) { + int vif = ip6mr_find_vif(mrt, skb->dev); + + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, + vif); + } if (!cache) { struct sk_buff *skb2; @@ -1744,7 +2305,7 @@ int ip6mr_get_route(struct net *net, } dev = skb->dev; - if (dev == NULL || (vif = ip6mr_find_vif(dev)) < 0) { + if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); return -ENODEV; } @@ -1770,10 +2331,10 @@ int ip6mr_get_route(struct net *net, iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; - ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr); - ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr); + iph->saddr = rt->rt6i_src.addr; + iph->daddr = rt->rt6i_dst.addr; - err = ip6mr_cache_unresolved(net, vif, skb2); + err = ip6mr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); return err; @@ -1782,8 +2343,161 @@ int ip6mr_get_route(struct net *net, if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; - err = ip6mr_fill_mroute(skb, cache, rtm); + err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); return err; } +static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mfc6_cache *c, int cmd, + int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + int err; + + nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IP6MR; + rtm->rtm_dst_len = 128; + rtm->rtm_src_len = 128; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + if (nla_put_u32(skb, RTA_TABLE, mrt->id)) + goto nla_put_failure; + rtm->rtm_type = RTN_MULTICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + if (c->mfc_flags & MFC_STATIC) + rtm->rtm_protocol = RTPROT_STATIC; + else + rtm->rtm_protocol = RTPROT_MROUTED; + rtm->rtm_flags = 0; + + if (nla_put(skb, RTA_SRC, 16, &c->mf6c_origin) || + nla_put(skb, RTA_DST, 16, &c->mf6c_mcastgrp)) + goto nla_put_failure; + err = __ip6mr_fill_mroute(mrt, skb, c, rtm); + /* do not break the dump if cache is unresolved */ + if (err < 0 && err != -ENOENT) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mr6_msgsize(bool unresolved, int maxvif) +{ + size_t len = + NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ + ; + + if (!unresolved) + len = len + + nla_total_size(4) /* RTA_IIF */ + + nla_total_size(0) /* RTA_MULTIPATH */ + + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) + /* RTA_MFC_STATS */ + + nla_total_size(sizeof(struct rta_mfc_stats)) + ; + + return len; +} + +static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, + int cmd) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif), + GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); + if (err < 0) + goto errout; + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); + return; + +errout: + kfree_skb(skb); + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); +} + +static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr6_table *mrt; + struct mfc6_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ip6mr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC6_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) { + spin_unlock_bh(&mfc_unres_lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(&mfc_unres_lock); + e = s_e = 0; + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 2f2a5ca2c87..d1c793cffcb 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -16,8 +16,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * [Memo] @@ -30,6 +29,9 @@ * The decompression of IP datagram MUST be done after the reassembly, * AH/ESP processing. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/module.h> #include <net/ip.h> #include <net/xfrm.h> @@ -43,6 +45,7 @@ #include <linux/list.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> +#include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -50,38 +53,46 @@ #include <linux/icmpv6.h> #include <linux/mutex.h> -static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { + struct net *net = dev_net(skb->dev); __be32 spi; - struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct ip_comp_hdr *ipcomph = (struct ip_comp_hdr *)(skb->data + offset); struct xfrm_state *x; - if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) - return; + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; spi = htonl(ntohs(ipcomph->cpi)); - x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET6); if (!x) - return; + return 0; - printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI6\n", - spi, &iph->daddr); + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0); + else + ip6_update_pmtu(skb, net, info, 0, 0); xfrm_state_put(x); + + return 0; } static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { + struct net *net = xs_net(x); struct xfrm_state *t = NULL; - t = xfrm_state_alloc(&init_net); + t = xfrm_state_alloc(net); if (!t) goto out; t->id.proto = IPPROTO_IPV6; - t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr); + t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); if (!t->id.spi) goto error; @@ -90,6 +101,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t->props.family = AF_INET6; t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) goto error; @@ -108,13 +120,15 @@ error: static int ipcomp6_tunnel_attach(struct xfrm_state *x) { + struct net *net = xs_net(x); int err = 0; struct xfrm_state *t = NULL; __be32 spi; + u32 mark = x->mark.m & x->mark.v; - spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr); + spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr); if (spi) - t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr, + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr, spi, IPPROTO_IPV6, AF_INET6); if (!t) { t = ipcomp6_tunnel_create(x); @@ -154,16 +168,17 @@ static int ipcomp6_init_state(struct xfrm_state *x) if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp6_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } err = 0; out: return err; -error_tunnel: - ipcomp_destroy(x); +} - goto out; +static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) +{ + return 0; } static const struct xfrm_type ipcomp6_type = @@ -178,21 +193,22 @@ static const struct xfrm_type ipcomp6_type = .hdr_offset = xfrm6_find_1stfragopt, }; -static const struct inet6_protocol ipcomp6_protocol = +static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, - .flags = INET6_PROTO_NOPOLICY, + .priority = 0, }; static int __init ipcomp6_init(void) { if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { - printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } - if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { - printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp6_type, AF_INET6); return -EAGAIN; } @@ -201,10 +217,10 @@ static int __init ipcomp6_init(void) static void __exit ipcomp6_fini(void) { - if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) - printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) + pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); + pr_info("%s: can't remove xfrm type\n", __func__); } module_init(ipcomp6_init); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 430454ee5ea..edb58aff4ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -36,6 +36,7 @@ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> @@ -54,8 +55,6 @@ #include <asm/uaccess.h> -DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly; - struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); @@ -113,9 +112,9 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, } opt = xchg(&inet6_sk(sk)->opt, opt); } else { - write_lock(&sk->sk_dst_lock); + spin_lock(&sk->sk_dst_lock); opt = xchg(&inet6_sk(sk)->opt, opt); - write_unlock(&sk->sk_dst_lock); + spin_unlock(&sk->sk_dst_lock); } sk_dst_reset(sk); @@ -175,7 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, } if (ipv6_only_sock(sk) || - !ipv6_addr_v4mapped(&np->daddr)) { + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } @@ -336,6 +335,33 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_RECVPATHMTU: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxpmtu = valbool; + retv = 0; + break; + + case IPV6_TRANSPARENT: + if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) && + !ns_capable(net->user_ns, CAP_NET_RAW)) { + retv = -EPERM; + break; + } + if (optlen < sizeof(int)) + goto e_inval; + /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ + inet_sk(sk)->transparent = valbool; + retv = 0; + break; + + case IPV6_RECVORIGDSTADDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxorigdstaddr = valbool; + retv = 0; + break; + case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: @@ -356,7 +382,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* hop-by-hop / destination options are privileged option */ retv = -EPERM; - if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; opt = ipv6_renew_options(sk, np->opt, optname, @@ -372,7 +398,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) @@ -410,7 +436,7 @@ sticky_done: goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; - ipv6_addr_copy(&np->sticky_pktinfo.ipi6_addr, &pkt.ipi6_addr); + np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } @@ -419,12 +445,12 @@ sticky_done: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; - struct flowi fl; + struct flowi6 fl6; int junk; - fl.fl6_flowlabel = 0; - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; @@ -450,7 +476,8 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void*)(opt+1); - retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, + &junk, &junk); if (retv) goto done; update: @@ -477,7 +504,7 @@ done: goto e_inval; if (val > 255 || val < -1) goto e_inval; - np->mcast_hops = val; + np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); retv = 0; break; @@ -490,6 +517,36 @@ done: retv = 0; break; + case IPV6_UNICAST_IF: + { + struct net_device *dev = NULL; + int ifindex; + + if (optlen != sizeof(int)) + goto e_inval; + + ifindex = (__force int)ntohl((__force __be32)val); + if (ifindex == 0) { + np->ucast_oif = 0; + retv = 0; + break; + } + + dev = dev_get_by_index(net, ifindex); + retv = -EADDRNOTAVAIL; + if (!dev) + break; + dev_put(dev); + + retv = -EINVAL; + if (sk->sk_bound_dev_if) + break; + + np->ucast_oif = ifindex; + retv = 0; + break; + } + case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) break; @@ -622,7 +679,6 @@ done: } case MCAST_MSFILTER: { - extern int sysctl_mld_max_msf; struct group_filter *gsf; if (optlen < GROUP_FILTER_SIZE(0)) @@ -666,7 +722,7 @@ done: case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; - if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) goto e_inval; np->pmtudisc = val; retv = 0; @@ -699,7 +755,7 @@ done: case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; @@ -766,6 +822,18 @@ pref_skip_coa: break; } + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + np->min_hopcount = val; + retv = 0; + break; + case IPV6_DONTFRAG: + np->dontfrag = valbool; + retv = 0; + break; } release_sock(sk); @@ -876,7 +944,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, } static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); int len; @@ -925,7 +993,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; - msg.msg_flags = 0; + msg.msg_flags = flags; lock_sock(sk); skb = np->pktoptions; @@ -934,35 +1002,42 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, release_sock(sk); if (skb) { - int err = datagram_recv_ctl(sk, &msg, skb); + ip6_datagram_recv_ctl(sk, &msg, skb); kfree_skb(skb); - if (err) - return err; } else { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; - np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : - ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } + if (np->rxopt.bits.rxtclass) { + int tclass = (int)ip6_tclass(np->rcv_flowinfo); + + put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); + } if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : np->sticky_pktinfo.ipi6_ifindex; - np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) : - ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr)); + src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : + np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = np->mcast_hops; put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } + if (np->rxopt.bits.rxflow) { + __be32 flowinfo = np->rcv_flowinfo; + + put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } } len -= msg.msg_controllen; return put_user(len, optlen); @@ -970,14 +1045,13 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_MTU: { struct dst_entry *dst; + val = 0; - lock_sock(sk); - dst = sk_dst_get(sk); - if (dst) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) val = dst_mtu(dst); - dst_release(dst); - } - release_sock(sk); + rcu_read_unlock(); if (!val) return -ENOTCONN; break; @@ -1055,6 +1129,46 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rxopt.bits.rxflow; break; + case IPV6_RECVPATHMTU: + val = np->rxopt.bits.rxpmtu; + break; + + case IPV6_PATHMTU: + { + struct dst_entry *dst; + struct ip6_mtuinfo mtuinfo; + + if (len < sizeof(mtuinfo)) + return -EINVAL; + + len = sizeof(mtuinfo); + memset(&mtuinfo, 0, sizeof(mtuinfo)); + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + mtuinfo.ip6m_mtu = dst_mtu(dst); + rcu_read_unlock(); + if (!mtuinfo.ip6m_mtu) + return -ENOTCONN; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &mtuinfo, len)) + return -EFAULT; + + return 0; + break; + } + + case IPV6_TRANSPARENT: + val = inet_sk(sk)->transparent; + break; + + case IPV6_RECVORIGDSTADDR: + val = np->rxopt.bits.rxorigdstaddr; + break; + case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { @@ -1065,12 +1179,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, else val = np->mcast_hops; - dst = sk_dst_get(sk); - if (dst) { - if (val < 0) + if (val < 0) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) val = ip6_dst_hoplimit(dst); - dst_release(dst); + rcu_read_unlock(); } + if (val < 0) val = sock_net(sk)->ipv6.devconf_all->hop_limit; break; @@ -1084,6 +1200,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->mcast_oif; break; + case IPV6_UNICAST_IF: + val = (__force int)htonl((__u32) np->ucast_oif); + break; + case IPV6_MTU_DISCOVER: val = np->pmtudisc; break; @@ -1096,6 +1216,37 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->sndflow; break; + case IPV6_FLOWLABEL_MGR: + { + struct in6_flowlabel_req freq; + int flags; + + if (len < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + if (freq.flr_action != IPV6_FL_A_GET) + return -EINVAL; + + len = sizeof(freq); + flags = freq.flr_flags; + + memset(&freq, 0, sizeof(freq)); + + val = ipv6_flowlabel_opt_get(sk, &freq, flags); + if (val < 0) + return val; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &freq, len)) + return -EFAULT; + + return 0; + } + case IPV6_ADDR_PREFERENCES: val = 0; @@ -1114,6 +1265,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val |= IPV6_PREFER_SRC_HOME; break; + case IPV6_MINHOPCOUNT: + val = np->min_hopcount; + break; + + case IPV6_DONTFRAG: + val = np->dontfrag; + break; + default: return -ENOPROTOOPT; } @@ -1136,7 +1295,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if(level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen); + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { @@ -1178,7 +1337,8 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, return compat_mc_getsockopt(sk, level, optname, optval, optlen, ipv6_getsockopt); - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen); + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1f9c44442e6..617f0958e16 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -43,6 +43,9 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/pkt_sched.h> +#include <net/mld.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -70,60 +73,17 @@ #define MDBG(x) #endif -/* - * These header formats should be in a separate include file, but icmpv6.h - * doesn't have in6_addr defined in all cases, there is no __u128, and no - * other files reference these. - * - * +-DLS 4/14/03 - */ - -/* Multicast Listener Discovery version 2 headers */ - -struct mld2_grec { - __u8 grec_type; - __u8 grec_auxwords; - __be16 grec_nsrcs; - struct in6_addr grec_mca; - struct in6_addr grec_src[0]; -}; - -struct mld2_report { - __u8 type; - __u8 resv1; - __sum16 csum; - __be16 resv2; - __be16 ngrec; - struct mld2_grec grec[0]; -}; - -struct mld2_query { - __u8 type; - __u8 code; - __sum16 csum; - __be16 mrc; - __be16 resv1; - struct in6_addr mca; -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 qrv:3, - suppress:1, - resv2:4; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 resv2:4, - suppress:1, - qrv:3; -#else -#error "Please fix <asm/byteorder.h>" -#endif - __u8 qqic; - __be16 nsrcs; - struct in6_addr srcs[0]; +/* Ensure that we have struct in6_addr aligned on 32bit word. */ +static void *__mld2_query_bugs[] __attribute__((__unused__)) = { + BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4) }; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; /* Big mc list lock for all the sockets */ -static DEFINE_RWLOCK(ipv6_sk_mc_lock); +static DEFINE_SPINLOCK(ipv6_sk_mc_lock); static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); @@ -133,36 +93,30 @@ static void mld_gq_timer_expire(unsigned long data); static void mld_ifc_timer_expire(unsigned long data); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *addr); +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); static void mld_clear_delrec(struct inet6_dev *idev); +static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); -static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); -static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); - -#define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 +/* RFC3810, 9.2. Query Interval */ +#define MLD_QI_DEFAULT (125 * HZ) +/* RFC3810, 9.3. Query Response Interval */ +#define MLD_QRI_DEFAULT (10 * HZ) -#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ - (idev)->cnf.force_mld_version == 1 || \ - ((idev)->mc_v1_seen && \ - time_before(jiffies, (idev)->mc_v1_seen))) - -#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) -#define MLDV2_EXP(thresh, nbmant, nbexp, value) \ - ((value) < (thresh) ? (value) : \ - ((MLDV2_MASK(value, nbmant) | (1<<(nbmant))) << \ - (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) - -#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) +/* RFC3810, 8.1 Query Version Distinctions */ +#define MLD_V1_QUERY_LEN 24 +#define MLD_V2_QUERY_LEN_MIN 28 #define IPV6_MLD_MAX_MSF 64 @@ -172,6 +126,23 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; * socket join on multicast group */ +#define for_each_pmc_rcu(np, pmc) \ + for (pmc = rcu_dereference(np->ipv6_mc_list); \ + pmc != NULL; \ + pmc = rcu_dereference(pmc->next)) + +static int unsolicited_report_interval(struct inet6_dev *idev) +{ + int iv; + + if (mld_in_v1_mode(idev)) + iv = idev->cnf.mldv1_unsolicited_report_interval; + else + iv = idev->cnf.mldv2_unsolicited_report_interval; + + return iv > 0 ? iv : 1; +} + int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; @@ -183,15 +154,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - read_lock_bh(&ipv6_sk_mc_lock); - for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) { + rcu_read_lock(); + for_each_pmc_rcu(np, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { - read_unlock_bh(&ipv6_sk_mc_lock); + rcu_read_unlock(); return -EADDRINUSE; } } - read_unlock_bh(&ipv6_sk_mc_lock); + rcu_read_unlock(); mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); @@ -199,20 +170,21 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -ENOMEM; mc_lst->next = NULL; - ipv6_addr_copy(&mc_lst->addr, addr); + mc_lst->addr = *addr; + rcu_read_lock(); if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(net, addr, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } } else - dev = dev_get_by_index(net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev == NULL) { + rcu_read_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -229,17 +201,17 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { + rcu_read_unlock(); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); - dev_put(dev); return err; } - write_lock_bh(&ipv6_sk_mc_lock); + spin_lock(&ipv6_sk_mc_lock); mc_lst->next = np->ipv6_mc_list; - np->ipv6_mc_list = mc_lst; - write_unlock_bh(&ipv6_sk_mc_lock); + rcu_assign_pointer(np->ipv6_mc_list, mc_lst); + spin_unlock(&ipv6_sk_mc_lock); - dev_put(dev); + rcu_read_unlock(); return 0; } @@ -250,76 +222,75 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_mc_socklist *mc_lst, **lnk; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_mc_socklist __rcu **lnk; struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_mc_lock); - for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + spin_lock(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; + (mc_lst = rcu_dereference_protected(*lnk, + lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { struct net_device *dev; *lnk = mc_lst->next; - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); - dev = dev_get_by_index(net, mc_lst->ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); if (dev != NULL) { - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) { + if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - in6_dev_put(idev); - } - dev_put(dev); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); - sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + rcu_read_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); return 0; } } - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); return -EADDRNOTAVAIL; } -static struct inet6_dev *ip6_mc_find_dev(struct net *net, - struct in6_addr *group, - int ifindex) +/* called with rcu_read_lock() */ +static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, + const struct in6_addr *group, + int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt; + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); - rt = rt6_lookup(net, group, NULL, 0, 0); if (rt) { - dev = rt->rt6i_dev; - dev_hold(dev); - dst_release(&rt->u.dst); + dev = rt->dst.dev; + ip6_rt_put(rt); } } else - dev = dev_get_by_index(net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) - goto nodev; - idev = in6_dev_get(dev); + return NULL; + idev = __in6_dev_get(dev); if (!idev) - goto release; + return NULL; read_lock_bh(&idev->lock); - if (idev->dead) - goto unlock_release; - + if (idev->dead) { + read_unlock_bh(&idev->lock); + return NULL; + } return idev; - -unlock_release: - read_unlock_bh(&idev->lock); - in6_dev_put(idev); -release: - dev_put(dev); -nodev: - return NULL; } void ipv6_sock_mc_close(struct sock *sk) @@ -328,31 +299,35 @@ void ipv6_sock_mc_close(struct sock *sk) struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); - write_lock_bh(&ipv6_sk_mc_lock); - while ((mc_lst = np->ipv6_mc_list) != NULL) { + if (!rcu_access_pointer(np->ipv6_mc_list)) + return; + + spin_lock(&ipv6_sk_mc_lock); + while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); - dev = dev_get_by_index(net, mc_lst->ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); if (dev) { - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get(dev); (void) ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) { + if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - in6_dev_put(idev); - } - dev_put(dev); } else (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); - sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); - write_lock_bh(&ipv6_sk_mc_lock); + spin_lock(&ipv6_sk_mc_lock); } - write_unlock_bh(&ipv6_sk_mc_lock); + spin_unlock(&ipv6_sk_mc_lock); } int ip6_mc_source(int add, int omode, struct sock *sk, @@ -360,7 +335,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk, { struct in6_addr *source, *group; struct ipv6_mc_socklist *pmc; - struct net_device *dev; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; @@ -376,15 +350,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface); - if (!idev) + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); + if (!idev) { + rcu_read_unlock(); return -ENODEV; - dev = idev->dev; + } err = -EADDRNOTAVAIL; - read_lock_bh(&ipv6_sk_mc_lock); - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -407,7 +382,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } - write_lock_bh(&pmc->sflock); + write_lock(&pmc->sflock); pmclocked = 1; psl = pmc->sflist; @@ -416,8 +391,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, - sizeof(struct in6_addr)); + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); if (rv == 0) break; } @@ -467,12 +441,10 @@ int ip6_mc_source(int add, int omode, struct sock *sk, } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; i<psl->sl_count; i++) { - rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); - if (rv == 0) - break; + rv = !ipv6_addr_equal(&psl->sl_addr[i], source); + if (rv == 0) /* There is an error in the address. */ + goto done; } - if (rv == 0) /* address already there is an error */ - goto done; for (j=psl->sl_count-1; j>=i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = *source; @@ -482,11 +454,9 @@ int ip6_mc_source(int add, int omode, struct sock *sk, ip6_mc_add_src(idev, group, omode, 1, source, 1); done: if (pmclocked) - write_unlock_bh(&pmc->sflock); - read_unlock_bh(&ipv6_sk_mc_lock); + write_unlock(&pmc->sflock); read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); if (leavegroup) return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -494,9 +464,8 @@ done: int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) { - struct in6_addr *group; + const struct in6_addr *group; struct ipv6_mc_socklist *pmc; - struct net_device *dev; struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; @@ -512,21 +481,22 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev(net, group, gsf->gf_interface); + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - if (!idev) + if (!idev) { + rcu_read_unlock(); return -ENODEV; - dev = idev->dev; + } err = 0; - read_lock_bh(&ipv6_sk_mc_lock); if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { leavegroup = 1; goto done; } - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -561,7 +531,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } - write_lock_bh(&pmc->sflock); + write_lock(&pmc->sflock); psl = pmc->sflist; if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, @@ -571,13 +541,11 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; - write_unlock_bh(&pmc->sflock); + write_unlock(&pmc->sflock); err = 0; done: - read_unlock_bh(&ipv6_sk_mc_lock); read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -587,10 +555,9 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct group_filter __user *optval, int __user *optlen) { int err, i, count, copycount; - struct in6_addr *group; + const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct inet6_dev *idev; - struct net_device *dev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; struct net *net = sock_net(sk); @@ -600,12 +567,13 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(net, group, gsf->gf_interface); + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - if (!idev) + if (!idev) { + rcu_read_unlock(); return -ENODEV; - - dev = idev->dev; + } err = -EADDRNOTAVAIL; /* @@ -614,7 +582,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, * so reading the list is safe. */ - for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + for_each_pmc_rcu(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) @@ -626,8 +594,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, psl = pmc->sflist; count = psl ? psl->sl_count : 0; read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -653,27 +620,26 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, return 0; done: read_unlock_bh(&idev->lock); - in6_dev_put(idev); - dev_put(dev); + rcu_read_unlock(); return err; } -int inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, - const struct in6_addr *src_addr) +bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, + const struct in6_addr *src_addr) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc; struct ip6_sf_socklist *psl; - int rv = 1; + bool rv = true; - read_lock(&ipv6_sk_mc_lock); - for (mc = np->ipv6_mc_list; mc; mc = mc->next) { + rcu_read_lock(); + for_each_pmc_rcu(np, mc) { if (ipv6_addr_equal(&mc->addr, mc_addr)) break; } if (!mc) { - read_unlock(&ipv6_sk_mc_lock); - return 1; + rcu_read_unlock(); + return true; } read_lock(&mc->sflock); psl = mc->sflist; @@ -687,12 +653,12 @@ int inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, break; } if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - rv = 0; + rv = false; if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - rv = 0; + rv = false; } read_unlock(&mc->sflock); - read_unlock(&ipv6_sk_mc_lock); + rcu_read_unlock(); return rv; } @@ -710,18 +676,22 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) - dev_mc_add(dev, buf, dev->addr_len, 0); + dev_mc_add(dev, buf); } spin_unlock_bh(&mc->mca_lock); if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; - if (MLD_V1_SEEN(mc->idev)) { + if (mld_in_v1_mode(mc->idev)) { igmp6_join_group(mc); return; } @@ -736,11 +706,15 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < + IPV6_ADDR_SCOPE_LINKLOCAL) + return; + spin_lock_bh(&mc->mca_lock); if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) - dev_mc_delete(dev, buf, dev->addr_len, 0); + dev_mc_del(dev, buf); } if (mc->mca_flags & MAF_NOREPORT) @@ -793,18 +767,18 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } spin_unlock_bh(&im->mca_lock); - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc->next = idev->mc_tomb; idev->mc_tomb = pmc; - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); } -static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) { struct ifmcaddr6 *pmc, *pmc_prev; struct ip6_sf_list *psf, *psf_next; - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) @@ -817,7 +791,8 @@ static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) else idev->mc_tomb = pmc->next; } - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); + if (pmc) { for (psf=pmc->mca_tomb; psf; psf=psf_next) { psf_next = psf->sf_next; @@ -832,10 +807,10 @@ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - write_lock_bh(&idev->mc_lock); + spin_lock_bh(&idev->mc_lock); pmc = idev->mc_tomb; idev->mc_tomb = NULL; - write_unlock_bh(&idev->mc_lock); + spin_unlock_bh(&idev->mc_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; @@ -870,6 +845,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) struct ifmcaddr6 *mc; struct inet6_dev *idev; + /* we need to take a reference on idev */ idev = in6_dev_get(dev); if (idev == NULL) @@ -907,8 +883,8 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); - ipv6_addr_copy(&mc->mca_addr, addr); - mc->idev = idev; + mc->mca_addr = *addr; + mc->idev = idev; /* (reference taken) */ mc->mca_users = 1; /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; @@ -963,57 +939,33 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) { - struct inet6_dev *idev = in6_dev_get(dev); + struct inet6_dev *idev; int err; - if (!idev) - return -ENODEV; - - err = __ipv6_dev_mc_dec(idev, addr); + rcu_read_lock(); - in6_dev_put(idev); + idev = __in6_dev_get(dev); + if (!idev) + err = -ENODEV; + else + err = __ipv6_dev_mc_dec(idev, addr); + rcu_read_unlock(); return err; } /* - * identify MLD packets for MLD filter exceptions - */ -int ipv6_is_mld(struct sk_buff *skb, int nexthdr) -{ - struct icmp6hdr *pic; - - if (nexthdr != IPPROTO_ICMPV6) - return 0; - - if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) - return 0; - - pic = icmp6_hdr(skb); - - switch (pic->icmp6_type) { - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - case ICMPV6_MLD2_REPORT: - return 1; - default: - break; - } - return 0; -} - -/* * check if the interface/address pair is valid */ -int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, - const struct in6_addr *src_addr) +bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, + const struct in6_addr *src_addr) { struct inet6_dev *idev; struct ifmcaddr6 *mc; - int rv = 0; + bool rv = false; - idev = in6_dev_get(dev); + rcu_read_lock(); + idev = __in6_dev_get(dev); if (idev) { read_lock_bh(&idev->lock); for (mc = idev->mc_list; mc; mc=mc->next) { @@ -1037,31 +989,59 @@ int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; spin_unlock_bh(&mc->mca_lock); } else - rv = 1; /* don't filter unspecified source */ + rv = true; /* don't filter unspecified source */ } read_unlock_bh(&idev->lock); - in6_dev_put(idev); } + rcu_read_unlock(); return rv; } static void mld_gq_start_timer(struct inet6_dev *idev) { - int tv = net_random() % idev->mc_maxdelay; + unsigned long tv = prandom_u32() % idev->mc_maxdelay; idev->mc_gq_running = 1; if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) in6_dev_hold(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +static void mld_gq_stop_timer(struct inet6_dev *idev) +{ + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) { - int tv = net_random() % delay; + unsigned long tv = prandom_u32() % delay; if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) in6_dev_hold(idev); } +static void mld_ifc_stop_timer(struct inet6_dev *idev) +{ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); +} + +static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) +{ + unsigned long tv = prandom_u32() % delay; + + if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_dad_stop_timer(struct inet6_dev *idev) +{ + if (del_timer(&idev->mc_dad_timer)) + __in6_dev_put(idev); +} + /* * IGMP handling (alias multicast ICMPv6 messages) */ @@ -1080,12 +1060,9 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) delay = ma->mca_timer.expires - jiffies; } - if (delay >= resptime) { - if (resptime) - delay = net_random() % resptime; - else - delay = 1; - } + if (delay >= resptime) + delay = prandom_u32() % resptime; + ma->mca_timer.expires = jiffies + delay; if (!mod_timer(&ma->mca_timer, jiffies + delay)) atomic_inc(&ma->mca_refcnt); @@ -1093,8 +1070,8 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } /* mark EXCLUDE-mode sources */ -static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, - struct in6_addr *srcs) +static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; @@ -1105,10 +1082,10 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, break; for (i=0; i<nsrcs; i++) { /* skip inactive filters */ - if (pmc->mca_sfcount[MCAST_INCLUDE] || + if (psf->sf_count[MCAST_INCLUDE] || pmc->mca_sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) - continue; + break; if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { scount++; break; @@ -1117,12 +1094,12 @@ static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, } pmc->mca_flags &= ~MAF_GSQUERY; if (scount == nsrcs) /* all sources excluded */ - return 0; - return 1; + return false; + return true; } -static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, - struct in6_addr *srcs) +static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; @@ -1146,23 +1123,176 @@ static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, } if (!scount) { pmc->mca_flags &= ~MAF_GSQUERY; - return 0; + return false; } pmc->mca_flags |= MAF_GSQUERY; - return 1; + return true; +} + +static int mld_force_mld_version(const struct inet6_dev *idev) +{ + /* Normally, both are 0 here. If enforcement to a particular is + * being used, individual device enforcement will have a lower + * precedence over 'all' device (.../conf/all/force_mld_version). + */ + + if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0) + return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version; + else + return idev->cnf.force_mld_version; +} + +static bool mld_in_v2_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 2; +} + +static bool mld_in_v1_mode_only(const struct inet6_dev *idev) +{ + return mld_force_mld_version(idev) == 1; +} + +static bool mld_in_v1_mode(const struct inet6_dev *idev) +{ + if (mld_in_v2_mode_only(idev)) + return false; + if (mld_in_v1_mode_only(idev)) + return true; + if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen)) + return true; + + return false; } +static void mld_set_v1_mode(struct inet6_dev *idev) +{ + /* RFC3810, relevant sections: + * - 9.1. Robustness Variable + * - 9.2. Query Interval + * - 9.3. Query Response Interval + * - 9.12. Older Version Querier Present Timeout + */ + unsigned long switchback; + + switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri; + + idev->mc_v1_seen = jiffies + switchback; +} + +static void mld_update_qrv(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.8. QRV (Querier's Robustness Variable) + * - 9.1. Robustness Variable + */ + + /* The value of the Robustness Variable MUST NOT be zero, + * and SHOULD NOT be one. Catch this here if we ever run + * into such a case in future. + */ + WARN_ON(idev->mc_qrv == 0); + + if (mlh2->mld2q_qrv > 0) + idev->mc_qrv = mlh2->mld2q_qrv; + + if (unlikely(idev->mc_qrv < 2)) { + net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n", + idev->mc_qrv, MLD_QRV_DEFAULT); + idev->mc_qrv = MLD_QRV_DEFAULT; + } +} + +static void mld_update_qi(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.9. QQIC (Querier's Query Interval Code) + * - 9.2. Query Interval + * - 9.12. Older Version Querier Present Timeout + * (the [Query Interval] in the last Query received) + */ + unsigned long mc_qqi; + + if (mlh2->mld2q_qqic < 128) { + mc_qqi = mlh2->mld2q_qqic; + } else { + unsigned long mc_man, mc_exp; + + mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic); + mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic); + + mc_qqi = (mc_man | 0x10) << (mc_exp + 3); + } + + idev->mc_qi = mc_qqi * HZ; +} + +static void mld_update_qri(struct inet6_dev *idev, + const struct mld2_query *mlh2) +{ + /* RFC3810, relevant sections: + * - 5.1.3. Maximum Response Code + * - 9.3. Query Response Interval + */ + idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2)); +} + +static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, + unsigned long *max_delay) +{ + unsigned long mldv1_md; + + /* Ignore v1 queries */ + if (mld_in_v2_mode_only(idev)) + return -EINVAL; + + /* MLDv1 router present */ + mldv1_md = ntohs(mld->mld_maxdelay); + *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL); + + mld_set_v1_mode(idev); + + /* cancel MLDv2 report timer */ + mld_gq_stop_timer(idev); + /* cancel the interface change timer */ + mld_ifc_stop_timer(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + + return 0; +} + +static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) +{ + /* hosts need to stay in MLDv1 mode, discard MLDv2 queries */ + if (mld_in_v1_mode(idev)) + return -EINVAL; + + *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); + + mld_update_qrv(idev, mld); + mld_update_qi(idev, mld); + mld_update_qri(idev, mld); + + idev->mc_maxdelay = *max_delay; + + return 0; +} + +/* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; struct ifmcaddr6 *ma; - struct in6_addr *group; + const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; - struct icmp6hdr *hdr; + struct mld_msg *mld; int group_type; int mark = 0; - int len; + int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) return -EINVAL; @@ -1171,78 +1301,66 @@ int igmp6_event_query(struct sk_buff *skb) len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); len -= skb_network_header_len(skb); - /* Drop queries with not link local source */ - if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + /* RFC3810 6.2 + * Upon reception of an MLD message that contains a Query, the node + * checks if the source address of the message is a valid link-local + * address, if the Hop Limit is set to 1, and if the Router Alert + * option is present in the Hop-By-Hop Options header of the IPv6 + * packet. If any of these checks fails, the packet is dropped. + */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) || + ipv6_hdr(skb)->hop_limit != 1 || + !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || + IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) return -EINVAL; - idev = in6_dev_get(skb->dev); - + idev = __in6_dev_get(skb->dev); if (idev == NULL) return 0; - hdr = icmp6_hdr(skb); - group = (struct in6_addr *) (hdr + 1); + mld = (struct mld_msg *)icmp6_hdr(skb); + group = &mld->mld_mca; group_type = ipv6_addr_type(group); if (group_type != IPV6_ADDR_ANY && - !(group_type&IPV6_ADDR_MULTICAST)) { - in6_dev_put(idev); + !(group_type&IPV6_ADDR_MULTICAST)) return -EINVAL; - } - - if (len == 24) { - int switchback; - /* MLDv1 router present */ - - /* Translate milliseconds to jiffies */ - max_delay = (ntohs(hdr->icmp6_maxdelay)*HZ)/1000; - switchback = (idev->mc_qrv + 1) * max_delay; - idev->mc_v1_seen = jiffies + switchback; - - /* cancel the interface change timer */ - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - /* clear deleted report items */ - mld_clear_delrec(idev); - } else if (len >= 28) { + if (len == MLD_V1_QUERY_LEN) { + err = mld_process_v1(idev, mld, &max_delay); + if (err < 0) + return err; + } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); - if (!pskb_may_pull(skb, srcs_offset)) { - in6_dev_put(idev); + + if (!pskb_may_pull(skb, srcs_offset)) return -EINVAL; - } + mlh2 = (struct mld2_query *)skb_transport_header(skb); - max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; - if (!max_delay) - max_delay = 1; - idev->mc_maxdelay = max_delay; - if (mlh2->qrv) - idev->mc_qrv = mlh2->qrv; + + err = mld_process_v2(idev, mlh2, &max_delay); + if (err < 0) + return err; + if (group_type == IPV6_ADDR_ANY) { /* general query */ - if (mlh2->nsrcs) { - in6_dev_put(idev); + if (mlh2->mld2q_nsrcs) return -EINVAL; /* no sources allowed */ - } + mld_gq_start_timer(idev); - in6_dev_put(idev); return 0; } /* mark sources to include, if group & source-specific */ - if (mlh2->nsrcs != 0) { + if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + - ntohs(mlh2->nsrcs) * sizeof(struct in6_addr))) { - in6_dev_put(idev); + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) return -EINVAL; - } + mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } - } else { - in6_dev_put(idev); + } else return -EINVAL; - } read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { @@ -1268,25 +1386,23 @@ int igmp6_event_query(struct sk_buff *skb) ma->mca_flags &= ~MAF_GSQUERY; } if (!(ma->mca_flags & MAF_GSQUERY) || - mld_marksources(ma, ntohs(mlh2->nsrcs), mlh2->srcs)) + mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); break; } } read_unlock_bh(&idev->lock); - in6_dev_put(idev); return 0; } - +/* called with rcu_read_lock() */ int igmp6_event_report(struct sk_buff *skb) { struct ifmcaddr6 *ma; - struct in6_addr *addrp; struct inet6_dev *idev; - struct icmp6hdr *hdr; + struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ @@ -1298,10 +1414,10 @@ int igmp6_event_report(struct sk_buff *skb) skb->pkt_type != PACKET_BROADCAST) return 0; - if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) return -EINVAL; - hdr = icmp6_hdr(skb); + mld = (struct mld_msg *)icmp6_hdr(skb); /* Drop reports with not link local source */ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); @@ -1309,9 +1425,7 @@ int igmp6_event_report(struct sk_buff *skb) !(addr_type&IPV6_ADDR_LINKLOCAL)) return -EINVAL; - addrp = (struct in6_addr *) (hdr + 1); - - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); if (idev == NULL) return -ENODEV; @@ -1321,7 +1435,7 @@ int igmp6_event_report(struct sk_buff *skb) read_lock_bh(&idev->lock); for (ma = idev->mc_list; ma; ma=ma->next) { - if (ipv6_addr_equal(&ma->mca_addr, addrp)) { + if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); if (del_timer(&ma->mca_timer)) atomic_dec(&ma->mca_refcnt); @@ -1331,21 +1445,20 @@ int igmp6_event_report(struct sk_buff *skb) } } read_unlock_bh(&idev->lock); - in6_dev_put(idev); return 0; } -static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, - int gdeleted, int sdeleted) +static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) { switch (type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) - return 0; + return false; if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { if (pmc->mca_sfmode == MCAST_INCLUDE) - return 1; + return true; /* don't include if this source is excluded * in all filters */ @@ -1354,29 +1467,29 @@ static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } - return 0; + return false; case MLD2_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) - return 0; + return false; return psf->sf_count[MCAST_INCLUDE] != 0; case MLD2_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) - return 0; + return false; if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) - return 0; + return false; return pmc->mca_sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case MLD2_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) - return 0; + return false; return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; case MLD2_BLOCK_OLD_SOURCES: if (pmc->mca_sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } - return 0; + return false; } static int @@ -1393,28 +1506,60 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static struct sk_buff *mld_newpack(struct net_device *dev, int size) +static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + ip6_flow_hdr(hdr, 0, 0); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = inet6_sk(sk)->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} + +static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) { + struct net_device *dev = idev->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.igmp_sk; struct sk_buff *skb; struct mld2_report *pmr; struct in6_addr addr_buf; const struct in6_addr *saddr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - skb = sock_alloc_send_skb(sk, size + LL_ALLOCATED_SPACE(dev), 1, &err); + size += hlen + tlen; + /* limit our allocations to order-0 page */ + size = min_t(int, size, SKB_MAX_ORDER(0, 0)); + skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, hlen); - if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address * when a valid link-local address is not available. @@ -1423,18 +1568,18 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); skb_put(skb, sizeof(*pmr)); pmr = (struct mld2_report *)skb_transport_header(skb); - pmr->type = ICMPV6_MLD2_REPORT; - pmr->resv1 = 0; - pmr->csum = 0; - pmr->resv2 = 0; - pmr->ngrec = 0; + pmr->mld2r_type = ICMPV6_MLD2_REPORT; + pmr->mld2r_resv1 = 0; + pmr->mld2r_cksum = 0; + pmr->mld2r_resv2 = 0; + pmr->mld2r_ngrec = 0; return skb; } @@ -1444,52 +1589,54 @@ static void mld_sendpack(struct sk_buff *skb) struct mld2_report *pmr = (struct mld2_report *)skb_transport_header(skb); int payload_len, mldlen; - struct inet6_dev *idev = in6_dev_get(skb->dev); + struct inet6_dev *idev; struct net *net = dev_net(skb->dev); int err; - struct flowi fl; + struct flowi6 fl6; struct dst_entry *dst; + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); - mldlen = skb->tail - skb->transport_header; + payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) - + sizeof(*pip6); + mldlen = skb_tail_pointer(skb) - skb_transport_header(skb); pip6->payload_len = htons(payload_len); - pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, - IPPROTO_ICMPV6, csum_partial(skb_transport_header(skb), - mldlen, 0)); - - dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr); - - if (!dst) { - err = -ENOMEM; - goto err_out; - } + pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, + csum_partial(skb_transport_header(skb), + mldlen, 0)); - icmpv6_flow_init(net->ipv6.igmp_sk, &fl, ICMPV6_MLD2_REPORT, + icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); - err = xfrm_lookup(net, &dst, &fl, NULL, 0); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + } skb_dst_set(skb, dst); if (err) goto err_out; payload_len = skb->len; - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, dst_output); out: if (!err) { - ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT); - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); - } else - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS); + ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); + } else { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + } - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); return; err_out: @@ -1510,7 +1657,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr; if (!skb) - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(pmc->idev, dev->mtu); if (!skb) return NULL; pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); @@ -1519,7 +1666,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->mca_addr; /* structure copy */ pmr = (struct mld2_report *)skb_transport_header(skb); - pmr->ngrec = htons(ntohs(pmr->ngrec)+1); + pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); *ppgr = pgr; return skb; } @@ -1528,9 +1675,10 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, skb_tailroom(skb)) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, int gdeleted, int sdeleted) + int type, int gdeleted, int sdeleted, int crsend) { - struct net_device *dev = pmc->idev->dev; + struct inet6_dev *idev = pmc->idev; + struct net_device *dev = idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -1555,11 +1703,11 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { - if (pmr && pmr->ngrec && + if (pmr && pmr->mld2r_ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(idev, dev->mtu); } } first = 1; @@ -1586,7 +1734,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(dev, dev->mtu); + skb = mld_newpack(idev, dev->mtu); first = 1; scount = 0; } @@ -1619,7 +1767,7 @@ empty_source: if (type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) return skb; - if (pmc->mca_crcount || isquery) { + if (pmc->mca_crcount || isquery || crsend) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { mld_sendpack(skb); @@ -1641,8 +1789,8 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) struct sk_buff *skb = NULL; int type; + read_lock_bh(&idev->lock); if (!pmc) { - read_lock_bh(&idev->lock); for (pmc=idev->mc_list; pmc; pmc=pmc->next) { if (pmc->mca_flags & MAF_NOREPORT) continue; @@ -1651,19 +1799,19 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); } else { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } + read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } @@ -1696,7 +1844,7 @@ static void mld_send_cr(struct inet6_dev *idev) int type, dtype; read_lock_bh(&idev->lock); - write_lock_bh(&idev->mc_lock); + spin_lock(&idev->mc_lock); /* deleted MCA's */ pmc_prev = NULL; @@ -1705,13 +1853,13 @@ static void mld_send_cr(struct inet6_dev *idev) if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; - skb = add_grec(skb, pmc, type, 1, 0); - skb = add_grec(skb, pmc, dtype, 1, 1); + skb = add_grec(skb, pmc, type, 1, 0, 0); + skb = add_grec(skb, pmc, dtype, 1, 1, 0); } if (pmc->mca_crcount) { if (pmc->mca_sfmode == MCAST_EXCLUDE) { type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, type, 1, 0, 0); } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { @@ -1730,7 +1878,7 @@ static void mld_send_cr(struct inet6_dev *idev) } else pmc_prev = pmc; } - write_unlock_bh(&idev->mc_lock); + spin_unlock(&idev->mc_lock); /* change recs */ for (pmc=idev->mc_list; pmc; pmc=pmc->next) { @@ -1742,8 +1890,8 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_ALLOW_NEW_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; } - skb = add_grec(skb, pmc, type, 0, 0); - skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + skb = add_grec(skb, pmc, type, 0, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */ /* filter mode changes */ if (pmc->mca_crcount) { @@ -1751,7 +1899,7 @@ static void mld_send_cr(struct inet6_dev *idev) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_CHANGE_TO_INCLUDE; - skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } spin_unlock_bh(&pmc->mca_lock); @@ -1768,15 +1916,16 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) struct sock *sk = net->ipv6.igmp_sk; struct inet6_dev *idev; struct sk_buff *skb; - struct icmp6hdr *hdr; + struct mld_msg *hdr; const struct in6_addr *snd_addr, *saddr; - struct in6_addr *addrp; struct in6_addr addr_buf; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi fl; + struct flowi6 fl6; struct dst_entry *dst; if (type == ICMPV6_MGM_REDUCTION) @@ -1793,7 +1942,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPSTATS_MIB_OUT, full_len); rcu_read_unlock(); - skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + full_len, 1, &err); + skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); if (skb == NULL) { rcu_read_lock(); @@ -1802,8 +1951,8 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) rcu_read_unlock(); return; } - - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->priority = TC_PRIO_CONTROL; + skb_reserve(skb, hlen); if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: @@ -1814,39 +1963,33 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) } else saddr = &addr_buf; - ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); + ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); - hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr)); - memset(hdr, 0, sizeof(struct icmp6hdr)); - hdr->icmp6_type = type; - - addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); - ipv6_addr_copy(addrp, addr); + hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg)); + memset(hdr, 0, sizeof(struct mld_msg)); + hdr->mld_type = type; + hdr->mld_mca = *addr; - hdr->icmp6_cksum = csum_ipv6_magic(saddr, snd_addr, len, - IPPROTO_ICMPV6, - csum_partial(hdr, len, 0)); - - idev = in6_dev_get(skb->dev); + hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, len, 0)); - dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr); - if (!dst) { - err = -ENOMEM; - goto err_out; - } + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl, type, + icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); - - err = xfrm_lookup(net, &dst, &fl, NULL, 0); - if (err) + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); goto err_out; + } skb_dst_set(skb, dst); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, dst_output); out: if (!err) { @@ -1856,8 +1999,7 @@ out: } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); return; err_out: @@ -1865,8 +2007,57 @@ err_out: goto out; } +static void mld_send_initial_cr(struct inet6_dev *idev) +{ + struct sk_buff *skb; + struct ifmcaddr6 *pmc; + int type; + + if (mld_in_v1_mode(idev)) + return; + + skb = NULL; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0, 1); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (skb) + mld_sendpack(skb); +} + +void ipv6_mc_dad_complete(struct inet6_dev *idev) +{ + idev->mc_dad_count = idev->mc_qrv; + if (idev->mc_dad_count) { + mld_send_initial_cr(idev); + idev->mc_dad_count--; + if (idev->mc_dad_count) + mld_dad_start_timer(idev, idev->mc_maxdelay); + } +} + +static void mld_dad_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_initial_cr(idev); + if (idev->mc_dad_count) { + idev->mc_dad_count--; + if (idev->mc_dad_count) + mld_dad_start_timer(idev, idev->mc_maxdelay); + } + in6_dev_put(idev); +} + static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, - struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; @@ -1891,7 +2082,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, else pmc->mca_sources = psf->sf_next; if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && - !MLD_V1_SEEN(idev)) { + !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; psf->sf_next = pmc->mca_tomb; pmc->mca_tomb = psf; @@ -1902,8 +2093,8 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, return rv; } -static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; @@ -1963,7 +2154,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, * Add multicast single-source filter to the interface list */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, - struct in6_addr *psfsrc, int delta) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; @@ -2048,8 +2239,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) &psf->sf_addr)) break; if (!dpsf) { - dpsf = (struct ip6_sf_list *) - kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; @@ -2067,8 +2257,8 @@ static int sf_setstate(struct ifmcaddr6 *pmc) /* * Add multicast source filter list to the interface list */ -static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, - int sfmode, int sfcount, struct in6_addr *psfsrc, +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) { struct ifmcaddr6 *pmc; @@ -2095,7 +2285,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, pmc->mca_sfcount[sfmode]++; err = 0; for (i=0; i<sfcount; i++) { - err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } @@ -2105,7 +2295,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, if (!delta) pmc->mca_sfcount[sfmode]--; for (j=0; j<i; j++) - (void) ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); + ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { struct ip6_sf_list *psf; @@ -2157,7 +2347,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % IGMP6_UNSOLICITED_IVAL; + delay = prandom_u32() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { @@ -2192,7 +2382,7 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, static void igmp6_leave_group(struct ifmcaddr6 *ma) { - if (MLD_V1_SEEN(ma->idev)) { + if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); @@ -2208,7 +2398,7 @@ static void mld_gq_timer_expire(unsigned long data) idev->mc_gq_running = 0; mld_send_report(idev, NULL); - __in6_dev_put(idev); + in6_dev_put(idev); } static void mld_ifc_timer_expire(unsigned long data) @@ -2221,12 +2411,12 @@ static void mld_ifc_timer_expire(unsigned long data) if (idev->mc_ifc_count) mld_ifc_start_timer(idev, idev->mc_maxdelay); } - __in6_dev_put(idev); + in6_dev_put(idev); } static void mld_ifc_event(struct inet6_dev *idev) { - if (MLD_V1_SEEN(idev)) + if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_timer(idev, 1); @@ -2237,7 +2427,7 @@ static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; - if (MLD_V1_SEEN(ma->idev)) + if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); @@ -2277,12 +2467,9 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Withdraw multicast list */ read_lock_bh(&idev->lock); - idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) - __in6_dev_put(idev); - idev->mc_gq_running = 0; - if (del_timer(&idev->mc_gq_timer)) - __in6_dev_put(idev); + mld_ifc_stop_timer(idev); + mld_gq_stop_timer(idev); + mld_dad_stop_timer(idev); for (i = idev->mc_list; i; i=i->next) igmp6_group_dropped(i); @@ -2311,7 +2498,7 @@ void ipv6_mc_up(struct inet6_dev *idev) void ipv6_mc_init_dev(struct inet6_dev *idev) { write_lock_bh(&idev->lock); - rwlock_init(&idev->mc_lock); + spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, (unsigned long)idev); @@ -2319,8 +2506,14 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) idev->mc_ifc_count = 0; setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, (unsigned long)idev); + setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, + (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_qi = MLD_QI_DEFAULT; + idev->mc_qri = MLD_QRI_DEFAULT; + + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); } @@ -2646,15 +2839,15 @@ static const struct file_operations igmp6_mcf_seq_fops = { .release = seq_release_net, }; -static int igmp6_proc_init(struct net *net) +static int __net_init igmp6_proc_init(struct net *net) { int err; err = -ENOMEM; - if (!proc_net_fops_create(net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops)) + if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops)) goto out; - if (!proc_net_fops_create(net, "mcfilter6", S_IRUGO, - &igmp6_mcf_seq_fops)) + if (!proc_create("mcfilter6", S_IRUGO, net->proc_net, + &igmp6_mcf_seq_fops)) goto out_proc_net_igmp6; err = 0; @@ -2662,35 +2855,33 @@ out: return err; out_proc_net_igmp6: - proc_net_remove(net, "igmp6"); + remove_proc_entry("igmp6", net->proc_net); goto out; } -static void igmp6_proc_exit(struct net *net) +static void __net_exit igmp6_proc_exit(struct net *net) { - proc_net_remove(net, "mcfilter6"); - proc_net_remove(net, "igmp6"); + remove_proc_entry("mcfilter6", net->proc_net); + remove_proc_entry("igmp6", net->proc_net); } #else -static int igmp6_proc_init(struct net *net) +static inline int igmp6_proc_init(struct net *net) { return 0; } -static void igmp6_proc_exit(struct net *net) +static inline void igmp6_proc_exit(struct net *net) { - ; } #endif -static int igmp6_net_init(struct net *net) +static int __net_init igmp6_net_init(struct net *net) { int err; err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - printk(KERN_ERR - "Failed to initialize the IGMP6 control socket (err %d).\n", + pr_err("Failed to initialize the IGMP6 control socket (err %d)\n", err); goto out; } @@ -2708,7 +2899,7 @@ out_sock_create: goto out; } -static void igmp6_net_exit(struct net *net) +static void __net_exit igmp6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.igmp_sk); igmp6_proc_exit(net); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index f797e8c6f3b..db9b6cbc9db 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -13,8 +13,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * Authors: @@ -22,6 +21,8 @@ * Masahide NAKAMURA @USAGI */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/skbuff.h> #include <linux/time.h> @@ -44,7 +45,7 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen) if (!data) return NULL; if (padlen == 1) { - data[0] = IPV6_TLV_PAD0; + data[0] = IPV6_TLV_PAD1; } else if (padlen > 1) { data[0] = IPV6_TLV_PADN; data[1] = padlen - 2; @@ -56,7 +57,7 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen) static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) { - icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); } static int mip6_mh_len(int type) @@ -84,28 +85,30 @@ static int mip6_mh_len(int type) static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { - struct ip6_mh *mh; + struct ip6_mh _hdr; + const struct ip6_mh *mh; - if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || - !pskb_may_pull(skb, (skb_transport_offset(skb) + - ((skb_transport_header(skb)[1] + 1) << 3)))) + mh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_hdr), &_hdr); + if (!mh) return -1; - mh = (struct ip6_mh *)skb_transport_header(skb); + if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) + return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); - mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); - mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - - skb_network_header(skb))); + mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + + skb_network_header_len(skb)); return -1; } @@ -126,7 +129,7 @@ static struct mip6_report_rate_limiter mip6_report_rl = { static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; int err = destopt->nexthdr; @@ -181,8 +184,8 @@ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) } static inline int mip6_report_rl_allow(struct timeval *stamp, - struct in6_addr *dst, - struct in6_addr *src, int iif) + const struct in6_addr *dst, + const struct in6_addr *src, int iif) { int allow = 0; @@ -195,26 +198,28 @@ static inline int mip6_report_rl_allow(struct timeval *stamp, mip6_report_rl.stamp.tv_sec = stamp->tv_sec; mip6_report_rl.stamp.tv_usec = stamp->tv_usec; mip6_report_rl.iif = iif; - ipv6_addr_copy(&mip6_report_rl.src, src); - ipv6_addr_copy(&mip6_report_rl.dst, dst); + mip6_report_rl.src = *src; + mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); return allow; } -static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct flowi *fl) +static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, + const struct flowi *fl) { struct net *net = xs_net(x); struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + const struct flowi6 *fl6 = &fl->u.ip6; struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; struct timeval stamp; int err = 0; - if (unlikely(fl->proto == IPPROTO_MH && - fl->fl_mh_type <= IP6_MH_TYPE_MAX)) + if (unlikely(fl6->flowi6_proto == IPPROTO_MH && + fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) goto out; if (likely(opt->dsthao)) { @@ -239,14 +244,14 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, struct sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; - sel.proto = fl->proto; - sel.dport = xfrm_flowi_dport(fl); + sel.proto = fl6->flowi6_proto; + sel.dport = xfrm_flowi_dport(fl, &fl6->uli); if (sel.dport) sel.dport_mask = htons(~0); - sel.sport = xfrm_flowi_sport(fl); + sel.sport = xfrm_flowi_sport(fl, &fl6->uli); if (sel.sport) sel.sport_mask = htons(~0); - sel.ifindex = fl->oif; + sel.ifindex = fl6->flowi6_oif; err = km_report(net, IPPROTO_DSTOPTS, &sel, (hao ? (xfrm_address_t *)&hao->addr : NULL)); @@ -262,7 +267,8 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; @@ -305,13 +311,12 @@ static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, static int mip6_destopt_init_state(struct xfrm_state *x) { if (x->id.spi) { - printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, - x->id.spi); + pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - printk(KERN_INFO "%s: state's mode is not %u: %u\n", - __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + pr_info("%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); return -EINVAL; } @@ -347,11 +352,12 @@ static const struct xfrm_type mip6_destopt_type = static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { + const struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); - if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) && + if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); @@ -398,7 +404,8 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); const unsigned char *nh = skb_network_header(skb); - unsigned int packet_len = skb->tail - skb->network_header; + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; @@ -440,13 +447,12 @@ static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, static int mip6_rthdr_init_state(struct xfrm_state *x) { if (x->id.spi) { - printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, - x->id.spi); + pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - printk(KERN_INFO "%s: state's mode is not %u: %u\n", - __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + pr_info("%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); return -EINVAL; } @@ -478,18 +484,18 @@ static const struct xfrm_type mip6_rthdr_type = static int __init mip6_init(void) { - printk(KERN_INFO "Mobile IPv6\n"); + pr_info("Mobile IPv6\n"); if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { - printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __func__); + pr_info("%s: can't add xfrm type(destopt)\n", __func__); goto mip6_destopt_xfrm_fail; } if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { - printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __func__); + pr_info("%s: can't add xfrm type(rthdr)\n", __func__); goto mip6_rthdr_xfrm_fail; } if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { - printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __func__); + pr_info("%s: can't add rawv6 mh filter\n", __func__); goto mip6_rawv6_mh_fail; } @@ -507,11 +513,11 @@ static int __init mip6_init(void) static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) - printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __func__); + pr_info("%s: can't remove rawv6 mh filter\n", __func__); if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __func__); + pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __func__); + pr_info("%s: can't remove xfrm type(destopt)\n", __func__); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c4585279809..ca8d4ea48a5 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -15,6 +15,7 @@ /* * Changes: * + * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt @@ -26,27 +27,7 @@ * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) -#define ND_NOPRINTK(x...) do { ; } while(0) -#define ND_PRINTK0 ND_PRINTK -#define ND_PRINTK1 ND_NOPRINTK -#define ND_PRINTK2 ND_NOPRINTK -#define ND_PRINTK3 ND_NOPRINTK -#if ND_DEBUG >= 1 -#undef ND_PRINTK1 -#define ND_PRINTK1 ND_PRINTK -#endif -#if ND_DEBUG >= 2 -#undef ND_PRINTK2 -#define ND_PRINTK2 ND_PRINTK -#endif -#if ND_DEBUG >= 3 -#undef ND_PRINTK3 -#define ND_PRINTK3 ND_PRINTK -#endif +#define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> @@ -59,6 +40,7 @@ #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -90,7 +72,18 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> -static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(val, level, fmt, ...) \ +do { \ + if (val <= ND_DEBUG) \ + net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ +} while (0) + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -104,8 +97,6 @@ static const struct neigh_ops ndisc_generic_ops = { .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; static const struct neigh_ops ndisc_hh_ops = { @@ -114,22 +105,17 @@ static const struct neigh_ops ndisc_hh_ops = { .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, - .output = dev_queue_xmit, - .connected_output = dev_queue_xmit, - .hh_output = dev_queue_xmit, - .queue_xmit = dev_queue_xmit, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, - .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), .key_len = sizeof(struct in6_addr), .hash = ndisc_hash, .constructor = ndisc_constructor, @@ -138,18 +124,20 @@ struct neigh_table nd_tbl = { .proxy_redo = pndisc_redo, .id = "ndisc_cache", .parms = { - .tbl = &nd_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, + .tbl = &nd_tbl, + .reachable_time = ND_REACHABLE_TIME, + .data = { + [NEIGH_VAR_MCAST_PROBES] = 3, + [NEIGH_VAR_UCAST_PROBES] = 3, + [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, + [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, + [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_GC_STALETIME] = 60 * HZ, + [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_PROXY_QLEN] = 64, + [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, + [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, + }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, @@ -157,50 +145,12 @@ struct neigh_table nd_tbl = { .gc_thresh3 = 1024, }; -/* ND options */ -struct ndisc_options { - struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; -#ifdef CONFIG_IPV6_ROUTE_INFO - struct nd_opt_hdr *nd_opts_ri; - struct nd_opt_hdr *nd_opts_ri_end; -#endif - struct nd_opt_hdr *nd_useropts; - struct nd_opt_hdr *nd_useropts_end; -}; - -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] - -#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) - -/* - * Return the padding between the option length and the start of the - * link addr. Currently only IP-over-InfiniBand needs this, although - * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may - * also need a pad of 2. - */ -static int ndisc_addr_option_pad(unsigned short type) -{ - switch (type) { - case ARPHRD_INFINIBAND: return 2; - default: return 0; - } -} - -static inline int ndisc_opt_addr_space(struct net_device *dev) +static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) { - return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); -} - -static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, - unsigned short addr_type) -{ - int space = NDISC_OPT_SPACE(data_len); - int pad = ndisc_addr_option_pad(addr_type); + int pad = ndisc_addr_option_pad(skb->dev->type); + int data_len = skb->dev->addr_len; + int space = ndisc_opt_addr_space(skb->dev); + u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; @@ -214,7 +164,6 @@ static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, opt += data_len; if ((space -= data_len) > 0) memset(opt, 0, space); - return opt + space; } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, @@ -227,12 +176,13 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && cur->nd_opt_type != type); - return (cur <= end && cur->nd_opt_type == type ? cur : NULL); + return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) { - return (opt->nd_opt_type == ND_OPT_RDNSS); + return opt->nd_opt_type == ND_OPT_RDNSS || + opt->nd_opt_type == ND_OPT_DNSSL; } static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, @@ -243,11 +193,11 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && !ndisc_is_useropt(cur)); - return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL); + return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } -static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, - struct ndisc_options *ndopts) +struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -267,10 +217,9 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, case ND_OPT_MTU: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - ND_PRINTK2(KERN_WARNING - "%s(): duplicated ND6 option found: type=%d\n", - __func__, - nd_opt->nd_opt_type); + ND_PRINTK(2, warn, + "%s: duplicated ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } @@ -298,10 +247,11 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, * to accommodate future extension to the * protocol. */ - ND_PRINTK2(KERN_NOTICE - "%s(): ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, nd_opt->nd_opt_len); + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } } opt_len -= l; @@ -310,18 +260,7 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, return ndopts; } -static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, - struct net_device *dev) -{ - u8 *lladdr = (u8 *)(p + 1); - int lladdrlen = p->nd_opt_len << 3; - int prepad = ndisc_addr_option_pad(dev->type); - if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) - return NULL; - return (lladdr + prepad); -} - -int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: @@ -329,15 +268,14 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; - case ARPHRD_IEEE802_TR: - ipv6_tr_mc_map(addr,buf); - return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); @@ -349,16 +287,11 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d EXPORT_SYMBOL(ndisc_mc_map); -static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd) { - const u32 *p32 = pkey; - u32 addr_hash, i; - - addr_hash = 0; - for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) - addr_hash ^= *p32++; - - return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); + return ndisc_hashfn(pkey, dev, hash_rnd); } static int ndisc_constructor(struct neighbour *neigh) @@ -367,25 +300,22 @@ static int ndisc_constructor(struct neighbour *neigh) struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; - int is_multicast = ipv6_addr_is_multicast(addr); + bool is_multicast = ipv6_addr_is_multicast(addr); - rcu_read_lock(); in6_dev = in6_dev_get(dev); if (in6_dev == NULL) { - rcu_read_unlock(); return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; - neigh->output = neigh->ops->queue_xmit; + neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; @@ -437,154 +367,123 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } -struct sk_buff *ndisc_build_skb(struct net_device *dev, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h, - const struct in6_addr *target, - int llinfo) +static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, + int len) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.ndisc_sk; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; - struct icmp6hdr *hdr; - int len; - int err; - u8 *opt; - if (!dev->addr_len) - llinfo = 0; - - len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); - if (llinfo) - len += ndisc_opt_addr_space(dev); - - skb = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_ALLOCATED_SPACE(dev)), - 1, &err); + skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { - ND_PRINTK0(KERN_ERR - "ICMPv6 ND: %s() failed to allocate an skb, err=%d.\n", - __func__, err); + ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", + __func__); return NULL; } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; - skb->transport_header = skb->tail; - skb_put(skb, len); + skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); - hdr = (struct icmp6hdr *)skb_transport_header(skb); - memcpy(hdr, icmp6h, sizeof(*hdr)); + /* Manually assign socket ownership as we avoid calling + * sock_alloc_send_pskb() to bypass wmem buffer limits + */ + skb_set_owner_w(skb, sk); - opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); - if (target) { - ipv6_addr_copy((struct in6_addr *)opt, target); - opt += sizeof(*target); - } + return skb; +} - if (llinfo) - ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, - dev->addr_len, dev->type); +static void ip6_nd_hdr(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int hop_limit, int len) +{ + struct ipv6hdr *hdr; - hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, - IPPROTO_ICMPV6, - csum_partial(hdr, - len, 0)); + skb_push(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); - return skb; -} + ip6_flow_hdr(hdr, 0, 0); -EXPORT_SYMBOL(ndisc_build_skb); + hdr->payload_len = htons(len); + hdr->nexthdr = IPPROTO_ICMPV6; + hdr->hop_limit = hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; +} -void ndisc_send_skb(struct sk_buff *skb, - struct net_device *dev, - struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h) +static void ndisc_send_skb(struct sk_buff *skb, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { - struct flowi fl; - struct dst_entry *dst; - struct net *net = dev_net(dev); + struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; + struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; - icmpv6_flow_init(sk, &fl, type, saddr, daddr, dev->ifindex); - - dst = icmp6_dst_alloc(dev, neigh, daddr); if (!dst) { - kfree_skb(skb); - return; - } + struct flowi6 fl6; - err = xfrm_lookup(net, &dst, &fl, NULL, 0); - if (err < 0) { - kfree_skb(skb); - return; + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, &fl6); + if (IS_ERR(dst)) { + kfree_skb(skb); + return; + } + + skb_dst_set(skb, dst); } - skb_dst_set(skb, dst); + icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, + csum_partial(icmp6h, + skb->len, 0)); - idev = in6_dev_get(dst->dev); + ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } - if (likely(idev != NULL)) - in6_dev_put(idev); + rcu_read_unlock(); } -EXPORT_SYMBOL(ndisc_send_skb); - -/* - * Send a Neighbour Discover packet - */ -static void __ndisc_send(struct net_device *dev, - struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h, const struct in6_addr *target, - int llinfo) +void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; - - skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo); - if (!skb) - return; - - ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h); -} - -static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - int router, int solicited, int override, int inc_opt) -{ struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, - }; + struct nd_msg *msg; + int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) - override = 0; + override = false; + inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, @@ -594,24 +493,64 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, src_addr = &tmpaddr; } - icmp6h.icmp6_router = router; - icmp6h.icmp6_solicited = solicited; - icmp6h.icmp6_override = override; + if (!dev->addr_len) + inc_opt = 0; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + .icmp6_router = router, + .icmp6_solicited = solicited, + .icmp6_override = override, + }, + .target = *solicited_addr, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + dev->dev_addr); + + + ndisc_send_skb(skb, daddr, src_addr); +} + +static void ndisc_send_unsol_na(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + idev = in6_dev_get(dev); + if (!idev) + return; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr, + /*router=*/ !!idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + read_unlock_bh(&idev->lock); - inc_opt |= ifp->idev->cnf.force_tllao; - __ndisc_send(dev, neigh, daddr, src_addr, - &icmp6h, solicited_addr, - inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); + in6_dev_put(idev); } void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct sk_buff *skb; struct in6_addr addr_buf; - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, - }; + int inc_opt = dev->addr_len; + int optlen = 0; + struct nd_msg *msg; if (saddr == NULL) { if (ipv6_get_lladdr(dev, &addr_buf, @@ -620,18 +559,37 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, saddr = &addr_buf; } - __ndisc_send(dev, neigh, daddr, saddr, - &icmp6h, solicit, - !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0); + if (ipv6_addr_any(saddr)) + inc_opt = false; + if (inc_opt) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct nd_msg) { + .icmph = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }, + .target = *solicit, + }; + + if (inc_opt) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct icmp6hdr icmp6h = { - .icmp6_type = NDISC_ROUTER_SOLICITATION, - }; + struct sk_buff *skb; + struct rs_msg *msg; int send_sllao = dev->addr_len; + int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* @@ -655,9 +613,25 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } } #endif - __ndisc_send(dev, NULL, daddr, saddr, - &icmp6h, NULL, - send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0); + if (send_sllao) + optlen += ndisc_opt_addr_space(dev); + + skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!skb) + return; + + msg = (struct rs_msg *)skb_put(skb, sizeof(*msg)); + *msg = (struct rs_msg) { + .icmph = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }, + }; + + if (send_sllao) + ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + dev->dev_addr); + + ndisc_send_skb(skb, daddr, saddr); } @@ -684,16 +658,15 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1)) saddr = &ipv6_hdr(skb)->saddr; - if ((probes -= neigh->parms->ucast_probes) < 0) { + if ((probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES)) < 0) { if (!(neigh->nud_state & NUD_VALID)) { - ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %pI6\n", - __func__, target); + ND_PRINTK(1, dbg, + "%s: trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); } ndisc_send_ns(dev, neigh, target, target, saddr); - } else if ((probes -= neigh->parms->app_probes) < 0) { -#ifdef CONFIG_ARPD + } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); -#endif } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); @@ -718,10 +691,10 @@ static int pndisc_is_router(const void *pkey, static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); - struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; - struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; @@ -729,12 +702,16 @@ static void ndisc_recv_ns(struct sk_buff *skb) struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); - int inc; + bool inc; int is_router = -1; + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK(2, warn, "NS: packet too short\n"); + return; + } + if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: multicast target address"); + ND_PRINTK(2, warn, "NS: multicast target address\n"); return; } @@ -742,27 +719,21 @@ static void ndisc_recv_ns(struct sk_buff *skb) * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ - if (dad && - !(daddr->s6_addr32[0] == htonl(0xff020000) && - daddr->s6_addr32[1] == htonl(0x00000000) && - daddr->s6_addr32[2] == htonl(0x00000001) && - daddr->s6_addr [12] == 0xff )) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + if (dad && !ipv6_addr_is_solict_mult(daddr)) { + ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return; } if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid ND options\n"); + ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "NS: invalid link-layer address length\n"); return; } @@ -772,8 +743,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) * in the message. */ if (dad) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + ND_PRINTK(2, warn, + "NS: bad DAD packet (link-layer address option)\n"); return; } } @@ -785,20 +756,6 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { - if (dev->type == ARPHRD_IEEE802_TR) { - const unsigned char *sadr; - sadr = skb_mac_header(skb); - if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && - sadr[9] == dev->dev_addr[1] && - sadr[10] == dev->dev_addr[2] && - sadr[11] == dev->dev_addr[3] && - sadr[12] == dev->dev_addr[4] && - sadr[13] == dev->dev_addr[5]) { - /* looped-back to us */ - goto out; - } - } - /* * We are colliding with another node * who is doing DAD @@ -834,8 +791,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && - inc != 0 && - idev->nd_parms->proxy_delay != 0) { + inc && + NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response @@ -853,11 +810,11 @@ static void ndisc_recv_ns(struct sk_buff *skb) } if (is_router < 0) - is_router = !!idev->cnf.forwarding; + is_router = idev->cnf.forwarding; if (dad) { ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, - is_router, 0, (ifp != NULL), 1); + !!is_router, false, (ifp != NULL), true); goto out; } @@ -878,8 +835,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) NEIGH_UPDATE_F_OVERRIDE); if (neigh || !dev->header_ops) { ndisc_send_na(dev, neigh, saddr, &msg->target, - is_router, - 1, (ifp != NULL && inc), inc); + !!is_router, + true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); } @@ -889,17 +846,15 @@ out: in6_ifa_put(ifp); else in6_dev_put(idev); - - return; } static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; - struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; - u32 ndoptlen = skb->tail - (skb->transport_header + + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; @@ -907,42 +862,39 @@ static void ndisc_recv_na(struct sk_buff *skb) struct neighbour *neigh; if (skb->len < sizeof(struct nd_msg)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: packet too short\n"); + ND_PRINTK(2, warn, "NA: packet too short\n"); return; } if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: target address is multicast.\n"); + ND_PRINTK(2, warn, "NA: target address is multicast\n"); return; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: solicited NA is multicasted.\n"); + ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return; } if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NS: invalid ND option\n"); + ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NA: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "NA: invalid link-layer address length\n"); return; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { - if (ifp->flags & IFA_F_TENTATIVE) { - addrconf_dad_failure(ifp); - return; + if (skb->pkt_type != PACKET_LOOPBACK + && (ifp->flags & IFA_F_TENTATIVE)) { + addrconf_dad_failure(ifp); + return; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing @@ -954,9 +906,9 @@ static void ndisc_recv_na(struct sk_buff *skb) unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) - ND_PRINTK1(KERN_WARNING - "ICMPv6 NA: someone advertises our address %pI6 on %s!\n", - &ifp->addr, ifp->idev->dev->name); + ND_PRINTK(1, warn, + "NA: someone advertises our address %pI6 on %s!\n", + &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return; } @@ -977,7 +929,7 @@ static void ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { - /* XXX: idev->cnf.prixy_ndp */ + /* XXX: idev->cnf.proxy_ndp */ goto out; } @@ -992,10 +944,7 @@ static void ndisc_recv_na(struct sk_buff *skb) /* * Change: router to host */ - struct rt6_info *rt; - rt = rt6_get_dflt_router(saddr, dev); - if (rt) - ip6_del_rt(rt); + rt6_clean_tohost(dev_net(dev), saddr); } out: @@ -1009,17 +958,16 @@ static void ndisc_recv_rs(struct sk_buff *skb) unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; - struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; if (skb->len < sizeof(*rs_msg)) return; - idev = in6_dev_get(skb->dev); + idev = __in6_dev_get(skb->dev); if (!idev) { - if (net_ratelimit()) - ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + ND_PRINTK(1, err, "RS: can't find in6 device\n"); return; } @@ -1036,8 +984,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) /* Parse ND options */ if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { - if (net_ratelimit()) - ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1057,7 +1004,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh_release(neigh); } out: - in6_dev_put(idev); + return; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) @@ -1092,8 +1039,9 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); - NLA_PUT(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), - &ipv6_hdr(ra)->saddr); + if (nla_put(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), + &ipv6_hdr(ra)->saddr)) + goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); @@ -1119,23 +1067,21 @@ static void ndisc_router_discovery(struct sk_buff *skb) __u8 * opt = (__u8 *)(ra_msg + 1); - optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); + optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - + sizeof(struct ra_msg); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: source address is not link-local.\n"); + ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; } if (optlen < 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: packet too short\n"); + ND_PRINTK(2, warn, "RA: packet too short\n"); return; } #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: from host or unauthorized router\n"); + ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return; } #endif @@ -1144,23 +1090,19 @@ static void ndisc_router_discovery(struct sk_buff *skb) * set the RA_RECV flag in the interface */ - in6_dev = in6_dev_get(skb->dev); + in6_dev = __in6_dev_get(skb->dev); if (in6_dev == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: can't find inet6 device for %s.\n", - skb->dev->name); + ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", + skb->dev->name); return; } if (!ndisc_parse_options(opt, optlen, &ndopts)) { - in6_dev_put(in6_dev); - ND_PRINTK2(KERN_WARNING - "ICMP6 RA: invalid ND options\n"); + ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!ipv6_accept_ra(in6_dev)) goto skip_linkparms; #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -1191,6 +1133,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!in6_dev->cnf.accept_ra_defrtr) goto skip_defrtr; + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_defrtr; + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); #ifdef CONFIG_IPV6_ROUTER_PREF @@ -1203,35 +1148,38 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); - if (rt) - neigh = rt->rt6i_nexthop; - + if (rt) { + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + if (!neigh) { + ND_PRINTK(0, err, + "RA: %s got default router without neighbour\n", + __func__); + ip6_rt_put(rt); + return; + } + } if (rt && lifetime == 0) { - neigh_clone(neigh); ip6_del_rt(rt); rt = NULL; } if (rt == NULL && lifetime) { - ND_PRINTK3(KERN_DEBUG - "ICMPv6 RA: adding default router.\n"); + ND_PRINTK(3, dbg, "RA: adding default router\n"); rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); if (rt == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: %s() failed to add default route.\n", - __func__); - in6_dev_put(in6_dev); + ND_PRINTK(0, err, + "RA: %s failed to add default route\n", + __func__); return; } - neigh = rt->rt6i_nexthop; + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); if (neigh == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 RA: %s() got default router without neighbour.\n", - __func__); - dst_release(&rt->u.dst); - in6_dev_put(in6_dev); + ND_PRINTK(0, err, + "RA: %s got default router without neighbour\n", + __func__); + ip6_rt_put(rt); return; } neigh->flags |= NTF_ROUTER; @@ -1240,12 +1188,12 @@ static void ndisc_router_discovery(struct sk_buff *skb) } if (rt) - rt->rt6i_expires = jiffies + (HZ * lifetime); - + rt6_set_expires(rt, jiffies + (HZ * lifetime)); if (ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; if (rt) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } skip_defrtr: @@ -1261,7 +1209,7 @@ skip_defrtr: rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; - in6_dev->nd_parms->retrans_time = rtime; + NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); } @@ -1273,9 +1221,11 @@ skip_defrtr: if (rtime < HZ/10) rtime = HZ/10; - if (rtime != in6_dev->nd_parms->base_reachable_time) { - in6_dev->nd_parms->base_reachable_time = rtime; - in6_dev->nd_parms->gc_staletime = 3 * rtime; + if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { + NEIGH_VAR_SET(in6_dev->nd_parms, + BASE_REACHABLE_TIME, rtime); + NEIGH_VAR_SET(in6_dev->nd_parms, + GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); @@ -1298,8 +1248,8 @@ skip_linkparms: lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid link-layer address length\n"); + ND_PRINTK(2, warn, + "RA: invalid link-layer address length\n"); goto out; } } @@ -1310,11 +1260,13 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!ipv6_accept_ra(in6_dev)) goto out; #ifdef CONFIG_IPV6_ROUTE_INFO + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_routeinfo; + if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; @@ -1326,12 +1278,17 @@ skip_linkparms: ri->prefix_len == 0) continue; #endif + if (ri->prefix_len == 0 && + !in6_dev->cnf.accept_ra_defrtr) + continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } + +skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -1345,7 +1302,9 @@ skip_linkparms: for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { - addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3); + addrconf_prefix_rcv(skb->dev, (u8 *)p, + (p->nd_opt_len) << 3, + ndopts.nd_opts_src_lladdr != NULL); } } @@ -1357,14 +1316,12 @@ skip_linkparms: mtu = ntohl(n); if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid mtu: %d\n", - mtu); + ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; if (rt) - rt->u.dst.metrics[RTAX_MTU-1] = mtu; + dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } @@ -1380,252 +1337,180 @@ skip_linkparms: } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 RA: invalid RA options"); + ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: - if (rt) - dst_release(&rt->u.dst); - else if (neigh) + ip6_rt_put(rt); + if (neigh) neigh_release(neigh); - in6_dev_put(in6_dev); } static void ndisc_redirect_rcv(struct sk_buff *skb) { - struct inet6_dev *in6_dev; - struct icmp6hdr *icmph; - struct in6_addr *dest; - struct in6_addr *target; /* new first hop to destination */ - struct neighbour *neigh; - int on_link = 0; + u8 *hdr; struct ndisc_options ndopts; - int optlen; - u8 *lladdr = NULL; + struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); + u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + + offsetof(struct rd_msg, opt)); #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: from host or unauthorized router\n"); + ND_PRINTK(2, warn, + "Redirect: from host or unauthorized router\n"); return; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: source address is not link-local.\n"); + ND_PRINTK(2, warn, + "Redirect: source address is not link-local\n"); return; } - optlen = skb->tail - skb->transport_header; - optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); - - if (optlen < 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: packet too short\n"); + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) return; - } - - icmph = icmp6_hdr(skb); - target = (struct in6_addr *) (icmph + 1); - dest = target + 1; - if (ipv6_addr_is_multicast(dest)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: destination address is multicast.\n"); + if (!ndopts.nd_opts_rh) { + ip6_redirect_no_header(skb, dev_net(skb->dev), + skb->dev->ifindex, 0); return; } - if (ipv6_addr_equal(dest, target)) { - on_link = 1; - } else if (ipv6_addr_type(target) != - (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: target address is not link-local unicast.\n"); + hdr = (u8 *)ndopts.nd_opts_rh; + hdr += 8; + if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return; - } - in6_dev = in6_dev_get(skb->dev); - if (!in6_dev) - return; - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) { - in6_dev_put(in6_dev); - return; - } + icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); +} - /* RFC2461 8.1: - * The IP source address of the Redirect MUST be the same as the current - * first-hop router for the specified ICMP Destination Address. - */ +static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, + struct sk_buff *orig_skb, + int rd_len) +{ + u8 *opt = skb_put(skb, rd_len); - if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: invalid ND options\n"); - in6_dev_put(in6_dev); - return; - } - if (ndopts.nd_opts_tgt_lladdr) { - lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, - skb->dev); - if (!lladdr) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: invalid link-layer address length\n"); - in6_dev_put(in6_dev); - return; - } - } + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; - neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); - if (neigh) { - rt6_redirect(dest, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr, neigh, lladdr, - on_link); - neigh_release(neigh); - } - in6_dev_put(in6_dev); + memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8); } -void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, - const struct in6_addr *target) +void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; - int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + int optlen = 0; + struct inet_peer *peer; struct sk_buff *buff; - struct icmp6hdr *icmph; + struct rd_msg *msg; struct in6_addr saddr_buf; - struct in6_addr *addrp; struct rt6_info *rt; struct dst_entry *dst; - struct inet6_dev *idev; - struct flowi fl; - u8 *opt; + struct flowi6 fl6; int rd_len; - int err; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: no link-local address on %s\n", - dev->name); + ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", + dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: target address is not link-local unicast.\n"); + ND_PRINTK(2, warn, + "Redirect: target address is not link-local unicast\n"); return; } - icmpv6_flow_init(sk, &fl, NDISC_REDIRECT, + icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); - dst = ip6_route_output(net, NULL, &fl); - if (dst == NULL) + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); return; - - err = xfrm_lookup(net, &dst, &fl, NULL, 0); - if (err) + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) return; rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 Redirect: destination is not a neighbour.\n"); + ND_PRINTK(2, warn, + "Redirect: destination is not a neighbour\n"); goto release; } - if (!xrlim_allow(dst, 1*HZ)) + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + ret = inet_peer_xrlim_allow(peer, 1*HZ); + if (peer) + inet_putpeer(peer); + if (!ret) goto release; if (dev->addr_len) { + struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); + if (!neigh) { + ND_PRINTK(2, warn, + "Redirect: no neigh for target address\n"); + goto release; + } + read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - len += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev); } else read_unlock_bh(&neigh->lock); + + neigh_release(neigh); } rd_len = min_t(unsigned int, - IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, + skb->len + 8); rd_len &= ~0x7; - len += rd_len; - - buff = sock_alloc_send_skb(sk, - (MAX_HEADER + sizeof(struct ipv6hdr) + - len + LL_ALLOCATED_SPACE(dev)), - 1, &err); - if (buff == NULL) { - ND_PRINTK0(KERN_ERR - "ICMPv6 Redirect: %s() failed to allocate an skb, err=%d.\n", - __func__, err); - goto release; - } - - skb_reserve(buff, LL_RESERVED_SPACE(dev)); - ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, - IPPROTO_ICMPV6, len); + optlen += rd_len; - skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); - skb_put(buff, len); - icmph = icmp6_hdr(buff); - - memset(icmph, 0, sizeof(struct icmp6hdr)); - icmph->icmp6_type = NDISC_REDIRECT; - - /* - * copy target and destination addresses - */ - - addrp = (struct in6_addr *)(icmph + 1); - ipv6_addr_copy(addrp, target); - addrp++; - ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr); + buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + if (!buff) + goto release; - opt = (u8*) (addrp + 1); + msg = (struct rd_msg *)skb_put(buff, sizeof(*msg)); + *msg = (struct rd_msg) { + .icmph = { + .icmp6_type = NDISC_REDIRECT, + }, + .target = *target, + .dest = ipv6_hdr(skb)->daddr, + }; /* * include target_address option */ if (ha) - opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, - dev->addr_len, dev->type); + ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); /* * build redirect option and copy skb over to the new packet. */ - memset(opt, 0, 8); - *(opt++) = ND_OPT_REDIRECT_HDR; - *(opt++) = (rd_len >> 3); - opt += 6; - - memcpy(opt, ipv6_hdr(skb), rd_len - 8); - - icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, - len, IPPROTO_ICMPV6, - csum_partial(icmph, len, 0)); + if (rd_len) + ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); - idev = in6_dev_get(dst->dev); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, - dst_output); - if (!err) { - ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT); - ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - } - - if (likely(idev != NULL)) - in6_dev_put(idev); + ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: @@ -1638,11 +1523,28 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return true; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && + idev->cnf.suppress_frag_ndisc) { + net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); + return true; + } + return false; +} + int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; - if (!pskb_may_pull(skb, skb->len)) + if (ndisc_suppress_frag_ndisc(skb)) + return 0; + + if (skb_linearize(skb)) return 0; msg = (struct nd_msg *)skb_transport_header(skb); @@ -1650,16 +1552,14 @@ int ndisc_rcv(struct sk_buff *skb) __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NDISC: invalid hop-limit: %d\n", - ipv6_hdr(skb)->hop_limit); + ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", + ipv6_hdr(skb)->hop_limit); return 0; } if (msg->icmph.icmp6_code != 0) { - ND_PRINTK2(KERN_WARNING - "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", - msg->icmph.icmp6_code); + ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); return 0; } @@ -1692,17 +1592,27 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct inet6_dev *idev; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); - fib6_run_gc(~0UL, net); + fib6_run_gc(0, net, false); + idev = in6_dev_get(dev); + if (!idev) + break; + if (idev->cnf.ndisc_notify) + ndisc_send_unsol_na(dev); + in6_dev_put(idev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); - fib6_run_gc(~0UL, net); + fib6_run_gc(0, net, false); + break; + case NETDEV_NOTIFY_PEERS: + ndisc_send_unsol_na(dev); break; default: break; @@ -1723,11 +1633,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); - printk(KERN_WARNING - "process `%s' is using deprecated sysctl (%s) " - "net.ipv6.neigh.%s.%s; " - "Use net.ipv6.neigh.%s.%s_ms " - "instead.\n", + pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); @@ -1746,22 +1652,23 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *bu ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) - ret = proc_dointvec_jiffies(ctl, write, - buffer, lenp, ppos); + ret = neigh_proc_dointvec_jiffies(ctl, write, + buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) - ret = proc_dointvec_ms_jiffies(ctl, write, - buffer, lenp, ppos); + ret = neigh_proc_dointvec_ms_jiffies(ctl, write, + buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->data == &idev->nd_parms->base_reachable_time) - idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) + idev->nd_parms->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); @@ -1772,7 +1679,7 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *bu #endif -static int ndisc_net_init(struct net *net) +static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; @@ -1781,9 +1688,9 @@ static int ndisc_net_init(struct net *net) err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - ND_PRINTK0(KERN_ERR - "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", - err); + ND_PRINTK(0, err, + "NDISC: Failed to initialize the control socket (err %d)\n", + err); return err; } @@ -1797,7 +1704,7 @@ static int ndisc_net_init(struct net *net) return 0; } -static void ndisc_net_exit(struct net *net) +static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } @@ -1820,30 +1727,33 @@ int __init ndisc_init(void) neigh_table_init(&nd_tbl); #ifdef CONFIG_SYSCTL - err = neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, - NET_IPV6_NEIGH, "ipv6", + err = neigh_sysctl_register(NULL, &nd_tbl.parms, &ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; -#endif - err = register_netdevice_notifier(&ndisc_netdev_notifier); - if (err) - goto out_unregister_sysctl; out: +#endif return err; -out_unregister_sysctl: #ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(&nd_tbl.parms); out_unregister_pernet: -#endif unregister_pernet_subsys(&ndisc_net_ops); goto out; +#endif } -void ndisc_cleanup(void) +int __init ndisc_late_init(void) +{ + return register_netdevice_notifier(&ndisc_netdev_notifier); +} + +void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); +} + +void ndisc_cleanup(void) +{ #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index d5ed92b1434..d38e6a8d8b9 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -1,8 +1,16 @@ +/* + * IPv6 specific functions of netfilter core + * + * Rusty Russell (C) 2000 -- This code is GPL. + * Patrick McHardy (C) 2006-2012 + */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/addrconf.h> #include <net/dst.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -13,43 +21,49 @@ int ip6_route_me_harder(struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int hh_len; struct dst_entry *dst; - struct flowi fl = { - .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, - .mark = skb->mark, - .nl_u = - { .ip6_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, } }, + struct flowi6 fl6 = { + .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .flowi6_mark = skb->mark, + .daddr = iph->daddr, + .saddr = iph->saddr, }; + int err; - dst = ip6_route_output(net, skb->sk, &fl); - -#ifdef CONFIG_XFRM - if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET6) == 0) { - struct dst_entry *dst2 = skb_dst(skb); - - if (xfrm_lookup(net, &dst2, &fl, skb->sk, 0)) { - skb_dst_set(skb, NULL); - return -1; - } - skb_dst_set(skb, dst2); - } -#endif - - if (dst->error) { + dst = ip6_route_output(net, skb->sk, &fl6); + err = dst->error; + if (err) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); - return -EINVAL; + return err; } /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, dst); + +#ifdef CONFIG_XFRM + if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { + skb_dst_set(skb, NULL); + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + skb_dst_set(skb, dst); + } +#endif + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), + 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; } EXPORT_SYMBOL(ip6_route_me_harder); @@ -71,7 +85,7 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->hook == NF_INET_LOCAL_OUT) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; @@ -85,7 +99,7 @@ static int nf_ip6_reroute(struct sk_buff *skb, struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->hook == NF_INET_LOCAL_OUT) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) @@ -94,16 +108,32 @@ static int nf_ip6_reroute(struct sk_buff *skb, return 0; } -static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict) { - *dst = ip6_route_output(&init_net, NULL, fl); - return (*dst)->error; + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + struct dst_entry *result; + int err; + + result = ip6_route_output(net, sk, &fl->u.ip6); + err = result->error; + if (err) + dst_release(result); + else + *dst = result; + return err; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { @@ -137,7 +167,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum hsum; __sum16 csum = 0; @@ -154,13 +184,15 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, protocol, csum_sub(0, hsum))); skb->ip_summed = CHECKSUM_NONE; - csum = __skb_checksum_complete_head(skb, dataoff + len); - if (!csum) - skb->ip_summed = CHECKSUM_UNNECESSARY; + return __skb_checksum_complete_head(skb, dataoff + len); } return csum; }; +static const struct nf_ipv6_ops ipv6ops = { + .chk_addr = ipv6_chk_addr, +}; + static const struct nf_afinfo nf_ip6_afinfo = { .family = AF_INET6, .checksum = nf_ip6_checksum, @@ -173,6 +205,7 @@ static const struct nf_afinfo nf_ip6_afinfo = { int __init ipv6_netfilter_init(void) { + RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return nf_register_afinfo(&nf_ip6_afinfo); } @@ -181,5 +214,6 @@ int __init ipv6_netfilter_init(void) */ void ipv6_netfilter_fini(void) { + RCU_INIT_POINTER(nf_ipv6_ops, NULL); nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 29d643bcafa..4bff1f297e3 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,15 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +config NF_DEFRAG_IPV6 + tristate + default n + config NF_CONNTRACK_IPV6 tristate "IPv6 connection tracking support" depends on INET && IPV6 && NF_CONNTRACK default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -20,27 +25,35 @@ config NF_CONNTRACK_IPV6 To compile it as a module, choose M here. If unsure, say N. -config IP6_NF_QUEUE - tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" - depends on INET && IPV6 && NETFILTER - depends on NETFILTER_ADVANCED - ---help--- - - This option adds a queue handler to the kernel for IPv6 - packets which enables users to receive the filtered packets - with QUEUE target using libipq. - - This option enables the old IPv6-only "ip6_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). +config NF_TABLES_IPV6 + depends on NF_TABLES + tristate "IPv6 nf_tables support" + help + This option enables the IPv6 support for nf_tables. - (C) Fernando Anton 2001 - IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. - Universidad Carlos III de Madrid - Universidad Politecnica de Alcala de Henares - email: <fanton@it.uc3m.es>. +config NFT_CHAIN_ROUTE_IPV6 + depends on NF_TABLES_IPV6 + tristate "IPv6 nf_tables route chain support" + help + This option enables the "route" chain for IPv6 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, flowlabel, hop-limit and + the packet mark. + +config NFT_CHAIN_NAT_IPV6 + depends on NF_TABLES_IPV6 + depends on NF_NAT_IPV6 && NFT_NAT + tristate "IPv6 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv6 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. - To compile it as a module, choose M here. If unsure, say N. +config NFT_REJECT_IPV6 + depends on NF_TABLES_IPV6 + default NFT_REJECT + tristate config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" @@ -120,6 +133,16 @@ config IP6_NF_MATCH_MH To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_MATCH_RPFILTER + tristate '"rpfilter" reverse path filter match support' + depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW) + ---help--- + This option allows you to match packets whose replies would + go out via the interface the packet came in. + + To compile it as a module, choose M here. If unsure, say N. + The module will be called ip6t_rpfilter. + config IP6_NF_MATCH_RT tristate '"rt" Routing header match support' depends on NETFILTER_ADVANCED @@ -132,22 +155,13 @@ config IP6_NF_MATCH_RT # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compat option for the user's convenience + This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. -config IP6_NF_TARGET_LOG - tristate "LOG target support" - default m if NETFILTER_ADVANCED=n - help - This option adds a `LOG' target, which allows you to create rules in - any iptables table which records the packet header to the syslog. - - To compile it as a module, choose M here. If unsure, say N. - config IP6_NF_FILTER tristate "Packet filtering" default m if NETFILTER_ADVANCED=n @@ -169,6 +183,19 @@ config IP6_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n @@ -181,7 +208,6 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' - depends on NETFILTER_ADVANCED help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -198,9 +224,44 @@ config IP6_NF_SECURITY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. - + If unsure, say N. +config NF_NAT_IPV6 + tristate "IPv6 NAT" + depends on NF_CONNTRACK_IPV6 + depends on NETFILTER_ADVANCED + select NF_NAT + help + The IPv6 NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in ip6tables, see the man page for ip6tables(8). + + To compile it as a module, choose M here. If unsure, say N. + +if NF_NAT_IPV6 + +config IP6_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_NPT + tristate "NPT (Network Prefix translation) target support" + help + This option adds the `SNPT' and `DNPT' target, which perform + stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296. + + To compile it as a module, choose M here. If unsure, say N. + +endif # NF_NAT_IPV6 + endif # IP6_NF_IPTABLES endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index aafbba30c89..70d3dd66f2c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -6,16 +6,29 @@ obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o -obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o +obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o # objects for l3 independent conntrack -nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o +nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o # l3 independent conntrack obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o +nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o +obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o + +# defrag +nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + +# nf_tables +obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o +obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o +obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o +obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o @@ -23,8 +36,11 @@ obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o +obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o # targets -obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o +obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o +obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c deleted file mode 100644 index 7854052be60..00000000000 --- a/net/ipv6/netfilter/ip6_queue.c +++ /dev/null @@ -1,646 +0,0 @@ -/* - * This is a module which is used for queueing IPv6 packets and - * communicating with userspace via netlink. - * - * (C) 2001 Fernando Anton, this code is GPL. - * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. - * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain - * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain - * email: fanton@it.uc3m.es - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/ipv6.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/netfilter.h> -#include <linux/netlink.h> -#include <linux/spinlock.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/ipv6.h> -#include <net/ip6_route.h> -#include <net/netfilter/nf_queue.h> -#include <linux/netfilter_ipv4/ip_queue.h> -#include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> - -#define IPQ_QMAX_DEFAULT 1024 -#define IPQ_PROC_FS_NAME "ip6_queue" -#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" - -typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); - -static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; -static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_RWLOCK(queue_lock); -static int peer_pid __read_mostly; -static unsigned int copy_range __read_mostly; -static unsigned int queue_total; -static unsigned int queue_dropped = 0; -static unsigned int queue_user_dropped = 0; -static struct sock *ipqnl __read_mostly; -static LIST_HEAD(queue_list); -static DEFINE_MUTEX(ipqnl_mutex); - -static inline void -__ipq_enqueue_entry(struct nf_queue_entry *entry) -{ - list_add_tail(&entry->list, &queue_list); - queue_total++; -} - -static inline int -__ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status = 0; - - switch(mode) { - case IPQ_COPY_NONE: - case IPQ_COPY_META: - copy_mode = mode; - copy_range = 0; - break; - - case IPQ_COPY_PACKET: - copy_mode = mode; - copy_range = range; - if (copy_range > 0xFFFF) - copy_range = 0xFFFF; - break; - - default: - status = -EINVAL; - - } - return status; -} - -static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); - -static inline void -__ipq_reset(void) -{ - peer_pid = 0; - net_disable_timestamp(); - __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NULL, 0); -} - -static struct nf_queue_entry * -ipq_find_dequeue_entry(unsigned long id) -{ - struct nf_queue_entry *entry = NULL, *i; - - write_lock_bh(&queue_lock); - - list_for_each_entry(i, &queue_list, list) { - if ((unsigned long)i == id) { - entry = i; - break; - } - } - - if (entry) { - list_del(&entry->list); - queue_total--; - } - - write_unlock_bh(&queue_lock); - return entry; -} - -static void -__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - struct nf_queue_entry *entry, *next; - - list_for_each_entry_safe(entry, next, &queue_list, list) { - if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue_total--; - nf_reinject(entry, NF_DROP); - } - } -} - -static void -ipq_flush(ipq_cmpfn cmpfn, unsigned long data) -{ - write_lock_bh(&queue_lock); - __ipq_flush(cmpfn, data); - write_unlock_bh(&queue_lock); -} - -static struct sk_buff * -ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) -{ - sk_buff_data_t old_tail; - size_t size = 0; - size_t data_len = 0; - struct sk_buff *skb; - struct ipq_packet_msg *pmsg; - struct nlmsghdr *nlh; - struct timeval tv; - - read_lock_bh(&queue_lock); - - switch (copy_mode) { - case IPQ_COPY_META: - case IPQ_COPY_NONE: - size = NLMSG_SPACE(sizeof(*pmsg)); - break; - - case IPQ_COPY_PACKET: - if ((entry->skb->ip_summed == CHECKSUM_PARTIAL || - entry->skb->ip_summed == CHECKSUM_COMPLETE) && - (*errp = skb_checksum_help(entry->skb))) { - read_unlock_bh(&queue_lock); - return NULL; - } - if (copy_range == 0 || copy_range > entry->skb->len) - data_len = entry->skb->len; - else - data_len = copy_range; - - size = NLMSG_SPACE(sizeof(*pmsg) + data_len); - break; - - default: - *errp = -EINVAL; - read_unlock_bh(&queue_lock); - return NULL; - } - - read_unlock_bh(&queue_lock); - - skb = alloc_skb(size, GFP_ATOMIC); - if (!skb) - goto nlmsg_failure; - - old_tail = skb->tail; - nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); - pmsg = NLMSG_DATA(nlh); - memset(pmsg, 0, sizeof(*pmsg)); - - pmsg->packet_id = (unsigned long )entry; - pmsg->data_len = data_len; - tv = ktime_to_timeval(entry->skb->tstamp); - pmsg->timestamp_sec = tv.tv_sec; - pmsg->timestamp_usec = tv.tv_usec; - pmsg->mark = entry->skb->mark; - pmsg->hook = entry->hook; - pmsg->hw_protocol = entry->skb->protocol; - - if (entry->indev) - strcpy(pmsg->indev_name, entry->indev->name); - else - pmsg->indev_name[0] = '\0'; - - if (entry->outdev) - strcpy(pmsg->outdev_name, entry->outdev->name); - else - pmsg->outdev_name[0] = '\0'; - - if (entry->indev && entry->skb->dev) { - pmsg->hw_type = entry->skb->dev->type; - pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); - } - - if (data_len) - if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) - BUG(); - - nlh->nlmsg_len = skb->tail - old_tail; - return skb; - -nlmsg_failure: - *errp = -EINVAL; - printk(KERN_ERR "ip6_queue: error creating packet message\n"); - return NULL; -} - -static int -ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) -{ - int status = -EINVAL; - struct sk_buff *nskb; - - if (copy_mode == IPQ_COPY_NONE) - return -EAGAIN; - - nskb = ipq_build_packet_message(entry, &status); - if (nskb == NULL) - return status; - - write_lock_bh(&queue_lock); - - if (!peer_pid) - goto err_out_free_nskb; - - if (queue_total >= queue_maxlen) { - queue_dropped++; - status = -ENOSPC; - if (net_ratelimit()) - printk (KERN_WARNING "ip6_queue: fill at %d entries, " - "dropping packet(s). Dropped: %d\n", queue_total, - queue_dropped); - goto err_out_free_nskb; - } - - /* netlink_unicast will either free the nskb or attach it to a socket */ - status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); - if (status < 0) { - queue_user_dropped++; - goto err_out_unlock; - } - - __ipq_enqueue_entry(entry); - - write_unlock_bh(&queue_lock); - return status; - -err_out_free_nskb: - kfree_skb(nskb); - -err_out_unlock: - write_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) -{ - int diff; - struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; - struct sk_buff *nskb; - - if (v->data_len < sizeof(*user_iph)) - return 0; - diff = v->data_len - e->skb->len; - if (diff < 0) { - if (pskb_trim(e->skb, v->data_len)) - return -ENOMEM; - } else if (diff > 0) { - if (v->data_len > 0xFFFF) - return -EINVAL; - if (diff > skb_tailroom(e->skb)) { - nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), - diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "ip6_queue: OOM " - "in mangle, dropping packet\n"); - return -ENOMEM; - } - kfree_skb(e->skb); - e->skb = nskb; - } - skb_put(e->skb, diff); - } - if (!skb_make_writable(e->skb, v->data_len)) - return -ENOMEM; - skb_copy_to_linear_data(e->skb, v->payload, v->data_len); - e->skb->ip_summed = CHECKSUM_NONE; - - return 0; -} - -static int -ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) -{ - struct nf_queue_entry *entry; - - if (vmsg->value > NF_MAX_VERDICT) - return -EINVAL; - - entry = ipq_find_dequeue_entry(vmsg->id); - if (entry == NULL) - return -ENOENT; - else { - int verdict = vmsg->value; - - if (vmsg->data_len && vmsg->data_len == len) - if (ipq_mangle_ipv6(vmsg, entry) < 0) - verdict = NF_DROP; - - nf_reinject(entry, verdict); - return 0; - } -} - -static int -ipq_set_mode(unsigned char mode, unsigned int range) -{ - int status; - - write_lock_bh(&queue_lock); - status = __ipq_set_mode(mode, range); - write_unlock_bh(&queue_lock); - return status; -} - -static int -ipq_receive_peer(struct ipq_peer_msg *pmsg, - unsigned char type, unsigned int len) -{ - int status = 0; - - if (len < sizeof(*pmsg)) - return -EINVAL; - - switch (type) { - case IPQM_MODE: - status = ipq_set_mode(pmsg->msg.mode.value, - pmsg->msg.mode.range); - break; - - case IPQM_VERDICT: - if (pmsg->msg.verdict.value > NF_MAX_VERDICT) - status = -EINVAL; - else - status = ipq_set_verdict(&pmsg->msg.verdict, - len - sizeof(*pmsg)); - break; - default: - status = -EINVAL; - } - return status; -} - -static int -dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) -{ - if (entry->indev) - if (entry->indev->ifindex == ifindex) - return 1; - - if (entry->outdev) - if (entry->outdev->ifindex == ifindex) - return 1; -#ifdef CONFIG_BRIDGE_NETFILTER - if (entry->skb->nf_bridge) { - if (entry->skb->nf_bridge->physindev && - entry->skb->nf_bridge->physindev->ifindex == ifindex) - return 1; - if (entry->skb->nf_bridge->physoutdev && - entry->skb->nf_bridge->physoutdev->ifindex == ifindex) - return 1; - } -#endif - return 0; -} - -static void -ipq_dev_drop(int ifindex) -{ - ipq_flush(dev_cmp, ifindex); -} - -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) - -static inline void -__ipq_rcv_skb(struct sk_buff *skb) -{ - int status, type, pid, flags, nlmsglen, skblen; - struct nlmsghdr *nlh; - - skblen = skb->len; - if (skblen < sizeof(*nlh)) - return; - - nlh = nlmsg_hdr(skb); - nlmsglen = nlh->nlmsg_len; - if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) - return; - - pid = nlh->nlmsg_pid; - flags = nlh->nlmsg_flags; - - if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) - RCV_SKB_FAIL(-EINVAL); - - if (flags & MSG_TRUNC) - RCV_SKB_FAIL(-ECOMM); - - type = nlh->nlmsg_type; - if (type < NLMSG_NOOP || type >= IPQM_MAX) - RCV_SKB_FAIL(-EINVAL); - - if (type <= IPQM_BASE) - return; - - if (security_netlink_recv(skb, CAP_NET_ADMIN)) - RCV_SKB_FAIL(-EPERM); - - write_lock_bh(&queue_lock); - - if (peer_pid) { - if (peer_pid != pid) { - write_unlock_bh(&queue_lock); - RCV_SKB_FAIL(-EBUSY); - } - } else { - net_enable_timestamp(); - peer_pid = pid; - } - - write_unlock_bh(&queue_lock); - - status = ipq_receive_peer(NLMSG_DATA(nlh), type, - nlmsglen - NLMSG_LENGTH(0)); - if (status < 0) - RCV_SKB_FAIL(status); - - if (flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - return; -} - -static void -ipq_rcv_skb(struct sk_buff *skb) -{ - mutex_lock(&ipqnl_mutex); - __ipq_rcv_skb(skb); - mutex_unlock(&ipqnl_mutex); -} - -static int -ipq_rcv_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = ptr; - - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - - /* Drop any packets associated with the downed device */ - if (event == NETDEV_DOWN) - ipq_dev_drop(dev->ifindex); - return NOTIFY_DONE; -} - -static struct notifier_block ipq_dev_notifier = { - .notifier_call = ipq_rcv_dev_event, -}; - -static int -ipq_rcv_nl_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct netlink_notify *n = ptr; - - if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) { - write_lock_bh(&queue_lock); - if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) - __ipq_reset(); - write_unlock_bh(&queue_lock); - } - return NOTIFY_DONE; -} - -static struct notifier_block ipq_nl_notifier = { - .notifier_call = ipq_rcv_nl_event, -}; - -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *ipq_sysctl_header; - -static ctl_table ipq_table[] = { - { - .procname = NET_IPQ_QMAX_NAME, - .data = &queue_maxlen, - .maxlen = sizeof(queue_maxlen), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { } -}; -#endif - -#ifdef CONFIG_PROC_FS -static int ip6_queue_show(struct seq_file *m, void *v) -{ - read_lock_bh(&queue_lock); - - seq_printf(m, - "Peer PID : %d\n" - "Copy mode : %hu\n" - "Copy range : %u\n" - "Queue length : %u\n" - "Queue max. length : %u\n" - "Queue dropped : %u\n" - "Netfilter dropped : %u\n", - peer_pid, - copy_mode, - copy_range, - queue_total, - queue_maxlen, - queue_dropped, - queue_user_dropped); - - read_unlock_bh(&queue_lock); - return 0; -} - -static int ip6_queue_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip6_queue_show, NULL); -} - -static const struct file_operations ip6_queue_proc_fops = { - .open = ip6_queue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; -#endif - -static const struct nf_queue_handler nfqh = { - .name = "ip6_queue", - .outfn = &ipq_enqueue_packet, -}; - -static int __init ip6_queue_init(void) -{ - int status = -ENOMEM; - struct proc_dir_entry *proc __maybe_unused; - - netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, - ipq_rcv_skb, NULL, THIS_MODULE); - if (ipqnl == NULL) { - printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } - -#ifdef CONFIG_PROC_FS - proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, - &ip6_queue_proc_fops); - if (!proc) { - printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); - goto cleanup_ipqnl; - } -#endif - register_netdevice_notifier(&ipq_dev_notifier); -#ifdef CONFIG_SYSCTL - ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table); -#endif - status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh); - if (status < 0) { - printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); - goto cleanup_sysctl; - } - return status; - -cleanup_sysctl: -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - -cleanup_ipqnl: __maybe_unused - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - -cleanup_netlink_notifier: - netlink_unregister_notifier(&ipq_nl_notifier); - return status; -} - -static void __exit ip6_queue_fini(void) -{ - nf_unregister_queue_handlers(&nfqh); - - ipq_flush(NULL, 0); - -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(ipq_sysctl_header); -#endif - unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - - netlink_kernel_release(ipqnl); - mutex_lock(&ipqnl_mutex); - mutex_unlock(&ipqnl_mutex); - - netlink_unregister_notifier(&ipq_nl_notifier); -} - -MODULE_DESCRIPTION("IPv6 packet queue handler"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW); - -module_init(ip6_queue_init); -module_exit(ip6_queue_fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 480d7f8c980..e080fbbbc0e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -3,6 +3,7 @@ * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -29,6 +30,7 @@ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -39,24 +41,19 @@ MODULE_DESCRIPTION("IPv6 packet filter"); /*#define DEBUG_IP_FIREWALL_USER*/ #ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) printk(format , ## args) +#define dprintf(format, args...) pr_info(format , ## args) #else #define dprintf(format, args...) #endif #ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) +#define duprintf(format, args...) pr_info(format , ## args) #else #define duprintf(format, args...) #endif #ifdef CONFIG_NETFILTER_DEBUG -#define IP_NF_ASSERT(x) \ -do { \ - if (!(x)) \ - printk("IP_NF_ASSERT: %s:%s:%u\n", \ - __func__, __FILE__, __LINE__); \ -} while(0) +#define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif @@ -67,6 +64,12 @@ do { \ #define inline #endif +void *ip6t_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ip6t, IP6T); +} +EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); + /* We keep a set of rules for each CPU, so we can avoid write-locking them in the softirq when updating the counters and therefore @@ -76,19 +79,6 @@ do { \ Hence the start of any table is given by get_table() below. */ -/* Check for an extension */ -int -ip6t_ext_hdr(u8 nexthdr) -{ - return ( (nexthdr == IPPROTO_HOPOPTS) || - (nexthdr == IPPROTO_ROUTING) || - (nexthdr == IPPROTO_FRAGMENT) || - (nexthdr == IPPROTO_ESP) || - (nexthdr == IPPROTO_AH) || - (nexthdr == IPPROTO_NONE) || - (nexthdr == IPPROTO_DSTOPTS) ); -} - /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -144,7 +134,7 @@ ip6_packet_match(const struct sk_buff *skb, int protohdr; unsigned short _frag_off; - protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off); + protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL); if (protohdr < 0) { if (_frag_off == 0) *hotdrop = true; @@ -190,32 +180,15 @@ ip6_checkentry(const struct ip6t_ip6 *ipv6) } static unsigned int -ip6t_error(struct sk_buff *skb, const struct xt_target_param *par) +ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) { - if (net_ratelimit()) - printk("ip6_tables: error: `%s'\n", - (const char *)par->targinfo); + net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } -/* Performance critical - called for every packet */ -static inline bool -do_match(struct ip6t_entry_match *m, const struct sk_buff *skb, - struct xt_match_param *par) -{ - par->match = m->u.kernel.match; - par->matchinfo = m->data; - - /* Stop iteration if it doesn't match */ - if (!m->u.kernel.match->match(skb, par)) - return true; - else - return false; -} - static inline struct ip6t_entry * -get_entry(void *base, unsigned int offset) +get_entry(const void *base, unsigned int offset) { return (struct ip6t_entry *)(base + offset); } @@ -229,8 +202,13 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6) return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; } -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +static inline const struct xt_entry_target * +ip6t_get_target_c(const struct ip6t_entry *e) +{ + return ip6t_get_target((struct ip6t_entry *)e); +} + +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* This cries for unification! */ static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", @@ -264,13 +242,13 @@ static struct nf_loginfo trace_loginfo = { /* Mildly perf critical (only if packet tracing is on) */ static inline int -get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, +get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - struct ip6t_standard_target *t = (void *)ip6t_get_target(s); + const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); - if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) { + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; @@ -279,7 +257,7 @@ get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, if (s->target_offset == sizeof(struct ip6t_entry) && strcmp(t->target.u.kernel.target->name, - IP6T_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && t->verdict < 0 && unconditional(&s->ipv6)) { /* Tail of chains: STANDARD target (return/policy) */ @@ -294,18 +272,20 @@ get_chainname_rulenum(struct ip6t_entry *s, struct ip6t_entry *e, return 0; } -static void trace_packet(struct sk_buff *skb, +static void trace_packet(const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, - struct xt_table_info *private, - struct ip6t_entry *e) + const struct xt_table_info *private, + const struct ip6t_entry *e) { - void *table_base; + const void *table_base; const struct ip6t_entry *root; const char *hookname, *chainname, *comment; + const struct ip6t_entry *iter; unsigned int rulenum = 0; + struct net *net = dev_net(in ? in : out); table_base = private->entries[smp_processor_id()]; root = get_entry(table_base, private->hook_entry[hook]); @@ -313,12 +293,12 @@ static void trace_packet(struct sk_buff *skb, hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; - IP6T_ENTRY_ITERATE(root, - private->size - private->hook_entry[hook], - get_chainname_rulenum, - e, hookname, &chainname, &comment, &rulenum); + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; - nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo, + nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } @@ -338,18 +318,16 @@ ip6t_do_table(struct sk_buff *skb, const struct net_device *out, struct xt_table *table) { -#define tb_comefrom ((struct ip6t_entry *)table_base)->comefrom - static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); - bool hotdrop = false; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; - void *table_base; - struct ip6t_entry *e, *back; - struct xt_table_info *private; - struct xt_match_param mtpar; - struct xt_target_param tgpar; + const void *table_base; + struct ip6t_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; /* Initialization */ indev = in ? in->name : nulldevname; @@ -360,44 +338,56 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ - mtpar.hotdrop = &hotdrop; - mtpar.in = tgpar.in = in; - mtpar.out = tgpar.out = out; - mtpar.family = tgpar.family = NFPROTO_IPV6; - mtpar.hooknum = tgpar.hooknum = hook; + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV6; + acpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; - table_base = private->entries[smp_processor_id()]; + /* + * Ensure we load private-> members after we've fetched the base + * pointer. + */ + smp_read_barrier_depends(); + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; e = get_entry(table_base, private->hook_entry[hook]); - /* For return from builtin chain */ - back = get_entry(table_base, private->underflow[hook]); - do { - struct ip6t_entry_target *t; + const struct xt_entry_target *t; + const struct xt_entry_match *ematch; IP_NF_ASSERT(e); - IP_NF_ASSERT(back); + acpar.thoff = 0; if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, - &mtpar.thoff, &mtpar.fragoff, &hotdrop) || - IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { + &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { + no_match: e = ip6t_next_entry(e); continue; } - ADD_COUNTER(e->counters, - ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr), 1); + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } + + ADD_COUNTER(e->counters, skb->len, 1); - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(skb, hook, in, out, @@ -407,75 +397,62 @@ ip6t_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct ip6t_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != IP6T_RETURN) { - verdict = (unsigned)(-v) - 1; + if (v != XT_RETURN) { + verdict = (unsigned int)(-v) - 1; break; } - e = back; - back = get_entry(table_base, back->comefrom); + if (*stackptr <= origptr) + e = get_entry(table_base, + private->underflow[hook]); + else + e = ip6t_next_entry(jumpstack[--*stackptr]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { - /* Save old back ptr in next entry */ - struct ip6t_entry *next = ip6t_next_entry(e); - next->comefrom = (void *)back - table_base; - /* set back pointer to next entry */ - back = next; + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[(*stackptr)++] = e; } e = get_entry(table_base, v); continue; } - /* Targets which reenter must return - abs. verdicts */ - tgpar.target = t->u.kernel.target; - tgpar.targinfo = t->data; + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; -#ifdef CONFIG_NETFILTER_DEBUG - tb_comefrom = 0xeeeeeeec; -#endif - verdict = t->u.kernel.target->target(skb, &tgpar); - -#ifdef CONFIG_NETFILTER_DEBUG - if (tb_comefrom != 0xeeeeeeec && verdict == IP6T_CONTINUE) { - printk("Target %s reentered!\n", - t->u.kernel.target->name); - verdict = NF_DROP; - } - tb_comefrom = 0x57acc001; -#endif - if (verdict == IP6T_CONTINUE) + verdict = t->u.kernel.target->target(skb, &acpar); + if (verdict == XT_CONTINUE) e = ip6t_next_entry(e); else /* Verdict */ break; - } while (!hotdrop); + } while (!acpar.hotdrop); -#ifdef CONFIG_NETFILTER_DEBUG - tb_comefrom = NETFILTER_LINK_POISON; -#endif - xt_info_rdunlock_bh(); + *stackptr = origptr; + + xt_write_recseq_end(addend); + local_bh_enable(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else - if (hotdrop) + if (acpar.hotdrop) return NF_DROP; else return verdict; #endif - -#undef tb_comefrom } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -493,12 +470,12 @@ mark_source_chains(struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - struct ip6t_standard_target *t - = (void *)ip6t_get_target(e); + const struct xt_standard_target *t + = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - printk("iptables: loop hook %u pos %u %08X.\n", + pr_err("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } @@ -507,13 +484,13 @@ mark_source_chains(struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ip6t_entry) && (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->ipv6)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -556,7 +533,7 @@ mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ip6t_entry)) { @@ -584,46 +561,41 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static int -cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; - if (i && (*i)-- == 0) - return 1; - + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); - return 0; } static int -check_entry(struct ip6t_entry *e, const char *name) +check_entry(const struct ip6t_entry *e, const char *name) { - struct ip6t_entry_target *t; + const struct xt_entry_target *t; if (!ip6_checkentry(&e->ipv6)) { duprintf("ip_tables: ip check failed %p %s.\n", e, name); return -EINVAL; } - if (e->target_offset + sizeof(struct ip6t_entry_target) > + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; return 0; } -static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; int ret; @@ -638,27 +610,24 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, par.match->name); return ret; } - ++*i; return 0; } static int -find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; - match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, - m->u.user.revision), - "ip6t_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("find_check_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; - ret = check_match(m, par, i); + ret = check_match(m, par); if (ret) goto err; @@ -668,10 +637,11 @@ err: return ret; } -static int check_target(struct ip6t_entry *e, const char *name) +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { - struct ip6t_entry_target *t = ip6t_get_target(e); + struct xt_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -693,64 +663,69 @@ static int check_target(struct ip6t_entry *e, const char *name) } static int -find_check_entry(struct ip6t_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; ret = check_entry(e, name); if (ret) return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV6; - ret = IP6T_MATCH_ITERATE(e, find_check_match, &mtpar, &j); - if (ret != 0) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } t = ip6t_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET6, - t->u.user.name, - t->u.user.revision), - "ip6t_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } -static bool check_underflow(struct ip6t_entry *e) +static bool check_underflow(const struct ip6t_entry *e) { - const struct ip6t_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->ipv6)) return false; - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct ip6t_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -758,12 +733,11 @@ static bool check_underflow(struct ip6t_entry *e) static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int valid_hooks, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; @@ -774,7 +748,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -800,50 +774,41 @@ check_entry_size_and_hooks(struct ip6t_entry *e, /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static int -cleanup_entry(struct ip6t_entry *e, unsigned int *i) +static void cleanup_entry(struct ip6t_entry *e, struct net *net) { struct xt_tgdtor_param par; - struct ip6t_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_target *t; + struct xt_entry_match *ematch; /* Cleanup all matches */ - IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); t = ip6t_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - return 0; } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ip6t_replace *repl) { + struct ip6t_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -854,49 +819,61 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, valid_hooks, &i); - if (ret != 0) - return ret; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - IP6T_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } return ret; } @@ -909,80 +886,45 @@ translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int -add_entry_to_counter(const struct ip6t_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int -set_entry_to_counter(const struct ip6t_entry *e, - struct ip6t_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct ip6t_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - curcpu = smp_processor_id(); - - i = 0; - IP6T_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqcount_t *s = &per_cpu(xt_recseq, cpu); + i = 0; - xt_info_wrlock(cpu); - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); - xt_info_wrunlock(cpu); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; + } } - local_bh_enable(); } -static struct xt_counters *alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -994,11 +936,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct ip6t_entry *e; + const struct ip6t_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; @@ -1022,8 +964,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - const struct ip6t_entry_match *m; - const struct ip6t_entry_target *t; + const struct xt_entry_match *m; + const struct xt_entry_target *t; e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -1040,7 +982,7 @@ copy_entries_to_user(unsigned int total_size, m = (void *)e + i; if (copy_to_user(userptr + off + i - + offsetof(struct ip6t_entry_match, + + offsetof(struct xt_entry_match, u.user.name), m->u.kernel.match->name, strlen(m->u.kernel.match->name)+1) @@ -1050,9 +992,9 @@ copy_entries_to_user(unsigned int total_size, } } - t = ip6t_get_target(e); + t = ip6t_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct ip6t_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -1067,7 +1009,7 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -1076,7 +1018,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -1085,25 +1027,20 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static inline int -compat_calc_match(struct ip6t_entry_match *m, int *size) -{ - *size += xt_compat_match_offset(m->u.kernel.match); - return 0; -} - -static int compat_calc_entry(struct ip6t_entry *e, +static int compat_calc_entry(const struct ip6t_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct ip6t_entry_target *t; + const struct xt_entry_match *ematch; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - base; - IP6T_MATCH_ITERATE(e, compat_calc_match, &off); - t = ip6t_get_target(e); + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ip6t_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); @@ -1124,7 +1061,9 @@ static int compat_calc_entry(struct ip6t_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct ip6t_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -1133,15 +1072,20 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return IP6T_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_compat_init_offsets(AF_INET6, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -1154,25 +1098,26 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET6); #endif t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); @@ -1199,7 +1144,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int -get_entries(struct net *net, struct ip6t_get_entries __user *uptr, int *len) +get_entries(struct net *net, struct ip6t_get_entries __user *uptr, + const int *len) { int ret; struct ip6t_get_entries get; @@ -1218,7 +1164,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, int *len) } t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) @@ -1247,10 +1193,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *oldinfo; struct xt_counters *counters; const void *loc_cpu_old_entry; + struct ip6t_entry *iter; ret = 0; - counters = vmalloc_node(num_counters * sizeof(struct xt_counters), - numa_node_id()); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1258,7 +1204,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), "ip6table_%s", name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } @@ -1290,12 +1236,15 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; @@ -1310,12 +1259,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, void __user *user, unsigned int len) +do_replace(struct net *net, const void __user *user, unsigned int len) { int ret; struct ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1323,6 +1273,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1336,9 +1287,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1351,27 +1300,15 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ static int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -static int -do_add_counters(struct net *net, void __user *user, unsigned int len, +do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { unsigned int i, curcpu; @@ -1385,6 +1322,8 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, const struct xt_table_info *private; int ret = 0; const void *loc_cpu_entry; + struct ip6t_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1415,7 +1354,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; @@ -1425,7 +1364,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, } t = xt_find_table_lock(net, AF_INET6, name); - if (!t || IS_ERR(t)) { + if (IS_ERR_OR_NULL(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; } @@ -1441,14 +1380,13 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, i = 0; /* Choose the copy that is on our node */ curcpu = smp_processor_id(); - xt_info_wrlock(curcpu); + addend = xt_write_recseq_begin(); loc_cpu_entry = private->entries[curcpu]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); - xt_info_wrunlock(curcpu); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); @@ -1462,122 +1400,101 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, #ifdef CONFIG_COMPAT struct compat_ip6t_replace { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; - compat_uptr_t counters; /* struct ip6t_counters * */ + compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[0]; }; static int compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; - int ret; + const struct xt_entry_match *ematch; + int ret = 0; - ret = -EFAULT; origsize = *size; ce = (struct compat_ip6t_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct ip6t_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_ip6t_entry); *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); - ret = IP6T_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } target_offset = e->target_offset - (origsize - *size); - if (ret) - goto out; t = ip6t_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int -compat_find_calc_match(struct ip6t_entry_match *m, +compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, unsigned int hookmask, - int *size, unsigned int *i) + int *size) { struct xt_match *match; - match = try_then_request_module(xt_find_match(AF_INET6, m->u.user.name, - m->u.user.revision), - "ip6t_%s", m->u.user.name); - if (IS_ERR(match) || !match) { + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { duprintf("compat_check_calc_match: `%s' not found\n", m->u.user.name); - return match ? PTR_ERR(match) : -ENOENT; + return PTR_ERR(match); } m->u.kernel.match = match; *size += xt_compat_match_offset(match); - - (*i)++; return 0; } -static int -compat_release_match(struct ip6t_entry_match *m, unsigned int *i) +static void compat_release_entry(struct compat_ip6t_entry *e) { - if (i && (*i)-- == 0) - return 1; - - module_put(m->u.kernel.match->me); - return 0; -} - -static int -compat_release_entry(struct compat_ip6t_entry *e, unsigned int *i) -{ - struct ip6t_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_target *t; + struct xt_entry_match *ematch; /* Cleanup all matches */ - COMPAT_IP6T_MATCH_ITERATE(e, compat_release_match, NULL); + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); t = compat_ip6t_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static int check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { - struct ip6t_entry_target *t; + struct xt_entry_match *ematch; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; @@ -1605,20 +1522,21 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = COMPAT_IP6T_MATCH_ITERATE(e, compat_find_calc_match, name, - &e->ipv6, e->comefrom, &off, &j); - if (ret != 0) - goto release_matches; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ipv6, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } t = compat_ip6t_get_target(e); - target = try_then_request_module(xt_find_target(AF_INET6, - t->u.user.name, - t->u.user.revision), - "ip6t_%s", t->u.user.name); - if (IS_ERR(target) || !target) { + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", t->u.user.name); - ret = target ? PTR_ERR(target) : -ENOENT; + ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; @@ -1640,14 +1558,16 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; out: module_put(t->u.kernel.target->me); release_matches: - IP6T_MATCH_ITERATE(e, compat_release_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } return ret; } @@ -1656,11 +1576,11 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct ip6t_entry_target *t; - struct xt_target *target; + struct xt_entry_target *t; struct ip6t_entry *de; unsigned int origsize; int ret, h; + struct xt_entry_match *ematch; ret = 0; origsize = *size; @@ -1671,13 +1591,13 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, *dstptr += sizeof(struct ip6t_entry); *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); - ret = COMPAT_IP6T_MATCH_ITERATE(e, xt_compat_match_from_user, - dstptr, size); - if (ret) - return ret; + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } de->target_offset = e->target_offset - (origsize - *size); t = compat_ip6t_get_target(e); - target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); @@ -1690,36 +1610,44 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, return ret; } -static int compat_check_entry(struct ip6t_entry *e, const char *name, - unsigned int *i) +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name) { unsigned int j; - int ret; + int ret = 0; struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV6; - ret = IP6T_MATCH_ITERATE(e, check_match, &mtpar, &j); - if (ret) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; - - (*i)++; return 0; cleanup_matches: - IP6T_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1731,8 +1659,10 @@ translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_ip6t_entry *iter0; + struct ip6t_entry *iter1; unsigned int size; - int ret; + int ret = 0; info = *pinfo; entry0 = *pentry0; @@ -1748,14 +1678,19 @@ translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); /* Walk through entries, checking offsets. */ - ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1794,9 +1729,12 @@ translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); if (ret) @@ -1807,13 +1745,35 @@ translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = IP6T_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_IP6T_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - IP6T_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } xt_free_table_info(newinfo); return ret; } @@ -1831,7 +1791,11 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_IP6T_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(AF_INET6); @@ -1846,6 +1810,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) struct compat_ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1855,6 +1820,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1868,7 +1834,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1884,7 +1850,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1896,7 +1863,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1917,7 +1884,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, } struct compat_ip6t_get_entries { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[0]; }; @@ -1933,6 +1900,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, int ret = 0; const void *loc_cpu_entry; unsigned int i = 0; + struct ip6t_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1945,9 +1913,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = IP6T_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; @@ -1977,7 +1948,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t && !IS_ERR(t)) { + if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; duprintf("t->private->number = %u\n", private->number); @@ -2007,7 +1978,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2029,7 +2000,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2054,7 +2025,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2068,7 +2039,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { - struct ip6t_get_revision rev; + struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { @@ -2079,6 +2050,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; @@ -2106,8 +2078,7 @@ struct xt_table *ip6t_register_table(struct net *net, { int ret; struct xt_table_info *newinfo; - struct xt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; @@ -2121,11 +2092,7 @@ struct xt_table *ip6t_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -2142,17 +2109,19 @@ out: return ERR_PTR(ret); } -void ip6t_unregister_table(struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct ip6t_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); @@ -2169,7 +2138,7 @@ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, } static bool -icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par) +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; @@ -2185,7 +2154,7 @@ icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par) * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ICMP tinygram.\n"); - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -2197,31 +2166,32 @@ icmp6_match(const struct sk_buff *skb, const struct xt_match_param *par) } /* Called when user tries to insert an entry of this type. */ -static bool icmp6_checkentry(const struct xt_mtchk_param *par) +static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(icmpinfo->invflags & ~IP6T_ICMP_INV); + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } /* The built-in targets: standard (NULL) and error. */ -static struct xt_target ip6t_standard_target __read_mostly = { - .name = IP6T_STANDARD_TARGET, - .targetsize = sizeof(int), - .family = NFPROTO_IPV6, +static struct xt_target ip6t_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV6, #ifdef CONFIG_COMPAT - .compatsize = sizeof(compat_int_t), - .compat_from_user = compat_standard_from_user, - .compat_to_user = compat_standard_to_user, + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, #endif -}; - -static struct xt_target ip6t_error_target __read_mostly = { - .name = IP6T_ERROR_TARGET, - .target = ip6t_error, - .targetsize = IP6T_FUNCTION_MAXNAMELEN, - .family = NFPROTO_IPV6, + }, + { + .name = XT_ERROR_TARGET, + .target = ip6t_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV6, + }, }; static struct nf_sockopt_ops ip6t_sockopts = { @@ -2241,13 +2211,15 @@ static struct nf_sockopt_ops ip6t_sockopts = { .owner = THIS_MODULE, }; -static struct xt_match icmp6_matchstruct __read_mostly = { - .name = "icmp6", - .match = icmp6_match, - .matchsize = sizeof(struct ip6t_icmp), - .checkentry = icmp6_checkentry, - .proto = IPPROTO_ICMPV6, - .family = NFPROTO_IPV6, +static struct xt_match ip6t_builtin_mt[] __read_mostly = { + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + }, }; static int __net_init ip6_tables_net_init(struct net *net) @@ -2273,14 +2245,11 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err1; - /* Noone else will be downing sem now, so we won't sleep */ - ret = xt_register_target(&ip6t_standard_target); + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; - ret = xt_register_target(&ip6t_error_target); - if (ret < 0) - goto err3; - ret = xt_register_match(&icmp6_matchstruct); + ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); if (ret < 0) goto err4; @@ -2289,15 +2258,13 @@ static int __init ip6_tables_init(void) if (ret < 0) goto err5; - printk(KERN_INFO "ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); + pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: - xt_unregister_match(&icmp6_matchstruct); + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); err4: - xt_unregister_target(&ip6t_error_target); -err3: - xt_unregister_target(&ip6t_standard_target); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: unregister_pernet_subsys(&ip6_tables_net_ops); err1: @@ -2308,95 +2275,14 @@ static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); - xt_unregister_match(&icmp6_matchstruct); - xt_unregister_target(&ip6t_error_target); - xt_unregister_target(&ip6t_standard_target); - + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } -/* - * find the offset to specified header or the protocol number of last header - * if target < 0. "last header" is transport protocol header, ESP, or - * "No next header". - * - * If target header is found, its offset is set in *offset and return protocol - * number. Otherwise, return -1. - * - * If the first fragment doesn't contain the final protocol header or - * NEXTHDR_NONE it is considered invalid. - * - * Note that non-1st fragment is special case that "the protocol number - * of last header" is "next header" field in Fragment header. In this case, - * *offset is meaningless and fragment offset is stored in *fragoff if fragoff - * isn't NULL. - * - */ -int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, - int target, unsigned short *fragoff) -{ - unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len = skb->len - start; - - if (fragoff) - *fragoff = 0; - - while (nexthdr != target) { - struct ipv6_opt_hdr _hdr, *hp; - unsigned int hdrlen; - - if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { - if (target < 0) - break; - return -ENOENT; - } - - hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); - if (hp == NULL) - return -EBADMSG; - if (nexthdr == NEXTHDR_FRAGMENT) { - unsigned short _frag_off; - __be16 *fp; - fp = skb_header_pointer(skb, - start+offsetof(struct frag_hdr, - frag_off), - sizeof(_frag_off), - &_frag_off); - if (fp == NULL) - return -EBADMSG; - - _frag_off = ntohs(*fp) & ~0x7; - if (_frag_off) { - if (target < 0 && - ((!ipv6_ext_hdr(hp->nexthdr)) || - hp->nexthdr == NEXTHDR_NONE)) { - if (fragoff) - *fragoff = _frag_off; - return hp->nexthdr; - } - return -ENOENT; - } - hdrlen = 8; - } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen + 2) << 2; - else - hdrlen = ipv6_optlen(hp); - - nexthdr = hp->nexthdr; - len -= hdrlen; - start += hdrlen; - } - - *offset = start; - return nexthdr; -} - EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_do_table); -EXPORT_SYMBOL(ip6t_ext_hdr); -EXPORT_SYMBOL(ipv6_find_hdr); module_init(ip6_tables_init); module_exit(ip6_tables_fini); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c deleted file mode 100644 index b285fdf1905..00000000000 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ /dev/null @@ -1,504 +0,0 @@ -/* - * This is a module which is used for logging packets. - */ - -/* (C) 2001 Jan Rekorajski <baggins@pld.org.pl> - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <linux/spinlock.h> -#include <linux/icmpv6.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/ipv6.h> -#include <linux/netfilter.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv6/ip6_tables.h> -#include <net/netfilter/nf_log.h> - -MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); -MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); -MODULE_LICENSE("GPL"); - -struct in_device; -#include <net/route.h> -#include <linux/netfilter_ipv6/ip6t_LOG.h> - -/* Use lock to serialize, so printks don't overlap */ -static DEFINE_SPINLOCK(log_lock); - -/* One level of recursion won't kill us */ -static void dump_packet(const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int ip6hoff, - int recurse) -{ - u_int8_t currenthdr; - int fragment; - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - unsigned int ptr; - unsigned int hdrlen = 0; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_MASK; - - ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); - if (ih == NULL) { - printk("TRUNCATED"); - return; - } - - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); - - /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), - (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, - ih->hop_limit, - (ntohl(*(__be32 *)ih) & 0x000fffff)); - - fragment = 0; - ptr = ip6hoff + sizeof(struct ipv6hdr); - currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { - struct ipv6_opt_hdr _hdr; - const struct ipv6_opt_hdr *hp; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - if (hp == NULL) { - printk("TRUNCATED"); - return; - } - - /* Max length: 48 "OPT (...) " */ - if (logflags & IP6T_LOG_IPOPT) - printk("OPT ( "); - - switch (currenthdr) { - case IPPROTO_FRAGMENT: { - struct frag_hdr _fhdr; - const struct frag_hdr *fh; - - printk("FRAG:"); - fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), - &_fhdr); - if (fh == NULL) { - printk("TRUNCATED "); - return; - } - - /* Max length: 6 "65535 " */ - printk("%u ", ntohs(fh->frag_off) & 0xFFF8); - - /* Max length: 11 "INCOMPLETE " */ - if (fh->frag_off & htons(0x0001)) - printk("INCOMPLETE "); - - printk("ID:%08x ", ntohl(fh->identification)); - - if (ntohs(fh->frag_off) & 0xFFF8) - fragment = 1; - - hdrlen = 8; - - break; - } - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - if (fragment) { - if (logflags & IP6T_LOG_IPOPT) - printk(")"); - return; - } - hdrlen = ipv6_optlen(hp); - break; - /* Max Length */ - case IPPROTO_AH: - if (logflags & IP6T_LOG_IPOPT) { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - /* Max length: 3 "AH " */ - printk("AH "); - - if (fragment) { - printk(")"); - return; - } - - ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), - &_ahdr); - if (ah == NULL) { - /* - * Max length: 26 "INCOMPLETE [65535 - * bytes] )" - */ - printk("INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 15 "SPI=0xF1234567 */ - printk("SPI=0x%x ", ntohl(ah->spi)); - - } - - hdrlen = (hp->hdrlen+2)<<2; - break; - case IPPROTO_ESP: - if (logflags & IP6T_LOG_IPOPT) { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 4 "ESP " */ - printk("ESP "); - - if (fragment) { - printk(")"); - return; - } - - /* - * Max length: 26 "INCOMPLETE [65535 bytes] )" - */ - eh = skb_header_pointer(skb, ptr, sizeof(_esph), - &_esph); - if (eh == NULL) { - printk("INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 16 "SPI=0xF1234567 )" */ - printk("SPI=0x%x )", ntohl(eh->spi) ); - - } - return; - default: - /* Max length: 20 "Unknown Ext Hdr 255" */ - printk("Unknown Ext Hdr %u", currenthdr); - return; - } - if (logflags & IP6T_LOG_IPOPT) - printk(") "); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - } - - switch (currenthdr) { - case IPPROTO_TCP: { - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); - if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", - ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & IP6T_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", - ntohl(th->seq), ntohl(th->ack_seq)); - /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - printk("CWR "); - if (th->ece) - printk("ECE "); - if (th->urg) - printk("URG "); - if (th->ack) - printk("ACK "); - if (th->psh) - printk("PSH "); - if (th->rst) - printk("RST "); - if (th->syn) - printk("SYN "); - if (th->fin) - printk("FIN "); - /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & IP6T_LOG_TCPOPT) && - th->doff * 4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff * 4 - - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, - ptr + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - printk("OPT (TRUNCATED)"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); - for (i =0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); - } - break; - } - case IPPROTO_UDP: - case IPPROTO_UDPLITE: { - struct udphdr _udph; - const struct udphdr *uh; - - if (currenthdr == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); - else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); - if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", - ntohs(uh->source), ntohs(uh->dest), - ntohs(uh->len)); - break; - } - case IPPROTO_ICMPV6: { - struct icmp6hdr _icmp6h; - const struct icmp6hdr *ic; - - /* Max length: 13 "PROTO=ICMPv6 " */ - printk("PROTO=ICMPv6 "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); - if (ic == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); - return; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); - - switch (ic->icmp6_type) { - case ICMPV6_ECHO_REQUEST: - case ICMPV6_ECHO_REPLY: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", - ntohs(ic->icmp6_identifier), - ntohs(ic->icmp6_sequence)); - break; - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - break; - - case ICMPV6_PARAMPROB: - /* Max length: 17 "POINTER=ffffffff " */ - printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); - /* Fall through */ - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - /* Max length: 3+maxlen */ - if (recurse) { - printk("["); - dump_packet(info, skb, ptr + sizeof(_icmp6h), - 0); - printk("] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - printk("MTU=%u ", ntohl(ic->icmp6_mtu)); - } - break; - } - /* Max length: 10 "PROTO=255 " */ - default: - printk("PROTO=%u ", currenthdr); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", - skb->sk->sk_socket->file->f_cred->fsuid, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!recurse && skb->mark) - printk("MARK=0x%x ", skb->mark); -} - -static struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = 0, - .logflags = NF_LOG_MASK, - }, - }, -}; - -static void -ip6t_log_packet(u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - if (!loginfo) - loginfo = &default_loginfo; - - spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, - prefix, - in ? in->name : "", - out ? out->name : ""); - if (in && !out) { - unsigned int len; - /* MAC logging for input chain only. */ - printk("MAC="); - if (skb->dev && (len = skb->dev->hard_header_len) && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - int i; - - if (skb->dev->type == ARPHRD_SIT && - (p -= ETH_HLEN) < skb->head) - p = NULL; - - if (p != NULL) { - for (i = 0; i < len; i++) - printk("%02x%s", p[i], - i == len - 1 ? "" : ":"); - } - printk(" "); - - if (skb->dev->type == ARPHRD_SIT) { - const struct iphdr *iph = - (struct iphdr *)skb_mac_header(skb); - printk("TUNNEL=%pI4->%pI4 ", - &iph->saddr, &iph->daddr); - } - } else - printk(" "); - } - - dump_packet(loginfo, skb, skb_network_offset(skb), 1); - printk("\n"); - spin_unlock_bh(&log_lock); -} - -static unsigned int -log_tg6(struct sk_buff *skb, const struct xt_target_param *par) -{ - const struct ip6t_log_info *loginfo = par->targinfo; - struct nf_loginfo li; - - li.type = NF_LOG_TYPE_LOG; - li.u.log.level = loginfo->level; - li.u.log.logflags = loginfo->logflags; - - ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, par->out, - &li, loginfo->prefix); - return XT_CONTINUE; -} - - -static bool log_tg6_check(const struct xt_tgchk_param *par) -{ - const struct ip6t_log_info *loginfo = par->targinfo; - - if (loginfo->level >= 8) { - pr_debug("LOG: level %u >= 8\n", loginfo->level); - return false; - } - if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { - pr_debug("LOG: prefix term %i\n", - loginfo->prefix[sizeof(loginfo->prefix)-1]); - return false; - } - return true; -} - -static struct xt_target log_tg6_reg __read_mostly = { - .name = "LOG", - .family = NFPROTO_IPV6, - .target = log_tg6, - .targetsize = sizeof(struct ip6t_log_info), - .checkentry = log_tg6_check, - .me = THIS_MODULE, -}; - -static struct nf_logger ip6t_logger __read_mostly = { - .name = "ip6t_LOG", - .logfn = &ip6t_log_packet, - .me = THIS_MODULE, -}; - -static int __init log_tg6_init(void) -{ - int ret; - - ret = xt_register_target(&log_tg6_reg); - if (ret < 0) - return ret; - nf_log_register(NFPROTO_IPV6, &ip6t_logger); - return 0; -} - -static void __exit log_tg6_exit(void) -{ - nf_log_unregister(&ip6t_logger); - xt_unregister_target(&log_tg6_reg); -} - -module_init(log_tg6_init); -module_exit(log_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c new file mode 100644 index 00000000000..3e4e92d5e15 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/addrconf.h> +#include <net/ipv6.h> + +static unsigned int +masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct in6_addr src; + struct nf_conn *ct; + struct nf_nat_range newrange; + + ct = nf_ct_get(skb, &ctinfo); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY)); + + if (ipv6_dev_get_saddr(dev_net(par->out), par->out, + &ipv6_hdr(skb)->daddr, 0, &src) < 0) + return NF_DROP; + + nfct_nat(ct)->masq_index = par->out->ifindex; + + newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; + newrange.min_addr.in6 = src; + newrange.max_addr.in6 = src; + newrange.min_proto = range->min_proto; + newrange.max_proto = range->max_proto; + + return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); +} + +static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + return 0; +} + +static int device_cmp(struct nf_conn *ct, void *ifindex) +{ + const struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + return 0; + return nat->masq_index == (int)(long)ifindex; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); +} + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct xt_target masquerade_tg6_reg __read_mostly = { + .name = "MASQUERADE", + .family = NFPROTO_IPV6, + .checkentry = masquerade_tg6_checkentry, + .target = masquerade_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .me = THIS_MODULE, +}; + +static int __init masquerade_tg6_init(void) +{ + int err; + + err = xt_register_target(&masquerade_tg6_reg); + if (err == 0) { + register_netdevice_notifier(&masq_dev_notifier); + register_inet6addr_notifier(&masq_inet_notifier); + } + + return err; +} +static void __exit masquerade_tg6_exit(void) +{ + unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_netdevice_notifier(&masq_dev_notifier); + xt_unregister_target(&masquerade_tg6_reg); +} + +module_init(masquerade_tg6_init); +module_exit(masquerade_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: automatic address SNAT"); diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c new file mode 100644 index 00000000000..590f767db5d --- /dev/null +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6t_NPT.h> +#include <linux/netfilter/x_tables.h> + +static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) +{ + struct ip6t_npt_tginfo *npt = par->targinfo; + struct in6_addr pfx; + __wsum src_sum, dst_sum; + + if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) + return -EINVAL; + + /* Ensure that LSB of prefix is zero */ + ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len); + if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6)) + return -EINVAL; + ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len); + if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) + return -EINVAL; + + src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); + dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); + + npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); + return 0; +} + +static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, + struct in6_addr *addr) +{ + unsigned int pfx_len; + unsigned int i, idx; + __be32 mask; + __sum16 sum; + + pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len); + for (i = 0; i < pfx_len; i += 32) { + if (pfx_len - i >= 32) + mask = 0; + else + mask = htonl((1 << (i - pfx_len + 32)) - 1); + + idx = i / 32; + addr->s6_addr32[idx] &= mask; + addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx]; + } + + if (pfx_len <= 48) + idx = 3; + else { + for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) { + if ((__force __sum16)addr->s6_addr16[idx] != + CSUM_MANGLED_0) + break; + } + if (idx == ARRAY_SIZE(addr->s6_addr16)) + return false; + } + + sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]), + csum_unfold(npt->adjustment))); + if (sum == CSUM_MANGLED_0) + sum = 0; + *(__force __sum16 *)&addr->s6_addr16[idx] = sum; + + return true; +} + +static unsigned int +ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_npt_tginfo *npt = par->targinfo; + + if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { + icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, saddr)); + return NF_DROP; + } + return XT_CONTINUE; +} + +static unsigned int +ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_npt_tginfo *npt = par->targinfo; + + if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { + icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, daddr)); + return NF_DROP; + } + return XT_CONTINUE; +} + +static struct xt_target ip6t_npt_target_reg[] __read_mostly = { + { + .name = "SNPT", + .table = "mangle", + .target = ip6t_snpt_tg, + .targetsize = sizeof(struct ip6t_npt_tginfo), + .checkentry = ip6t_npt_checkentry, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "DNPT", + .table = "mangle", + .target = ip6t_dnpt_tg, + .targetsize = sizeof(struct ip6t_npt_tginfo), + .checkentry = ip6t_npt_checkentry, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, +}; + +static int __init ip6t_npt_init(void) +{ + return xt_register_targets(ip6t_npt_target_reg, + ARRAY_SIZE(ip6t_npt_target_reg)); +} + +static void __exit ip6t_npt_exit(void) +{ + xt_unregister_targets(ip6t_npt_target_reg, + ARRAY_SIZE(ip6t_npt_target_reg)); +} + +module_init(ip6t_npt_init); +module_exit(ip6t_npt_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS("ip6t_SNPT"); +MODULE_ALIAS("ip6t_DNPT"); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 8311ca31816..544b0a9da1b 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -7,6 +7,8 @@ * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * + * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> + * * Based on net/ipv4/netfilter/ipt_REJECT.c * * This program is free software; you can redistribute it and/or @@ -14,222 +16,80 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/gfp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> -#include <net/ipv6.h> -#include <net/tcp.h> #include <net/icmp.h> -#include <net/ip6_checksum.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> +#include <net/netfilter/ipv6/nf_reject.h> + MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); -/* Send RST reply */ -static void send_reset(struct net *net, struct sk_buff *oldskb) -{ - struct sk_buff *nskb; - struct tcphdr otcph, *tcph; - unsigned int otcplen, hh_len; - int tcphoff, needs_ack; - const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); - struct ipv6hdr *ip6h; - struct dst_entry *dst = NULL; - u8 proto; - struct flowi fl; - - if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || - (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { - pr_debug("ip6t_REJECT: addr is not unicast.\n"); - return; - } - - proto = oip6h->nexthdr; - tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); - - if ((tcphoff < 0) || (tcphoff > oldskb->len)) { - pr_debug("ip6t_REJECT: Can't get TCP header.\n"); - return; - } - - otcplen = oldskb->len - tcphoff; - - /* IP header checks: fragment, too short. */ - if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { - pr_debug("ip6t_REJECT: proto(%d) != IPPROTO_TCP, " - "or too short. otcplen = %d\n", - proto, otcplen); - return; - } - - if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) - BUG(); - - /* No RST for RST. */ - if (otcph.rst) { - pr_debug("ip6t_REJECT: RST is set\n"); - return; - } - - /* Check checksum. */ - if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, - skb_checksum(oldskb, tcphoff, otcplen, 0))) { - pr_debug("ip6t_REJECT: TCP checksum is invalid\n"); - return; - } - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr); - ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr); - fl.fl_ip_sport = otcph.dest; - fl.fl_ip_dport = otcph.source; - security_skb_classify_flow(oldskb, &fl); - dst = ip6_route_output(net, NULL, &fl); - if (dst == NULL) - return; - if (dst->error || xfrm_lookup(net, &dst, &fl, NULL, 0)) - return; - - hh_len = (dst->dev->hard_header_len + 15)&~15; - nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) + dst->trailer_len, - GFP_ATOMIC); - - if (!nskb) { - if (net_ratelimit()) - printk("ip6t_REJECT: Can't alloc skb\n"); - dst_release(dst); - return; - } - - skb_dst_set(nskb, dst); - - skb_reserve(nskb, hh_len + dst->header_len); - - skb_put(nskb, sizeof(struct ipv6hdr)); - skb_reset_network_header(nskb); - ip6h = ipv6_hdr(nskb); - ip6h->version = 6; - ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); - ip6h->nexthdr = IPPROTO_TCP; - ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); - ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); - - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - /* Truncate to length (no data) */ - tcph->doff = sizeof(struct tcphdr)/4; - tcph->source = otcph.dest; - tcph->dest = otcph.source; - - if (otcph.ack) { - needs_ack = 0; - tcph->seq = otcph.ack_seq; - tcph->ack_seq = 0; - } else { - needs_ack = 1; - tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin - + otcplen - (otcph.doff<<2)); - tcph->seq = 0; - } - - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; - tcph->rst = 1; - tcph->ack = needs_ack; - tcph->window = 0; - tcph->urg_ptr = 0; - tcph->check = 0; - - /* Adjust TCP checksum */ - tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, - &ipv6_hdr(nskb)->daddr, - sizeof(struct tcphdr), IPPROTO_TCP, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); - - nf_ct_attach(nskb, oldskb); - - ip6_local_out(nskb); -} - -static inline void -send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code, - unsigned int hooknum) -{ - if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) - skb_in->dev = net->loopback_dev; - - icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL); -} static unsigned int -reject_tg6(struct sk_buff *skb, const struct xt_target_param *par) +reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = dev_net((par->in != NULL) ? par->in : par->out); pr_debug("%s: medium point\n", __func__); - /* WARNING: This code causes reentry within ip6tables. - This means that the ip6tables jump stack is now crap. We - must return an absolute verdict. --RR */ switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: - send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); break; case IP6T_ICMP6_ADM_PROHIBITED: - send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); break; case IP6T_ICMP6_NOT_NEIGHBOUR: - send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); break; case IP6T_ICMP6_ADDR_UNREACH: - send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); break; case IP6T_ICMP6_PORT_UNREACH: - send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); + nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: - send_reset(net, skb); + nf_send_reset6(net, skb, par->hooknum); break; default: - if (net_ratelimit()) - printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with); + net_info_ratelimited("case %u not handled yet\n", reject->with); break; } return NF_DROP; } -static bool reject_tg6_check(const struct xt_tgchk_param *par) +static int reject_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_reject_info *rejinfo = par->targinfo; const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { - printk("ip6t_REJECT: ECHOREPLY is not supported.\n"); - return false; + pr_info("ECHOREPLY is not supported.\n"); + return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ if (e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { - printk("ip6t_REJECT: TCP_RESET illegal for non-tcp\n"); - return false; + pr_info("TCP_RESET illegal for non-tcp\n"); + return -EINVAL; } } - return true; + return 0; } static struct xt_target reject_tg6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c new file mode 100644 index 00000000000..a0d17270117 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SYNPROXY.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> + +static struct ipv6hdr * +synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = 64; //XXX + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct net *net = nf_ct_net((struct nf_conn *)nfct); + struct dst_entry *dst; + struct flowi6 fl6; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + goto free_nskb; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip6_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= XT_SYNPROXY_OPT_MSS; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + if (!synproxy_parse_options(skb, par->thoff, th, &opts)) + return NF_DROP; + + if (th->syn && !(th->ack || th->fin || th->rst)) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + return NF_DROP; + + } else if (th->ack && !(th->fin || th->rst || th->syn)) { + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + return NF_DROP; + } + + return XT_CONTINUE; +} + +static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_entry *e = par->entryinfo; + + if (!(e->ipv6.flags & IP6T_F_PROTO) || + e->ipv6.proto != IPPROTO_TCP || + e->ipv6.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg6_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), + .target = synproxy_tg6, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg6_check, + .destroy = synproxy_tg6_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg6_init(void) +{ + int err; + + err = nf_register_hooks(ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg6_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg6_exit(void) +{ + xt_unregister_target(&synproxy_tg6_reg); + nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); +} + +module_init(synproxy_tg6_init); +module_exit(synproxy_tg6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index ac0b7c629d7..04099ab7d2e 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> @@ -29,32 +29,32 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; - pr_debug("ah spi_match:%c 0x%x <= 0x%x <= 0x%x", + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_auth_hdr _ah; const struct ip_auth_hdr *ah; const struct ip6t_ah *ahinfo = par->matchinfo; - unsigned int ptr; + unsigned int ptr = 0; unsigned int hdrlen = 0; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *par->hotdrop = true; + par->hotdrop = true; return false; } ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); if (ah == NULL) { - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -87,15 +87,15 @@ static bool ah_mt6(const struct sk_buff *skb, const struct xt_match_param *par) !(ahinfo->hdrres && ah->reserved); } -static bool ah_mt6_check(const struct xt_mtchk_param *par) +static int ah_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ah *ahinfo = par->matchinfo; if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { - pr_debug("ip6t_ah: unknown flags %X\n", ahinfo->invflags); - return false; + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; } - return true; + return 0; } static struct xt_match ah_mt6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index ca287f6d2bc..aab0706908c 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -20,14 +20,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -eui64_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) { unsigned char eui64[8]; if (!(skb_mac_header(skb) >= skb->head && skb_mac_header(skb) + ETH_HLEN <= skb->data) && par->fragoff != 0) { - *par->hotdrop = true; + par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c index 7b91c2598ed..3b5735e56bf 100644 --- a/net/ipv6/netfilter/ip6t_frag.c +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -27,7 +27,7 @@ static inline bool id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; - pr_debug("frag id_match:%c 0x%x <= 0x%x <= 0x%x", invert ? '!' : ' ', + pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); @@ -35,24 +35,24 @@ id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) } static bool -frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct frag_hdr _frag; const struct frag_hdr *fh; const struct ip6t_frag *fraginfo = par->matchinfo; - unsigned int ptr; + unsigned int ptr = 0; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *par->hotdrop = true; + par->hotdrop = true; return false; } fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); if (fh == NULL) { - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -102,15 +102,15 @@ frag_mt6(const struct sk_buff *skb, const struct xt_match_param *par) (ntohs(fh->frag_off) & IP6_MF)); } -static bool frag_mt6_check(const struct xt_mtchk_param *par) +static int frag_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_frag *fraginfo = par->matchinfo; if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { - pr_debug("ip6t_frag: unknown flags %X\n", fraginfo->invflags); - return false; + pr_debug("unknown flags %X\n", fraginfo->invflags); + return -EINVAL; } - return true; + return 0; } static struct xt_match frag_mt6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c index cbe8dec9744..01df142bb02 100644 --- a/net/ipv6/netfilter/ip6t_hbh.c +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -41,14 +41,16 @@ MODULE_ALIAS("ip6t_dst"); * 5 -> RTALERT 2 x x */ +static struct xt_match hbh_mt6_reg[] __read_mostly; + static bool -hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; - unsigned int ptr; + unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; u8 _opttype; @@ -58,16 +60,18 @@ hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) unsigned int optlen; int err; - err = ipv6_find_hdr(skb, &ptr, par->match->data, NULL); + err = ipv6_find_hdr(skb, &ptr, + (par->match == &hbh_mt6_reg[0]) ? + NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *par->hotdrop = true; + par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -141,11 +145,11 @@ hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) } /* Step to the next */ - pr_debug("len%04X \n", optlen); + pr_debug("len%04X\n", optlen); if ((ptr > skb->len - optlen || hdrlen < optlen) && temp < optinfo->optsnr - 1) { - pr_debug("new pointer is too large! \n"); + pr_debug("new pointer is too large!\n"); break; } ptr += optlen; @@ -160,32 +164,32 @@ hbh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool hbh_mt6_check(const struct xt_mtchk_param *par) +static int hbh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { - pr_debug("ip6t_opts: unknown flags %X\n", optsinfo->invflags); - return false; + pr_debug("unknown flags %X\n", optsinfo->invflags); + return -EINVAL; } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { - pr_debug("ip6t_opts: Not strict - not implemented"); - return false; + pr_debug("Not strict - not implemented"); + return -EINVAL; } - return true; + return 0; } static struct xt_match hbh_mt6_reg[] __read_mostly = { { + /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ .name = "hbh", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, - .data = NEXTHDR_HOP, }, { .name = "dst", @@ -194,7 +198,6 @@ static struct xt_match hbh_mt6_reg[] __read_mostly = { .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, - .data = NEXTHDR_DEST, }, }; diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 91490ad9302..54bd9790603 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -27,7 +27,7 @@ MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool -ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; @@ -118,16 +118,16 @@ ipv6header_mt6(const struct sk_buff *skb, const struct xt_match_param *par) } } -static bool ipv6header_mt6_check(const struct xt_mtchk_param *par) +static int ipv6header_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && info->invflags != 0xFF) - return false; + return -EINVAL; - return true; + return 0; } static struct xt_match ipv6header_mt6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c index aafe4e66577..0c90c66b199 100644 --- a/net/ipv6/netfilter/ip6t_mh.c +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -11,6 +11,7 @@ * Based on net/netfilter/xt_tcpudp.c * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> @@ -24,12 +25,6 @@ MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); MODULE_LICENSE("GPL"); -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) printk(format , ## args) -#else -#define duprintf(format, args...) -#endif - /* Returns 1 if the type is matched by the range, 0 otherwise */ static inline bool type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) @@ -37,7 +32,7 @@ type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) return (type >= min && type <= max) ^ invert; } -static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip6_mh _mh; const struct ip6_mh *mh; @@ -51,15 +46,15 @@ static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) if (mh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ - duprintf("Dropping evil MH tinygram.\n"); - *par->hotdrop = true; + pr_debug("Dropping evil MH tinygram.\n"); + par->hotdrop = true; return false; } if (mh->ip6mh_proto != IPPROTO_NONE) { - duprintf("Dropping invalid MH Payload Proto: %u\n", + pr_debug("Dropping invalid MH Payload Proto: %u\n", mh->ip6mh_proto); - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -67,12 +62,12 @@ static bool mh_mt6(const struct sk_buff *skb, const struct xt_match_param *par) !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); } -static bool mh_mt6_check(const struct xt_mtchk_param *par) +static int mh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_mh *mhinfo = par->matchinfo; /* Must specify no unknown invflags */ - return !(mhinfo->invflags & ~IP6T_MH_INV_MASK); + return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; } static struct xt_match mh_mt6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c new file mode 100644 index 00000000000..790e0c6b19e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2011 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/route.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +#include <linux/netfilter/xt_rpfilter.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match"); + +static bool rpfilter_addr_unicast(const struct in6_addr *addr) +{ + int addr_type = ipv6_addr_type(addr); + return addr_type & IPV6_ADDR_UNICAST; +} + +static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, + const struct net_device *dev, u8 flags) +{ + struct rt6_info *rt; + struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = false; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowi6_proto = iph->nexthdr, + .daddr = iph->saddr, + }; + int lookup_flags; + + if (rpfilter_addr_unicast(&iph->daddr)) { + memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr)); + lookup_flags = RT6_LOOKUP_F_HAS_SADDR; + } else { + lookup_flags = 0; + } + + fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; + if ((flags & XT_RPFILTER_LOOSE) == 0) { + fl6.flowi6_oif = dev->ifindex; + lookup_flags |= RT6_LOOKUP_F_IFACE; + } + + rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags); + if (rt->dst.error) + goto out; + + if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST)) + goto out; + + if (rt->rt6i_flags & RTF_LOCAL) { + ret = flags & XT_RPFILTER_ACCEPT_LOCAL; + goto out; + } + + if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) + ret = true; + out: + ip6_rt_put(rt); + return ret; +} + +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rt6_info *rt = (const void *) skb_dst(skb); + return rt && (rt->rt6i_flags & RTF_LOCAL); +} + +static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + int saddrtype; + struct ipv6hdr *iph; + bool invert = info->flags & XT_RPFILTER_INVERT; + + if (rpfilter_is_local(skb)) + return true ^ invert; + + iph = ipv6_hdr(skb); + saddrtype = ipv6_addr_type(&iph->saddr); + if (unlikely(saddrtype == IPV6_ADDR_ANY)) + return true ^ invert; /* not routable: forward path will drop it */ + + return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert; +} + +static int rpfilter_check(const struct xt_mtchk_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + unsigned int options = ~XT_RPFILTER_OPTION_MASK; + + if (info->flags & options) { + pr_info("unknown options encountered"); + return -EINVAL; + } + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "raw") != 0) { + pr_info("match only valid in the \'raw\' " + "or \'mangle\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + return 0; +} + +static struct xt_match rpfilter_mt_reg __read_mostly = { + .name = "rpfilter", + .family = NFPROTO_IPV6, + .checkentry = rpfilter_check, + .match = rpfilter_mt, + .matchsize = sizeof(struct xt_rpfilter_info), + .hooks = (1 << NF_INET_PRE_ROUTING), + .me = THIS_MODULE +}; + +static int __init rpfilter_mt_init(void) +{ + return xt_register_match(&rpfilter_mt_reg); +} + +static void __exit rpfilter_mt_exit(void) +{ + xt_unregister_match(&rpfilter_mt_reg); +} + +module_init(rpfilter_mt_init); +module_exit(rpfilter_mt_exit); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index b77307fc874..2c99b94eeca 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -6,7 +6,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> @@ -29,36 +29,36 @@ static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; - pr_debug("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x", + pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } -static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par) +static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; const struct ip6t_rt *rtinfo = par->matchinfo; unsigned int temp; - unsigned int ptr; + unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; struct in6_addr _addr; const struct in6_addr *ap; int err; - err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL); + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL); if (err < 0) { if (err != -ENOENT) - *par->hotdrop = true; + par->hotdrop = true; return false; } rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); if (rh == NULL) { - *par->hotdrop = true; + par->hotdrop = true; return false; } @@ -183,23 +183,23 @@ static bool rt_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool rt_mt6_check(const struct xt_mtchk_param *par) +static int rt_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_rt *rtinfo = par->matchinfo; if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { - pr_debug("ip6t_rt: unknown flags %X\n", rtinfo->invflags); - return false; + pr_debug("unknown flags %X\n", rtinfo->invflags); + return -EINVAL; } if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && (!(rtinfo->flags & IP6T_RT_TYP) || (rtinfo->rt_type != 0) || (rtinfo->invflags & IP6T_RT_INV_TYP))) { pr_debug("`--rt-type 0' required before `--rt-0-*'"); - return false; + return -EINVAL; } - return true; + return 0; } static struct xt_match rt_mt6_reg __read_mostly = { diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index ad378efd0eb..ca7f6c12808 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -21,117 +22,52 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[3]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_FILTER, }; /* The work comes in here from netfilter.c. */ static unsigned int -ip6t_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, - dev_net(in)->ipv6.ip6table_filter); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int -ip6t_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ -#if 0 - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ip6t_hook: happy cracking.\n"); - return NF_ACCEPT; - } -#endif - - return ip6t_do_table(skb, hook, in, out, - dev_net(out)->ipv6.ip6table_filter); + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_filter); } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_FILTER, - }, - { - .hook = ip6t_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_FILTER, - }, - { - .hook = ip6t_local_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_FILTER, - }, -}; +static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static int forward = NF_ACCEPT; +static bool forward = true; module_param(forward, bool, 0000); static int __net_init ip6table_filter_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ip6t_standard *)repl->entries)[1].target.verdict = + forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + net->ipv6.ip6table_filter = - ip6t_register_table(net, &packet_filter, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_filter)) - return PTR_ERR(net->ipv6.ip6table_filter); - return 0; + ip6t_register_table(net, &packet_filter, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter); } static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_filter); + ip6t_unregister_table(net, net->ipv6.ip6table_filter); } static struct pernet_operations ip6table_filter_net_ops = { @@ -143,22 +79,16 @@ static int __init ip6table_filter_init(void) { int ret; - if (forward < 0 || forward > NF_MAX_VERDICT) { - printk("iptables forward must be 0 or 1\n"); - return -EINVAL; - } - - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; - ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); goto cleanup_table; + } return ret; @@ -169,7 +99,7 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&ip6table_filter_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a929c19d30e..307bbb782d1 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -10,6 +10,8 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> +#include <net/ipv6.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -21,91 +23,27 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static const struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[5]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "mangle", - .valid_hooks = MANGLE_VALID_HOOKS, - .num_entries = 6, - .size = sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ip6t_standard), - [NF_INET_FORWARD] = sizeof(struct ip6t_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ip6t_standard) * 4, - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_MANGLE, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6t_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ip6t_do_table(skb, hook, in, out, - dev_net(in)->ipv6.ip6table_mangle); -} - -static unsigned int -ip6t_post_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ip6t_do_table(skb, hook, in, out, - dev_net(out)->ipv6.ip6table_mangle); -} - static unsigned int -ip6t_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) { - unsigned int ret; struct in6_addr saddr, daddr; u_int8_t hop_limit; u_int32_t flowlabel, mark; - + int err; #if 0 /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) { - if (net_ratelimit()) - printk("ip6t_hook: happy cracking.\n"); + net_warn_ratelimited("ip6t_hook: happy cracking\n"); return NF_ACCEPT; } #endif @@ -119,70 +57,56 @@ ip6t_local_out_hook(unsigned int hook, /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, hook, in, out, + ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv6.ip6table_mangle); if (ret != NF_DROP && ret != NF_STOLEN && - (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || - memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || + !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || skb->mark != mark || - ipv6_hdr(skb)->hop_limit != hop_limit)) - return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } return ret; } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_local_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_MANGLE, - }, - { - .hook = ip6t_post_routing_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_MANGLE, - }, -}; +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (ops->hooknum == NF_INET_LOCAL_OUT) + return ip6t_mangle_out(skb, out); + if (ops->hooknum == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, ops->hooknum, in, out, + dev_net(out)->ipv6.ip6table_mangle); + /* INPUT/FORWARD */ + return ip6t_do_table(skb, ops->hooknum, in, out, + dev_net(in)->ipv6.ip6table_mangle); +} +static struct nf_hook_ops *mangle_ops __read_mostly; static int __net_init ip6table_mangle_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; net->ipv6.ip6table_mangle = - ip6t_register_table(net, &packet_mangler, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_mangle)) - return PTR_ERR(net->ipv6.ip6table_mangle); - return 0; + ip6t_register_table(net, &packet_mangler, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_mangle); + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); } static struct pernet_operations ip6table_mangle_net_ops = { @@ -199,9 +123,11 @@ static int __init ip6table_mangle_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); goto cleanup_table; + } return ret; @@ -212,7 +138,7 @@ static int __init ip6table_mangle_init(void) static void __exit ip6table_mangle_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&ip6table_mangle_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c new file mode 100644 index 00000000000..387d8b8fc18 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT + * funded by Astaro. + */ + +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> + +static const struct xt_table nf_nat_ipv6_table = { + .name = "nat", + .valid_hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + .af = NFPROTO_IPV6, +}; + +static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + */ + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI6)\n", ct, + HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6); + + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); +} + +static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned int ret; + + ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat); + if (ret == NF_ACCEPT) { + if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) + ret = alloc_null_binding(ct, hooknum); + } + return ret; +} + +static unsigned int +nf_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + + ct = nf_ct_get(skb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + * have dropped it. Hence it's the user's responsibilty to + * packet filter it out, or implement conntrack/NAT for that + * protocol. 8) --RR + */ + if (!ct) + return NF_ACCEPT; + + /* Don't try to NAT if this packet is not conntracked */ + if (nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + unsigned int ret; + + ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct); + if (ret != NF_ACCEPT) + return ret; + } else { + pr_debug("Already setup manip %s for ct %p\n", + maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + break; + + default: + /* ESTABLISHED */ + NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + ctinfo == IP_CT_ESTABLISHED_REPLY); + if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out)) + goto oif_changed; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); + +oif_changed: + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_DROP; +} + +static unsigned int +nf_nat_ipv6_in(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int +nf_nat_ipv6_out(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#ifdef CONFIG_XFRM + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int err; +#endif + unsigned int ret; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } +#endif + return ret; +} + +static unsigned int +nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int ret; + int err; + + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) + return NF_ACCEPT; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + err = ip6_route_me_harder(skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) { + err = nf_xfrm_me_harder(skb, AF_INET6); + if (err < 0) + ret = NF_DROP_ERR(err); + } +#endif + } + return ret; +} + +static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_out, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC, + }, + /* Before packet filtering, change destination */ + { + .hook = nf_nat_ipv6_local_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST, + }, + /* After packet filtering, change source */ + { + .hook = nf_nat_ipv6_fn, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC, + }, +}; + +static int __net_init ip6table_nat_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat); +} + +static void __net_exit ip6table_nat_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_nat); +} + +static struct pernet_operations ip6table_nat_net_ops = { + .init = ip6table_nat_net_init, + .exit = ip6table_nat_net_exit, +}; + +static int __init ip6table_nat_init(void) +{ + int err; + + err = register_pernet_subsys(&ip6table_nat_net_ops); + if (err < 0) + goto err1; + + err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + if (err < 0) + goto err2; + return 0; + +err2: + unregister_pernet_subsys(&ip6table_nat_net_ops); +err1: + return err; +} + +static void __exit ip6table_nat_exit(void) +{ + nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + unregister_pernet_subsys(&ip6table_nat_net_ops); +} + +module_init(ip6table_nat_init); +module_exit(ip6table_nat_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index ed1a1180f3b..5274740acec 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -5,96 +5,48 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static const struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[2]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "raw", - .valid_hooks = RAW_VALID_HOOKS, - .num_entries = 3, - .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW, }; /* The work comes in here from netfilter.c. */ static unsigned int -ip6t_pre_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, - dev_net(in)->ipv6.ip6table_raw); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int -ip6t_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ip6t_do_table(skb, hook, in, out, - dev_net(out)->ipv6.ip6table_raw); + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_raw); } -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_pre_routing_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_FIRST, - .owner = THIS_MODULE, - }, - { - .hook = ip6t_local_out_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_FIRST, - .owner = THIS_MODULE, - }, -}; +static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init ip6table_raw_net_init(struct net *net) { - /* Register table */ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; net->ipv6.ip6table_raw = - ip6t_register_table(net, &packet_raw, &initial_table.repl); - if (IS_ERR(net->ipv6.ip6table_raw)) - return PTR_ERR(net->ipv6.ip6table_raw); - return 0; + ip6t_register_table(net, &packet_raw, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw); } static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_raw); + ip6t_unregister_table(net, net->ipv6.ip6table_raw); } static struct pernet_operations ip6table_raw_net_ops = { @@ -111,9 +63,11 @@ static int __init ip6table_raw_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); goto cleanup_table; + } return ret; @@ -124,7 +78,7 @@ static int __init ip6table_raw_init(void) static void __exit ip6table_raw_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&ip6table_raw_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 41b444c6093..ab3b0219ecf 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -17,6 +17,7 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); @@ -26,115 +27,44 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static const struct -{ - struct ip6t_replace repl; - struct ip6t_standard entries[3]; - struct ip6t_error term; -} initial_table __net_initdata = { - .repl = { - .name = "security", - .valid_hooks = SECURITY_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2, - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ip6t_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2, - }, - }, - .entries = { - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IP6T_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IP6T_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_SECURITY, }; static unsigned int -ip6t_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ip6t_do_table(skb, hook, in, out, - dev_net(in)->ipv6.ip6table_security); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int -ip6t_forward_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ip6t_do_table(skb, hook, in, out, - dev_net(in)->ipv6.ip6table_security); + return ip6t_do_table(skb, ops->hooknum, in, out, + net->ipv6.ip6table_security); } -static unsigned int -ip6t_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* TBD: handle short packets via raw socket */ - return ip6t_do_table(skb, hook, in, out, - dev_net(out)->ipv6.ip6table_security); -} - -static struct nf_hook_ops ip6t_ops[] __read_mostly = { - { - .hook = ip6t_local_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_SECURITY, - }, - { - .hook = ip6t_forward_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_SECURITY, - }, - { - .hook = ip6t_local_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_SECURITY, - }, -}; +static struct nf_hook_ops *sectbl_ops __read_mostly; static int __net_init ip6table_security_net_init(struct net *net) { - net->ipv6.ip6table_security = - ip6t_register_table(net, &security_table, &initial_table.repl); + struct ip6t_replace *repl; - if (IS_ERR(net->ipv6.ip6table_security)) - return PTR_ERR(net->ipv6.ip6table_security); - - return 0; + repl = ip6t_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_security = + ip6t_register_table(net, &security_table, repl); + kfree(repl); + return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security); } static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net->ipv6.ip6table_security); + ip6t_unregister_table(net, net->ipv6.ip6table_security); } static struct pernet_operations ip6table_security_net_ops = { @@ -150,9 +80,11 @@ static int __init ip6table_security_init(void) if (ret < 0) return ret; - ret = nf_register_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); - if (ret < 0) + sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); goto cleanup_table; + } return ret; @@ -163,7 +95,7 @@ cleanup_table: static void __exit ip6table_security_fini(void) { - nf_unregister_hooks(ip6t_ops, ARRAY_SIZE(ip6t_ops)); + xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&ip6table_security_net_ops); } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 0956ebabbff..4cbc6b290dd 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -16,18 +16,22 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> -#include <linux/sysctl.h> #include <net/ipv6.h> #include <net/inet_frag.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#include <net/netfilter/nf_nat_helper.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netfilter/nf_log.h> static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, @@ -63,209 +67,118 @@ static int ipv6_print_tuple(struct seq_file *s, tuple->src.u3.ip6, tuple->dst.u3.ip6); } -/* - * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c - * - * This function parses (probably truncated) exthdr set "hdr" - * of length "len". "nexthdrp" initially points to some place, - * where type of the first header can be found. - * - * It skips all well-known exthdrs, and returns pointer to the start - * of unparsable area i.e. the first header with unknown type. - * if success, *nexthdr is updated by type/protocol of this header. - * - * NOTES: - it may return pointer pointing beyond end of packet, - * if the last recognized header is truncated in the middle. - * - if packet is truncated, so that all parsed headers are skipped, - * it returns -1. - * - if packet is fragmented, return pointer of the fragment header. - * - ESP is unparsable for now and considered like - * normal payload protocol. - * - Note also special handling of AUTH header. Thanks to IPsec wizards. - */ - -static int nf_ct_ipv6_skip_exthdr(const struct sk_buff *skb, int start, - u8 *nexthdrp, int len) -{ - u8 nexthdr = *nexthdrp; - - while (ipv6_ext_hdr(nexthdr)) { - struct ipv6_opt_hdr hdr; - int hdrlen; - - if (len < (int)sizeof(struct ipv6_opt_hdr)) - return -1; - if (nexthdr == NEXTHDR_NONE) - break; - if (nexthdr == NEXTHDR_FRAGMENT) - break; - if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) - BUG(); - if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hdr.hdrlen+2)<<2; - else - hdrlen = ipv6_optlen(&hdr); - - nexthdr = hdr.nexthdr; - len -= hdrlen; - start += hdrlen; - } - - *nexthdrp = nexthdr; - return start; -} - static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { unsigned int extoff = nhoff + sizeof(struct ipv6hdr); - unsigned char pnum; + __be16 frag_off; int protoff; + u8 nexthdr; if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), - &pnum, sizeof(pnum)) != 0) { + &nexthdr, sizeof(nexthdr)) != 0) { pr_debug("ip6_conntrack_core: can't get nexthdr\n"); return -NF_ACCEPT; } - protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, skb->len - extoff); + protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); /* - * (protoff == skb->len) mean that the packet doesn't have no data - * except of IPv6 & ext headers. but it's tracked anyway. - YK + * (protoff == skb->len) means the packet has not data, just + * IPv6 and possibly extensions headers, but it is tracked anyway */ - if ((protoff < 0) || (protoff > skb->len)) { + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); return -NF_ACCEPT; } *dataoff = protoff; - *protonum = pnum; + *protonum = nexthdr; return NF_ACCEPT; } -static unsigned int ipv6_confirm(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv6_helper(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; - unsigned int ret, protoff; - unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - + __be16 frag_off; + int protoff; + u8 nexthdr; /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) - goto out; + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; help = nfct_help(ct); if (!help) - goto out; + return NF_ACCEPT; /* rcu_read_lock()ed by nf_hook_slow */ helper = rcu_dereference(help->helper); if (!helper) - goto out; + return NF_ACCEPT; - protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, - skb->len - extoff); - if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) { + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } - ret = helper->help(skb, protoff, ct, ctinfo); - if (ret != NF_ACCEPT) { - nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL, - "nf_ct_%s: dropping packet", helper->name); - return ret; - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, - struct sk_buff *skb) -{ -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP6_DEFRAG_CONNTRACK_BRIDGE_IN; -#endif - if (hooknum == NF_INET_PRE_ROUTING) - return IP6_DEFRAG_CONNTRACK_IN; - else - return IP6_DEFRAG_CONNTRACK_OUT; - + return helper->help(skb, protoff, ct, ctinfo); } -static unsigned int ipv6_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int ipv6_confirm(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - struct sk_buff *reasm; - - /* Previously seen (loopback)? */ - if (skb->nfct) - return NF_ACCEPT; - - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); - /* queued */ - if (reasm == NULL) - return NF_STOLEN; - - /* error occured or not fragmented */ - if (reasm == skb) - return NF_ACCEPT; - - nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, - (struct net_device *)out, okfn); - - return NF_STOLEN; -} + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + int protoff; + __be16 frag_off; -static unsigned int __ipv6_conntrack_in(struct net *net, - unsigned int hooknum, - struct sk_buff *skb, - int (*okfn)(struct sk_buff *)) -{ - struct sk_buff *reasm = skb->nfct_reasm; + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; - /* This packet is fragmented and has reassembled packet. */ - if (reasm) { - /* Reassembled packet isn't parsed yet ? */ - if (!reasm->nfct) { - unsigned int ret; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + goto out; + } - ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); - if (ret != NF_ACCEPT) - return ret; + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; } - nf_conntrack_get(reasm->nfct); - skb->nfct = reasm->nfct; - skb->nfctinfo = reasm->nfctinfo; - return NF_ACCEPT; } - - return nf_conntrack_in(net, PF_INET6, hooknum, skb); +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); } -static unsigned int ipv6_conntrack_in(unsigned int hooknum, +static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn); + return nf_conntrack_in(dev_net(in), PF_INET6, ops->hooknum, skb); } -static unsigned int ipv6_conntrack_local(unsigned int hooknum, +static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -273,22 +186,14 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, { /* root is playing with raw sockets. */ if (skb->len < sizeof(struct ipv6hdr)) { - if (net_ratelimit()) - printk("ipv6_conntrack_local: packet too short\n"); + net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); return NF_ACCEPT; } - return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn); + return nf_conntrack_in(dev_net(out), PF_INET6, ops->hooknum, skb); } static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv6_conntrack_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, @@ -303,11 +208,11 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_defrag, + .hook = ipv6_helper, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, }, { .hook = ipv6_confirm, @@ -317,6 +222,13 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { .priority = NF_IP6_PRI_LAST, }, { + .hook = ipv6_helper, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { .hook = ipv6_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, @@ -325,7 +237,52 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { }, }; -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +static int +ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct sockaddr_in6 sin6; + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; + struct nf_conn *ct; + + tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.in6 = sk->sk_v6_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.dst.protonum = sk->sk_protocol; + + if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) + return -ENOPROTOOPT; + + if (*len < 0 || (unsigned int) *len < sizeof(sin6)) + return -EINVAL; + + h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + if (!h) { + pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", + &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + sin6.sin6_family = AF_INET6; + sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + sin6.sin6_flowinfo = inet6->flow_label & IPV6_FLOWINFO_MASK; + memcpy(&sin6.sin6_addr, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, + sizeof(sin6.sin6_addr)); + + nf_ct_put(ct); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, + sk->sk_bound_dev_if); + return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> @@ -333,10 +290,11 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, - &tuple->src.u3.ip6); - NLA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, - &tuple->dst.u3.ip6); + if (nla_put(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6) || + nla_put(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6)) + goto nla_put_failure; return 0; nla_put_failure: @@ -375,16 +333,12 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .invert_tuple = ipv6_invert_tuple, .print_tuple = ipv6_print_tuple, .get_l4proto = ipv6_get_l4proto, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = ipv6_tuple_to_nlattr, .nlattr_tuple_size = ipv6_nlattr_tuple_size, .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_path = nf_net_netfilter_sysctl_path, - .ctl_table = nf_ct_ipv6_sysctl_table, -#endif .me = THIS_MODULE, }; @@ -392,72 +346,137 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); +static struct nf_sockopt_ops so_getorigdst6 = { + .pf = NFPROTO_IPV6, + .get_optmin = IP6T_SO_ORIGINAL_DST, + .get_optmax = IP6T_SO_ORIGINAL_DST + 1, + .get = ipv6_getorigdst, + .owner = THIS_MODULE, +}; + +static int ipv6_net_init(struct net *net) +{ + int ret = 0; + + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6); + if (ret < 0) { + pr_err("nf_conntrack_tcp6: pernet registration failed\n"); + goto out; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6); + if (ret < 0) { + pr_err("nf_conntrack_udp6: pernet registration failed\n"); + goto cleanup_tcp6; + } + ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + pr_err("nf_conntrack_icmp6: pernet registration failed\n"); + goto cleanup_udp6; + } + ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: pernet registration failed.\n"); + goto cleanup_icmpv6; + } + return 0; + cleanup_icmpv6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); + cleanup_udp6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); + cleanup_tcp6: + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); + out: + return ret; +} + +static void ipv6_net_exit(struct net *net) +{ + nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6); + nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6); +} + +static struct pernet_operations ipv6_net_ops = { + .init = ipv6_net_init, + .exit = ipv6_net_exit, +}; + static int __init nf_conntrack_l3proto_ipv6_init(void) { int ret = 0; need_conntrack(); + nf_defrag_ipv6_enable(); - ret = nf_ct_frag6_init(); + ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + pr_err("Unable to register netfilter socket option\n"); return ret; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); + + ret = register_pernet_subsys(&ipv6_net_ops); + if (ret < 0) + goto cleanup_sockopt; + + ret = nf_register_hooks(ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register tcp.\n"); - goto cleanup_frag6; + pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_pernet; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register udp.\n"); - goto cleanup_tcp; + pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n"); + goto cleanup_hooks; } - ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register icmpv6.\n"); - goto cleanup_udp; + pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n"); + goto cleanup_tcp6; } - ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register ipv6\n"); - goto cleanup_icmpv6; + pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n"); + goto cleanup_udp6; } - ret = nf_register_hooks(ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); + ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); if (ret < 0) { - printk("nf_conntrack_ipv6: can't register pre-routing defrag " - "hook.\n"); - goto cleanup_ipv6; + pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); + goto cleanup_icmpv6; } return ret; - cleanup_ipv6: - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); cleanup_icmpv6: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - cleanup_udp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - cleanup_tcp: - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: - nf_ct_frag6_cleanup(); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + cleanup_udp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + cleanup_tcp6: + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + cleanup_hooks: + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + cleanup_pernet: + unregister_pernet_subsys(&ipv6_net_ops); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst6); return ret; } static void __exit nf_conntrack_l3proto_ipv6_fini(void) { synchronize_net(); + nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6); + nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); - nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); - nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - nf_ct_frag6_cleanup(); + unregister_pernet_subsys(&ipv6_net_ops); + nf_unregister_sockopt(&so_getorigdst6); } module_init(nf_conntrack_l3proto_ipv6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index c7b8bd1d798..b3807c5cb88 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -23,11 +23,17 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; +static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6; +} + static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) @@ -87,25 +93,31 @@ static int icmpv6_print_tuple(struct seq_file *s, ntohs(tuple->src.u.icmp.id)); } +static unsigned int *icmpv6_get_timeouts(struct net *net) +{ + return &icmpv6_pernet(net)->timeout; +} + /* Returns verdict for packet, or -1 for invalid. */ static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, - unsigned int hooknum) + unsigned int hooknum, + unsigned int *timeout) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ - nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, unsigned int *timeouts) { static const u_int8_t valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, @@ -119,7 +131,8 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, type + 128); nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL, + NULL, NULL, "nf_ct_icmpv6: invalid new with type %d ", type + 128); return false; @@ -128,7 +141,7 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, } static int -icmpv6_error_message(struct net *net, +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int icmp6off, enum ip_conntrack_info *ctinfo, @@ -137,6 +150,7 @@ icmpv6_error_message(struct net *net, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; NF_CT_ASSERT(skb->nfct == NULL); @@ -163,7 +177,7 @@ icmpv6_error_message(struct net *net, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, &intuple); + h = nf_conntrack_find_get(net, zone, &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; @@ -175,11 +189,12 @@ icmpv6_error_message(struct net *net, /* Update skb to refer to this connection */ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; + return NF_ACCEPT; } static int -icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmp6hdr *icmp6h; @@ -189,7 +204,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: short packet "); return -NF_ACCEPT; } @@ -197,7 +212,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { if (LOG_INVALID(net, IPPROTO_ICMPV6)) - nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed "); return -NF_ACCEPT; } @@ -205,7 +220,7 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { - skb->nfct = &nf_conntrack_untracked.ct_general; + skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); return NF_ACCEPT; @@ -215,20 +230,20 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(net, skb, dataoff, ctinfo, hooknum); + return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); } -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id); - NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type); - NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code); - + if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; return 0; nla_put_failure: @@ -267,12 +282,50 @@ static int icmpv6_nlattr_tuple_size(void) } #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmpv6_pernet(net); + + if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; + } else { + /* Set default ICMPv6 timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { + [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + #ifdef CONFIG_SYSCTL -static struct ctl_table_header *icmpv6_sysctl_header; static struct ctl_table icmpv6_sysctl_table[] = { { .procname = "nf_conntrack_icmpv6_timeout", - .data = &nf_ct_icmpv6_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -281,6 +334,36 @@ static struct ctl_table icmpv6_sysctl_table[] = { }; #endif /* CONFIG_SYSCTL */ +static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmpv6_sysctl_table, + sizeof(icmpv6_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmpv6_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmpv6_timeout; + + return icmpv6_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = { .l3proto = PF_INET6, @@ -290,16 +373,24 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = .invert_tuple = icmpv6_invert_tuple, .print_tuple = icmpv6_print_tuple, .packet = icmpv6_packet, + .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, .error = icmpv6_error, -#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, .nlattr_to_tuple = icmpv6_nlattr_to_tuple, .nla_policy = icmpv6_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_header = &icmpv6_sysctl_header, - .ctl_table = icmpv6_sysctl_table, -#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, + .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmpv6_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmpv6_init_net, + .get_net_proto = icmpv6_get_net_proto, }; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 312c20adc83..0d5279fd852 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -14,6 +14,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv6-nf: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -27,6 +29,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> @@ -38,16 +41,15 @@ #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/inet_ecn.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ -#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ -#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT struct nf_ct_frag6_skb_cb { @@ -58,117 +60,131 @@ struct nf_ct_frag6_skb_cb #define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) -struct nf_ct_frag6_queue -{ - struct inet_frag_queue q; - - __be32 id; /* fragment id */ - struct in6_addr saddr; - struct in6_addr daddr; - - unsigned int csum; - __u16 nhoffset; -}; - static struct inet_frags nf_frags; -static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_ipv6_sysctl_table[] = { +static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &nf_init_frags.timeout, + .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &nf_init_frags.low_thresh, + .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &nf_init_frags.high_thresh, + .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; -#endif -static unsigned int nf_hashfn(struct inet_frag_queue *q) +static int nf_ct_frag6_sysctl_register(struct net *net) { - const struct nf_ct_frag6_queue *nq; + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = nf_ct_frag6_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table), + GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->nf_frag.frags.timeout; + table[1].data = &net->nf_frag.frags.low_thresh; + table[2].data = &net->nf_frag.frags.high_thresh; + } + + hdr = register_net_sysctl(net, "net/netfilter", table); + if (hdr == NULL) + goto err_reg; + + net->nf_frag.sysctl.frags_hdr = hdr; + return 0; - nq = container_of(q, struct nf_ct_frag6_queue, q); - return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd); +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; } -static void nf_skb_free(struct sk_buff *skb) +static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { - if (NFCT_FRAG6_CB(skb)->orig) - kfree_skb(NFCT_FRAG6_CB(skb)->orig); + struct ctl_table *table; + + table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); + if (!net_eq(net, &init_net)) + kfree(table); } -/* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +#else +static int nf_ct_frag6_sysctl_register(struct net *net) { - if (work) - *work -= skb->truesize; - atomic_sub(skb->truesize, &nf_init_frags.mem); - nf_skb_free(skb); - kfree_skb(skb); + return 0; } - -/* Destruction primitives. */ - -static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { - inet_frag_put(&fq->q, &nf_frags); } +#endif -/* Kill fq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { - inet_frag_kill(&fq->q, &nf_frags); + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static void nf_ct_frag6_evictor(void) +static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr) { - local_bh_disable(); - inet_frag_evictor(&nf_init_frags, &nf_frags); - local_bh_enable(); + u32 c; + + net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); + c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, nf_frags.rnd); + return c & (INETFRAGS_HASHSZ - 1); } -static void nf_ct_frag6_expire(unsigned long data) + +static unsigned int nf_hashfn(struct inet_frag_queue *q) { - struct nf_ct_frag6_queue *fq; + const struct frag_queue *nq; - fq = container_of((struct inet_frag_queue *)data, - struct nf_ct_frag6_queue, q); + nq = container_of(q, struct frag_queue, q); + return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); +} - spin_lock(&fq->q.lock); +static void nf_skb_free(struct sk_buff *skb) +{ + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); +} - if (fq->q.last_in & INET_FRAG_COMPLETE) - goto out; +static void nf_ct_frag6_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net *net; - fq_kill(fq); + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, nf_frag.frags); -out: - spin_unlock(&fq->q.lock); - fq_put(fq); + ip6_expire_frag_queue(net, fq, &nf_frags); } /* Creation primitives. */ - -static __inline__ struct nf_ct_frag6_queue * -fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) +static inline struct frag_queue *fq_find(struct net *net, __be32 id, + u32 user, struct in6_addr *src, + struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -178,36 +194,38 @@ fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) arg.user = user; arg.src = src; arg.dst = dst; + arg.ecn = ecn; read_lock_bh(&nf_frags.lock); - hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); + hash = nf_hash_frag(id, src, dst); - q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); local_bh_enable(); - if (q == NULL) - goto oom; - - return container_of(q, struct nf_ct_frag6_queue, q); - -oom: - pr_debug("Can't alloc new queue\n"); - return NULL; + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } + return container_of(q, struct frag_queue, q); } -static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, +static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { struct sk_buff *prev, *next; + unsigned int payload_len; int offset, end; + u8 ecn; if (fq->q.last_in & INET_FRAG_COMPLETE) { - pr_debug("Allready completed\n"); + pr_debug("Already completed\n"); goto err; } + payload_len = ntohs(ipv6_hdr(skb)->payload_len); + offset = ntohs(fhdr->frag_off) & ~0x7; - end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + end = offset + (payload_len - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { @@ -215,6 +233,8 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -272,6 +292,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = fq->q.fragments_tail; + if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { if (NFCT_FRAG6_CB(next)->offset >= offset) @@ -279,80 +304,45 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, prev = next; } - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. +found: + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. */ - if (prev) { - int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; - if (i > 0) { - offset += i; - if (end <= offset) { - pr_debug("overlap\n"); - goto err; - } - if (!pskb_pull(skb, i)) { - pr_debug("Can't pull\n"); - goto err; - } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - while (next && NFCT_FRAG6_CB(next)->offset < end) { - /* overlap is 'i' bytes */ - int i = end - NFCT_FRAG6_CB(next)->offset; - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - pr_debug("Eat head of the overlapped parts.: %d", i); - if (!pskb_pull(next, i)) - goto err; + /* Check for overlap with preceding fragment. */ + if (prev && + (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; - /* next fragment */ - NFCT_FRAG6_CB(next)->offset += i; - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragmnet is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - fq->q.fragments = next; - - fq->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); - } - } + /* Look for overlap with succeeding segment. */ + if (next && NFCT_FRAG6_CB(next)->offset < end) + goto discard_fq; NFCT_FRAG6_CB(skb)->offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + fq->q.fragments_tail = skb; if (prev) prev->next = skb; else fq->q.fragments = skb; - skb->dev = NULL; + if (skb->dev) { + fq->iif = skb->dev->ifindex; + skb->dev = NULL; + } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &nf_init_frags.mem); + fq->ecn |= ecn; + if (payload_len > fq->q.max_size) + fq->q.max_size = payload_len; + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -361,11 +351,12 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, fq->nhoffset = nhoff; fq->q.last_in |= INET_FRAG_FIRST_IN; } - write_lock(&nf_frags.lock); - list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); - write_unlock(&nf_frags.lock); + + inet_frag_lru_move(&fq->q); return 0; +discard_fq: + inet_frag_kill(&fq->q, &nf_frags); err: return -1; } @@ -380,16 +371,21 @@ err: * the last and the first frames arrived and all the bits are here. */ static struct sk_buff * -nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) { struct sk_buff *fp, *op, *head = fq->q.fragments; int payload_len; + u8 ecn; - fq_kill(fq); + inet_frag_kill(&fq->q, &nf_frags); WARN_ON(head == NULL); WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; + /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - sizeof(struct ipv6hdr) + fq->q.len - @@ -400,7 +396,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) } /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + if (skb_unclone(head, GFP_ATOMIC)) { pr_debug("skb is cloned but can't expand head"); goto out_oom; } @@ -408,20 +404,20 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; - if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { - pr_debug("Can't alloc skb\n"); + clone = alloc_skb(0, GFP_ATOMIC); + if (clone == NULL) goto out_oom; - } + clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; @@ -429,7 +425,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) clone->ip_summed = head->ip_summed; NFCT_FRAG6_CB(clone)->orig = NULL; - atomic_add(clone->truesize, &nf_init_frags.mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -443,7 +439,6 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &nf_init_frags.mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -453,13 +448,16 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &nf_init_frags.mem); } + sub_frag_mem_limit(&fq->q, head->truesize); + head->ignore_df = 1; head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); + IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -468,10 +466,11 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) head->csum); fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ fp = skb_shinfo(head)->frag_list; - if (NFCT_FRAG6_CB(fp)->orig == NULL) + if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) /* at above code, head skb is divided into two skbs. */ fp = fp->next; @@ -487,12 +486,11 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) return head; out_oversize: - if (net_ratelimit()) - printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", + payload_len); goto out_fail; out_oom: - if (net_ratelimit()) - printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); + net_dbg_ratelimited("nf_ct_frag6_reasm: no memory for reassembly\n"); out_fail: return NULL; } @@ -564,8 +562,10 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) { struct sk_buff *clone; struct net_device *dev = skb->dev; + struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev) + : dev_net(skb->dev); struct frag_hdr *fhdr; - struct nf_ct_frag6_queue *fq; + struct frag_queue *fq; struct ipv6hdr *hdr; int fhoff, nhoff; u8 prevhdr; @@ -597,16 +597,12 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) hdr = ipv6_hdr(clone); fhdr = (struct frag_hdr *)skb_transport_header(clone); - if (!(fhdr->frag_off & htons(0xFFF9))) { - pr_debug("Invalid fragment offset\n"); - /* It is not a fragmented frame */ - goto ret_orig; - } - - if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) - nf_ct_frag6_evictor(); + local_bh_disable(); + inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false); + local_bh_enable(); - fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr); + fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); goto ret_orig; @@ -617,7 +613,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { spin_unlock_bh(&fq->q.lock); pr_debug("Can't insert skb to queue\n"); - fq_put(fq); + inet_frag_put(&fq->q, &nf_frags); goto ret_orig; } @@ -629,7 +625,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) } spin_unlock_bh(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, &nf_frags); return ret_skb; ret_orig: @@ -637,50 +633,62 @@ ret_orig: return skb; } -void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, - struct net_device *in, struct net_device *out, - int (*okfn)(struct sk_buff *)) +void nf_ct_frag6_consume_orig(struct sk_buff *skb) { struct sk_buff *s, *s2; for (s = NFCT_FRAG6_CB(skb)->orig; s;) { - nf_conntrack_put_reasm(s->nfct_reasm); - nf_conntrack_get_reasm(skb); - s->nfct_reasm = skb; - s2 = s->next; s->next = NULL; - - NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, - NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + consume_skb(s); s = s2; } - nf_conntrack_put_reasm(skb); } +static int nf_ct_net_init(struct net *net) +{ + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; + inet_frags_init_net(&net->nf_frag.frags); + + return nf_ct_frag6_sysctl_register(net); +} + +static void nf_ct_net_exit(struct net *net) +{ + nf_ct_frags6_sysctl_unregister(net); + inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); +} + +static struct pernet_operations nf_ct_net_ops = { + .init = nf_ct_net_init, + .exit = nf_ct_net_exit, +}; + int nf_ct_frag6_init(void) { + int ret = 0; + nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.skb_free = nf_skb_free; - nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); + nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.secret_interval = 10 * 60 * HZ; - nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; - nf_init_frags.high_thresh = 256 * 1024; - nf_init_frags.low_thresh = 192 * 1024; - inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); - return 0; + ret = register_pernet_subsys(&nf_ct_net_ops); + if (ret) + inet_frags_fini(&nf_frags); + + return ret; } void nf_ct_frag6_cleanup(void) { + unregister_pernet_subsys(&nf_ct_net_ops); inet_frags_fini(&nf_frags); - - nf_init_frags.low_thresh = 0; - nf_ct_frag6_evictor(); } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 00000000000..7b9a748c6ba --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,140 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#endif +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; +#endif + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occurred or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_consume_orig(reasm); + + NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, reasm, + (struct net_device *) in, (struct net_device *) out, + okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c new file mode 100644 index 00000000000..abfe75a2e31 --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of IPv6 NAT funded by Astaro. + */ +#include <linux/types.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <net/secure_seq.h> +#include <net/checksum.h> +#include <net/ip6_checksum.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> + +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv6; + +#ifdef CONFIG_XFRM +static void nf_nat_ipv6_decode_session(struct sk_buff *skb, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + unsigned long statusbit, + struct flowi *fl) +{ + const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; + struct flowi6 *fl6 = &fl->u.ip6; + + if (ct->status & statusbit) { + fl6->daddr = t->dst.u3.in6; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl6->fl6_dport = t->dst.u.all; + } + + statusbit ^= IPS_NAT_MASK; + + if (ct->status & statusbit) { + fl6->saddr = t->src.u3.in6; + if (t->dst.protonum == IPPROTO_TCP || + t->dst.protonum == IPPROTO_UDP || + t->dst.protonum == IPPROTO_UDPLITE || + t->dst.protonum == IPPROTO_DCCP || + t->dst.protonum == IPPROTO_SCTP) + fl6->fl6_sport = t->src.u.all; + } +} +#endif + +static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, + const struct nf_nat_range *range) +{ + return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && + ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; +} + +static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t, + __be16 dport) +{ + return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport); +} + +static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, + unsigned int iphdroff, + const struct nf_nat_l4proto *l4proto, + const struct nf_conntrack_tuple *target, + enum nf_nat_manip_type maniptype) +{ + struct ipv6hdr *ipv6h; + __be16 frag_off; + int hdroff; + u8 nexthdr; + + if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + return false; + + ipv6h = (void *)skb->data + iphdroff; + nexthdr = ipv6h->nexthdr; + hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), + &nexthdr, &frag_off); + if (hdroff < 0) + goto manip_addr; + + if ((frag_off & htons(~0x7)) == 0 && + !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff, + target, maniptype)) + return false; +manip_addr: + if (maniptype == NF_NAT_MANIP_SRC) + ipv6h->saddr = target->src.u3.in6; + else + ipv6h->daddr = target->dst.u3.in6; + + return true; +} + +static void nf_nat_ipv6_csum_update(struct sk_buff *skb, + unsigned int iphdroff, __sum16 *check, + const struct nf_conntrack_tuple *t, + enum nf_nat_manip_type maniptype) +{ + const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); + const struct in6_addr *oldip, *newip; + + if (maniptype == NF_NAT_MANIP_SRC) { + oldip = &ipv6h->saddr; + newip = &t->src.u3.in6; + } else { + oldip = &ipv6h->daddr; + newip = &t->dst.u3.in6; + } + inet_proto_csum_replace16(check, skb, oldip->s6_addr32, + newip->s6_addr32, 1); +} + +static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, + u8 proto, void *data, __sum16 *check, + int datalen, int oldlen) +{ + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt6i_flags & RTF_LOCAL) && + (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + (data - (void *)skb->data); + skb->csum_offset = (void *)check - data; + *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, 0); + } else { + *check = 0; + *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, proto, + csum_partial(data, datalen, + 0)); + if (proto == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + +static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], + struct nf_nat_range *range) +{ + if (tb[CTA_NAT_V6_MINIP]) { + nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], + sizeof(struct in6_addr)); + range->flags |= NF_NAT_RANGE_MAP_IPS; + } + + if (tb[CTA_NAT_V6_MAXIP]) + nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], + sizeof(struct in6_addr)); + else + range->max_addr = range->min_addr; + + return 0; +} + +static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = { + .l3proto = NFPROTO_IPV6, + .secure_port = nf_nat_ipv6_secure_port, + .in_range = nf_nat_ipv6_in_range, + .manip_pkt = nf_nat_ipv6_manip_pkt, + .csum_update = nf_nat_ipv6_csum_update, + .csum_recalc = nf_nat_ipv6_csum_recalc, + .nlattr_to_range = nf_nat_ipv6_nlattr_to_range, +#ifdef CONFIG_XFRM + .decode_session = nf_nat_ipv6_decode_session, +#endif +}; + +int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + unsigned int hdrlen) +{ + struct { + struct icmp6hdr icmp6; + struct ipv6hdr ip6; + } *inside; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); + const struct nf_nat_l4proto *l4proto; + struct nf_conntrack_tuple target; + unsigned long statusbit; + + NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY); + + if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + return 0; + if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) + return 0; + + inside = (void *)skb->data + hdrlen; + if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + if (ct->status & IPS_NAT_MASK) + return 0; + } + + if (manip == NF_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply direction */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr); + if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), + l4proto, &ct->tuplehash[!dir].tuple, !manip)) + return 0; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + inside = (void *)skb->data + hdrlen; + inside->icmp6.icmp6_cksum = 0; + inside->icmp6.icmp6_cksum = + csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + skb->len - hdrlen, IPPROTO_ICMPV6, + csum_partial(&inside->icmp6, + skb->len - hdrlen, 0)); + } + + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6); + if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip)) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); + +static int __init nf_nat_l3proto_ipv6_init(void) +{ + int err; + + err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); + if (err < 0) + goto err1; + err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6); + if (err < 0) + goto err2; + return err; + +err2: + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); +err1: + return err; +} + +static void __exit nf_nat_l3proto_ipv6_exit(void) +{ + nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6); + nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6); +} + +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf-nat-" __stringify(AF_INET6)); + +module_init(nf_nat_l3proto_ipv6_init); +module_exit(nf_nat_l3proto_ipv6_exit); diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c new file mode 100644 index 00000000000..2205e8eeeac --- /dev/null +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6 + * NAT funded by Astaro. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/icmpv6.h> + +#include <linux/netfilter.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/netfilter/nf_nat_l4proto.h> + +static bool +icmpv6_in_range(const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype, + const union nf_conntrack_man_proto *min, + const union nf_conntrack_man_proto *max) +{ + return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && + ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); +} + +static void +icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, + struct nf_conntrack_tuple *tuple, + const struct nf_nat_range *range, + enum nf_nat_manip_type maniptype, + const struct nf_conn *ct) +{ + static u16 id; + unsigned int range_size; + unsigned int i; + + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xffff; + + for (i = 0; ; ++id) { + tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + + (id % range_size)); + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; + } +} + +static bool +icmpv6_manip_pkt(struct sk_buff *skb, + const struct nf_nat_l3proto *l3proto, + unsigned int iphdroff, unsigned int hdroff, + const struct nf_conntrack_tuple *tuple, + enum nf_nat_manip_type maniptype) +{ + struct icmp6hdr *hdr; + + if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + return false; + + hdr = (struct icmp6hdr *)(skb->data + hdroff); + l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum, + tuple, maniptype); + if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || + hdr->icmp6_type == ICMPV6_ECHO_REPLY) { + inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, + hdr->icmp6_identifier, + tuple->src.u.icmp.id, 0); + hdr->icmp6_identifier = tuple->src.u.icmp.id; + } + return true; +} + +const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = { + .l4proto = IPPROTO_ICMPV6, + .manip_pkt = icmpv6_manip_pkt, + .in_range = icmpv6_in_range, + .unique_tuple = icmpv6_unique_tuple, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, +#endif +}; diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c new file mode 100644 index 00000000000..0d812b31277 --- /dev/null +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/ipv6.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> + +static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + /* malformed packet, drop it */ + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + return NF_DROP; + + return nft_do_chain(&pkt, ops); +} + +static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (unlikely(skb->len < sizeof(struct ipv6hdr))) { + if (net_ratelimit()) + pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " + "packet\n"); + return NF_ACCEPT; + } + + return nft_do_chain_ipv6(ops, skb, in, out, okfn); +} + +struct nft_af_info nft_af_ipv6 __read_mostly = { + .family = NFPROTO_IPV6, + .nhooks = NF_INET_NUMHOOKS, + .owner = THIS_MODULE, + .nops = 1, + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_ipv6_output, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, +}; +EXPORT_SYMBOL_GPL(nft_af_ipv6); + +static int nf_tables_ipv6_init_net(struct net *net) +{ + net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); + if (net->nft.ipv6 == NULL) + return -ENOMEM; + + memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6)); + + if (nft_register_afinfo(net, net->nft.ipv6) < 0) + goto err; + + return 0; +err: + kfree(net->nft.ipv6); + return -ENOMEM; +} + +static void nf_tables_ipv6_exit_net(struct net *net) +{ + nft_unregister_afinfo(net->nft.ipv6); + kfree(net->nft.ipv6); +} + +static struct pernet_operations nf_tables_ipv6_net_ops = { + .init = nf_tables_ipv6_init_net, + .exit = nf_tables_ipv6_exit_net, +}; + +static const struct nf_chain_type filter_ipv6 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), +}; + +static int __init nf_tables_ipv6_init(void) +{ + int ret; + + nft_register_chain_type(&filter_ipv6); + ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv6); + + return ret; +} + +static void __exit nf_tables_ipv6_exit(void) +{ + unregister_pernet_subsys(&nf_tables_ipv6_net_ops); + nft_unregister_chain_type(&filter_ipv6); +} + +module_init(nf_tables_ipv6_init); +module_exit(nf_tables_ipv6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_FAMILY(AF_INET6); diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c new file mode 100644 index 00000000000..d189fcb437f --- /dev/null +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/netfilter/nf_nat_l3proto.h> +#include <net/ipv6.h> + +/* + * IPv6 NAT chains + */ + +static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_nat *nat; + enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum); + __be16 frag_off; + int hdrlen; + u8 nexthdr; + struct nft_pktinfo pkt; + unsigned int ret; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return NF_ACCEPT; + + nat = nf_ct_nat_ext_add(ct); + if (nat == NULL) + return NF_ACCEPT; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED + IP_CT_IS_REPLY: + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, + ops->hooknum, + hdrlen)) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall through */ + case IP_CT_NEW: + if (nf_nat_initialized(ct, maniptype)) + break; + + nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out); + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_ACCEPT) + return ret; + if (!nf_nat_initialized(ct, maniptype)) { + ret = nf_nat_alloc_null_binding(ct, ops->hooknum); + if (ret != NF_ACCEPT) + return ret; + } + default: + break; + } + + return nf_nat_packet(ct, ctinfo, ops->hooknum, skb); +} + +static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct in6_addr daddr = ipv6_hdr(skb)->daddr; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) + skb_dst_drop(skb); + + return ret; +} + +static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo __maybe_unused; + const struct nf_conn *ct __maybe_unused; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); +#ifdef CONFIG_XFRM + if (ret != NF_DROP && ret != NF_STOLEN && + !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3) || + (ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all)) + if (nf_xfrm_me_harder(skb, AF_INET6) < 0) + ret = NF_DROP; + } +#endif + return ret; +} + +static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int ret; + + ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN && + (ct = nf_ct_get(skb, &ctinfo)) != NULL) { + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, + &ct->tuplehash[!dir].tuple.src.u3)) { + if (ip6_route_me_harder(skb)) + ret = NF_DROP; + } +#ifdef CONFIG_XFRM + else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.u.all != + ct->tuplehash[!dir].tuple.src.u.all) + if (nf_xfrm_me_harder(skb, AF_INET6)) + ret = NF_DROP; +#endif + } + return ret; +} + +static const struct nf_chain_type nft_chain_nat_ipv6 = { + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), + .hooks = { + [NF_INET_PRE_ROUTING] = nf_nat_ipv6_prerouting, + [NF_INET_POST_ROUTING] = nf_nat_ipv6_postrouting, + [NF_INET_LOCAL_OUT] = nf_nat_ipv6_output, + [NF_INET_LOCAL_IN] = nf_nat_ipv6_fn, + }, +}; + +static int __init nft_chain_nat_ipv6_init(void) +{ + int err; + + err = nft_register_chain_type(&nft_chain_nat_ipv6); + if (err < 0) + return err; + + return 0; +} + +static void __exit nft_chain_nat_ipv6_exit(void) +{ + nft_unregister_chain_type(&nft_chain_nat_ipv6); +} + +module_init(nft_chain_nat_ipv6_init); +module_exit(nft_chain_nat_ipv6_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat"); diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c new file mode 100644 index 00000000000..42031299585 --- /dev/null +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/route.h> + +static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + struct nft_pktinfo pkt; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u32 mark, flowlabel; + + /* malformed packet, drop it */ + if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0) + return NF_DROP; + + /* save source/dest address, mark, hoplimit, flowlabel, priority */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u32 *)ipv6_hdr(skb)); + + ret = nft_do_chain(&pkt, ops); + if (ret != NF_DROP && ret != NF_QUEUE && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) + return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + + return ret; +} + +static const struct nf_chain_type nft_chain_route_ipv6 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook, + }, +}; + +static int __init nft_chain_route_init(void) +{ + return nft_register_chain_type(&nft_chain_route_ipv6); +} + +static void __exit nft_chain_route_exit(void) +{ + nft_unregister_chain_type(&nft_chain_route_ipv6); +} + +module_init(nft_chain_route_init); +module_exit(nft_chain_route_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route"); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c new file mode 100644 index 00000000000..0bc19fa8782 --- /dev/null +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_reject.h> +#include <net/netfilter/ipv6/nf_reject.h> + +void nft_reject_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv6_eval); + +static struct nft_expr_type nft_reject_ipv6_type; +static const struct nft_expr_ops nft_reject_ipv6_ops = { + .type = &nft_reject_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv6_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "reject", + .ops = &nft_reject_ipv6_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv6_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv6_type); +} + +static void __exit nft_reject_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv6_type); +} + +module_init(nft_reject_ipv6_module_init); +module_exit(nft_reject_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c new file mode 100644 index 00000000000..5ec867e4a8b --- /dev/null +++ b/net/ipv6/output_core.c @@ -0,0 +1,98 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. These functions are needed by GSO/GRO implementation. + */ +#include <linux/export.h> +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/addrconf.h> +#include <net/secure_seq.h> + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + unsigned int packet_len = skb_tail_pointer(skb) - + skb_network_header(skb); + int found_rhdr = 0; + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: +#if IS_ENABLED(CONFIG_IPV6_MIP6) + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + break; +#endif + if (found_rhdr) + return offset; + break; + default : + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); + } + + return offset; +} +EXPORT_SYMBOL(ip6_find_1stfragopt); + +#if IS_ENABLED(CONFIG_IPV6) +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); +#endif + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} +EXPORT_SYMBOL_GPL(__ip6_local_out); + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c new file mode 100644 index 00000000000..5b7a1ed2aba --- /dev/null +++ b/net/ipv6/ping.c @@ -0,0 +1,275 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/ping.c code. + * + * Authors: Lorenzo Colitti (IPv6 support) + * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), + * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) + * + */ + +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/transp_v6.h> +#include <net/ping.h> + +struct proto pingv6_prot = { + .name = "PINGv6", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip6_datagram_connect_v6_only, + .disconnect = udp_disconnect, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .sendmsg = ping_v6_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, + .obj_size = sizeof(struct raw6_sock), +}; +EXPORT_SYMBOL_GPL(pingv6_prot); + +static struct inet_protosw pingv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMPV6, + .prot = &pingv6_prot, + .ops = &inet6_dgram_ops, + .flags = INET_PROTOSW_REUSE, +}; + + +/* Compatibility glue so we can support IPv6 when it's compiled as a module */ +static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, + int *addr_len) +{ + return -EAFNOSUPPORT; +} +static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ +} +static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) +{ + return -EAFNOSUPPORT; +} +static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) {} +static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict) +{ + return 0; +} + +int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr user_icmph; + int addr_type; + struct in6_addr *daddr; + int iif = 0; + struct flowi6 fl6; + int err; + int hlimit; + struct dst_entry *dst; + struct rt6_info *rt; + struct pingfakehdr pfh; + + pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); + if (msg->msg_namelen < sizeof(struct sockaddr_in6) || + u->sin6_family != AF_INET6) { + return -EINVAL; + } + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != u->sin6_scope_id) { + return -EINVAL; + } + daddr = &(u->sin6_addr); + iif = u->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &sk->sk_v6_daddr; + } + + if (!iif) + iif = sk->sk_bound_dev_if; + + addr_type = ipv6_addr_type(daddr); + if (__ipv6_addr_needs_scope_id(addr_type) && !iif) + return -EINVAL; + if (addr_type & IPV6_ADDR_MAPPED) + return -EINVAL; + + /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ + + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.saddr = np->saddr; + fl6.daddr = *daddr; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_icmp_type = user_icmph.icmp6_type; + fl6.fl6_icmp_code = user_icmph.icmp6_code; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr); + if (IS_ERR(dst)) + return PTR_ERR(dst); + rt = (struct rt6_info *) dst; + + np = inet6_sk(sk); + if (!np) + return -EBADF; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + pfh.icmph.type = user_icmph.icmp6_type; + pfh.icmph.code = user_icmph.icmp6_code; + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + pfh.family = AF_INET6; + + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + + lock_sock(sk); + err = ip6_append_data(sk, ping_getfrag, &pfh, len, + 0, hlimit, + np->tclass, NULL, &fl6, rt, + MSG_DONTWAIT, np->dontfrag); + + if (err) { + ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, + ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *) &pfh.icmph, + len); + } + release_sock(sk); + + if (err) + return err; + + return len; +} + +#ifdef CONFIG_PROC_FS +static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) +{ + return ping_seq_start(seq, pos, AF_INET6); +} + +static int ping_v6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct ping_iter_state *) seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } + return 0; +} + +static struct ping_seq_afinfo ping_v6_seq_afinfo = { + .name = "icmp6", + .family = AF_INET6, + .seq_fops = &ping_seq_fops, + .seq_ops = { + .start = ping_v6_seq_start, + .show = ping_v6_seq_show, + .next = ping_seq_next, + .stop = ping_seq_stop, + }, +}; + +static int __net_init ping_v6_proc_init_net(struct net *net) +{ + return ping_proc_register(net, &ping_v6_seq_afinfo); +} + +static void __net_init ping_v6_proc_exit_net(struct net *net) +{ + return ping_proc_unregister(net, &ping_v6_seq_afinfo); +} + +static struct pernet_operations ping_v6_net_ops = { + .init = ping_v6_proc_init_net, + .exit = ping_v6_proc_exit_net, +}; +#endif + +int __init pingv6_init(void) +{ +#ifdef CONFIG_PROC_FS + int ret = register_pernet_subsys(&ping_v6_net_ops); + if (ret) + return ret; +#endif + pingv6_ops.ipv6_recv_error = ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = + ip6_datagram_recv_specific_ctl; + pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; + return inet6_register_protosw(&pingv6_protosw); +} + +/* This never gets called because it's not possible to unload the ipv6 module, + * but just in case. + */ +void pingv6_exit(void) +{ + pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; + pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; +#ifdef CONFIG_PROC_FS + unregister_pernet_subsys(&ping_v6_net_ops); +#endif + inet6_unregister_protosw(&pingv6_protosw); +} diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index c9605c3ad91..3317440ea34 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -21,6 +21,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> +#include <linux/export.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> @@ -59,7 +60,7 @@ static const struct file_operations sockstat6_seq_fops = { .release = single_release_net, }; -static struct snmp_mib snmp6_ipstats_list[] = { +static const struct snmp_mib snmp6_ipstats_list[] = { /* ipv6 mib according to RFC 2465 */ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), @@ -89,14 +90,21 @@ static struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ + SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp6_icmp6_list[] = { +static const struct snmp_mib snmp6_icmp6_list[] = { /* icmpv6 mib according to RFC 2466 */ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), + SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; @@ -120,23 +128,29 @@ static const char *const icmp6type2name[256] = { }; -static struct snmp_mib snmp6_udp6_list[] = { +static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp6_udplite6_list[] = { +static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_SENTINEL }; -static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void **mib) +static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) { char name[32]; int i; @@ -153,45 +167,63 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void **mib) snprintf(name, sizeof(name), "Icmp6%s%s", i & 0x100 ? "Out" : "In", p); seq_printf(seq, "%-32s\t%lu\n", name, - snmp_fold_field(mib, i)); + atomic_long_read(smib + i)); } /* print by number (nonzero only) - ICMPMsgStat format */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { unsigned long val; - val = snmp_fold_field(mib, i); + val = atomic_long_read(smib + i); if (!val) continue; snprintf(name, sizeof(name), "Icmp6%sType%u", i & 0x100 ? "Out" : "In", i & 0xff); seq_printf(seq, "%-32s\t%lu\n", name, val); } - return; } -static inline void -snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist) +/* can be called either with percpu mib (pcpumib != NULL), + * or shared one (smib != NULL) + */ +static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, + atomic_long_t *smib, + const struct snmp_mib *itemlist) +{ + int i; + unsigned long val; + + for (i = 0; itemlist[i].name; i++) { + val = pcpumib ? + snmp_fold_field(pcpumib, itemlist[i].entry) : + atomic_long_read(smib + itemlist[i].entry); + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + } +} + +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, + const struct snmp_mib *itemlist, size_t syncpoff) { int i; - for (i=0; itemlist[i].name; i++) - seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, - snmp_fold_field(mib, itemlist[i].entry)); + + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, + snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); } static int snmp6_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; - snmp6_seq_show_item(seq, (void **)net->mib.ipv6_statistics, - snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)net->mib.icmpv6_statistics, - snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)net->mib.icmpv6msg_statistics); - snmp6_seq_show_item(seq, (void **)net->mib.udp_stats_in6, - snmp6_udp6_list); - snmp6_seq_show_item(seq, (void **)net->mib.udplite_stats_in6, - snmp6_udplite6_list); + snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, + NULL, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); + snmp6_seq_show_item(seq, net->mib.udp_stats_in6, + NULL, snmp6_udp6_list); + snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, + NULL, snmp6_udplite6_list); return 0; } @@ -213,15 +245,17 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) struct inet6_dev *idev = (struct inet6_dev *)seq->private; seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); - snmp6_seq_show_item(seq, (void **)idev->stats.ipv6, snmp6_ipstats_list); - snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); - snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg); + snmp6_seq_show_item64(seq, idev->stats.ipv6, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, + snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } static int snmp6_dev_seq_open(struct inode *inode, struct file *file) { - return single_open(file, snmp6_dev_seq_show, PDE(inode)->data); + return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode)); } static const struct file_operations snmp6_dev_seq_fops = { @@ -259,21 +293,20 @@ int snmp6_unregister_dev(struct inet6_dev *idev) struct net *net = dev_net(idev->dev); if (!net->mib.proc_net_devsnmp6) return -ENOENT; - if (!idev || !idev->stats.proc_dir_entry) + if (!idev->stats.proc_dir_entry) return -EINVAL; - remove_proc_entry(idev->stats.proc_dir_entry->name, - net->mib.proc_net_devsnmp6); + proc_remove(idev->stats.proc_dir_entry); idev->stats.proc_dir_entry = NULL; return 0; } -static int ipv6_proc_init_net(struct net *net) +static int __net_init ipv6_proc_init_net(struct net *net) { - if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, - &sockstat6_seq_fops)) + if (!proc_create("sockstat6", S_IRUGO, net->proc_net, + &sockstat6_seq_fops)) return -ENOMEM; - if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); @@ -281,18 +314,18 @@ static int ipv6_proc_init_net(struct net *net) goto proc_dev_snmp6_fail; return 0; -proc_snmp6_fail: - proc_net_remove(net, "sockstat6"); proc_dev_snmp6_fail: - proc_net_remove(net, "dev_snmp6"); + remove_proc_entry("snmp6", net->proc_net); +proc_snmp6_fail: + remove_proc_entry("sockstat6", net->proc_net); return -ENOMEM; } -static void ipv6_proc_exit_net(struct net *net) +static void __net_exit ipv6_proc_exit_net(struct net *net) { - proc_net_remove(net, "sockstat6"); - proc_net_remove(net, "dev_snmp6"); - proc_net_remove(net, "snmp6"); + remove_proc_entry("sockstat6", net->proc_net); + remove_proc_entry("dev_snmp6", net->proc_net); + remove_proc_entry("snmp6", net->proc_net); } static struct pernet_operations ipv6_proc_ops = { diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 1fa3468f0f3..e048cf1bb6a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -25,52 +25,49 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -static DEFINE_SPINLOCK(inet6_proto_lock); - +#if IS_ENABLED(CONFIG_IPV6) +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet6_protos); int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_protocol); - spin_lock_bh(&inet6_proto_lock); +int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int ret; - if (inet6_protos[hash]) { - ret = -1; - } else { - inet6_protos[hash] = prot; - ret = 0; - } + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol], + prot, NULL) == prot) ? 0 : -1; - spin_unlock_bh(&inet6_proto_lock); + synchronize_net(); return ret; } +EXPORT_SYMBOL(inet6_del_protocol); +#endif -EXPORT_SYMBOL(inet6_add_protocol); +const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly; -/* - * Remove a protocol from the hash tables. - */ - -int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) +int inet6_add_offload(const struct net_offload *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet6_proto_lock); + return !cmpxchg((const struct net_offload **)&inet6_offloads[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_offload); - if (inet6_protos[hash] != prot) { - ret = -1; - } else { - inet6_protos[hash] = NULL; - ret = 0; - } +int inet6_del_offload(const struct net_offload *prot, unsigned char protocol) +{ + int ret; - spin_unlock_bh(&inet6_proto_lock); + ret = (cmpxchg((const struct net_offload **)&inet6_offloads[protocol], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); return ret; } - -EXPORT_SYMBOL(inet6_del_protocol); +EXPORT_SYMBOL(inet6_del_offload); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 926ce8eeffa..b2dc60b0c76 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -21,6 +21,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> +#include <linux/slab.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> @@ -30,6 +31,7 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -48,7 +50,7 @@ #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #include <linux/mroute6.h> @@ -59,34 +61,35 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/export.h> + +#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ static struct raw_hashinfo raw_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), }; static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, - unsigned short num, struct in6_addr *loc_addr, - struct in6_addr *rmt_addr, int dif) + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif) { - struct hlist_node *node; - int is_multicast = ipv6_addr_is_multicast(loc_addr); + bool is_multicast = ipv6_addr_is_multicast(loc_addr); - sk_for_each_from(sk, node) + sk_for_each_from(sk) if (inet_sk(sk)->inet_num == num) { - struct ipv6_pinfo *np = inet6_sk(sk); if (!net_eq(sock_net(sk), net)) continue; - if (!ipv6_addr_any(&np->daddr) && - !ipv6_addr_equal(&np->daddr, rmt_addr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) goto found; if (is_multicast && inet6_mc_check(sk, loc_addr, rmt_addr)) @@ -104,38 +107,40 @@ found: * 0 - deliver * 1 - block */ -static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { - struct icmp6hdr *icmph; - struct raw6_sock *rp = raw6_sk(sk); - - if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { - __u32 *data = &rp->filter.data[0]; - int bit_nr; + struct icmp6hdr _hdr; + const struct icmp6hdr *hdr; - icmph = (struct icmp6hdr *) skb->data; - bit_nr = icmph->icmp6_type; + /* We require only the four bytes of the ICMPv6 header, not any + * additional bytes of message body in "struct icmp6hdr". + */ + hdr = skb_header_pointer(skb, skb_transport_offset(skb), + ICMPV6_HDRLEN, &_hdr); + if (hdr) { + const __u32 *data = &raw6_sk(sk)->filter.data[0]; + unsigned int type = hdr->icmp6_type; - return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + return (data[type >> 5] & (1U << (type & 31))) != 0; } - return 0; + return 1; } -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -static int (*mh_filter)(struct sock *sock, struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6_MIP6) +typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); + +static mh_filter_t __rcu *mh_filter __read_mostly; -int rawv6_mh_filter_register(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); -int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +int rawv6_mh_filter_unregister(mh_filter_t filter) { - rcu_assign_pointer(mh_filter, NULL); + RCU_INIT_POINTER(mh_filter, NULL); synchronize_rcu(); return 0; } @@ -150,19 +155,19 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); * * Caller owns SKB so we must make clones. */ -static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { - struct in6_addr *saddr; - struct in6_addr *daddr; + const struct in6_addr *saddr; + const struct in6_addr *daddr; struct sock *sk; - int delivered = 0; + bool delivered = false; __u8 hash; struct net *net; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; - hash = nexthdr & (MAX_INET_PROTOS - 1); + hash = nexthdr & (RAW_HTABLE_SIZE - 1); read_lock(&raw_v6_hashinfo.lock); sk = sk_head(&raw_v6_hashinfo.ht[hash]); @@ -176,13 +181,13 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) while (sk) { int filtered; - delivered = 1; + delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); break; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: { /* XXX: To validate MH only once for each packet, @@ -191,10 +196,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) * policy is placed in rawv6_rcv() because it is * required for each socket. */ - int (*filter)(struct sock *sock, struct sk_buff *skb); + mh_filter_t *filter; filter = rcu_dereference(mh_filter); - filtered = filter ? filter(sk, skb) : 0; + filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif @@ -222,11 +227,11 @@ out: return delivered; } -int raw6_local_deliver(struct sk_buff *skb, int nexthdr) +bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { struct sock *raw_sk; - raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]); if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) raw_sk = NULL; @@ -245,6 +250,10 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ @@ -262,7 +271,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; - if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another @@ -296,9 +305,9 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) } inet->inet_rcv_saddr = inet->inet_saddr = v4addr; - ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) - ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + np->saddr = addr->sin6_addr; err = 0; out_unlock: rcu_read_unlock(); @@ -325,9 +334,14 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, return; harderr = icmpv6_err_convert(type, code, &err); - if (type == ICMPV6_PKT_TOOBIG) + if (type == ICMPV6_PKT_TOOBIG) { + ip6_sk_update_pmtu(skb, sk, info); harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); - + } + if (type == NDISC_REDIRECT) { + ip6_sk_redirect(skb, sk); + return; + } if (np->recverr) { u8 *payload = skb->data; if (!inet->hdrincl) @@ -346,7 +360,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, { struct sock *sk; int hash; - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; struct net *net; hash = nexthdr & (RAW_HTABLE_SIZE - 1); @@ -355,7 +369,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, sk = sk_head(&raw_v6_hashinfo.ht[hash]); if (sk != NULL) { /* Note: ipv6_hdr(skb) != skb->data */ - struct ipv6hdr *ip6h = (struct ipv6hdr *)skb->data; + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; saddr = &ip6h->saddr; daddr = &ip6h->daddr; net = dev_net(skb->dev); @@ -370,9 +384,9 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, read_unlock(&raw_v6_hashinfo.lock); } -static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { - if ((raw6_sk(sk)->checksum || sk->sk_filter) && + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb(skb); @@ -380,6 +394,7 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) } /* Charge it to the socket. */ + skb_dst_drop(skb); if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; @@ -446,7 +461,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, int noblock, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; @@ -454,11 +469,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (flags & MSG_OOB) return -EOPNOTSUPP; - if (addr_len) - *addr_len=sizeof(*sin6); - if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len); + return ipv6_recv_error(sk, msg, len, addr_len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) @@ -488,17 +503,17 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; - ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr); + sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; + sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); + *addr_len = sizeof(*sin6); } sock_recv_ts_and_drops(msg, sk, skb); if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_ctl(sk, msg, skb); err = copied; if (flags & MSG_TRUNC) @@ -519,7 +534,7 @@ csum_copy_err: goto out; } -static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, +static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { struct sk_buff *skb; @@ -537,8 +552,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, goto out; offset = rp->offset; - total_len = inet_sk(sk)->cork.length - (skb_network_header(skb) - - skb->data); + total_len = inet_sk(sk)->cork.base.length; if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); @@ -581,11 +595,10 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, if (unlikely(csum)) tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); - csum = csum_ipv6_magic(&fl->fl6_src, - &fl->fl6_dst, - total_len, fl->proto, tmp_csum); + csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + total_len, fl6->flowi6_proto, tmp_csum); - if (csum == 0 && fl->proto == IPPROTO_UDP) + if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; if (skb_store_bits(skb, offset, &csum, 2)) @@ -598,31 +611,36 @@ out: } static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, - struct flowi *fl, struct rt6_info *rt, + struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; + struct rt6_info *rt = (struct rt6_info *)*dstp; + int hlen = LL_RESERVED_SPACE(rt->dst.dev); + int tlen = rt->dst.dev->needed_tailroom; - if (length > rt->u.dst.dev->mtu) { - ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + if (length > rt->dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(skb, hlen); + skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, &rt->dst); + *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -636,8 +654,8 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, goto error_fault; IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); - err = NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -655,7 +673,7 @@ error: return err; } -static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; @@ -672,7 +690,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) if (!iov) continue; - switch (fl->proto) { + switch (fl6->flowi6_proto) { case IPPROTO_ICMPV6: /* check if one-byte field is readable or not. */ if (iov->iov_base && iov->iov_len < 1) @@ -687,8 +705,8 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) code = iov->iov_base; if (type && code) { - if (get_user(fl->fl_icmp_type, type) || - get_user(fl->fl_icmp_code, code)) + if (get_user(fl6->fl6_icmp_type, type) || + get_user(fl6->fl6_icmp_code, code)) return -EFAULT; probed = 1; } @@ -699,7 +717,7 @@ static int rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) /* check if type field is readable or not. */ if (iov->iov_len > 2 - len) { u8 __user *p = iov->iov_base; - if (get_user(fl->fl_mh_type, &p[2 - len])) + if (get_user(fl6->fl6_mh_type, &p[2 - len])) return -EFAULT; probed = 1; } else @@ -720,18 +738,19 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; - struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; - struct flowi fl; + struct flowi6 fl6; int addr_len = msg->msg_namelen; int hlimit = -1; int tclass = -1; + int dontfrag = -1; u16 proto; int err; @@ -748,16 +767,16 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, /* * Get and verify the address. */ - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); - fl.mark = sk->sk_mark; + fl6.flowi6_mark = sk->sk_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); @@ -765,19 +784,18 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (!proto) proto = inet->inet_num; else if (proto != inet->inet_num) - return(-EINVAL); + return -EINVAL; if (proto > 255) - return(-EINVAL); + return -EINVAL; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -786,37 +804,38 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && - ipv6_addr_equal(daddr, &np->daddr)) - daddr = &np->daddr; + ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) + daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) + fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; proto = inet->inet_num; - daddr = &np->daddr; - fl.fl6_flowlabel = np->flow_label; + daddr = &sk->sk_v6_daddr; + fl6.flowlabel = np->flow_label; } - if (fl.oif == 0) - fl.oif = sk->sk_bound_dev_if; + if (fl6.flowi6_oif == 0) + fl6.flowi6_oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); - err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -829,72 +848,56 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl.proto = proto; - err = rawv6_probe_proto_opt(&fl, msg); + fl6.flowi6_proto = proto; + err = rawv6_probe_proto_opt(&fl6, msg); if (err) goto out; if (!ipv6_addr_any(daddr)) - ipv6_addr_copy(&fl.fl6_dst, daddr); + fl6.daddr = *daddr; else - fl.fl6_dst.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ - if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; - security_sk_classify_flow(sk, &fl); + final_p = fl6_update_dst(&fl6, opt, &final); - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) - goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); - if (err < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; - } + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (tclass < 0) tclass = np->tclass; + if (dontfrag < 0) + dontfrag = np->dontfrag; + if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: - if (inet->hdrincl) { - err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags); - } else { + if (inet->hdrincl) + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags); + else { lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, - len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst, - msg->msg_flags); + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + msg->msg_flags, dontfrag); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) - err = rawv6_push_pending_frames(sk, &fl, rp); + err = rawv6_push_pending_frames(sk, &fl6, rp); release_sock(sk); } done: @@ -963,57 +966,54 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { - case IPV6_CHECKSUM: - if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && - level == IPPROTO_IPV6) { - /* - * RFC3542 tells that IPV6_CHECKSUM socket - * option in the IPPROTO_IPV6 level is not - * allowed on ICMPv6 sockets. - * If you want to set it, use IPPROTO_RAW - * level IPV6_CHECKSUM socket option - * (Linux extension). - */ - return -EINVAL; - } + case IPV6_CHECKSUM: + if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && + level == IPPROTO_IPV6) { + /* + * RFC3542 tells that IPV6_CHECKSUM socket + * option in the IPPROTO_IPV6 level is not + * allowed on ICMPv6 sockets. + * If you want to set it, use IPPROTO_RAW + * level IPV6_CHECKSUM socket option + * (Linux extension). + */ + return -EINVAL; + } - /* You may get strange result with a positive odd offset; - RFC2292bis agrees with me. */ - if (val > 0 && (val&1)) - return(-EINVAL); - if (val < 0) { - rp->checksum = 0; - } else { - rp->checksum = 1; - rp->offset = val; - } + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return -EINVAL; + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } - return 0; - break; + return 0; - default: - return(-ENOPROTOOPT); + default: + return -ENOPROTOOPT; } } static int rawv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { - switch(level) { - case SOL_RAW: - break; + switch (level) { + case SOL_RAW: + break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, - optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM) - break; - default: - return ipv6_setsockopt(sk, level, optname, optval, - optlen); + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, optlen); } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); @@ -1079,21 +1079,19 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, static int rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - switch(level) { - case SOL_RAW: - break; + switch (level) { + case SOL_RAW: + break; - case SOL_ICMPV6: - if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) - return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, - optlen); - case SOL_IPV6: - if (optname == IPV6_CHECKSUM) - break; - default: - return ipv6_getsockopt(sk, level, optname, optval, - optlen); + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, optlen); } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); @@ -1123,35 +1121,51 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) { - switch(cmd) { - case SIOCOUTQ: - { - int amount = sk_wmem_alloc_get(sk); + switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); - return put_user(amount, (int __user *)arg); - } - case SIOCINQ: - { - struct sk_buff *skb; - int amount = 0; - - spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) - amount = skb->tail - skb->transport_header; - spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb_tail_pointer(skb) - + skb_transport_header(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } - default: + default: #ifdef CONFIG_IPV6_MROUTE - return ip6mr_ioctl(sk, cmd, (void __user *)arg); + return ip6mr_ioctl(sk, cmd, (void __user *)arg); #else - return -ENOIOCTLCMD; + return -ENOIOCTLCMD; #endif } } +#ifdef CONFIG_COMPAT +static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif + static void rawv6_close(struct sock *sk, long timeout) { if (inet_sk(sk)->inet_num == IPPROTO_RAW) @@ -1185,7 +1199,7 @@ static int rawv6_init_sk(struct sock *sk) default: break; } - return(0); + return 0; } struct proto rawv6_prot = { @@ -1193,7 +1207,7 @@ struct proto rawv6_prot = { .owner = THIS_MODULE, .close = rawv6_close, .destroy = raw6_destroy, - .connect = ip6_datagram_connect, + .connect = ip6_datagram_connect_v6_only, .disconnect = udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, @@ -1210,48 +1224,21 @@ struct proto rawv6_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_rawv6_setsockopt, .compat_getsockopt = compat_rawv6_getsockopt, + .compat_ioctl = compat_rawv6_ioctl, #endif }; #ifdef CONFIG_PROC_FS -static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) -{ - struct ipv6_pinfo *np = inet6_sk(sp); - struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = 0; - srcp = inet_sk(sp)->inet_num; - seq_printf(seq, - "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", - i, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), - 0, 0L, 0, - sock_i_uid(sp), 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); -} - static int raw6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode ref pointer drops\n"); - else - raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + struct sock *sp = v; + __u16 srcp = inet_sk(sp)->inet_num; + ip6_dgram_sock_seq_show(seq, v, srcp, 0, + raw_seq_private(seq)->bucket); + } return 0; } @@ -1275,17 +1262,17 @@ static const struct file_operations raw6_seq_fops = { .release = seq_release_net, }; -static int raw6_init_net(struct net *net) +static int __net_init raw6_init_net(struct net *net) { - if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) + if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops)) return -ENOMEM; return 0; } -static void raw6_exit_net(struct net *net) +static void __net_exit raw6_exit_net(struct net *net) { - proc_net_remove(net, "raw6"); + remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { @@ -1335,7 +1322,6 @@ static struct inet_protosw rawv6_protosw = { .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, }; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2cddea3bd6b..cc85a9ba501 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -26,6 +26,9 @@ * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ + +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> @@ -41,6 +44,8 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/sock.h> #include <net/snmp.h> @@ -53,6 +58,7 @@ #include <net/ndisc.h> #include <net/addrconf.h> #include <net/inet_frag.h> +#include <net/inet_ecn.h> struct ip6frag_skb_cb { @@ -62,36 +68,12 @@ struct ip6frag_skb_cb #define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) - -/* - * Equivalent of ipv4 struct ipq - */ - -struct frag_queue -{ - struct inet_frag_queue q; - - __be32 id; /* fragment id */ - u32 user; - struct in6_addr saddr; - struct in6_addr daddr; - - int iif; - unsigned int csum; - __u16 nhoffset; -}; - -static struct inet_frags ip6_frags; - -int ip6_frag_nqueues(struct net *net) +static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { - return net->ipv6.frags.nqueues; + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -int ip6_frag_mem(struct net *net) -{ - return atomic_read(&net->ipv6.frags.mem); -} +static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); @@ -100,64 +82,39 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, * callers should be careful not to use the hash value outside the ipfrag_lock * as doing so could race with ipfrag_hash_rnd being recalculated. */ -unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr, u32 rnd) +static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr) { - u32 a, b, c; - - a = (__force u32)saddr->s6_addr32[0]; - b = (__force u32)saddr->s6_addr32[1]; - c = (__force u32)saddr->s6_addr32[2]; + u32 c; - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += (__force u32)saddr->s6_addr32[3]; - b += (__force u32)daddr->s6_addr32[0]; - c += (__force u32)daddr->s6_addr32[1]; - __jhash_mix(a, b, c); - - a += (__force u32)daddr->s6_addr32[2]; - b += (__force u32)daddr->s6_addr32[3]; - c += (__force u32)id; - __jhash_mix(a, b, c); + net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); + c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), + (__force u32)id, ip6_frags.rnd); return c & (INETFRAGS_HASHSZ - 1); } -EXPORT_SYMBOL_GPL(inet6_hash_frag); static unsigned int ip6_hashfn(struct inet_frag_queue *q) { struct frag_queue *fq; fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr, ip6_frags.rnd); + return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); } -int ip6_frag_match(struct inet_frag_queue *q, void *a) +bool ip6_frag_match(struct inet_frag_queue *q, void *a) { struct frag_queue *fq; struct ip6_create_arg *arg = a; fq = container_of(q, struct frag_queue, q); - return (fq->id == arg->id && fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst)); + return fq->id == arg->id && + fq->user == arg->user && + ipv6_addr_equal(&fq->saddr, arg->src) && + ipv6_addr_equal(&fq->daddr, arg->dst); } EXPORT_SYMBOL(ip6_frag_match); -/* Memory Tracking Functions. */ -static inline void frag_kfree_skb(struct netns_frags *nf, - struct sk_buff *skb, int *work) -{ - if (work) - *work -= skb->truesize; - atomic_sub(skb->truesize, &nf->mem); - kfree_skb(skb); -} - void ip6_frag_init(struct inet_frag_queue *q, void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); @@ -165,51 +122,24 @@ void ip6_frag_init(struct inet_frag_queue *q, void *a) fq->id = arg->id; fq->user = arg->user; - ipv6_addr_copy(&fq->saddr, arg->src); - ipv6_addr_copy(&fq->daddr, arg->dst); + fq->saddr = *arg->src; + fq->daddr = *arg->dst; + fq->ecn = arg->ecn; } EXPORT_SYMBOL(ip6_frag_init); -/* Destruction primitives. */ - -static __inline__ void fq_put(struct frag_queue *fq) -{ - inet_frag_put(&fq->q, &ip6_frags); -} - -/* Kill fq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static __inline__ void fq_kill(struct frag_queue *fq) -{ - inet_frag_kill(&fq->q, &ip6_frags); -} - -static void ip6_evictor(struct net *net, struct inet6_dev *idev) -{ - int evicted; - - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags); - if (evicted) - IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS, evicted); -} - -static void ip6_frag_expire(unsigned long data) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, + struct inet_frags *frags) { - struct frag_queue *fq; struct net_device *dev = NULL; - struct net *net; - - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); spin_lock(&fq->q.lock); if (fq->q.last_in & INET_FRAG_COMPLETE) goto out; - fq_kill(fq); + inet_frag_kill(&fq->q, frags); - net = container_of(fq->q.net, struct net, ipv6.frags); rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) @@ -228,17 +158,29 @@ static void ip6_frag_expire(unsigned long data) pointer directly, device might already disappeared. */ fq->q.fragments->dev = dev; - icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, dev); + icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); out_rcu_unlock: rcu_read_unlock(); out: spin_unlock(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, frags); +} +EXPORT_SYMBOL(ip6_expire_frag_queue); + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net *net; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + net = container_of(fq->q.net, struct net, ipv6.frags); + + ip6_expire_frag_queue(net, fq, &ip6_frags); } static __inline__ struct frag_queue * -fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst, - struct inet6_dev *idev) +fq_find(struct net *net, __be32 id, const struct in6_addr *src, + const struct in6_addr *dst, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -248,19 +190,17 @@ fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst, arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.ecn = ecn; read_lock(&ip6_frags.lock); - hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); + hash = inet6_hash_frag(id, src, dst); q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (q == NULL) - goto oom; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct frag_queue, q); - -oom: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS); - return NULL; } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, @@ -270,6 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct net_device *dev; int offset, end; struct net *net = dev_net(skb_dst(skb)->dev); + u8 ecn; if (fq->q.last_in & INET_FRAG_COMPLETE) goto err; @@ -287,6 +228,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, return -1; } + ecn = ip6_frag_ecn(ipv6_hdr(skb)); + if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, @@ -340,6 +283,11 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = fq->q.fragments_tail; + if (!prev || FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for(next = fq->q.fragments; next != NULL; next = next->next) { if (FRAG6_CB(next)->offset >= offset) @@ -347,63 +295,29 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, prev = next; } - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. - */ - if (prev) { - int i = (FRAG6_CB(prev)->offset + prev->len) - offset; - - if (i > 0) { - offset += i; - if (end <= offset) - goto err; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. +found: + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. */ - while (next && FRAG6_CB(next)->offset < end) { - int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - FRAG6_CB(next)->offset += i; /* next fragment */ - fq->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; + /* Check for overlap with preceding fragment. */ + if (prev && + (FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; - if (prev) - prev->next = next; - else - fq->q.fragments = next; - - fq->q.meat -= free_it->len; - frag_kfree_skb(fq->q.net, free_it, NULL); - } - } + /* Look for overlap with succeeding segment. */ + if (next && FRAG6_CB(next)->offset < end) + goto discard_fq; FRAG6_CB(skb)->offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + fq->q.fragments_tail = skb; if (prev) prev->next = skb; else @@ -416,7 +330,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; - atomic_add(skb->truesize, &fq->q.net->mem); + fq->ecn |= ecn; + add_frag_mem_limit(&fq->q, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -427,14 +342,22 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, } if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len) - return ip6_frag_reasm(fq, prev, dev); + fq->q.meat == fq->q.len) { + int res; + unsigned long orefdst = skb->_skb_refdst; + + skb->_skb_refdst = 0UL; + res = ip6_frag_reasm(fq, prev, dev); + skb->_skb_refdst = orefdst; + return res; + } - write_lock(&ip6_frags.lock); - list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); - write_unlock(&ip6_frags.lock); + skb_dst_drop(skb); + inet_frag_lru_move(&fq->q); return -1; +discard_fq: + inet_frag_kill(&fq->q, &ip6_frags); err: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); @@ -458,8 +381,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct sk_buff *fp, *head = fq->q.fragments; int payload_len; unsigned int nhoff; + int sum_truesize; + u8 ecn; - fq_kill(fq); + inet_frag_kill(&fq->q, &ip6_frags); + + ecn = ip_frag_ecn_table[fq->ecn]; + if (unlikely(ecn == 0xff)) + goto out_fail; /* Make the one we just received the head. */ if (prev) { @@ -470,12 +399,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oom; fp->next = head->next; + if (!fp->next) + fq->q.fragments_tail = fp; prev->next = fp; skb_morph(head, fq->q.fragments); head->next = fq->q.fragments->next; - kfree_skb(fq->q.fragments); + consume_skb(fq->q.fragments); fq->q.fragments = head; } @@ -490,13 +421,13 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + if (skb_unclone(head, GFP_ATOMIC)) goto out_oom; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; @@ -506,14 +437,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); - for (i=0; i<skb_shinfo(head)->nr_frags; i++) - plen += skb_shinfo(head)->frags[i].size; + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; head->data_len -= clone->len; head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &fq->q.net->mem); + add_frag_mem_limit(&fq->q, clone->truesize); } /* We have to remove fragment header from datagram and to relocate @@ -525,27 +456,41 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, head->mac_header += sizeof(struct frag_hdr); head->network_header += sizeof(struct frag_hdr); - skb_shinfo(head)->frag_list = head->next; skb_reset_transport_header(head); skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &fq->q.net->mem); - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; + sum_truesize = head->truesize; + for (fp = head->next; fp;) { + bool headstolen; + int delta; + struct sk_buff *next = fp->next; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - atomic_sub(fp->truesize, &fq->q.net->mem); + + if (skb_try_coalesce(head, fp, &headstolen, &delta)) { + kfree_skb_partial(fp, headstolen); + } else { + if (!skb_shinfo(head)->frag_list) + skb_shinfo(head)->frag_list = fp; + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + } + fp = next; } + sub_frag_mem_limit(&fq->q, sum_truesize); head->next = NULL; head->dev = dev; head->tstamp = fq->q.stamp; ipv6_hdr(head)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); IP6CB(head)->nhoff = nhoff; + IP6CB(head)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (head->ip_summed == CHECKSUM_COMPLETE) @@ -557,15 +502,14 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; return 1; out_oversize: - if (net_ratelimit()) - printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len); goto out_fail; out_oom: - if (net_ratelimit()) - printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); + net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); @@ -577,8 +521,12 @@ static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; struct frag_queue *fq; - struct ipv6hdr *hdr = ipv6_hdr(skb); + const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int evicted; + + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) + goto fail_hdr; IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); @@ -600,14 +548,18 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; return 1; } - if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh) - ip6_evictor(net, ip6_dst_idev(skb_dst(skb))); + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false); + if (evicted) + IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS, evicted); - if ((fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_dst_idev(skb_dst(skb)))) != NULL) { + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, + ip6_frag_ecn(hdr)); + if (fq != NULL) { int ret; spin_lock(&fq->q.lock); @@ -615,7 +567,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - fq_put(fq); + inet_frag_put(&fq->q, &ip6_frags); return ret; } @@ -672,7 +624,7 @@ static struct ctl_table ip6_frags_ctl_table[] = { { } }; -static int ip6_frags_ns_sysctl_register(struct net *net) +static int __net_init ip6_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -686,9 +638,13 @@ static int ip6_frags_ns_sysctl_register(struct net *net) table[0].data = &net->ipv6.frags.high_thresh; table[1].data = &net->ipv6.frags.low_thresh; table[2].data = &net->ipv6.frags.timeout; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } - hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table); + hdr = register_net_sysctl(net, "net/ipv6", table); if (hdr == NULL) goto err_reg; @@ -702,7 +658,7 @@ err_alloc: return -ENOMEM; } -static void ip6_frags_ns_sysctl_unregister(struct net *net) +static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { struct ctl_table *table; @@ -716,7 +672,7 @@ static struct ctl_table_header *ip6_ctl_header; static int ip6_frags_sysctl_register(void) { - ip6_ctl_header = register_net_sysctl_rotable(net_ipv6_ctl_path, + ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6", ip6_frags_ctl_table); return ip6_ctl_header == NULL ? -ENOMEM : 0; } @@ -745,10 +701,10 @@ static inline void ip6_frags_sysctl_unregister(void) } #endif -static int ipv6_frags_init_net(struct net *net) +static int __net_init ipv6_frags_init_net(struct net *net) { - net->ipv6.frags.high_thresh = 256 * 1024; - net->ipv6.frags.low_thresh = 192 * 1024; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; inet_frags_init_net(&net->ipv6.frags); @@ -756,7 +712,7 @@ static int ipv6_frags_init_net(struct net *net) return ip6_frags_ns_sysctl_register(net); } -static void ipv6_frags_exit_net(struct net *net) +static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c2bd74c5f8d..f23fbd28a50 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -24,8 +24,11 @@ * Fixed routing subtrees. */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/capability.h> #include <linux/errno.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> @@ -40,6 +43,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> @@ -53,6 +57,7 @@ #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> +#include <net/nexthop.h> #include <asm/uaccess.h> @@ -60,21 +65,18 @@ #include <linux/sysctl.h> #endif -/* Set to 3 to get tracing. */ -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RDBG(x) printk x -#define RT6_TRACE(x...) printk(KERN_DEBUG x) -#else -#define RDBG(x) -#define RT6_TRACE(x...) do { ; } while (0) -#endif - -#define CLONE_OFFLINK_ROUTE 0 +enum rt6_nud_state { + RT6_NUD_FAIL_HARD = -3, + RT6_NUD_FAIL_PROBE = -2, + RT6_NUD_FAIL_DO_RR = -1, + RT6_NUD_SUCCEED = 1 +}; -static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ip6_default_advmss(const struct dst_entry *dst); +static unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -82,37 +84,154 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); +static int ip6_pkt_prohibit(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, - struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex, - unsigned pref); + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned int pref); static struct rt6_info *rt6_get_route_info(struct net *net, - struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex); + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex); #endif +static void rt6_bind_peer(struct rt6_info *rt, int create) +{ + struct inet_peer_base *base; + struct inet_peer *peer; + + base = inetpeer_base_ptr(rt->_rt6i_peer); + if (!base) + return; + + peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); + if (peer) { + if (!rt6_set_peer(rt, peer)) + inet_putpeer(peer); + } +} + +static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +{ + if (rt6_has_peer(rt)) + return rt6_peer_ptr(rt); + + rt6_bind_peer(rt, create); + return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); +} + +static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +{ + return __rt6_get_peer(rt, 1); +} + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!(rt->dst.flags & DST_HOST)) + return NULL; + + peer = rt6_get_peer_create(rt); + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer) || + (old & DST_METRICS_FORCE_OVERWRITE)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} + +static inline const void *choose_neigh_daddr(struct rt6_info *rt, + struct sk_buff *skb, + const void *daddr) +{ + struct in6_addr *p = &rt->rt6i_gateway; + + if (!ipv6_addr_any(p)) + return (const void *) p; + else if (skb) + return &ipv6_hdr(skb)->daddr; + return daddr; +} + +static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct neighbour *n; + + daddr = choose_neigh_daddr(rt, skb, daddr); + n = __ipv6_neigh_lookup(dst->dev, daddr); + if (n) + return n; + return neigh_create(&nd_tbl, daddr, dst->dev); +} + static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .protocol = cpu_to_be16(ETH_P_IPV6), .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, + .default_advmss = ip6_default_advmss, + .mtu = ip6_mtu, + .cow_metrics = ipv6_cow_metrics, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, + .redirect = rt6_do_redirect, .local_out = __ip6_local_out, - .entries = ATOMIC_INIT(0), + .neigh_lookup = ip6_neigh_lookup, }; -static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; +} + +static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} + +static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} + +static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) { + return NULL; } static struct dst_ops ip6_dst_blackhole_ops = { @@ -120,21 +239,26 @@ static struct dst_ops ip6_dst_blackhole_ops = { .protocol = cpu_to_be16(ETH_P_IPV6), .destroy = ip6_dst_destroy, .check = ip6_dst_check, + .mtu = ip6_blackhole_mtu, + .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, - .entries = ATOMIC_INIT(0), + .redirect = ip6_rt_blackhole_redirect, + .cow_metrics = ip6_rt_blackhole_cow_metrics, + .neigh_lookup = ip6_neigh_lookup, }; -static struct rt6_info ip6_null_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -ENETUNREACH, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_discard, - .output = ip6_pkt_discard_out, - } +static const u32 ip6_template_metrics[RTAX_MAX] = { + [RTAX_HOPLIMIT - 1] = 0, +}; + +static const struct rt6_info ip6_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -144,20 +268,14 @@ static struct rt6_info ip6_null_entry_template = { #ifdef CONFIG_IPV6_MULTIPLE_TABLES -static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); - -static struct rt6_info ip6_prohibit_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EACCES, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = ip6_pkt_prohibit, - .output = ip6_pkt_prohibit_out, - } +static const struct rt6_info ip6_prohibit_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EACCES, + .input = ip6_pkt_prohibit, + .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -165,17 +283,14 @@ static struct rt6_info ip6_prohibit_entry_template = { .rt6i_ref = ATOMIC_INIT(1), }; -static struct rt6_info ip6_blk_hole_entry_template = { - .u = { - .dst = { - .__refcnt = ATOMIC_INIT(1), - .__use = 1, - .obsolete = -1, - .error = -EINVAL, - .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, - .input = dst_discard, - .output = dst_discard, - } +static const struct rt6_info ip6_blk_hole_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EINVAL, + .input = dst_discard, + .output = dst_discard_sk, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -186,20 +301,46 @@ static struct rt6_info ip6_blk_hole_entry_template = { #endif /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops) +static inline struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { - return (struct rt6_info *)dst_alloc(ops); + struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, + 0, DST_OBSOLETE_FORCE_CHK, flags); + + if (rt) { + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); + rt->rt6i_genid = rt_genid_ipv6(net); + INIT_LIST_HEAD(&rt->rt6i_siblings); + } + return rt; } static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct dst_entry *from = dst->from; - if (idev != NULL) { + if (!(rt->dst.flags & DST_HOST)) + dst_destroy_metrics_generic(dst); + + if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } + + dst->from = NULL; + dst_release(from); + + if (rt6_has_peer(rt)) { + struct inet_peer *peer = rt6_peer_ptr(rt); + inet_putpeer(peer); + } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -210,26 +351,86 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct net_device *loopback_dev = dev_net(dev)->loopback_dev; - if (dev != loopback_dev && idev != NULL && idev->dev == dev) { - struct inet6_dev *loopback_idev = - in6_dev_get(loopback_dev); - if (loopback_idev != NULL) { - rt->rt6i_idev = loopback_idev; - in6_dev_put(idev); + if (dev != loopback_dev) { + if (idev && idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } } } } -static __inline__ int rt6_check_expired(const struct rt6_info *rt) +static bool rt6_check_expired(const struct rt6_info *rt) { - return (rt->rt6i_flags & RTF_EXPIRES && - time_after(jiffies, rt->rt6i_expires)); + if (rt->rt6i_flags & RTF_EXPIRES) { + if (time_after(jiffies, rt->dst.expires)) + return true; + } else if (rt->dst.from) { + return rt6_check_expired((struct rt6_info *) rt->dst.from); + } + return false; +} + +/* Multipath route selection: + * Hash based function using packet header and flowlabel. + * Adapted from fib_info_hashfn() + */ +static int rt6_info_hash_nhsfn(unsigned int candidate_count, + const struct flowi6 *fl6) +{ + unsigned int val = fl6->flowi6_proto; + + val ^= ipv6_addr_hash(&fl6->daddr); + val ^= ipv6_addr_hash(&fl6->saddr); + + /* Work only if this not encapsulated */ + switch (fl6->flowi6_proto) { + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + val ^= (__force u16)fl6->fl6_sport; + val ^= (__force u16)fl6->fl6_dport; + break; + + case IPPROTO_ICMPV6: + val ^= (__force u16)fl6->fl6_icmp_type; + val ^= (__force u16)fl6->fl6_icmp_code; + break; + } + /* RFC6438 recommands to use flowlabel */ + val ^= (__force u32)fl6->flowlabel; + + /* Perhaps, we need to tune, this function? */ + val = val ^ (val >> 7) ^ (val >> 12); + return val % candidate_count; } -static inline int rt6_need_strict(struct in6_addr *daddr) +static struct rt6_info *rt6_multipath_select(struct rt6_info *match, + struct flowi6 *fl6, int oif, + int strict) { - return (ipv6_addr_type(daddr) & - (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)); + struct rt6_info *sibling, *next_sibling; + int route_choosen; + + route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); + /* Don't change the route, if route_choosen == 0 + * (siblings does not include ourself) + */ + if (route_choosen) + list_for_each_entry_safe(sibling, next_sibling, + &match->rt6i_siblings, rt6i_siblings) { + route_choosen--; + if (route_choosen == 0) { + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + } + return match; } /* @@ -238,7 +439,7 @@ static inline int rt6_need_strict(struct in6_addr *daddr) static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *rt, - struct in6_addr *saddr, + const struct in6_addr *saddr, int oif, int flags) { @@ -248,14 +449,14 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) { - struct net_device *dev = sprt->rt6i_dev; + for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + struct net_device *dev = sprt->dst.dev; if (oif) { if (dev->ifindex == oif) return sprt; if (dev->flags & IFF_LOOPBACK) { - if (sprt->rt6i_idev == NULL || + if (!sprt->rt6i_idev || sprt->rt6i_idev->dev->ifindex != oif) { if (flags & RT6_LOOKUP_F_IFACE && oif) continue; @@ -284,9 +485,27 @@ out: } #ifdef CONFIG_IPV6_ROUTER_PREF +struct __rt6_probe_work { + struct work_struct work; + struct in6_addr target; + struct net_device *dev; +}; + +static void rt6_probe_deferred(struct work_struct *w) +{ + struct in6_addr mcaddr; + struct __rt6_probe_work *work = + container_of(w, struct __rt6_probe_work, work); + + addrconf_addr_solict_mult(&work->target, &mcaddr); + ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + dev_put(work->dev); + kfree(w); +} + static void rt6_probe(struct rt6_info *rt) { - struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL; + struct neighbour *neigh; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it @@ -295,27 +514,44 @@ static void rt6_probe(struct rt6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!neigh || (neigh->nud_state & NUD_VALID)) + if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) return; - read_lock_bh(&neigh->lock); - if (!(neigh->nud_state & NUD_VALID) && + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + write_lock(&neigh->lock); + if (neigh->nud_state & NUD_VALID) + goto out; + } + + if (!neigh || time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct in6_addr mcaddr; - struct in6_addr *target; + struct __rt6_probe_work *work; - neigh->updated = jiffies; - read_unlock_bh(&neigh->lock); + work = kmalloc(sizeof(*work), GFP_ATOMIC); - target = (struct in6_addr *)&neigh->primary_key; - addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL); - } else - read_unlock_bh(&neigh->lock); + if (neigh && work) + __neigh_set_probe_once(neigh); + + if (neigh) + write_unlock(&neigh->lock); + + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); + } + } else { +out: + write_unlock(&neigh->lock); + } + rcu_read_unlock_bh(); } #else static inline void rt6_probe(struct rt6_info *rt) { - return; } #endif @@ -324,7 +560,7 @@ static inline void rt6_probe(struct rt6_info *rt) */ static inline int rt6_check_dev(struct rt6_info *rt, int oif) { - struct net_device *dev = rt->rt6i_dev; + struct net_device *dev = rt->dst.dev; if (!oif || dev->ifindex == oif) return 2; if ((dev->flags & IFF_LOOPBACK) && @@ -333,85 +569,102 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) return 0; } -static inline int rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) { - struct neighbour *neigh = rt->rt6i_nexthop; - int m; + struct neighbour *neigh; + enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) - m = 1; - else if (neigh) { - read_lock_bh(&neigh->lock); + return RT6_NUD_SUCCEED; + + rcu_read_lock_bh(); + neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + if (neigh) { + read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) - m = 2; + ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF - else if (neigh->nud_state & NUD_FAILED) - m = 0; -#endif + else if (!(neigh->nud_state & NUD_FAILED)) + ret = RT6_NUD_SUCCEED; else - m = 1; - read_unlock_bh(&neigh->lock); - } else - m = 0; - return m; + ret = RT6_NUD_FAIL_PROBE; +#endif + read_unlock(&neigh->lock); + } else { + ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? + RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; + } + rcu_read_unlock_bh(); + + return ret; } static int rt6_score_route(struct rt6_info *rt, int oif, int strict) { - int m, n; + int m; m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) - return -1; + return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; #endif - n = rt6_check_neigh(rt); - if (!n && (strict & RT6_LOOKUP_F_REACHABLE)) - return -1; + if (strict & RT6_LOOKUP_F_REACHABLE) { + int n = rt6_check_neigh(rt); + if (n < 0) + return n; + } return m; } static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match) + int *mpri, struct rt6_info *match, + bool *do_rr) { int m; + bool match_do_rr = false; if (rt6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); - if (m < 0) + if (m == RT6_NUD_FAIL_DO_RR) { + match_do_rr = true; + m = 0; /* lowest valid score */ + } else if (m == RT6_NUD_FAIL_HARD) { goto out; + } + + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(rt); + /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { - if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(match); + *do_rr = match_do_rr; *mpri = m; match = rt; - } else if (strict & RT6_LOOKUP_F_REACHABLE) { - rt6_probe(rt); } - out: return match; } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, struct rt6_info *rr_head, - u32 metric, int oif, int strict) + u32 metric, int oif, int strict, + bool *do_rr) { struct rt6_info *rt, *match; int mpri = -1; match = NULL; for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match, do_rr); for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->u.dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } @@ -420,19 +673,17 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) { struct rt6_info *match, *rt0; struct net *net; - - RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n", - __func__, fn->leaf, oif); + bool do_rr = false; rt0 = fn->rr_ptr; if (!rt0) fn->rr_ptr = rt0 = fn->leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + &do_rr); - if (!match && - (strict & RT6_LOOKUP_F_REACHABLE)) { - struct rt6_info *next = rt0->u.dst.rt6_next; + if (do_rr) { + struct rt6_info *next = rt0->dst.rt6_next; /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) @@ -442,16 +693,13 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) fn->rr_ptr = next; } - RT6_TRACE("%s() => %p\n", - __func__, match); - - net = dev_net(rt0->rt6i_dev); - return (match ? match : net->ipv6.ip6_null_entry); + net = dev_net(rt0->dst.dev); + return match ? match : net->ipv6.ip6_null_entry; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, - struct in6_addr *gwaddr) + const struct in6_addr *gwaddr) { struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; @@ -495,8 +743,11 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, prefix = &prefix_buf; } - rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, - dev->ifindex); + if (rinfo->prefix_len == 0) + rt = rt6_get_dflt_router(gwaddr, dev); + else + rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, + gwaddr, dev->ifindex); if (rt && !lifetime) { ip6_del_rt(rt); @@ -511,13 +762,12 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) { - rt->rt6i_flags &= ~RTF_EXPIRES; - } else { - rt->rt6i_expires = jiffies + HZ * lifetime; - rt->rt6i_flags |= RTF_EXPIRES; - } - dst_release(&rt->u.dst); + if (!addrconf_finite_timeout(lifetime)) + rt6_clean_expires(rt); + else + rt6_set_expires(rt, jiffies + HZ * lifetime); + + ip6_rt_put(rt); } return 0; } @@ -539,48 +789,53 @@ do { \ goto restart; \ } \ } \ -} while(0) +} while (0) static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, - struct flowi *fl, int flags) + struct flowi6 *fl6, int flags) { struct fib6_node *fn; struct rt6_info *rt; read_lock_bh(&table->tb6_lock); - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags); - BACKTRACK(net, &fl->fl6_src); + rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + BACKTRACK(net, &fl6->saddr); out: - dst_use(&rt->u.dst, jiffies); + dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); return rt; } +struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, + int flags) +{ + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); +} +EXPORT_SYMBOL_GPL(ip6_route_lookup); + struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int strict) { - struct flowi fl = { - .oif = oif, - .nl_u = { - .ip6_u = { - .daddr = *daddr, - }, - }, + struct flowi6 fl6 = { + .flowi6_oif = oif, + .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { - memcpy(&fl.fl6_src, saddr, sizeof(*saddr)); + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -597,14 +852,15 @@ EXPORT_SYMBOL(rt6_lookup); be destroyed. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, + struct nlattr *mx, int mx_len) { int err; struct fib6_table *table; table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info); + err = fib6_add(&table->tb6_root, rt, info, mx, mx_len); write_unlock_bh(&table->tb6_lock); return err; @@ -613,13 +869,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) int ip6_ins_rt(struct rt6_info *rt) { struct nl_info info = { - .nl_net = dev_net(rt->rt6i_dev), + .nl_net = dev_net(rt->dst.dev), }; - return __ip6_ins_rt(rt, &info); + return __ip6_ins_rt(rt, &info, NULL, 0); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr, - struct in6_addr *saddr) +static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -627,81 +884,38 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad * Clone the route. */ - rt = ip6_rt_copy(ort); + rt = ip6_rt_copy(ort, daddr); if (rt) { - struct neighbour *neigh; - int attempts = !in_softirq(); - - if (!(rt->rt6i_flags&RTF_GATEWAY)) { - if (rt->rt6i_dst.plen != 128 && - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)) - rt->rt6i_flags |= RTF_ANYCAST; - ipv6_addr_copy(&rt->rt6i_gateway, daddr); - } + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { - ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif - - retry: - neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (IS_ERR(neigh)) { - struct net *net = dev_net(rt->rt6i_dev); - int saved_rt_min_interval = - net->ipv6.sysctl.ip6_rt_gc_min_interval; - int saved_rt_elasticity = - net->ipv6.sysctl.ip6_rt_gc_elasticity; - - if (attempts-- > 0) { - net->ipv6.sysctl.ip6_rt_gc_elasticity = 1; - net->ipv6.sysctl.ip6_rt_gc_min_interval = 0; - - ip6_dst_gc(&net->ipv6.ip6_dst_ops); - - net->ipv6.sysctl.ip6_rt_gc_elasticity = - saved_rt_elasticity; - net->ipv6.sysctl.ip6_rt_gc_min_interval = - saved_rt_min_interval; - goto retry; - } - - if (net_ratelimit()) - printk(KERN_WARNING - "Neighbour table overflow.\n"); - dst_free(&rt->u.dst); - return NULL; - } - rt->rt6i_nexthop = neigh; - } return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr) +static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, + const struct in6_addr *daddr) { - struct rt6_info *rt = ip6_rt_copy(ort); - if (rt) { - ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); - rt->rt6i_dst.plen = 128; + struct rt6_info *rt = ip6_rt_copy(ort, daddr); + + if (rt) rt->rt6i_flags |= RTF_CACHE; - rt->u.dst.flags |= DST_HOST; - rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop); - } return rt; } static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, - struct flowi *fl, int flags) + struct flowi6 *fl6, int flags) { struct fib6_node *fn; struct rt6_info *rt, *nrt; @@ -716,33 +930,31 @@ relookup: read_lock_bh(&table->tb6_lock); restart_2: - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: rt = rt6_select(fn, oif, strict | reachable); - - BACKTRACK(net, &fl->fl6_src); + if (rt->rt6i_nsiblings) + rt = rt6_multipath_select(rt, fl6, oif, strict | reachable); + BACKTRACK(net, &fl6->saddr); if (rt == net->ipv6.ip6_null_entry || rt->rt6i_flags & RTF_CACHE) goto out; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) - nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src); - else { -#if CLONE_OFFLINK_ROUTE - nrt = rt6_alloc_clone(rt, &fl->fl6_dst); -#else + if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) + nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); + else if (!(rt->dst.flags & DST_HOST)) + nrt = rt6_alloc_clone(rt, &fl6->daddr); + else goto out2; -#endif - } - dst_release(&rt->u.dst); + ip6_rt_put(rt); rt = nrt ? : net->ipv6.ip6_null_entry; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); if (nrt) { err = ip6_ins_rt(nrt); if (!err) @@ -756,7 +968,7 @@ restart: * Race condition! In the gap, when table->tb6_lock was * released someone could insert this route. Relookup. */ - dst_release(&rt->u.dst); + ip6_rt_put(rt); goto relookup; out: @@ -764,102 +976,100 @@ out: reachable = 0; goto restart_2; } - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); out2: - rt->u.dst.lastuse = jiffies; - rt->u.dst.__use++; + rt->dst.lastuse = jiffies; + rt->dst.__use++; return rt; } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, - struct flowi *fl, int flags) + struct flowi6 *fl6, int flags) { - return ip6_pol_route(net, table, fl->iif, fl, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); +} + +static struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) +{ + if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) + flags |= RT6_LOOKUP_F_IFACE; + + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } void ip6_route_input(struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; - struct flowi fl = { - .iif = skb->dev->ifindex, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK, - }, - }, - .mark = skb->mark, - .proto = iph->nexthdr, + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, }; - if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG) - flags |= RT6_LOOKUP_F_IFACE; - - skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input)); + skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, - struct flowi *fl, int flags) + struct flowi6 *fl6, int flags) { - return ip6_pol_route(net, table, fl->oif, fl, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, struct sock *sk, - struct flowi *fl) +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, + struct flowi6 *fl6) { int flags = 0; - if (rt6_need_strict(&fl->fl6_dst)) + fl6->flowi6_iif = LOOPBACK_IFINDEX; + + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) flags |= RT6_LOOKUP_F_IFACE; - if (!ipv6_addr_any(&fl->fl6_src)) + if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - else if (sk) { - unsigned int prefs = inet6_sk(sk)->srcprefs; - if (prefs & IPV6_PREFER_SRC_TMP) - flags |= RT6_LOOKUP_F_SRCPREF_TMP; - if (prefs & IPV6_PREFER_SRC_PUBLIC) - flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC; - if (prefs & IPV6_PREFER_SRC_COA) - flags |= RT6_LOOKUP_F_SRCPREF_COA; - } + else if (sk) + flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); } EXPORT_SYMBOL(ip6_route_output); -int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl) +struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *ort = (struct rt6_info *) *dstp; - struct rt6_info *rt = (struct rt6_info *) - dst_alloc(&ip6_dst_blackhole_ops); + struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; struct dst_entry *new = NULL; + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { - new = &rt->u.dst; + new = &rt->dst; + + memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); + rt6_init_peer(rt, net->ipv6.peers); - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; - if (new->dev) - dev_hold(new->dev); + if (dst_metrics_read_only(&ort->dst)) + new->_metrics = ort->dst._metrics; + else + dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->rt6i_expires = 0; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -870,11 +1080,9 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl dst_free(new); } - dst_release(*dstp); - *dstp = new; - return (new ? 0 : -ENOMEM); + dst_release(dst_orig); + return new ? new : ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(ip6_dst_blackhole); /* * Destination cache support functions @@ -886,10 +1094,20 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt = (struct rt6_info *) dst; - if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) - return dst; + /* All IPV6 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + */ + if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev))) + return NULL; - return NULL; + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return dst; } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -897,49 +1115,209 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) struct rt6_info *rt = (struct rt6_info *) dst; if (rt) { - if (rt->rt6i_flags & RTF_CACHE) - ip6_del_rt(rt); - else + if (rt->rt6i_flags & RTF_CACHE) { + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + dst = NULL; + } + } else { dst_release(dst); + dst = NULL; + } } - return NULL; + return dst; } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); rt = (struct rt6_info *) skb_dst(skb); if (rt) { - if (rt->rt6i_flags&RTF_CACHE) { - dst_set_expires(&rt->u.dst, 0); - rt->rt6i_flags |= RTF_EXPIRES; - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + if (rt->rt6i_flags & RTF_CACHE) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; + } } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info*)dst; + dst_confirm(dst); if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + struct net *net = dev_net(dst->dev); + rt6->rt6i_flags |= RTF_MODIFIED; if (mtu < IPV6_MIN_MTU) { + u32 features = dst_metric(dst, RTAX_FEATURES); mtu = IPV6_MIN_MTU; - dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(dst, RTAX_FEATURES, features); } - dst->metrics[RTAX_MTU-1] = mtu; - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + dst_metric_set(dst, RTAX_MTU, mtu); + rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); } } -static int ipv6_get_mtu(struct net_device *dev); +void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, + int oif, u32 mark) +{ + const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + + dst = ip6_route_output(net, NULL, &fl6); + if (!dst->error) + ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + dst_release(dst); +} +EXPORT_SYMBOL_GPL(ip6_update_pmtu); + +void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) +{ + ip6_update_pmtu(skb, sock_net(sk), mtu, + sk->sk_bound_dev_if, sk->sk_mark); +} +EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); + +/* Handle redirects */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 4861 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ -static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu) + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt6_check_expired(rt)) + continue; + if (rt->dst.error) + break; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->dst.dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; + } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + else if (rt->dst.error) { + rt = net->ipv6.ip6_null_entry; + goto out; + } + BACKTRACK(net, &fl6->saddr); +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct dst_entry *ip6_route_redirect(struct net *net, + const struct flowi6 *fl6, + const struct in6_addr *gateway) { + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip6rd_flowi rdfl; + + rdfl.fl6 = *fl6; + rdfl.gateway = *gateway; + + return fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); +} + +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) +{ + const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + + dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} +EXPORT_SYMBOL_GPL(ip6_redirect); + +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, + u32 mark) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.flowi6_oif = oif; + fl6.flowi6_mark = mark; + fl6.daddr = msg->dest; + fl6.saddr = iph->daddr; + + dst = ip6_route_redirect(net, &fl6, &iph->saddr); + rt6_do_redirect(dst, NULL, skb); + dst_release(dst); +} + +void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); +} +EXPORT_SYMBOL_GPL(ip6_sk_redirect); + +static unsigned int ip6_default_advmss(const struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); + struct net *net = dev_net(dev); + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) @@ -956,70 +1334,74 @@ static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu) return mtu; } +static unsigned int ip6_mtu(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + goto out; + + mtu = IPV6_MIN_MTU; + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + +out: + return min_t(unsigned int, mtu, IP6_MAX_MTU); +} + static struct dst_entry *icmp6_dst_gc_list; static DEFINE_SPINLOCK(icmp6_dst_lock); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, - struct neighbour *neigh, - const struct in6_addr *addr) + struct flowi6 *fl6) { + struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); - if (unlikely(idev == NULL)) - return NULL; + if (unlikely(!idev)) + return ERR_PTR(-ENODEV); - rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); - if (unlikely(rt == NULL)) { + rt = ip6_dst_alloc(net, dev, 0, NULL); + if (unlikely(!rt)) { in6_dev_put(idev); + dst = ERR_PTR(-ENOMEM); goto out; } - dev_hold(dev); - if (neigh) - neigh_hold(neigh); - else { - neigh = ndisc_get_neigh(dev, addr); - if (IS_ERR(neigh)) - neigh = NULL; - } - - rt->rt6i_dev = dev; - rt->rt6i_idev = idev; - rt->rt6i_nexthop = neigh; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.output = ip6_output; - -#if 0 /* there's no chance to use these for ndisc */ - rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST - ? DST_HOST - : 0; - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->dst.flags |= DST_HOST; + rt->dst.output = ip6_output; + atomic_set(&rt->dst.__refcnt, 1); + rt->rt6i_gateway = fl6->daddr; + rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; -#endif + rt->rt6i_idev = idev; + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); spin_lock_bh(&icmp6_dst_lock); - rt->u.dst.next = icmp6_dst_gc_list; - icmp6_dst_gc_list = &rt->u.dst; + rt->dst.next = icmp6_dst_gc_list; + icmp6_dst_gc_list = &rt->dst; spin_unlock_bh(&icmp6_dst_lock); fib6_force_start_gc(net); + dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); + out: - return &rt->u.dst; + return dst; } int icmp6_dst_gc(void) { - struct dst_entry *dst, *next, **pprev; + struct dst_entry *dst, **pprev; int more = 0; - next = NULL; - spin_lock_bh(&icmp6_dst_lock); pprev = &icmp6_dst_gc_list; @@ -1059,60 +1441,27 @@ static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), static int ip6_dst_gc(struct dst_ops *ops) { - unsigned long now = jiffies; struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int entries; - if (time_after(rt_last_gc + rt_min_interval, now) && - atomic_read(&ops->entries) <= rt_max_size) + entries = dst_entries_get_fast(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && + entries <= rt_max_size) goto out; net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); - net->ipv6.ip6_rt_last_gc = now; - if (atomic_read(&ops->entries) < ops->gc_thresh) + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; out: net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; - return (atomic_read(&ops->entries) > rt_max_size); -} - -/* Clean host part of a prefix. Not necessary in radix tree, - but results in cleaner routing tables. - - Remove it only when all the things will work! - */ - -static int ipv6_get_mtu(struct net_device *dev) -{ - int mtu = IPV6_MIN_MTU; - struct inet6_dev *idev; - - idev = in6_dev_get(dev); - if (idev) { - mtu = idev->cnf.mtu6; - in6_dev_put(idev); - } - return mtu; -} - -int ip6_dst_hoplimit(struct dst_entry *dst) -{ - int hoplimit = dst_metric(dst, RTAX_HOPLIMIT); - if (hoplimit < 0) { - struct net_device *dev = dst->dev; - struct inet6_dev *idev = in6_dev_get(dev); - if (idev) { - hoplimit = idev->cnf.hop_limit; - in6_dev_put(idev); - } else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; - } - return hoplimit; + return entries > rt_max_size; } /* @@ -1148,23 +1497,33 @@ int ip6_route_add(struct fib6_config *cfg) if (cfg->fc_metric == 0) cfg->fc_metric = IP6_RT_PRIO_USER; - table = fib6_new_table(net, cfg->fc_table); - if (table == NULL) { - err = -ENOBUFS; - goto out; + err = -ENOBUFS; + if (cfg->fc_nlinfo.nlh && + !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { + table = fib6_get_table(net, cfg->fc_table); + if (!table) { + pr_warn("NLM_F_CREATE should be specified when creating new route\n"); + table = fib6_new_table(net, cfg->fc_table); + } + } else { + table = fib6_new_table(net, cfg->fc_table); } - rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); + if (!table) + goto out; + + rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); - if (rt == NULL) { + if (!rt) { err = -ENOMEM; goto out; } - rt->u.dst.obsolete = -1; - rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ? - jiffies + clock_t_to_jiffies(cfg->fc_expires) : - 0; + if (cfg->fc_flags & RTF_EXPIRES) + rt6_set_expires(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); + else + rt6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -1173,16 +1532,20 @@ int ip6_route_add(struct fib6_config *cfg) addr_type = ipv6_addr_type(&cfg->fc_dst); if (addr_type & IPV6_ADDR_MULTICAST) - rt->u.dst.input = ip6_mc_input; + rt->dst.input = ip6_mc_input; + else if (cfg->fc_flags & RTF_LOCAL) + rt->dst.input = ip6_input; else - rt->u.dst.input = ip6_forward; + rt->dst.input = ip6_forward; - rt->u.dst.output = ip6_output; + rt->dst.output = ip6_output; ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) - rt->u.dst.flags = DST_HOST; + if (rt->rt6i_dst.plen == 128) { + rt->dst.flags |= DST_HOST; + dst_metrics_set_force_overwrite(&rt->dst); + } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1195,7 +1558,9 @@ int ip6_route_add(struct fib6_config *cfg) they would result in kernel looping; promote them to reject routes */ if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(cfg->fc_flags & RTF_LOCAL))) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { @@ -1210,19 +1575,35 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } } - rt->u.dst.output = ip6_pkt_discard_out; - rt->u.dst.input = ip6_pkt_discard; - rt->u.dst.error = -ENETUNREACH; rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + switch (cfg->fc_type) { + case RTN_BLACKHOLE: + rt->dst.error = -EINVAL; + rt->dst.output = dst_discard_sk; + rt->dst.input = dst_discard; + break; + case RTN_PROHIBIT: + rt->dst.error = -EACCES; + rt->dst.output = ip6_pkt_prohibit_out; + rt->dst.input = ip6_pkt_prohibit; + break; + case RTN_THROW: + default: + rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN + : -ENETUNREACH; + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + break; + } goto install_route; } if (cfg->fc_flags & RTF_GATEWAY) { - struct in6_addr *gw_addr; + const struct in6_addr *gw_addr; int gwa_type; gw_addr = &cfg->fc_gateway; - ipv6_addr_copy(&rt->rt6i_gateway, gw_addr); + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { @@ -1236,84 +1617,61 @@ int ip6_route_add(struct fib6_config *cfg) some exceptions. --ANK */ err = -EINVAL; - if (!(gwa_type&IPV6_ADDR_UNICAST)) + if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); err = -EHOSTUNREACH; - if (grt == NULL) + if (!grt) goto out; if (dev) { - if (dev != grt->rt6i_dev) { - dst_release(&grt->u.dst); + if (dev != grt->dst.dev) { + ip6_rt_put(grt); goto out; } } else { - dev = grt->rt6i_dev; + dev = grt->dst.dev; idev = grt->rt6i_idev; dev_hold(dev); in6_dev_hold(grt->rt6i_idev); } - if (!(grt->rt6i_flags&RTF_GATEWAY)) + if (!(grt->rt6i_flags & RTF_GATEWAY)) err = 0; - dst_release(&grt->u.dst); + ip6_rt_put(grt); if (err) goto out; } err = -EINVAL; - if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + if (!dev || (dev->flags & IFF_LOOPBACK)) goto out; } err = -ENODEV; - if (dev == NULL) + if (!dev) goto out; - if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { - rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); - if (IS_ERR(rt->rt6i_nexthop)) { - err = PTR_ERR(rt->rt6i_nexthop); - rt->rt6i_nexthop = NULL; + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + err = -EINVAL; goto out; } - } + rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; + rt->rt6i_prefsrc.plen = 128; + } else + rt->rt6i_prefsrc.plen = 0; rt->rt6i_flags = cfg->fc_flags; install_route: - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - if (type > RTAX_MAX) { - err = -EINVAL; - goto out; - } - - rt->u.dst.metrics[type - 1] = nla_get_u32(nla); - } - } - } - - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - if (!dst_mtu(&rt->u.dst)) - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); - if (!dst_metric(&rt->u.dst, RTAX_ADVMSS)) - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.dev = dev; + rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); - return __ip6_ins_rt(rt, &cfg->fc_nlinfo); + return __ip6_ins_rt(rt, &cfg->fc_nlinfo, cfg->fc_mx, cfg->fc_mx_len); out: if (dev) @@ -1321,7 +1679,7 @@ out: if (idev) in6_dev_put(idev); if (rt) - dst_free(&rt->u.dst); + dst_free(&rt->dst); return err; } @@ -1329,26 +1687,27 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) { int err; struct fib6_table *table; - struct net *net = dev_net(rt->rt6i_dev); + struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) - return -ENOENT; + if (rt == net->ipv6.ip6_null_entry) { + err = -ENOENT; + goto out; + } table = rt->rt6i_table; write_lock_bh(&table->tb6_lock); - err = fib6_del(rt, info); - dst_release(&rt->u.dst); - write_unlock_bh(&table->tb6_lock); +out: + ip6_rt_put(rt); return err; } int ip6_del_rt(struct rt6_info *rt) { struct nl_info info = { - .nl_net = dev_net(rt->rt6i_dev), + .nl_net = dev_net(rt->dst.dev), }; return __ip6_del_rt(rt, &info); } @@ -1361,7 +1720,7 @@ static int ip6_route_del(struct fib6_config *cfg) int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); - if (table == NULL) + if (!table) return err; read_lock_bh(&table->tb6_lock); @@ -1371,17 +1730,17 @@ static int ip6_route_del(struct fib6_config *cfg) &cfg->fc_src, cfg->fc_src_len); if (fn) { - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { if (cfg->fc_ifindex && - (rt->rt6i_dev == NULL || - rt->rt6i_dev->ifindex != cfg->fc_ifindex)) + (!rt->dst.dev || + rt->dst.dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); return __ip6_del_rt(rt, &cfg->fc_nlinfo); @@ -1392,113 +1751,84 @@ static int ip6_route_del(struct fib6_config *cfg) return err; } -/* - * Handle redirects - */ -struct ip6rd_flowi { - struct flowi fl; - struct in6_addr gateway; -}; - -static struct rt6_info *__ip6_route_redirect(struct net *net, - struct fib6_table *table, - struct flowi *fl, - int flags) +static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { - struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl; - struct rt6_info *rt; - struct fib6_node *fn; + struct net *net = dev_net(skb->dev); + struct netevent_redirect netevent; + struct rt6_info *rt, *nrt = NULL; + struct ndisc_options ndopts; + struct inet6_dev *in6_dev; + struct neighbour *neigh; + struct rd_msg *msg; + int optlen, on_link; + u8 *lladdr; - /* - * Get the "current" route for this destination and - * check if the redirect has come from approriate router. - * - * RFC 2461 specifies that redirects should only be - * accepted if they come from the nexthop to the target. - * Due to the way the routes are chosen, this notion - * is a bit fuzzy and one might need to check all possible - * routes. - */ + optlen = skb_tail_pointer(skb) - skb_transport_header(skb); + optlen -= sizeof(*msg); - read_lock_bh(&table->tb6_lock); - fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src); -restart: - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { - /* - * Current route is on-link; redirect is always invalid. - * - * Seems, previous statement is not true. It could - * be node, which looks for us as on-link (f.e. proxy ndisc) - * But then router serving it might decide, that we should - * know truth 8)8) --ANK (980726). - */ - if (rt6_check_expired(rt)) - continue; - if (!(rt->rt6i_flags & RTF_GATEWAY)) - continue; - if (fl->oif != rt->rt6i_dev->ifindex) - continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) - continue; - break; + if (optlen < 0) { + net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); + return; } - if (!rt) - rt = net->ipv6.ip6_null_entry; - BACKTRACK(net, &fl->fl6_src); -out: - dst_hold(&rt->u.dst); - - read_unlock_bh(&table->tb6_lock); + msg = (struct rd_msg *)icmp6_hdr(skb); - return rt; -}; - -static struct rt6_info *ip6_route_redirect(struct in6_addr *dest, - struct in6_addr *src, - struct in6_addr *gateway, - struct net_device *dev) -{ - int flags = RT6_LOOKUP_F_HAS_SADDR; - struct net *net = dev_net(dev); - struct ip6rd_flowi rdfl = { - .fl = { - .oif = dev->ifindex, - .nl_u = { - .ip6_u = { - .daddr = *dest, - .saddr = *src, - }, - }, - }, - }; + if (ipv6_addr_is_multicast(&msg->dest)) { + net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); + return; + } - ipv6_addr_copy(&rdfl.gateway, gateway); + on_link = 0; + if (ipv6_addr_equal(&msg->dest, &msg->target)) { + on_link = 1; + } else if (ipv6_addr_type(&msg->target) != + (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); + return; + } - if (rt6_need_strict(dest)) - flags |= RT6_LOOKUP_F_IFACE; + in6_dev = __in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + return; - return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl, - flags, __ip6_route_redirect); -} + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ -void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, - struct in6_addr *saddr, - struct neighbour *neigh, u8 *lladdr, int on_link) -{ - struct rt6_info *rt, *nrt = NULL; - struct netevent_redirect netevent; - struct net *net = dev_net(neigh->dev); + if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); + return; + } - rt = ip6_route_redirect(dest, src, saddr, neigh->dev); + lladdr = NULL; + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); + return; + } + } + rt = (struct rt6_info *) dst; if (rt == net->ipv6.ip6_null_entry) { - if (net_ratelimit()) - printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " - "for redirect target\n"); - goto out; + net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); + return; } + /* Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); + if (!neigh) + return; + /* * We have finally decided to accept it. */ @@ -1510,165 +1840,71 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, NEIGH_UPDATE_F_ISROUTER)) ); - /* - * Redirect received -> path was valid. - * Look, redirects are sent only in response to data packets, - * so that this nexthop apparently is reachable. --ANK - */ - dst_confirm(&rt->u.dst); - - /* Duplicate redirect: silently ignore. */ - if (neigh == rt->u.dst.neighbour) - goto out; - - nrt = ip6_rt_copy(rt); - if (nrt == NULL) + nrt = ip6_rt_copy(rt, &msg->dest); + if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); - nrt->rt6i_dst.plen = 128; - nrt->u.dst.flags |= DST_HOST; - - ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); - nrt->rt6i_nexthop = neigh_clone(neigh); - /* Reset pmtu, it may be better */ - nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); - nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev), - dst_mtu(&nrt->u.dst)); + nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) goto out; - netevent.old = &rt->u.dst; - netevent.new = &nrt->u.dst; + netevent.old = &rt->dst; + netevent.new = &nrt->dst; + netevent.daddr = &msg->dest; + netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags&RTF_CACHE) { - ip6_del_rt(rt); - return; - } - -out: - dst_release(&rt->u.dst); - return; -} - -/* - * Handle ICMP "packet too big" messages - * i.e. Path MTU discovery - */ - -void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, - struct net_device *dev, u32 pmtu) -{ - struct rt6_info *rt, *nrt; - struct net *net = dev_net(dev); - int allfrag = 0; - - rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0); - if (rt == NULL) - return; - - if (pmtu >= dst_mtu(&rt->u.dst)) - goto out; - - if (pmtu < IPV6_MIN_MTU) { - /* - * According to RFC2460, PMTU is set to the IPv6 Minimum Link - * MTU (1280) and a fragment header should always be included - * after a node receiving Too Big message reporting PMTU is - * less than the IPv6 Minimum Link MTU. - */ - pmtu = IPV6_MIN_MTU; - allfrag = 1; - } - - /* New mtu received -> path was valid. - They are sent only in response to data packets, - so that this nexthop apparently is reachable. --ANK - */ - dst_confirm(&rt->u.dst); - - /* Host route. If it is static, it would be better - not to override it, but add new one, so that - when cache entry will expire old pmtu - would return automatically. - */ if (rt->rt6i_flags & RTF_CACHE) { - rt->u.dst.metrics[RTAX_MTU-1] = pmtu; - if (allfrag) - rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires); - rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; - goto out; + rt = (struct rt6_info *) dst_clone(&rt->dst); + ip6_del_rt(rt); } - /* Network route. - Two cases are possible: - 1. It is connected route. Action: COW - 2. It is gatewayed route or NONEXTHOP route. Action: clone it. - */ - if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) - nrt = rt6_alloc_cow(rt, daddr, saddr); - else - nrt = rt6_alloc_clone(rt, daddr); - - if (nrt) { - nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; - if (allfrag) - nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; - - /* According to RFC 1981, detecting PMTU increase shouldn't be - * happened within 5 mins, the recommended timer is 10 mins. - * Here this route expiration time is set to ip6_rt_mtu_expires - * which is 10 mins. After 10 mins the decreased pmtu is expired - * and detecting PMTU increase will be automatically happened. - */ - dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires); - nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; - - ip6_ins_rt(nrt); - } out: - dst_release(&rt->u.dst); + neigh_release(neigh); } /* * Misc support functions */ -static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest) { - struct net *net = dev_net(ort->rt6i_dev); - struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); + struct net *net = dev_net(ort->dst.dev); + struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, + ort->rt6i_table); if (rt) { - rt->u.dst.input = ort->u.dst.input; - rt->u.dst.output = ort->u.dst.output; - - memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); - rt->u.dst.error = ort->u.dst.error; - rt->u.dst.dev = ort->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->dst.flags |= DST_HOST; + + rt->rt6i_dst.addr = *dest; + rt->rt6i_dst.plen = 128; + dst_copy_metrics(&rt->dst, &ort->dst); + rt->dst.error = ort->dst.error; rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); - rt->u.dst.lastuse = jiffies; - rt->rt6i_expires = 0; + rt->dst.lastuse = jiffies; - ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + if (ort->rt6i_flags & RTF_GATEWAY) + rt->rt6i_gateway = ort->rt6i_gateway; + else + rt->rt6i_gateway = *dest; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); rt->rt6i_metric = 0; - memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif + memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); rt->rt6i_table = ort->rt6i_table; } return rt; @@ -1676,41 +1912,41 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_get_route_info(struct net *net, - struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex) + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex) { struct fib6_node *fn; struct rt6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, RT6_TABLE_INFO); - if (table == NULL) + if (!table) return NULL; - write_lock_bh(&table->tb6_lock); + read_lock_bh(&table->tb6_lock); fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) { - if (rt->rt6i_dev->ifindex != ifindex) + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->u.dst); + dst_hold(&rt->dst); break; } out: - write_unlock_bh(&table->tb6_lock); + read_unlock_bh(&table->tb6_lock); return rt; } static struct rt6_info *rt6_add_route_info(struct net *net, - struct in6_addr *prefix, int prefixlen, - struct in6_addr *gwaddr, int ifindex, - unsigned pref) + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned int pref) { struct fib6_config cfg = { .fc_table = RT6_TABLE_INFO, @@ -1719,13 +1955,13 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), - .fc_nlinfo.pid = 0, + .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, }; - ipv6_addr_copy(&cfg.fc_dst, prefix); - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_dst = *prefix; + cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) @@ -1737,29 +1973,29 @@ static struct rt6_info *rt6_add_route_info(struct net *net, } #endif -struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev) +struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) { struct rt6_info *rt; struct fib6_table *table; table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); - if (table == NULL) + if (!table) return NULL; - write_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) { - if (dev == rt->rt6i_dev && + read_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) break; } if (rt) - dst_hold(&rt->u.dst); - write_unlock_bh(&table->tb6_lock); + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); return rt; } -struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { @@ -1769,12 +2005,12 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), - .fc_nlinfo.pid = 0, + .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), }; - ipv6_addr_copy(&cfg.fc_gateway, gwaddr); + cfg.fc_gateway = *gwaddr; ip6_route_add(&cfg); @@ -1788,14 +2024,15 @@ void rt6_purge_dflt_routers(struct net *net) /* NOTE: Keep consistent with rt6_get_dflt_router */ table = fib6_get_table(net, RT6_TABLE_DFLT); - if (table == NULL) + if (!table) return; restart: read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { - dst_hold(&rt->u.dst); + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { + dst_hold(&rt->dst); read_unlock_bh(&table->tb6_lock); ip6_del_rt(rt); goto restart; @@ -1820,9 +2057,9 @@ static void rtmsg_to_fib6_config(struct net *net, cfg->fc_nlinfo.nl_net = net; - ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst); - ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src); - ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway); + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) @@ -1834,7 +2071,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch(cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = copy_from_user(&rtmsg, arg, sizeof(struct in6_rtmsg)); @@ -1873,7 +2110,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); - if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) { + if (type == IPV6_ADDR_ANY) { IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); break; @@ -1884,7 +2121,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) ipstats_mib_noroutes); break; } - icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb(skb); return 0; } @@ -1894,82 +2131,130 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } -#endif - /* * Allocate a dst for local (unicast / anycast) address. */ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, - int anycast) + bool anycast) { struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops); - struct neighbour *neigh; - - if (rt == NULL) + struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, + DST_NOCOUNT, NULL); + if (!rt) return ERR_PTR(-ENOMEM); - dev_hold(net->loopback_dev); in6_dev_hold(idev); - rt->u.dst.flags = DST_HOST; - rt->u.dst.input = ip6_input; - rt->u.dst.output = ip6_output; - rt->rt6i_dev = net->loopback_dev; + rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; + rt->dst.output = ip6_output; rt->rt6i_idev = idev; - rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst)); - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; - rt->u.dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) rt->rt6i_flags |= RTF_ANYCAST; else rt->rt6i_flags |= RTF_LOCAL; - neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - if (IS_ERR(neigh)) { - dst_free(&rt->u.dst); - - /* We are casting this because that is the return - * value type. But an errno encoded pointer is the - * same regardless of the underlying pointer type, - * and that's what we are returning. So this is OK. - */ - return (struct rt6_info *) neigh; - } - rt->rt6i_nexthop = neigh; - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_gateway = *addr; + rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); - atomic_set(&rt->u.dst.__refcnt, 1); + atomic_set(&rt->dst.__refcnt, 1); return rt; } +int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; + if (rt->rt6i_prefsrc.plen) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + return err; +} + +/* remove deleted ip from prefsrc entries */ +struct arg_dev_net_ip { + struct net_device *dev; + struct net *net; + struct in6_addr *addr; +}; + +static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +{ + struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; + struct net *net = ((struct arg_dev_net_ip *)arg)->net; + struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; + + if (((void *)rt->dst.dev == dev || !dev) && + rt != net->ipv6.ip6_null_entry && + ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + /* remove prefsrc entry */ + rt->rt6i_prefsrc.plen = 0; + } + return 0; +} + +void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) +{ + struct net *net = dev_net(ifp->idev->dev); + struct arg_dev_net_ip adni = { + .dev = ifp->idev->dev, + .net = net, + .addr = &ifp->addr, + }; + fib6_clean_all(net, fib6_remove_prefsrc, &adni); +} + +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +/* Remove routers and update dst entries when gateway turn into host. */ +static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +{ + struct in6_addr *gateway = (struct in6_addr *)arg; + + if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || + ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + return -1; + } + return 0; +} + +void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) +{ + fib6_clean_all(net, fib6_clean_tohost, gateway); +} + struct arg_dev_net { struct net_device *dev; struct net *net; @@ -1977,14 +2262,13 @@ struct arg_dev_net { static int fib6_ifdown(struct rt6_info *rt, void *arg) { - struct net_device *dev = ((struct arg_dev_net *)arg)->dev; - struct net *net = ((struct arg_dev_net *)arg)->net; + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; - if (((void *)rt->rt6i_dev == dev || dev == NULL) && - rt != net->ipv6.ip6_null_entry) { - RT6_TRACE("deleted by ifdown %p\n", rt); + if ((rt->dst.dev == dev || !dev) && + rt != adn->net->ipv6.ip6_null_entry) return -1; - } + return 0; } @@ -1995,21 +2279,19 @@ void rt6_ifdown(struct net *net, struct net_device *dev) .net = net, }; - fib6_clean_all(net, fib6_ifdown, 0, &adn); + fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); } -struct rt6_mtu_change_arg -{ +struct rt6_mtu_change_arg { struct net_device *dev; - unsigned mtu; + unsigned int mtu; }; static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; - struct net *net = dev_net(arg->dev); /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. @@ -2018,7 +2300,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) */ idev = __in6_dev_get(arg->dev); - if (idev == NULL) + if (!idev) return 0; /* For administrative MTU increase, there is no way to discover @@ -2035,25 +2317,24 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) also have the lowest MTU, TOO BIG MESSAGE will be lead to PMTU discouvery. */ - if (rt->rt6i_dev == arg->dev && - !dst_metric_locked(&rt->u.dst, RTAX_MTU) && - (dst_mtu(&rt->u.dst) >= arg->mtu || - (dst_mtu(&rt->u.dst) < arg->mtu && - dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) { - rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; - rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu); + if (rt->dst.dev == arg->dev && + !dst_metric_locked(&rt->dst, RTAX_MTU) && + (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6))) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } return 0; } -void rt6_mtu_change(struct net_device *dev, unsigned mtu) +void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; - fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { @@ -2062,6 +2343,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2084,11 +2366,18 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_src_len = rtm->rtm_src_len; cfg->fc_flags = RTF_UP; cfg->fc_protocol = rtm->rtm_protocol; + cfg->fc_type = rtm->rtm_type; - if (rtm->rtm_type == RTN_UNREACHABLE) + if (rtm->rtm_type == RTN_UNREACHABLE || + rtm->rtm_type == RTN_BLACKHOLE || + rtm->rtm_type == RTN_PROHIBIT || + rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; - cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + if (rtm->rtm_type == RTN_LOCAL) + cfg->fc_flags |= RTF_LOCAL; + + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -2115,6 +2404,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } + if (tb[RTA_PREFSRC]) + nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); @@ -2129,12 +2421,72 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); + if (tb[RTA_MULTIPATH]) { + cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); + cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); + } + err = 0; errout: return err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int ip6_route_multipath(struct fib6_config *cfg, int add) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 0, last_err = 0; + +beginning: + rtnh = (struct rtnexthop *)cfg->fc_mp; + remaining = cfg->fc_mp_len; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + if (err) { + last_err = err; + /* If we are trying to remove a route, do not stop the + * loop when ip6_route_del() fails (because next hop is + * already gone), we should try to remove all next hops. + */ + if (add) { + /* If add fails, we should try to delete all + * next hops that have been already added. + */ + add = 0; + goto beginning; + } + } + /* Because each route is added like a single route we remove + * this flag after the first nexthop (if there is a collision, + * we have already fail to add the first nexthop: + * fib6_add_rt2node() has reject it). + */ + cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL; + rtnh = rtnh_next(rtnh, &remaining); + } + + return last_err; +} + +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh) { struct fib6_config cfg; int err; @@ -2143,10 +2495,13 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a if (err < 0) return err; - return ip6_route_del(&cfg); + if (cfg.fc_mp) + return ip6_route_multipath(&cfg, 0); + else + return ip6_route_del(&cfg); } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh) { struct fib6_config cfg; int err; @@ -2155,7 +2510,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a if (err < 0) return err; - return ip6_route_add(&cfg); + if (cfg.fc_mp) + return ip6_route_multipath(&cfg, 1); + else + return ip6_route_add(&cfg); } static inline size_t rt6_nlmsg_size(void) @@ -2176,7 +2534,7 @@ static inline size_t rt6_nlmsg_size(void) static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, - int iif, int type, u32 pid, u32 seq, + int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { struct rtmsg *rtm; @@ -2191,8 +2549,8 @@ static int rt6_fill_node(struct net *net, } } - nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); - if (nlh == NULL) + nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); + if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); @@ -2205,37 +2563,60 @@ static int rt6_fill_node(struct net *net, else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table; - NLA_PUT_U32(skb, RTA_TABLE, table); - if (rt->rt6i_flags&RTF_REJECT) - rtm->rtm_type = RTN_UNREACHABLE; - else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + if (nla_put_u32(skb, RTA_TABLE, table)) + goto nla_put_failure; + if (rt->rt6i_flags & RTF_REJECT) { + switch (rt->dst.error) { + case -EINVAL: + rtm->rtm_type = RTN_BLACKHOLE; + break; + case -EACCES: + rtm->rtm_type = RTN_PROHIBIT; + break; + case -EAGAIN: + rtm->rtm_type = RTN_THROW; + break; + default: + rtm->rtm_type = RTN_UNREACHABLE; + break; + } + } + else if (rt->rt6i_flags & RTF_LOCAL) + rtm->rtm_type = RTN_LOCAL; + else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags&RTF_DYNAMIC) + if (rt->rt6i_flags & RTF_DYNAMIC) rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) - rtm->rtm_protocol = RTPROT_KERNEL; - else if (rt->rt6i_flags&RTF_DEFAULT) - rtm->rtm_protocol = RTPROT_RA; + else if (rt->rt6i_flags & RTF_ADDRCONF) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) + rtm->rtm_protocol = RTPROT_RA; + else + rtm->rtm_protocol = RTPROT_KERNEL; + } - if (rt->rt6i_flags&RTF_CACHE) + if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dst) { - NLA_PUT(skb, RTA_DST, 16, dst); + if (nla_put(skb, RTA_DST, 16, dst)) + goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); + if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr)) + goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { - NLA_PUT(skb, RTA_SRC, 16, src); + if (nla_put(skb, RTA_SRC, 16, src)) + goto nla_put_failure; rtm->rtm_src_len = 128; - } else if (rtm->rtm_src_len) - NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); + } else if (rtm->rtm_src_len && + nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr)) + goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE @@ -2253,35 +2634,39 @@ static int rt6_fill_node(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, iif); + if (nla_put_u32(skb, RTA_IIF, iif)) + goto nla_put_failure; } else if (dst) { - struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst); struct in6_addr saddr_buf; - if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - dst, 0, &saddr_buf) == 0) - NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && + nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) - goto nla_put_failure; + if (rt->rt6i_prefsrc.plen) { + struct in6_addr saddr_buf; + saddr_buf = rt->rt6i_prefsrc.addr; + if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf)) + goto nla_put_failure; + } - if (rt->u.dst.neighbour) - NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + goto nla_put_failure; - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex); + if (rt->rt6i_flags & RTF_GATEWAY) { + if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0) + goto nla_put_failure; + } - NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; + if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) + goto nla_put_failure; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - expires = 0; - else if (rt->rt6i_expires - jiffies < INT_MAX) - expires = rt->rt6i_expires - jiffies; - else - expires = INT_MAX; + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; - if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, - expires, rt->u.dst.error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2304,58 +2689,76 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) return rt6_fill_node(arg->net, arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, + NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, prefix, 0, NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi fl; - int err, iif = 0; + struct flowi6 fl6; + int err, iif = 0, oif = 0; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); if (err < 0) goto errout; err = -EINVAL; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC])); + fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; - ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST])); + fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) - fl.oif = nla_get_u32(tb[RTA_OIF]); + oif = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_MARK]) + fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (iif) { struct net_device *dev; + int flags = 0; + dev = __dev_get_by_index(net, iif); if (!dev) { err = -ENODEV; goto errout; } + + fl6.flowi6_iif = iif; + + if (!ipv6_addr_any(&fl6.saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6, + flags); + } else { + fl6.flowi6_oif = oif; + + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (skb == NULL) { + if (!skb) { + ip6_rt_put(rt); err = -ENOBUFS; goto errout; } @@ -2366,18 +2769,17 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb_reset_mac_header(skb); skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); - rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); - err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).pid, + err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, 0, 0); if (err < 0) { kfree_skb(skb); goto errout; } - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } @@ -2390,21 +2792,21 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) int err; err = -ENOBUFS; - seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0; + seq = info->nlh ? info->nlh->nlmsg_seq : 0; skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); - if (skb == NULL) + if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->pid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); return; errout: @@ -2413,18 +2815,18 @@ errout: } static int ip6_route_dev_notify(struct notifier_block *this, - unsigned long event, void *data) + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { - net->ipv6.ip6_null_entry->u.dst.dev = dev; + net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - net->ipv6.ip6_prohibit_entry->u.dst.dev = dev; + net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); - net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev; + net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } @@ -2438,59 +2840,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, #ifdef CONFIG_PROC_FS -#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) - -struct rt6_proc_arg -{ - char *buffer; - int offset; - int length; - int skip; - int len; -}; - -static int rt6_info_route(struct rt6_info *rt, void *p_arg) -{ - struct seq_file *m = p_arg; - - seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); - -#ifdef CONFIG_IPV6_SUBTREES - seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); -#else - seq_puts(m, "00000000000000000000000000000000 00 "); -#endif - - if (rt->rt6i_nexthop) { - seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key); - } else { - seq_puts(m, "00000000000000000000000000000000"); - } - seq_printf(m, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt), - rt->u.dst.__use, rt->rt6i_flags, - rt->rt6i_dev ? rt->rt6i_dev->name : ""); - return 0; -} - -static int ipv6_route_show(struct seq_file *m, void *v) -{ - struct net *net = (struct net *)m->private; - fib6_clean_all(net, rt6_info_route, 0, m); - return 0; -} - -static int ipv6_route_open(struct inode *inode, struct file *file) -{ - return single_open_net(inode, file, ipv6_route_show); -} - static const struct file_operations ipv6_route_proc_fops = { .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release_net, + .release = seq_release_net, }; static int rt6_stats_seq_show(struct seq_file *seq, void *v) @@ -2502,7 +2857,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) net->ipv6.rt6_stats->fib_rt_alloc, net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, - atomic_read(&net->ipv6.ip6_dst_ops.entries), + dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; @@ -2525,20 +2880,22 @@ static const struct file_operations rt6_stats_seq_fops = { #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; - int delay = net->ipv6.sysctl.flush_delay; - if (write) { - proc_dointvec(ctl, write, buffer, lenp, ppos); - fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); - return 0; - } else + struct net *net; + int delay; + if (!write) return -EINVAL; + + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; + proc_dointvec(ctl, write, buffer, lenp, ppos); + fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); + return 0; } -ctl_table ipv6_route_table_template[] = { +struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, @@ -2586,7 +2943,7 @@ ctl_table ipv6_route_table_template[] = { .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", @@ -2600,7 +2957,7 @@ ctl_table ipv6_route_table_template[] = { .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", @@ -2612,7 +2969,7 @@ ctl_table ipv6_route_table_template[] = { { } }; -struct ctl_table *ipv6_route_sysctl_init(struct net *net) +struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; @@ -2622,6 +2979,7 @@ struct ctl_table *ipv6_route_sysctl_init(struct net *net) if (table) { table[0].data = &net->ipv6.sysctl.flush_delay; + table[0].extra1 = net; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; @@ -2631,27 +2989,36 @@ struct ctl_table *ipv6_route_sysctl_init(struct net *net) table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + table[0].procname = NULL; } return table; } #endif -static int ip6_route_net_init(struct net *net) +static int __net_init ip6_route_net_init(struct net *net) { int ret = -ENOMEM; memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); + if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) + goto out_ip6_dst_ops; + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) - goto out_ip6_dst_ops; - net->ipv6.ip6_null_entry->u.dst.path = + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_null_entry; - net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_null_entry->dst, + ip6_template_metrics, true); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, @@ -2659,18 +3026,22 @@ static int ip6_route_net_init(struct net *net) GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; - net->ipv6.ip6_prohibit_entry->u.dst.path = + net->ipv6.ip6_prohibit_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_prohibit_entry; - net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, + ip6_template_metrics, true); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; - net->ipv6.ip6_blk_hole_entry->u.dst.path = + net->ipv6.ip6_blk_hole_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; - net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops; + net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, + ip6_template_metrics, true); #endif net->ipv6.sysctl.flush_delay = 0; @@ -2682,10 +3053,6 @@ static int ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; -#ifdef CONFIG_PROC_FS - proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); - proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); -#endif net->ipv6.ip6_rt_gc_expire = 30*HZ; ret = 0; @@ -2698,21 +3065,37 @@ out_ip6_prohibit_entry: out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif +out_ip6_dst_entries: + dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } -static void ip6_route_net_exit(struct net *net) +static void __net_exit ip6_route_net_exit(struct net *net) { -#ifdef CONFIG_PROC_FS - proc_net_remove(net, "ipv6_route"); - proc_net_remove(net, "rt6_stats"); -#endif kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +} + +static int __net_init ip6_route_net_init_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); + proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); +#endif + return 0; +} + +static void __net_exit ip6_route_net_exit_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipv6_route", net->proc_net); + remove_proc_entry("rt6_stats", net->proc_net); +#endif } static struct pernet_operations ip6_route_net_ops = { @@ -2720,6 +3103,36 @@ static struct pernet_operations ip6_route_net_ops = { .exit = ip6_route_net_exit, }; +static int __net_init ipv6_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv6.peers = bp; + return 0; +} + +static void __net_exit ipv6_inetpeer_exit(struct net *net) +{ + struct inet_peer_base *bp = net->ipv6.peers; + + net->ipv6.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); +} + +static struct pernet_operations ipv6_inetpeer_ops = { + .init = ipv6_inetpeer_init, + .exit = ipv6_inetpeer_exit, +}; + +static struct pernet_operations ip6_route_net_late_ops = { + .init = ip6_route_net_init_late, + .exit = ip6_route_net_exit_late, +}; + static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, .priority = 0, @@ -2736,21 +3149,29 @@ int __init ip6_route_init(void) if (!ip6_dst_ops_template.kmem_cachep) goto out; - ret = register_pernet_subsys(&ip6_route_net_ops); + ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; + ret = register_pernet_subsys(&ipv6_inetpeer_ops); + if (ret) + goto out_dst_entries; + + ret = register_pernet_subsys(&ip6_route_net_ops); + if (ret) + goto out_register_inetpeer; + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif ret = fib6_init(); @@ -2765,19 +3186,25 @@ int __init ip6_route_init(void) if (ret) goto xfrm6_init; - ret = -ENOBUFS; - if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) || - __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) || - __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL)) + ret = register_pernet_subsys(&ip6_route_net_late_ops); + if (ret) goto fib6_rules_init; + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) + goto out_register_late_subsys; + ret = register_netdevice_notifier(&ip6_route_dev_notifier); if (ret) - goto fib6_rules_init; + goto out_register_late_subsys; out: return ret; +out_register_late_subsys: + unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: @@ -2786,6 +3213,10 @@ out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); +out_register_inetpeer: + unregister_pernet_subsys(&ipv6_inetpeer_ops); +out_dst_entries: + dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; @@ -2794,9 +3225,12 @@ out_kmem_cache: void ip6_route_cleanup(void) { unregister_netdevice_notifier(&ip6_route_dev_notifier); + unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); xfrm6_fini(); fib6_gc_cleanup(); + unregister_pernet_subsys(&ipv6_inetpeer_ops); unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 976e68244b9..4f408176dc6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -17,6 +17,8 @@ * Fred Templin <fred.l.templin@boeing.com>: isatap support */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> @@ -28,6 +30,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> @@ -46,7 +49,7 @@ #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> -#include <net/ipip.h> +#include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> @@ -62,54 +65,53 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static void ipip6_fb_tunnel_init(struct net_device *dev); -static void ipip6_tunnel_init(struct net_device *dev); +static bool log_ecn_error = true; +module_param(log_ecn_error, bool, 0644); +MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); + +static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); +static void ipip6_dev_free(struct net_device *dev); +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst); +static struct rtnl_link_ops sit_link_ops __read_mostly; static int sit_net_id __read_mostly; struct sit_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; /* - * Locking : hash tables are protected by RCU and a spinlock - */ -static DEFINE_SPINLOCK(ipip6_lock); - -#define for_each_ip_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -/* * Must be invoked with rcu_read_lock */ -static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, +static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, struct net_device *dev, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); - for_each_ip_tunnel_rcu(sitn->tunnels_r_l[h0 ^ h1]) { + for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (!dev || !t->parms.link || dev->iflink == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } - for_each_ip_tunnel_rcu(sitn->tunnels_r[h0]) { + for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && (!dev || !t->parms.link || dev->iflink == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } - for_each_ip_tunnel_rcu(sitn->tunnels_l[h1]) { + for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && (!dev || !t->parms.link || dev->iflink == t->parms.link) && (t->dev->flags & IFF_UP)) @@ -121,12 +123,12 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn, +static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -140,7 +142,7 @@ static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn, return &sitn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn, +static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, struct ip_tunnel *t) { return __ipip6_bucket(sitn, &t->parms); @@ -148,13 +150,14 @@ static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn, static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip6_bucket(sitn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip6_lock); - *tp = t->next; - spin_unlock_bh(&ipip6_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip6_bucket(sitn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -162,12 +165,10 @@ static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip6_bucket(sitn, t); + struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); - spin_lock_bh(&ipip6_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip6_lock); } static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) @@ -187,17 +188,51 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #endif } -static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, +static int ipip6_tunnel_create(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + int err; + + err = ipip6_tunnel_init(dev); + if (err < 0) + goto out; + ipip6_tunnel_clone_6rd(dev, sitn); + + if ((__force u16)t->parms.i_flags & SIT_ISATAP) + dev->priv_flags |= IFF_ISATAP; + + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(t->parms.name, dev->name); + dev->rtnl_link_ops = &sit_link_ops; + + dev_hold(dev); + + ipip6_tunnel_link(sitn, t); + return 0; + +out: + return err; +} + +static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct sit_net *sitn = net_generic(net, sit_net_id); - for (tp = __ipip6_bucket(sitn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip6_bucket(sitn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && parms->link == t->parms.link) { @@ -213,7 +248,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "sit%%d"); + strcpy(name, "sit%d"); dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); if (dev == NULL) @@ -221,36 +256,20 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - nt = netdev_priv(dev); nt->parms = *parms; - ipip6_tunnel_init(dev); - ipip6_tunnel_clone_6rd(dev, sitn); - - if (parms->i_flags & SIT_ISATAP) - dev->priv_flags |= IFF_ISATAP; - - if (register_netdevice(dev) < 0) + if (ipip6_tunnel_create(dev) < 0) goto failed_free; - dev_hold(dev); - - ipip6_tunnel_link(sitn, nt); return nt; failed_free: - free_netdev(dev); + ipip6_dev_free(dev); failed: return NULL; } -static DEFINE_SPINLOCK(ipip6_prl_lock); - #define for_each_prl_rcu(start) \ for (prl = rcu_dereference(start); \ prl; \ @@ -340,9 +359,9 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) if (a->addr == htonl(INADDR_ANY)) return -EINVAL; - spin_lock(&ipip6_prl_lock); + ASSERT_RTNL(); - for (p = t->prl; p; p = p->next) { + for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { if (p->addr == a->addr) { if (chg) { p->flags = a->flags; @@ -364,29 +383,22 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) goto out; } - INIT_RCU_HEAD(&p->rcu_head); p->next = t->prl; p->addr = a->addr; p->flags = a->flags; t->prl_count++; rcu_assign_pointer(t->prl, p); out: - spin_unlock(&ipip6_prl_lock); return err; } -static void prl_entry_destroy_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct ip_tunnel_prl_entry, rcu_head)); -} - static void prl_list_destroy_rcu(struct rcu_head *head) { struct ip_tunnel_prl_entry *p, *n; p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); do { - n = p->next; + n = rcu_dereference_protected(p->next, 1); kfree(p); p = n; } while (p); @@ -395,37 +407,38 @@ static void prl_list_destroy_rcu(struct rcu_head *head) static int ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) { - struct ip_tunnel_prl_entry *x, **p; + struct ip_tunnel_prl_entry *x; + struct ip_tunnel_prl_entry __rcu **p; int err = 0; - spin_lock(&ipip6_prl_lock); + ASSERT_RTNL(); if (a && a->addr != htonl(INADDR_ANY)) { - for (p = &t->prl; *p; p = &(*p)->next) { - if ((*p)->addr == a->addr) { - x = *p; + for (p = &t->prl; + (x = rtnl_dereference(*p)) != NULL; + p = &x->next) { + if (x->addr == a->addr) { *p = x->next; - call_rcu(&x->rcu_head, prl_entry_destroy_rcu); + kfree_rcu(x, rcu_head); t->prl_count--; goto out; } } err = -ENXIO; } else { - if (t->prl) { + x = rtnl_dereference(t->prl); + if (x) { t->prl_count = 0; - x = t->prl; call_rcu(&x->rcu_head, prl_list_destroy_rcu); t->prl = NULL; } } out: - spin_unlock(&ipip6_prl_lock); return err; } static int -isatap_chksrc(struct sk_buff *skb, struct iphdr *iph, struct ip_tunnel *t) +isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { struct ip_tunnel_prl_entry *p; int ok = 1; @@ -438,7 +451,8 @@ isatap_chksrc(struct sk_buff *skb, struct iphdr *iph, struct ip_tunnel *t) else skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; } else { - struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; + const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; + if (ipv6_addr_is_isatap(addr6) && (addr6->s6_addr32[3] == iph->saddr) && ipv6_chk_prefix(addr6, t->dev)) @@ -452,30 +466,58 @@ isatap_chksrc(struct sk_buff *skb, struct iphdr *iph, struct ip_tunnel *t) static void ipip6_tunnel_uninit(struct net_device *dev) { - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *tunnel = netdev_priv(dev); + struct sit_net *sitn = net_generic(tunnel->net, sit_net_id); if (dev == sitn->fb_tunnel_dev) { - spin_lock_bh(&ipip6_lock); - sitn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip6_lock); - dev_put(dev); + RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); } else { - ipip6_tunnel_unlink(sitn, netdev_priv(dev)); - ipip6_tunnel_del_prl(netdev_priv(dev), NULL); - dev_put(dev); + ipip6_tunnel_unlink(sitn, tunnel); + ipip6_tunnel_del_prl(tunnel, NULL); } + ip_tunnel_dst_reset_all(tunnel); + dev_put(dev); } +/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH + * if sufficient data bytes are available + */ +static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct rt6_info *rt; + struct sk_buff *skb2; + + if (!pskb_may_pull(skb, iph->ihl * 4 + sizeof(struct ipv6hdr) + 8)) + return 1; + + skb2 = skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 1; + + skb_dst_drop(skb2); + skb_pull(skb2, iph->ihl * 4); + skb_reset_network_header(skb2); + + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; + + icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + + if (rt) + ip6_rt_put(rt); + + kfree_skb(skb2); + + return 0; +} static int ipip6_err(struct sk_buff *skb, u32 info) { - -/* All the routers (except for Linux) return only - 8 bytes of packet payload. It means, that precise relaying of - ICMP in the real Internet is absolutely infeasible. - */ - struct iphdr *iph = (struct iphdr*)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -489,12 +531,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info) case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: /* Impossible event. */ return 0; - case ICMP_FRAG_NEEDED: - /* Soft state for pmtu is maintained by IP core. */ - return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -507,19 +545,39 @@ static int ipip6_err(struct sk_buff *skb, u32 info) if (code != ICMP_EXC_TTL) return 0; break; + case ICMP_REDIRECT: + break; } err = -ENOENT; - rcu_read_lock(); t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->daddr, iph->saddr); - if (t == NULL || t->parms.iph.daddr == 0) + if (t == NULL) + goto out; + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + t->parms.link, 0, IPPROTO_IPV6, 0); + err = 0; + goto out; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, + IPPROTO_IPV6, 0); + err = 0; + goto out; + } + + if (t->parms.iph.daddr == 0) goto out; err = 0; + if (!ipip6_err_gen_icmpv6_unreach(skb)) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; @@ -529,75 +587,182 @@ static int ipip6_err(struct sk_buff *skb, u32 info) t->err_count = 1; t->err_time = jiffies; out: - rcu_read_unlock(); return err; } -static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, + const struct in6_addr *v6addr) { - if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(ipv6_hdr(skb)); + __be32 v4embed = 0; + if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) + return true; + return false; } -static int ipip6_rcv(struct sk_buff *skb) +/* Checks if an address matches an address on the tunnel interface. + * Used to detect the NAT of proto 41 packets and let them pass spoofing test. + * Long story: + * This function is called after we considered the packet as spoofed + * in is_spoofed_6rd. + * We may have a router that is doing NAT for proto 41 packets + * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb + * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd + * function will return true, dropping the packet. + * But, we can still check if is spoofed against the IP + * addresses associated with the interface. + */ +static bool only_dnatted(const struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) { - struct iphdr *iph; - struct ip_tunnel *tunnel; + int prefix_len; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; +#ifdef CONFIG_IPV6_SIT_6RD + prefix_len = tunnel->ip6rd.prefixlen + 32 + - tunnel->ip6rd.relay_prefixlen; +#else + prefix_len = 48; +#endif + return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); +} - iph = ip_hdr(skb); +/* Returns true if a packet is spoofed */ +static bool packet_is_spoofed(struct sk_buff *skb, + const struct iphdr *iph, + struct ip_tunnel *tunnel) +{ + const struct ipv6hdr *ipv6h; + + if (tunnel->dev->priv_flags & IFF_ISATAP) { + if (!isatap_chksrc(skb, iph, tunnel)) + return true; + + return false; + } + + if (tunnel->dev->flags & IFF_POINTOPOINT) + return false; + + ipv6h = ipv6_hdr(skb); + + if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { + net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; + } + + if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) + return false; + + if (only_dnatted(tunnel, &ipv6h->daddr)) + return false; + + net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; +} + +static int ipip6_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + struct ip_tunnel *tunnel; + int err; - rcu_read_lock(); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { - secpath_reset(skb); + struct pcpu_sw_netstats *tstats; + + if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && + tunnel->parms.iph.protocol != 0) + goto out; + skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); - skb->pkt_type = PACKET_HOST; - if ((tunnel->dev->priv_flags & IFF_ISATAP) && - !isatap_chksrc(skb, iph, tunnel)) { + if (packet_is_spoofed(skb, iph, tunnel)) { tunnel->dev->stats.rx_errors++; - rcu_read_unlock(); - kfree_skb(skb); - return 0; + goto out; + } + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + err = IP_ECN_decapsulate(iph, skb); + if (unlikely(err)) { + if (log_ecn_error) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &iph->saddr, iph->tos); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto out; + } } - tunnel->dev->stats.rx_packets++; - tunnel->dev->stats.rx_bytes += skb->len; - skb->dev = tunnel->dev; - skb_dst_drop(skb); - nf_reset(skb); - ipip6_ecn_decapsulate(iph, skb); + + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + netif_rx(skb); - rcu_read_unlock(); + return 0; } - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - rcu_read_unlock(); + /* no tunnel matched, let upstream know, ipsec may handle it */ + return 1; out: kfree_skb(skb); return 0; } +static const struct tnl_ptk_info tpi = { + /* no tunnel info required for ipip. */ + .proto = htons(ETH_P_IP), +}; + +static int ipip_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct ip_tunnel *tunnel; + + iph = ip_hdr(skb); + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + if (tunnel->parms.iph.protocol != IPPROTO_IPIP && + tunnel->parms.iph.protocol != 0) + goto drop; + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi.proto)) + goto drop; + return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + } + + return 1; + +drop: + kfree_skb(skb); + return 0; +} + /* - * Returns the embedded IPv4 address if the IPv6 address - * comes from 6rd / 6to4 (RFC 3056) addr space. + * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function + * stores the embedded IPv4 address in v4dst and returns true. */ -static inline -__be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel) +static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, + __be32 *v4dst) { - __be32 dst = 0; - #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { - unsigned pbw0, pbi0; + unsigned int pbw0, pbi0; int pbi1; u32 d; @@ -612,14 +777,24 @@ __be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel) d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> (32 - pbi1); - dst = tunnel->ip6rd.relay_prefix | htonl(d); + *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); + return true; } #else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ - memcpy(&dst, &v6dst->s6_addr16[1], 4); + memcpy(v4dst, &v6dst->s6_addr16[1], 4); + return true; } #endif + return false; +} + +static inline __be32 try_6rd(struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + __be32 dst = 0; + check_6rd(tunnel, v6dst, &dst); return dst; } @@ -632,63 +807,70 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); - struct iphdr *tiph = &tunnel->parms.iph; - struct ipv6hdr *iph6 = ipv6_hdr(skb); + const struct iphdr *tiph = &tunnel->parms.iph; + const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *iph; /* Our new IP header */ + struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; + struct flowi4 fl4; int mtu; - struct in6_addr *addr6; + const struct in6_addr *addr6; int addr_type; + u8 ttl; + int err; if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; + if (tos == 1) + tos = ipv6_get_dsfield(iph6); + /* ISATAP (RFC4214) - must come before 6to4 */ if (dev->priv_flags & IFF_ISATAP) { struct neighbour *neigh = NULL; + bool do_tx_error = false; if (skb_dst(skb)) - neigh = skb_dst(skb)->neighbour; + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } - addr6 = (struct in6_addr*)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if ((addr_type & IPV6_ADDR_UNICAST) && ipv6_addr_is_isatap(addr6)) dst = addr6->s6_addr32[3]; else + do_tx_error = true; + + neigh_release(neigh); + if (do_tx_error) goto tx_error; } if (!dst) - dst = try_6rd(&iph6->daddr, tunnel); + dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; + bool do_tx_error = false; if (skb_dst(skb)) - neigh = skb_dst(skb)->neighbour; + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } - addr6 = (struct in6_addr*)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -696,42 +878,43 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, addr_type = ipv6_addr_type(addr6); } - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) - goto tx_error_icmp; + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) + dst = addr6->s6_addr32[3]; + else + do_tx_error = true; - dst = addr6->s6_addr32[3]; + neigh_release(neigh); + if (do_tx_error) + goto tx_error; } - { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .oif = tunnel->parms.link, - .proto = IPPROTO_IPV6 }; - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; - goto tx_error_icmp; - } + rt = ip_route_output_ports(tunnel->net, &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPV6, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; } if (rt->rt_type != RTN_UNICAST) { ip_rt_put(rt); - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } if (df) { - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -742,10 +925,10 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + if (skb->len > mtu && !skb_is_gso(skb)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } @@ -770,8 +953,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; - dev_kfree_skb(skb); + dev->stats.tx_dropped++; + kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) @@ -780,70 +963,99 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb = new_skb; iph6 = ipv6_hdr(skb); } + ttl = tiph->ttl; + if (ttl == 0) + ttl = iph6->hop_limit; + tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - skb->transport_header = skb->network_header; - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags = 0; - skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT); + if (IS_ERR(skb)) { + ip_rt_put(rt); + goto out; + } - /* - * Push down and install the IPIP header. - */ + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df, + !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return NETDEV_TX_OK; - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPV6; - iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); +out: + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} - if ((iph->ttl = tiph->ttl) == 0) - iph->ttl = iph6->hop_limit; +static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + const struct iphdr *tiph = &tunnel->parms.iph; - nf_reset(skb); + skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + if (IS_ERR(skb)) + goto out; - IPTUNNEL_XMIT(); + ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); return NETDEV_TX_OK; +out: + dev->stats.tx_errors++; + return NETDEV_TX_OK; +} + +static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipip_tunnel_xmit(skb, dev); + break; + case htons(ETH_P_IPV6): + ipip6_tunnel_xmit(skb, dev); + break; + default: + goto tx_err; + } -tx_error_icmp: - dst_link_failure(skb); -tx_error: - stats->tx_errors++; - dev_kfree_skb(skb); return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; + } static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; + struct flowi4 fl4; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .oif = tunnel->parms.link, - .proto = IPPROTO_IPV6 }; - struct rtable *rt; - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, + NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPV6, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); @@ -854,14 +1066,69 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->iflink = tunnel->parms.link; } +static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) +{ + struct net *net = t->net; + struct sit_net *sitn = net_generic(net, sit_net_id); + + ipip6_tunnel_unlink(sitn, t); + synchronize_net(); + t->parms.iph.saddr = p->iph.saddr; + t->parms.iph.daddr = p->iph.daddr; + memcpy(t->dev->dev_addr, &p->iph.saddr, 4); + memcpy(t->dev->broadcast, &p->iph.daddr, 4); + ipip6_tunnel_link(sitn, t); + t->parms.iph.ttl = p->iph.ttl; + t->parms.iph.tos = p->iph.tos; + if (t->parms.link != p->link) { + t->parms.link = p->link; + ipip6_tunnel_bind_dev(t->dev); + } + ip_tunnel_dst_reset_all(t); + netdev_state_change(t->dev); +} + +#ifdef CONFIG_IPV6_SIT_6RD +static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, + struct ip_tunnel_6rd *ip6rd) +{ + struct in6_addr prefix; + __be32 relay_prefix; + + if (ip6rd->relay_prefixlen > 32 || + ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64) + return -EINVAL; + + ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen); + if (!ipv6_addr_equal(&prefix, &ip6rd->prefix)) + return -EINVAL; + if (ip6rd->relay_prefixlen) + relay_prefix = ip6rd->relay_prefix & + htonl(0xffffffffUL << + (32 - ip6rd->relay_prefixlen)); + else + relay_prefix = 0; + if (relay_prefix != ip6rd->relay_prefix) + return -EINVAL; + + t->ip6rd.prefix = prefix; + t->ip6rd.relay_prefix = relay_prefix; + t->ip6rd.prefixlen = ip6rd->prefixlen; + t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; + ip_tunnel_dst_reset_all(t); + netdev_state_change(t->dev); + return 0; +} +#endif + static int ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { int err = 0; struct ip_tunnel_parm p; struct ip_tunnel_prl prl; - struct ip_tunnel *t; - struct net *net = dev_net(dev); + struct ip_tunnel *t = netdev_priv(dev); + struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; @@ -872,16 +1139,15 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: #endif - t = NULL; if (dev == sitn->fb_tunnel_dev) { if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { err = -EFAULT; break; } t = ipip6_tunnel_locate(net, &p, 0); + if (t == NULL) + t = netdev_priv(dev); } - if (t == NULL) - t = netdev_priv(dev); err = -EFAULT; if (cmd == SIOCGETTUNNEL) { @@ -891,7 +1157,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) goto done; #ifdef CONFIG_IPV6_SIT_6RD } else { - ipv6_addr_copy(&ip6rd.prefix, &t->ip6rd.prefix); + ip6rd.prefix = t->ip6rd.prefix; ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; @@ -906,7 +1172,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -914,7 +1180,11 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) goto done; err = -EINVAL; - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + if (p.iph.protocol != IPPROTO_IPV6 && + p.iph.protocol != IPPROTO_IPIP && + p.iph.protocol != 0) + goto done; + if (p.iph.version != 4 || p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) goto done; if (p.iph.ttl) @@ -935,27 +1205,13 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } t = netdev_priv(dev); - ipip6_tunnel_unlink(sitn, t); - t->parms.iph.saddr = p.iph.saddr; - t->parms.iph.daddr = p.iph.daddr; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); - ipip6_tunnel_link(sitn, t); - netdev_state_change(dev); } + + ipip6_tunnel_update(t, &p); } if (t) { err = 0; - if (cmd == SIOCCHGTUNNEL) { - t->parms.iph.ttl = p.iph.ttl; - t->parms.iph.tos = p.iph.tos; - if (t->parms.link != p.link) { - t->parms.link = p.link; - ipip6_tunnel_bind_dev(dev); - netdev_state_change(dev); - } - } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; } else @@ -964,7 +1220,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == sitn->fb_tunnel_dev) { @@ -987,9 +1243,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EINVAL; if (dev == sitn->fb_tunnel_dev) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); break; @@ -997,7 +1250,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELPRL: case SIOCCHGPRL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EINVAL; if (dev == sitn->fb_tunnel_dev) @@ -1005,9 +1258,6 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = -EFAULT; if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) goto done; - err = -ENOENT; - if (!(t = netdev_priv(dev))) - goto done; switch (cmd) { case SIOCDELPRL: @@ -1018,6 +1268,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } + ip_tunnel_dst_reset_all(t); netdev_state_change(dev); break; @@ -1026,7 +1277,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCCHG6RD: case SIOCDEL6RD: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -1034,34 +1285,10 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(ip6rd))) goto done; - t = netdev_priv(dev); - if (cmd != SIOCDEL6RD) { - struct in6_addr prefix; - __be32 relay_prefix; - - err = -EINVAL; - if (ip6rd.relay_prefixlen > 32 || - ip6rd.prefixlen + (32 - ip6rd.relay_prefixlen) > 64) + err = ipip6_tunnel_update_6rd(t, &ip6rd); + if (err < 0) goto done; - - ipv6_addr_prefix(&prefix, &ip6rd.prefix, - ip6rd.prefixlen); - if (!ipv6_addr_equal(&prefix, &ip6rd.prefix)) - goto done; - if (ip6rd.relay_prefixlen) - relay_prefix = ip6rd.relay_prefix & - htonl(0xffffffffUL << - (32 - ip6rd.relay_prefixlen)); - else - relay_prefix = 0; - if (relay_prefix != ip6rd.relay_prefix) - goto done; - - ipv6_addr_copy(&t->ip6rd.prefix, &prefix); - t->ip6rd.relay_prefix = relay_prefix; - t->ip6rd.prefixlen = ip6rd.prefixlen; - t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen; } else ipip6_tunnel_clone_6rd(dev, sitn); @@ -1087,15 +1314,31 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, - .ndo_start_xmit = ipip6_tunnel_xmit, + .ndo_start_xmit = sit_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, }; +static void ipip6_dev_free(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + free_percpu(tunnel->dst_cache); + free_percpu(dev->tstats); + free_netdev(dev); +} + +#define SIT_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + static void ipip6_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip6_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -1104,23 +1347,36 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->iflink = 0; dev->addr_len = 4; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; + dev->features |= SIT_FEATURES; + dev->hw_features |= SIT_FEATURES; } -static void ipip6_tunnel_init(struct net_device *dev) +static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + tunnel->net = dev_net(dev); memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip6_tunnel_bind_dev(dev); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } + + return 0; } -static void ipip6_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -1128,6 +1384,7 @@ static void ipip6_fb_tunnel_init(struct net_device *dev) struct sit_net *sitn = net_generic(net, sit_net_id); tunnel->dev = dev; + tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); iph->version = 4; @@ -1135,36 +1392,335 @@ static void ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } + dev_hold(dev); - sitn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); + return 0; } -static struct xfrm_tunnel sit_handler = { +static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + u8 proto; + + if (!data || !data[IFLA_IPTUN_PROTO]) + return 0; + + proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (proto != IPPROTO_IPV6 && + proto != IPPROTO_IPIP && + proto != 0) + return -EINVAL; + + return 0; +} + +static void ipip6_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + memset(parms, 0, sizeof(*parms)); + + parms->iph.version = 4; + parms->iph.protocol = IPPROTO_IPV6; + parms->iph.ihl = 5; + parms->iph.ttl = 64; + + if (!data) + return; + + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); + + if (data[IFLA_IPTUN_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); + + if (data[IFLA_IPTUN_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); + + if (data[IFLA_IPTUN_TTL]) { + parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); + if (parms->iph.ttl) + parms->iph.frag_off = htons(IP_DF); + } + + if (data[IFLA_IPTUN_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_IPTUN_FLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + +} + +#ifdef CONFIG_IPV6_SIT_6RD +/* This function returns true when 6RD attributes are present in the nl msg */ +static bool ipip6_netlink_6rd_parms(struct nlattr *data[], + struct ip_tunnel_6rd *ip6rd) +{ + bool ret = false; + memset(ip6rd, 0, sizeof(*ip6rd)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_6RD_PREFIX]) { + ret = true; + nla_memcpy(&ip6rd->prefix, data[IFLA_IPTUN_6RD_PREFIX], + sizeof(struct in6_addr)); + } + + if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { + ret = true; + ip6rd->relay_prefix = + nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]); + } + + if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { + ret = true; + ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]); + } + + if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) { + ret = true; + ip6rd->relay_prefixlen = + nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); + } + + return ret; +} +#endif + +static int ipip6_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net *net = dev_net(dev); + struct ip_tunnel *nt; +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + int err; + + nt = netdev_priv(dev); + ipip6_netlink_parms(data, &nt->parms); + + if (ipip6_tunnel_locate(net, &nt->parms, 0)) + return -EEXIST; + + err = ipip6_tunnel_create(dev); + if (err < 0) + return err; + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipip6_netlink_6rd_parms(data, &ip6rd)) + err = ipip6_tunnel_update_6rd(nt, &ip6rd); +#endif + + return err; +} + +static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[]) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm p; + struct net *net = t->net; + struct sit_net *sitn = net_generic(net, sit_net_id); +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + + if (dev == sitn->fb_tunnel_dev) + return -EINVAL; + + ipip6_netlink_parms(data, &p); + + if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) + return -EINVAL; + + t = ipip6_tunnel_locate(net, &p, 0); + + if (t) { + if (t->dev != dev) + return -EEXIST; + } else + t = netdev_priv(dev); + + ipip6_tunnel_update(t, &p); + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipip6_netlink_6rd_parms(data, &ip6rd)) + return ipip6_tunnel_update_6rd(t, &ip6rd); +#endif + + return 0; +} + +static size_t ipip6_get_size(const struct net_device *dev) +{ + return + /* IFLA_IPTUN_LINK */ + nla_total_size(4) + + /* IFLA_IPTUN_LOCAL */ + nla_total_size(4) + + /* IFLA_IPTUN_REMOTE */ + nla_total_size(4) + + /* IFLA_IPTUN_TTL */ + nla_total_size(1) + + /* IFLA_IPTUN_TOS */ + nla_total_size(1) + + /* IFLA_IPTUN_PMTUDISC */ + nla_total_size(1) + + /* IFLA_IPTUN_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_PROTO */ + nla_total_size(1) + +#ifdef CONFIG_IPV6_SIT_6RD + /* IFLA_IPTUN_6RD_PREFIX */ + nla_total_size(sizeof(struct in6_addr)) + + /* IFLA_IPTUN_6RD_RELAY_PREFIX */ + nla_total_size(4) + + /* IFLA_IPTUN_6RD_PREFIXLEN */ + nla_total_size(2) + + /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ + nla_total_size(2) + +#endif + 0; +} + +static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct ip_tunnel_parm *parm = &tunnel->parms; + + if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || + nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || + nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || + nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, + !!(parm->iph.frag_off & htons(IP_DF))) || + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags)) + goto nla_put_failure; + +#ifdef CONFIG_IPV6_SIT_6RD + if (nla_put(skb, IFLA_IPTUN_6RD_PREFIX, sizeof(struct in6_addr), + &tunnel->ip6rd.prefix) || + nla_put_be32(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, + tunnel->ip6rd.relay_prefix) || + nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, + tunnel->ip6rd.prefixlen) || + nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + tunnel->ip6rd.relay_prefixlen)) + goto nla_put_failure; +#endif + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { + [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, + [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, + [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, + [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, + [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, + [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, +#ifdef CONFIG_IPV6_SIT_6RD + [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, + [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, + [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, + [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, +#endif +}; + +static void ipip6_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + if (dev != sitn->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + +static struct rtnl_link_ops sit_link_ops __read_mostly = { + .kind = "sit", + .maxtype = IFLA_IPTUN_MAX, + .policy = ipip6_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = ipip6_tunnel_setup, + .validate = ipip6_validate, + .newlink = ipip6_newlink, + .changelink = ipip6_changelink, + .get_size = ipip6_get_size, + .fill_info = ipip6_fill_info, + .dellink = ipip6_dellink, +}; + +static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, }; -static void sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head) +static struct xfrm_tunnel ipip_handler __read_mostly = { + .handler = ipip_rcv, + .err_handler = ipip6_err, + .priority = 2, +}; + +static void __net_exit sit_destroy_tunnels(struct net *net, + struct list_head *head) { + struct sit_net *sitn = net_generic(net, sit_net_id); + struct net_device *dev, *aux; int prio; + for_each_netdev_safe(net, dev, aux) + if (dev->rtnl_link_ops == &sit_link_ops) + unregister_netdevice_queue(dev, head); + for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = sitn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(sitn->tunnels[prio][h]); while (t != NULL) { - unregister_netdevice_queue(t->dev, head); - t = t->next; + /* If dev is in the same netns, it has already + * been added to the list by the previous loop. + */ + if (!net_eq(dev_net(t->dev), net)) + unregister_netdevice_queue(t->dev, + head); + t = rtnl_dereference(t->next); } } } } -static int sit_init_net(struct net *net) +static int __net_init sit_init_net(struct net *net) { struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *t; int err; sitn->tunnels[0] = sitn->tunnels_wc; @@ -1179,30 +1735,40 @@ static int sit_init_net(struct net *net) goto err_alloc_dev; } dev_net_set(sitn->fb_tunnel_dev, net); + sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; + /* FB netdevice is special: we have one, and only one per netns. + * Allowing to move it to another netns is clearly unsafe. + */ + sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + + err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + if (err) + goto err_dev_free; - ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); if ((err = register_netdev(sitn->fb_tunnel_dev))) goto err_reg_dev; + t = netdev_priv(sitn->fb_tunnel_dev); + + strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: dev_put(sitn->fb_tunnel_dev); - free_netdev(sitn->fb_tunnel_dev); +err_dev_free: + ipip6_dev_free(sitn->fb_tunnel_dev); err_alloc_dev: return err; } -static void sit_exit_net(struct net *net) +static void __net_exit sit_exit_net(struct net *net) { - struct sit_net *sitn = net_generic(net, sit_net_id); LIST_HEAD(list); rtnl_lock(); - sit_destroy_tunnels(sitn, &list); - unregister_netdevice_queue(sitn->fb_tunnel_dev, &list); + sit_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } @@ -1216,7 +1782,9 @@ static struct pernet_operations sit_net_ops = { static void __exit sit_cleanup(void) { + rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ @@ -1226,21 +1794,39 @@ static int __init sit_init(void) { int err; - printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); - - if (xfrm4_tunnel_register(&sit_handler, AF_INET6) < 0) { - printk(KERN_INFO "sit init: Can't add protocol\n"); - return -EAGAIN; - } + pr_info("IPv6 over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) - xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + return err; + err = xfrm4_tunnel_register(&sit_handler, AF_INET6); + if (err < 0) { + pr_info("%s: can't register ip6ip4\n", __func__); + goto xfrm_tunnel_failed; + } + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + pr_info("%s: can't register ip4ip4\n", __func__); + goto xfrm_tunnel4_failed; + } + err = rtnl_link_register(&sit_link_ops); + if (err < 0) + goto rtnl_link_failed; +out: return err; + +rtnl_link_failed: + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel4_failed: + xfrm4_tunnel_deregister(&sit_handler, AF_INET6); +xfrm_tunnel_failed: + unregister_pernet_device(&sit_net_ops); + goto out; } module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); -MODULE_ALIAS("sit0"); +MODULE_ALIAS_RTNL_LINK("sit"); +MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7208a06576c..a822b880689 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -21,42 +21,25 @@ #include <net/ipv6.h> #include <net/tcp.h> -extern int sysctl_tcp_syncookies; -extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; - #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -/* - * This table has to be sorted and terminated with (__u16)-1. - * XXX generate a better table. - * Unresolved Issues: HIPPI with a 64k MSS is not well supported. +static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS]; + +/* RFC 2460, Section 8.3: + * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] * - * Taken directly from ipv4 implementation. - * Should this list be modified for ipv6 use or is it close enough? - * rfc 2460 8.3 suggests mss values 20 bytes less than ipv4 counterpart + * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows + * using higher values than ipv4 tcp syncookies. + * The other values are chosen based on ethernet (1500 and 9k MTU), plus + * one that accounts for common encap (PPPoe) overhead. Table must be sorted. */ static __u16 const msstab[] = { - 64 - 1, - 256 - 1, - 512 - 1, - 536 - 1, - 1024 - 1, - 1440 - 1, - 1460 - 1, - 4312 - 1, - (__u16)-1 + 1280 - 60, /* IPV6_MIN_MTU - 60 */ + 1480 - 60, + 1500 - 60, + 9000 - 60, }; -/* The number doesn't include the -1 terminator */ -#define NUM_MSS (ARRAY_SIZE(msstab) - 1) - -/* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -77,17 +60,21 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); -static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, +static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); + __u32 *tmp; + + net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); + + tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining - * 192 bits required for sha_transform, from the syncookie_secret + * 192 bits required for sha_transform, from the syncookie6_secret * and overwrite the digest with the secret */ - memcpy(tmp + 10, syncookie_secret[c], 44); + memcpy(tmp + 10, syncookie6_secret[c], 44); memcpy(tmp, saddr, 16); memcpy(tmp + 4, daddr, 16); tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; @@ -97,27 +84,28 @@ static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, return tmp[17]; } -static __u32 secure_tcp_syn_cookie(struct in6_addr *saddr, struct in6_addr *daddr, +static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, - __u32 count, __u32 data) + __u32 data) { + u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } -static __u32 check_tcp_syn_cookie(__u32 cookie, struct in6_addr *saddr, - struct in6_addr *daddr, __be16 sport, - __be16 dport, __u32 sseq, __u32 count, - __u32 maxdiff) +static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, + __be16 dport, __u32 sseq) { - __u32 diff; + __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); - if (diff >= maxdiff) + if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - @@ -125,44 +113,49 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, struct in6_addr *saddr, & COOKIEMASK; } -__u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, + const struct tcphdr *th, __u16 *mssp) { - struct ipv6hdr *iph = ipv6_hdr(skb); - const struct tcphdr *th = tcp_hdr(skb); int mssind; const __u16 mss = *mssp; - tcp_synq_overflow(sk); - - for (mssind = 0; mss > msstab[mssind + 1]; mssind++) - ; - *mssp = msstab[mssind] + 1; + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, - th->dest, ntohl(th->seq), - jiffies / (HZ * 60), mssind); + th->dest, ntohl(th->seq), mssind); } +EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); -static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) { - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); + + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return __cookie_v6_init_sequence(iph, th, mssp); +} + +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, + __u32 cookie) +{ __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, - th->source, th->dest, seq, - jiffies / (HZ * 60), COUNTER_TRIES); + th->source, th->dest, seq); - return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } +EXPORT_SYMBOL_GPL(__cookie_v6_check); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; - u8 *hash_location; struct inet_request_sock *ireq; - struct inet6_request_sock *ireq6; struct tcp_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -173,12 +166,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) int mss; struct dst_entry *dst; __u8 rcv_wscale; + bool ecn_ok = false; - if (!sysctl_tcp_syncookies || !th->ack) + if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk) || - (mss = cookie_check(skb, cookie)) == 0) { + (mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie)) == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -187,10 +181,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + goto out; ret = NULL; req = inet6_reqsk_alloc(&tcp6_request_sock_ops); @@ -198,39 +192,41 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ireq = inet_rsk(req); - ireq6 = inet6_rsk(req); treq = tcp_rsk(req); + treq->listener = NULL; if (security_inet_conn_request(sk, skb, req)) goto out_free; req->mss = mss; - ireq->rmt_port = th->source; - ireq->loc_port = th->dest; - ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); + ireq->ir_rmt_port = th->source; + ireq->ir_num = ntohs(th->dest); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); - ireq6->pktopts = skb; + ireq->pktopts = skb; } - ireq6->iif = sk->sk_bound_dev_if; + ireq->ir_iif = sk->sk_bound_dev_if; /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && - ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq6->iif = inet6_iif(skb); + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + ireq->ir_mark = inet_request_mark(sk, skb); req->expires = 0UL; - req->retrans = 0; - ireq->ecn_ok = 0; + req->num_retrans = 0; + ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; @@ -240,36 +236,29 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) * me if there is a preferred way. */ { - struct in6_addr *final_p = NULL, final; - struct flowi fl; - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->inet_sport; - security_req_classify_flow(req, &fl); - if (ip6_dst_lookup(sk, &dst, &fl)) - goto out_free; - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) + struct in6_addr *final_p, final; + struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = ireq->ir_v6_rmt_addr; + final_p = fl6_update_dst(&fl6, np->opt, &final); + fl6.saddr = ireq->ir_v6_loc_addr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = ireq->ir_mark; + fl6.fl6_dport = ireq->ir_rmt_port; + fl6.fl6_sport = inet_sk(sk)->inet_sport; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) goto out_free; } req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index c690736885b..058f3eca2e5 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -9,27 +9,38 @@ #include <linux/sysctl.h> #include <linux/in6.h> #include <linux/ipv6.h> +#include <linux/slab.h> +#include <linux/export.h> #include <net/ndisc.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> -static ctl_table ipv6_table_template[] = { +static struct ctl_table ipv6_table_template[] = { { - .procname = "route", - .maxlen = 0, - .mode = 0555, - .child = ipv6_route_table_template + .procname = "bindv6only", + .data = &init_net.ipv6.sysctl.bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, { - .procname = "icmp", - .maxlen = 0, - .mode = 0555, - .child = ipv6_icmp_table_template + .procname = "anycast_src_echo_reply", + .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, { - .procname = "bindv6only", - .data = &init_net.ipv6.sysctl.bindv6only, + .procname = "flowlabel_consistency", + .data = &init_net.ipv6.sysctl.flowlabel_consistency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "fwmark_reflect", + .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -37,7 +48,7 @@ static ctl_table ipv6_table_template[] = { { } }; -static ctl_table ipv6_rotable[] = { +static struct ctl_table ipv6_rotable[] = { { .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, @@ -48,14 +59,7 @@ static ctl_table ipv6_rotable[] = { { } }; -struct ctl_path net_ipv6_ctl_path[] = { - { .procname = "net", }, - { .procname = "ipv6", }, - { }, -}; -EXPORT_SYMBOL_GPL(net_ipv6_ctl_path); - -static int ipv6_sysctl_net_init(struct net *net) +static int __net_init ipv6_sysctl_net_init(struct net *net) { struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; @@ -67,28 +71,39 @@ static int ipv6_sysctl_net_init(struct net *net) GFP_KERNEL); if (!ipv6_table) goto out; + ipv6_table[0].data = &net->ipv6.sysctl.bindv6only; + ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply; + ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) goto out_ipv6_table; - ipv6_table[0].child = ipv6_route_table; ipv6_icmp_table = ipv6_icmp_sysctl_init(net); if (!ipv6_icmp_table) goto out_ipv6_route_table; - ipv6_table[1].child = ipv6_icmp_table; - - ipv6_table[2].data = &net->ipv6.sysctl.bindv6only; - net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path, - ipv6_table); - if (!net->ipv6.sysctl.table) + net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table); + if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; + net->ipv6.sysctl.route_hdr = + register_net_sysctl(net, "net/ipv6/route", ipv6_route_table); + if (!net->ipv6.sysctl.route_hdr) + goto out_unregister_ipv6_table; + + net->ipv6.sysctl.icmp_hdr = + register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table); + if (!net->ipv6.sysctl.icmp_hdr) + goto out_unregister_route_table; + err = 0; out: return err; - +out_unregister_route_table: + unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); +out_unregister_ipv6_table: + unregister_net_sysctl_table(net->ipv6.sysctl.hdr); out_ipv6_icmp_table: kfree(ipv6_icmp_table); out_ipv6_route_table: @@ -98,17 +113,19 @@ out_ipv6_table: goto out; } -static void ipv6_sysctl_net_exit(struct net *net) +static void __net_exit ipv6_sysctl_net_exit(struct net *net) { struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; - ipv6_table = net->ipv6.sysctl.table->ctl_table_arg; - ipv6_route_table = ipv6_table[0].child; - ipv6_icmp_table = ipv6_table[1].child; + ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; + ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; + ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->ipv6.sysctl.table); + unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr); + unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); + unregister_net_sysctl_table(net->ipv6.sysctl.hdr); kfree(ipv6_table); kfree(ipv6_route_table); @@ -126,7 +143,7 @@ int ipv6_sysctl_register(void) { int err = -ENOMEM; - ip6_header = register_net_sysctl_rotable(net_ipv6_ctl_path, ipv6_rotable); + ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); if (ip6_header == NULL) goto out; @@ -146,19 +163,3 @@ void ipv6_sysctl_unregister(void) unregister_net_sysctl_table(ip6_header); unregister_pernet_subsys(&ipv6_sysctl_net_ops); } - -static struct ctl_table_header *ip6_base; - -int ipv6_static_sysctl_register(void) -{ - static struct ctl_table empty[1]; - ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty); - if (ip6_base == NULL) - return -ENOMEM; - return 0; -} - -void ipv6_static_sysctl_unregister(void) -{ - unregister_net_sysctl_table(ip6_base); -} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index febfd595a40..229239ad96b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -38,7 +38,8 @@ #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> - +#include <linux/slab.h> +#include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> @@ -60,8 +61,9 @@ #include <net/timewait_sock.h> #include <net/netdma.h> #include <net/inet_common.h> - -#include <asm/uaccess.h> +#include <net/secure_seq.h> +#include <net/tcp_memcontrol.h> +#include <net/busy_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -82,12 +84,24 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #else static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, - struct in6_addr *addr) + const struct in6_addr *addr) { return NULL; } #endif +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -101,15 +115,7 @@ static void tcp_v6_hash(struct sock *sk) } } -static __inline__ __sum16 tcp_v6_check(int len, - struct in6_addr *saddr, - struct in6_addr *daddr, - __wsum base) -{ - return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); -} - -static __u32 tcp_v6_init_sequence(struct sk_buff *skb) +static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) { return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, @@ -125,8 +131,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct in6_addr *saddr = NULL, *final_p = NULL, final; - struct flowi fl; + struct in6_addr *saddr = NULL, *final_p, final; + struct rt6_info *rt; + struct flowi6 fl6; struct dst_entry *dst; int addr_type; int err; @@ -135,19 +142,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; if (usin->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { - fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl.fl6_flowlabel); - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6.flowlabel); + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); fl6_sock_release(flowlabel); } } @@ -156,12 +162,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * connect() to INADDR_ANY means loopback (BSD'ism). */ - if(ipv6_addr_any(&usin->sin6_addr)) + if (ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 0x1; addr_type = ipv6_addr_type(&usin->sin6_addr); - if(addr_type & IPV6_ADDR_MULTICAST) + if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { @@ -183,14 +189,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } if (tp->rx_opt.ts_recent_stamp && - !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } - ipv6_addr_copy(&np->daddr, &usin->sin6_addr); - np->flow_label = fl.fl6_flowlabel; + sk->sk_v6_daddr = usin->sin6_addr; + np->flow_label = fl6.flowlabel; /* * TCP over IPv4 @@ -228,59 +234,51 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, } else { ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, - &np->rcv_saddr); + &sk->sk_v6_rcv_saddr); } return err; } - if (!ipv6_addr_any(&np->rcv_saddr)) - saddr = &np->rcv_saddr; - - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, - (saddr ? saddr : &np->saddr)); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = usin->sin6_port; - fl.fl_ip_sport = inet->inet_sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + saddr = &sk->sk_v6_rcv_saddr; - security_sk_classify_flow(sk, &fl); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = sk->sk_v6_daddr; + fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = usin->sin6_port; + fl6.fl6_sport = inet->inet_sport; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) + final_p = fl6_update_dst(&fl6, np->opt, &final); + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); goto failure; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); - if (err < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto failure; } if (saddr == NULL) { - saddr = &fl.fl6_src; - ipv6_addr_copy(&np->rcv_saddr, saddr); + saddr = &fl6.saddr; + sk->sk_v6_rcv_saddr = *saddr; } /* set the source address */ - ipv6_addr_copy(&np->saddr, saddr); + np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(sk, dst, NULL, NULL); + rt = (struct rt6_info *) dst; + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && + ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr)) + tcp_fetch_timewait_stamp(sk, dst); + icsk->icsk_ext_hdr_len = 0; if (np->opt) icsk->icsk_ext_hdr_len = (np->opt->opt_flen + @@ -295,9 +293,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) goto late_failure; - if (!tp->write_seq) + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, - np->daddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport); @@ -316,16 +314,34 @@ failure: return err; } +static void tcp_v6_mtu_reduced(struct sock *sk) +{ + struct dst_entry *dst; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + if (!dst) + return; + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } +} + static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; - __u32 seq; + struct request_sock *fastopen; + __u32 seq, snd_una; struct net *net = dev_net(skb->dev); sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, @@ -343,69 +359,55 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } bh_lock_sock(sk); - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; + if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + tp = tcp_sk(sk); seq = ntohl(th->seq); + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ + fastopen = tp->fastopen_rsk; + snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && - !between(seq, tp->snd_una, tp->snd_nxt)) { + !between(seq, snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = inet6_sk(sk); - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; + if (type == NDISC_REDIRECT) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) - goto out; - - /* icmp should have updated the destination cache entry */ - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct flowi fl; - - /* BUGGG_FUTURE: Again, it is not clear how - to handle rthdr case. Ignore this complexity - for now. - */ - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet->inet_dport; - fl.fl_ip_sport = inet->inet_sport; - security_skb_classify_flow(skb, &fl); - - if ((err = ip6_dst_lookup(sk, &dst, &fl))) { - sk->sk_err_soft = -err; - goto out; - } + if (dst) + dst->ops->redirect(dst, sk, skb); + goto out; + } - if ((err = xfrm_lookup(net, &dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - goto out; - } + if (type == ICMPV6_PKT_TOOBIG) { + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; - } else - dst_hold(dst); + if (!ip6_sk_accept_pmtu(sk)) + goto out; - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); - tcp_simple_retransmit(sk); - } /* else let the usual retransmit timer handle it */ - dst_release(dst); + tp->mtu_info = ntohl(info); + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + &tp->tsq_flags)) + sock_hold(sk); goto out; } @@ -434,11 +436,17 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); goto out; case TCP_SYN_SENT: - case TCP_SYN_RECV: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ + case TCP_SYN_RECV: + /* Only in fast or simultaneous open. If a fast open socket is + * is already accepted it is treated as a connected one below. + */ + if (fastopen && fastopen->sk == NULL) + break; + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -461,244 +469,82 @@ out: } -static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, - struct request_values *rvp) +static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst, + struct flowi6 *fl6, + struct request_sock *req, + u16 queue_mapping, + struct tcp_fastopen_cookie *foc) { - struct inet6_request_sock *treq = inet6_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff * skb; - struct ipv6_txoptions *opt = NULL; - struct in6_addr * final_p = NULL, final; - struct flowi fl; - struct dst_entry *dst; - int err = -1; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); - fl.fl6_flowlabel = 0; - fl.oif = treq->iif; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_rsk(req)->loc_port; - security_req_classify_flow(req, &fl); - - opt = np->opt; - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } + struct sk_buff *skb; + int err = -ENOMEM; - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) - goto done; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) + /* First, grab a route. */ + if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, rvp); + skb = tcp_make_synack(sk, dst, req, foc); + if (skb) { - struct tcphdr *th = tcp_hdr(skb); + __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, + &ireq->ir_v6_rmt_addr); - th->check = tcp_v6_check(skb->len, - &treq->loc_addr, &treq->rmt_addr, - csum_partial(th, skb->len, skb->csum)); + fl6->daddr = ireq->ir_v6_rmt_addr; + if (np->repflow && (ireq->pktopts != NULL)) + fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - err = ip6_xmit(sk, skb, &fl, opt, 0); + skb_set_queue_mapping(skb, queue_mapping); + err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); err = net_xmit_eval(err); } done: - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return err; } -static inline void syn_flood_warning(struct sk_buff *skb) +static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req) { -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest)); - else -#endif - printk(KERN_INFO - "TCPv6: Possible SYN flooding on port %d. " - "Dropping request.\n", ntohs(tcp_hdr(skb)->dest)); + struct flowi6 fl6; + int res; + + res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL); + if (!res) { + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + } + return res; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { - kfree_skb(inet6_rsk(req)->pktopts); + kfree_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, - struct in6_addr *addr) + const struct in6_addr *addr) { - struct tcp_sock *tp = tcp_sk(sk); - int i; - - BUG_ON(tp == NULL); - - if (!tp->md5sig_info || !tp->md5sig_info->entries6) - return NULL; - - for (i = 0; i < tp->md5sig_info->entries6; i++) { - if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, addr)) - return &tp->md5sig_info->keys6[i].base; - } - return NULL; + return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, struct sock *addr_sk) { - return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr); + return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr); } static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, struct request_sock *req) { - return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr); + return tcp_v6_md5_do_lookup(sk, &inet_rsk(req)->ir_v6_rmt_addr); } -static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer, - char *newkey, u8 newkeylen) -{ - /* Add key to the list */ - struct tcp_md5sig_key *key; - struct tcp_sock *tp = tcp_sk(sk); - struct tcp6_md5sig_key *keys; - - key = tcp_v6_md5_do_lookup(sk, peer); - if (key) { - /* modify existing entry - just update that one */ - kfree(key->key); - key->key = newkey; - key->keylen = newkeylen; - } else { - /* reallocate new list if current one is full. */ - if (!tp->md5sig_info) { - tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC); - if (!tp->md5sig_info) { - kfree(newkey); - return -ENOMEM; - } - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - } - if (tcp_alloc_md5sig_pool(sk) == NULL) { - kfree(newkey); - return -ENOMEM; - } - if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) { - keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) * - (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC); - - if (!keys) { - tcp_free_md5sig_pool(); - kfree(newkey); - return -ENOMEM; - } - - if (tp->md5sig_info->entries6) - memmove(keys, tp->md5sig_info->keys6, - (sizeof (tp->md5sig_info->keys6[0]) * - tp->md5sig_info->entries6)); - - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = keys; - tp->md5sig_info->alloced6++; - } - - ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr, - peer); - tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.key = newkey; - tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.keylen = newkeylen; - - tp->md5sig_info->entries6++; - } - return 0; -} - -static int tcp_v6_md5_add_func(struct sock *sk, struct sock *addr_sk, - u8 *newkey, __u8 newkeylen) -{ - return tcp_v6_md5_do_add(sk, &inet6_sk(addr_sk)->daddr, - newkey, newkeylen); -} - -static int tcp_v6_md5_do_del(struct sock *sk, struct in6_addr *peer) -{ - struct tcp_sock *tp = tcp_sk(sk); - int i; - - for (i = 0; i < tp->md5sig_info->entries6; i++) { - if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, peer)) { - /* Free the key */ - kfree(tp->md5sig_info->keys6[i].base.key); - tp->md5sig_info->entries6--; - - if (tp->md5sig_info->entries6 == 0) { - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = NULL; - tp->md5sig_info->alloced6 = 0; - } else { - /* shrink the database */ - if (tp->md5sig_info->entries6 != i) - memmove(&tp->md5sig_info->keys6[i], - &tp->md5sig_info->keys6[i+1], - (tp->md5sig_info->entries6 - i) - * sizeof (tp->md5sig_info->keys6[0])); - } - tcp_free_md5sig_pool(); - return 0; - } - } - return -ENOENT; -} - -static void tcp_v6_clear_md5_list (struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - int i; - - if (tp->md5sig_info->entries6) { - for (i = 0; i < tp->md5sig_info->entries6; i++) - kfree(tp->md5sig_info->keys6[i].base.key); - tp->md5sig_info->entries6 = 0; - tcp_free_md5sig_pool(); - } - - kfree(tp->md5sig_info->keys6); - tp->md5sig_info->keys6 = NULL; - tp->md5sig_info->alloced6 = 0; - - if (tp->md5sig_info->entries4) { - for (i = 0; i < tp->md5sig_info->entries4; i++) - kfree(tp->md5sig_info->keys4[i].base.key); - tp->md5sig_info->entries4 = 0; - tcp_free_md5sig_pool(); - } - - kfree(tp->md5sig_info->keys4); - tp->md5sig_info->keys4 = NULL; - tp->md5sig_info->alloced4 = 0; -} - -static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, - int optlen) +static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval, + int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; - u8 *newkey; if (optlen < sizeof(cmd)) return -EINVAL; @@ -710,49 +556,35 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, return -EINVAL; if (!cmd.tcpm_keylen) { - if (!tcp_sk(sk)->md5sig_info) - return -ENOENT; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - return tcp_v4_md5_do_del(sk, sin6->sin6_addr.s6_addr32[3]); - return tcp_v6_md5_do_del(sk, &sin6->sin6_addr); + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET); + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - if (!tcp_sk(sk)->md5sig_info) { - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_md5sig_info *p; - - p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL); - if (!p) - return -ENOMEM; + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); - tp->md5sig_info = p; - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - } - - newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); - if (!newkey) - return -ENOMEM; - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { - return tcp_v4_md5_do_add(sk, sin6->sin6_addr.s6_addr32[3], - newkey, cmd.tcpm_keylen); - } - return tcp_v6_md5_do_add(sk, &sin6->sin6_addr, newkey, cmd.tcpm_keylen); + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, - struct in6_addr *daddr, - struct in6_addr *saddr, int nbytes) + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; bp = &hp->md5_blk.ip6; /* 1. TCP pseudo-header (RFC2460) */ - ipv6_addr_copy(&bp->saddr, saddr); - ipv6_addr_copy(&bp->daddr, daddr); + bp->saddr = *saddr; + bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); @@ -761,8 +593,8 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, } static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, - struct in6_addr *daddr, struct in6_addr *saddr, - struct tcphdr *th) + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) { struct tcp_md5sig_pool *hp; struct hash_desc *desc; @@ -794,22 +626,23 @@ clear_hash_noput: } static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, - struct sock *sk, struct request_sock *req, - struct sk_buff *skb) + const struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; struct tcp_md5sig_pool *hp; struct hash_desc *desc; - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); if (sk) { saddr = &inet6_sk(sk)->saddr; - daddr = &inet6_sk(sk)->daddr; + daddr = &sk->sk_v6_daddr; } else if (req) { - saddr = &inet6_rsk(req)->loc_addr; - daddr = &inet6_rsk(req)->rmt_addr; + saddr = &inet_rsk(req)->ir_v6_loc_addr; + daddr = &inet_rsk(req)->ir_v6_rmt_addr; } else { - struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } @@ -843,12 +676,12 @@ clear_hash_noput: return 1; } -static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) { - __u8 *hash_location = NULL; + const __u8 *hash_location = NULL; struct tcp_md5sig_key *hash_expected; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); int genhash; u8 newhash[16]; @@ -875,12 +708,10 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) NULL, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { - if (net_ratelimit()) { - printk(KERN_INFO "MD5 Hash %s for (%pI6, %u)->(%pI6, %u)\n", - genhash ? "failed" : "mismatch", - &ip6h->saddr, ntohs(th->source), - &ip6h->daddr, ntohs(th->dest)); - } + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + genhash ? "failed" : "mismatch", + &ip6h->saddr, ntohs(th->source), + &ip6h->daddr, ntohs(th->dest)); return 1; } return 0; @@ -890,10 +721,11 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb) struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_v6_send_synack, + .rtx_syn_ack = tcp_v6_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, - .send_reset = tcp_v6_send_reset + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG @@ -903,95 +735,22 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { }; #endif -static struct timewait_sock_ops tcp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, -}; - -static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcphdr *th = tcp_hdr(skb); - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, - csum_partial(th, th->doff<<2, - skb->csum)); - } -} - -static int tcp_v6_gso_send_check(struct sk_buff *skb) -{ - struct ipv6hdr *ipv6h; - struct tcphdr *th; - - if (!pskb_may_pull(skb, sizeof(*th))) - return -EINVAL; - - ipv6h = ipv6_hdr(skb); - th = tcp_hdr(skb); - - th->check = 0; - th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_TCP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - return 0; -} - -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - struct ipv6hdr *iph = skb_gro_network_header(skb); - - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, - skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - break; - } - - /* fall through */ - case CHECKSUM_NONE: - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } - - return tcp_gro_receive(head, skb); -} - -static int tcp6_gro_complete(struct sk_buff *skb) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - struct tcphdr *th = tcp_hdr(skb); - - th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), - &iph->saddr, &iph->daddr, 0); - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; - - return tcp_gro_complete(skb); -} - static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, - u32 ts, struct tcp_md5sig_key *key, int rst) + u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, int rst, u8 tclass, + u32 label) { - struct tcphdr *th = tcp_hdr(skb), *t1; + const struct tcphdr *th = tcp_hdr(skb); + struct tcphdr *t1; struct sk_buff *buff; - struct flowi fl; + struct flowi6 fl6; struct net *net = dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); struct dst_entry *dst; __be32 *topt; - if (ts) + if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG if (key) @@ -1006,7 +765,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); t1 = (struct tcphdr *) skb_push(buff, tot_len); - skb_reset_transport_header(skb); + skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); @@ -1021,11 +780,11 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, topt = (__be32 *)(t1 + 1); - if (ts) { + if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *topt++ = htonl(tcp_time_stamp); - *topt++ = htonl(ts); + *topt++ = htonl(tsval); + *topt++ = htonl(tsecr); } #ifdef CONFIG_TCP_MD5SIG @@ -1038,35 +797,38 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, } #endif - buff->csum = csum_partial(t1, tot_len, 0); + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = ipv6_hdr(skb)->saddr; + fl6.saddr = ipv6_hdr(skb)->daddr; + fl6.flowlabel = label; - memset(&fl, 0, sizeof(fl)); - ipv6_addr_copy(&fl.fl6_dst, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&fl.fl6_src, &ipv6_hdr(skb)->daddr); + buff->ip_summed = CHECKSUM_PARTIAL; + buff->csum = 0; - t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, - tot_len, IPPROTO_TCP, - buff->csum); + __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); - fl.proto = IPPROTO_TCP; - fl.oif = inet6_iif(skb); - fl.fl_ip_dport = t1->dest; - fl.fl_ip_sport = t1->source; - security_skb_classify_flow(skb, &fl); + fl6.flowi6_proto = IPPROTO_TCP; + if (rt6_need_strict(&fl6.daddr) && !oif) + fl6.flowi6_oif = inet6_iif(skb); + else + fl6.flowi6_oif = oif; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); + fl6.fl6_dport = t1->dest; + fl6.fl6_sport = t1->source; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ - if (!ip6_dst_lookup(ctl_sk, &dst, &fl)) { - if (xfrm_lookup(net, &dst, &fl, NULL, 0) >= 0) { - skb_dst_set(buff, dst); - ip6_xmit(ctl_sk, buff, &fl, NULL, 0); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - if (rst) - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); - return; - } + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + if (rst) + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + return; } kfree_skb(buff); @@ -1074,9 +836,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; +#ifdef CONFIG_TCP_MD5SIG + const __u8 *hash_location = NULL; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + unsigned char newhash[16]; + int genhash; + struct sock *sk1 = NULL; +#endif + int oif; if (th->rst) return; @@ -1085,8 +855,33 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) return; #ifdef CONFIG_TCP_MD5SIG - if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); + hash_location = tcp_parse_md5sig_option(th); + if (!sk && hash_location) { + /* + * active side is lost. Try to find listening socket through + * source port, and then find md5 key through listening socket. + * we are not loose security here: + * Incoming packet is checked with md5 hash with finding key, + * no RST generated if md5 hash doesn't match. + */ + sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + &tcp_hashinfo, &ipv6h->saddr, + th->source, &ipv6h->daddr, + ntohs(th->source), inet6_iif(skb)); + if (!sk1) + return; + + rcu_read_lock(); + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + if (!key) + goto release_sk1; + + genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, NULL, skb); + if (genhash || memcmp(hash_location, newhash, 16) != 0) + goto release_sk1; + } else { + key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL; + } #endif if (th->ack) @@ -1095,13 +890,25 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1); + oif = sk ? sk->sk_bound_dev_if : 0; + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + +#ifdef CONFIG_TCP_MD5SIG +release_sk1: + if (sk1) { + rcu_read_unlock(); + sock_put(sk1); + } +#endif } -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, - struct tcp_md5sig_key *key) +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 tsval, u32 tsecr, int oif, + struct tcp_md5sig_key *key, u8 tclass, + u32 label) { - tcp_v6_send_response(skb, seq, ack, win, ts, key, 0); + tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, + label); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1111,7 +918,9 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw)); + tcp_time_stamp + tcptw->tw_ts_offset, + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), + tw->tw_tclass, (tw->tw_flowlabel << 12)); inet_twsk_put(tw); } @@ -1119,12 +928,19 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { - tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr)); + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV + * sk->sk_state == TCP_SYN_RECV -> for Fast Open. + */ + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, + tcp_rsk(req)->rcv_nxt, + req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + 0, 0); } -static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { struct request_sock *req, **prev; const struct tcphdr *th = tcp_hdr(skb); @@ -1135,7 +951,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, inet6_iif(skb)); if (req) - return tcp_check_req(sk, skb, req, prev); + return tcp_check_req(sk, skb, req, prev, false); nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, &ipv6_hdr(skb)->saddr, th->source, @@ -1151,7 +967,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) } #ifdef CONFIG_SYN_COOKIES - if (!th->rst && !th->syn && th->ack) + if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; @@ -1162,19 +978,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; - u8 *hash_location; struct request_sock *req; - struct inet6_request_sock *treq; + struct inet_request_sock *ireq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_SYN_COOKIES - int want_cookie = 0; -#else -#define want_cookie 0 -#endif + struct dst_entry *dst = NULL; + struct tcp_fastopen_cookie foc = { .len = -1 }; + bool want_cookie = false, fastopen; + struct flowi6 fl6; + int err; if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1182,19 +996,17 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (!ipv6_unicast_destination(skb)) goto drop; - if (inet_csk_reqsk_queue_is_full(sk) && !isn) { - if (net_ratelimit()) - syn_flood_warning(skb); -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) - want_cookie = 1; - else -#endif - goto drop; + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); + if (!want_cookie) + goto drop; } - if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; + } req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) @@ -1207,52 +1019,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0); - - if (tmp_opt.cookie_plus > 0 && - tmp_opt.saw_tstamp && - !tp->rx_opt.cookie_out_never && - (sysctl_tcp_cookie_size > 0 || - (tp->cookie_values != NULL && - tp->cookie_values->cookie_desired > 0))) { - u8 *c; - u32 *d; - u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; - int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; - - if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) - goto drop_and_free; - - /* Secret recipe starts with IP addresses */ - d = &ipv6_hdr(skb)->daddr.s6_addr32[0]; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - d = &ipv6_hdr(skb)->saddr.s6_addr32[0]; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - *mess++ ^= *d++; - - /* plus variable length Initiator Cookie */ - c = (u8 *)mess; - while (l-- > 0) - *c++ ^= *hash_location++; - -#ifdef CONFIG_SYN_COOKIES - want_cookie = 0; /* not our kind of cookie */ -#endif - tmp_ext.cookie_out_never = 0; /* false */ - tmp_ext.cookie_plus = tmp_opt.cookie_plus; - } else if (!tp->rx_opt.cookie_in_always) { - /* redundant indications, but ensure initialization. */ - tmp_ext.cookie_out_never = 1; /* true */ - tmp_ext.cookie_plus = 0; - } else { - goto drop_and_free; - } - tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1260,63 +1027,118 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = inet6_rsk(req); - ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr); - ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr); - if (!want_cookie) - TCP_ECN_create_request(req, tcp_hdr(skb)); + ireq = inet_rsk(req); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, skb, sock_net(sk)); + + ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_mark = inet_request_mark(sk, skb); - if (want_cookie) { - isn = cookie_v6_init_sequence(sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq->ir_iif = inet6_iif(skb); + + if (!isn) { if (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || + np->repflow) { atomic_inc(&skb->users); - treq->pktopts = skb; + ireq->pktopts = skb; + } + + if (want_cookie) { + isn = cookie_v6_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + goto have_isn; } - treq->iif = sk->sk_bound_dev_if; - /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && - ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) - treq->iif = inet6_iif(skb); + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { + if (!tcp_peer_is_proven(req, dst, true)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", + &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } isn = tcp_v6_init_sequence(skb); } - tcp_rsk(req)->snt_isn = isn; +have_isn: - security_inet_conn_request(sk, skb, req); + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_release; - if (tcp_v6_send_synack(sk, req, - (struct request_values *)&tmp_ext) || - want_cookie) + if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL) goto drop_and_free; - inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = tcp_v6_send_synack(sk, dst, &fl6, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } return 0; +drop_and_release: + dst_release(dst); drop_and_free: reqsk_free(req); drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return 0; /* don't send reset */ } -static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { - struct inet6_request_sock *treq; + struct inet_request_sock *ireq; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; - struct ipv6_txoptions *opt; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* @@ -1337,11 +1159,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr); + ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr); ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); - ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + newsk->sk_v6_rcv_saddr = newnp->saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -1349,10 +1171,15 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count @@ -1369,45 +1196,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } - treq = inet6_rsk(req); - opt = np->opt; + ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; - if (dst == NULL) { - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr); - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); - fl.oif = sk->sk_bound_dev_if; - fl.mark = sk->sk_mark; - fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_rsk(req)->loc_port; - security_req_classify_flow(req, &fl); - - if (ip6_dst_lookup(sk, &dst, &fl)) - goto out; - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) + if (!dst) { + dst = inet6_csk_route_req(sk, &fl6, req); + if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1417,6 +1219,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; @@ -1427,16 +1230,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr); - ipv6_addr_copy(&newnp->saddr, &treq->loc_addr); - ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr); - newsk->sk_bound_dev_if = treq->iif; + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + newnp->saddr = ireq->ir_v6_loc_addr; + newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; + newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ - newinet->opt = NULL; + newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ @@ -1444,16 +1248,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; - if (treq->pktopts != NULL) { - newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); - kfree_skb(treq->pktopts); - treq->pktopts = NULL; + if (ireq->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; if (newnp->pktoptions) skb_set_owner_r(newnp->pktoptions, newsk); } newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); + if (np->repflow) + newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Clone native IPv6 options from listening socket (if any) @@ -1461,20 +1269,20 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, but we make one more one thing there: reattach optmem to newsk. */ - if (opt) { - newnp->opt = ipv6_dup_options(newsk, opt); - if (opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - } + if (np->opt) + newnp->opt = ipv6_dup_options(newsk, np->opt); inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; @@ -1482,54 +1290,37 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) { + key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr); + if (key != NULL) { /* We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key * across. Shucks. */ - char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); - if (newkey != NULL) - tcp_v6_md5_do_add(newsk, &newnp->daddr, - newkey, key->keylen); + tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, + AF_INET6, key->key, key->keylen, + sk_gfp_atomic(sk, GFP_ATOMIC)); } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); out: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - if (opt && opt != np->opt) - sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return NULL; } -static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - if (!tcp_v6_check(skb->len, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, skb->csum)) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - return 0; - } - } - - skb->csum = ~csum_unfold(tcp_v6_check(skb->len, - &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, 0)); - - if (skb->len <= 76) { - return __skb_checksum_complete(skb); - } - return 0; -} - /* The socket must have it's spinlock held when we get * here. * @@ -1556,7 +1347,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return tcp_v4_do_rcv(sk, skb); #ifdef CONFIG_TCP_MD5SIG - if (tcp_v6_inbound_md5_hash (sk, skb)) + if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard; #endif @@ -1582,13 +1373,21 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - TCP_CHECK_TIMER(sk); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) - goto reset; - TCP_CHECK_TIMER(sk); + struct dst_entry *dst = sk->sk_rx_dst; + + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + + tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1607,19 +1406,19 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * otherwise we just shortcircuit this and continue with * the new socket.. */ - if(nsk != sk) { + if (nsk != sk) { + sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) __kfree_skb(opt_skb); return 0; } - } + } else + sock_rps_save_rxhash(sk, skb); - TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) goto reset; - TCP_CHECK_TIMER(sk); if (opt_skb) goto ipv6_pktoptions; return 0; @@ -1632,6 +1431,7 @@ discard: kfree_skb(skb); return 0; csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1651,6 +1451,10 @@ ipv6_pktoptions: np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) + np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); + if (np->repflow) + np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); @@ -1666,7 +1470,8 @@ ipv6_pktoptions: static int tcp_v6_rcv(struct sk_buff *skb) { - struct tcphdr *th; + const struct tcphdr *th; + const struct ipv6hdr *hdr; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1689,16 +1494,17 @@ static int tcp_v6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; - if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) - goto bad_packet; + if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) + goto csum_error; th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; - TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(ipv6_hdr(skb)); + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); @@ -1709,12 +1515,18 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); @@ -1723,7 +1535,7 @@ process: #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); + tp->ucopy.dma_chan = net_dma_find_channel(); if (tp->ucopy.dma_chan) ret = tcp_v6_do_rcv(sk, skb); else @@ -1732,8 +1544,12 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); } - } else - sk_add_backlog(sk, skb); + } else if (unlikely(sk_add_backlog(sk, skb, + sk->sk_rcvbuf + sk->sk_sndbuf))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } bh_unlock_sock(sk); sock_put(sk); @@ -1744,6 +1560,8 @@ no_tcp_socket: goto discard_it; if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +csum_error: + TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { @@ -1751,11 +1569,6 @@ bad_packet: } discard_it: - - /* - * Discard frame - */ - kfree_skb(skb); return 0; @@ -1769,10 +1582,13 @@ do_time_wait: goto discard_it; } - if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + if (skb->len < (th->doff<<2)) { inet_twsk_put(inet_twsk(sk)); - goto discard_it; + goto bad_packet; + } + if (tcp_checksum_complete(skb)) { + inet_twsk_put(inet_twsk(sk)); + goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { @@ -1781,6 +1597,7 @@ do_time_wait: struct sock *sk2; sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { @@ -1797,25 +1614,64 @@ do_time_wait: break; case TCP_TW_RST: goto no_tcp_socket; - case TCP_TW_SUCCESS:; + case TCP_TW_SUCCESS: + ; } goto discard_it; } -static int tcp_v6_remember_stamp(struct sock *sk) +static void tcp_v6_early_demux(struct sk_buff *skb) { - /* Alas, not yet... */ - return 0; + const struct ipv6hdr *hdr; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + hdr = ipv6_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + inet6_iif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + + if (dst) + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + if (dst && + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + skb_dst_set_noref(skb, dst); + } + } } +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor = tcp_twsk_destructor, +}; + static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v6_remember_stamp, .net_header_len = sizeof(struct ipv6hdr), + .net_frag_header_len = sizeof(struct frag_hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, @@ -1831,7 +1687,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, - .md5_add = tcp_v6_md5_add_func, .md5_parse = tcp_v6_parse_md5_keys, }; #endif @@ -1839,14 +1694,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { /* * TCP over IPv4 via INET6 API */ - static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v4_remember_stamp, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, @@ -1863,7 +1717,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, - .md5_add = tcp_v6_md5_add_func, .md5_parse = tcp_v6_parse_md5_keys, }; #endif @@ -1874,73 +1727,20 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); - tcp_init_xmit_timers(sk); - tcp_prequeue_init(tp); - - icsk->icsk_rto = TCP_TIMEOUT_INIT; - tp->mdev = TCP_TIMEOUT_INIT; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - tp->snd_cwnd = 2; - - /* See draft-stevens-tcpca-spec-01 for discussion of the - * initialization of these values. - */ - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tp->snd_cwnd_clamp = ~0; - tp->mss_cache = TCP_MSS_DEFAULT; - - tp->reordering = sysctl_tcp_reordering; - - sk->sk_state = TCP_CLOSE; + tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; - icsk->icsk_ca_ops = &tcp_init_congestion_ops; - icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_write_space = sk_stream_write_space; - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); #ifdef CONFIG_TCP_MD5SIG - tp->af_specific = &tcp_sock_ipv6_specific; + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif - /* TCP Cookie Transactions */ - if (sysctl_tcp_cookie_size > 0) { - /* Default, cookies without s_data_payload. */ - tp->cookie_values = - kzalloc(sizeof(*tp->cookie_values), - sk->sk_allocation); - if (tp->cookie_values != NULL) - kref_init(&tp->cookie_values->kref); - } - /* Presumed zeroed, in order of appearance: - * cookie_in_always, cookie_out_never, - * s_data_constant, s_data_in, s_data_out - */ - sk->sk_sndbuf = sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sysctl_tcp_rmem[1]; - - local_bh_disable(); - percpu_counter_inc(&tcp_sockets_allocated); - local_bh_enable(); - return 0; } static void tcp_v6_destroy_sock(struct sock *sk) { -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list */ - if (tcp_sk(sk)->md5sig_info) - tcp_v6_clear_md5_list(sk); -#endif tcp_v4_destroy_sock(sk); inet6_destroy_sock(sk); } @@ -1948,31 +1748,31 @@ static void tcp_v6_destroy_sock(struct sock *sk) #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, - struct sock *sk, struct request_sock *req, int i, int uid) + const struct sock *sk, struct request_sock *req, int i, kuid_t uid) { int ttd = req->expires - jiffies; - struct in6_addr *src = &inet6_rsk(req)->loc_addr; - struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; + const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; + const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], - ntohs(inet_rsk(req)->loc_port), + inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], - ntohs(inet_rsk(req)->rmt_port), + ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, - 0,0, /* could print option size, but that is af dependent. */ + 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), - req->retrans, - uid, + req->num_timeout, + from_kuid_munged(seq_user_ns(seq), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -1980,17 +1780,17 @@ static void get_openreq6(struct seq_file *seq, static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { - struct in6_addr *dest, *src; + const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; - struct inet_sock *inet = inet_sk(sp); - struct tcp_sock *tp = tcp_sk(sp); + const struct inet_sock *inet = inet_sk(sp); + const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); - struct ipv6_pinfo *np = inet6_sk(sp); + struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; - dest = &np->daddr; - src = &np->rcv_saddr; + dest = &sp->sk_v6_daddr; + src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); @@ -2010,7 +1810,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %lu %lu %u %u %d\n", + "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, @@ -2020,52 +1820,51 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) tp->write_seq-tp->snd_una, (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), timer_active, - jiffies_to_clock_t(timer_expires - jiffies), + jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - sock_i_uid(sp), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), - (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh + sp->sk_state == TCP_LISTEN ? + (fastopenq ? fastopenq->max_qlen : 0) : + (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { - struct in6_addr *dest, *src; + const struct in6_addr *dest, *src; __u16 destp, srcp; - struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); - int ttd = tw->tw_ttd - jiffies; + s32 delta = tw->tw_ttd - inet_tw_time_stamp(); - if (ttd < 0) - ttd = 0; - - dest = &tw6->tw_v6_daddr; - src = &tw6->tw_v6_rcv_saddr; + dest = &tw->tw_v6_daddr; + src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, tw->tw_substate, 0, 0, - 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; + struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2081,31 +1880,37 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_ESTABLISHED: - get_tcp6_sock(seq, v, st->num); + if (sk->sk_state == TCP_TIME_WAIT) + get_timewait6_sock(seq, v, st->num); + else + get_tcp6_sock(seq, v, st->num); break; case TCP_SEQ_STATE_OPENREQ: get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); break; - case TCP_SEQ_STATE_TIME_WAIT: - get_timewait6_sock(seq, v, st->num); - break; } out: return 0; } +static const struct file_operations tcp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = tcp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct tcp_seq_afinfo tcp6_seq_afinfo = { .name = "tcp6", .family = AF_INET6, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &tcp6_afinfo_seq_fops, .seq_ops = { .show = tcp6_seq_show, }, }; -int tcp6_proc_init(struct net *net) +int __net_init tcp6_proc_init(struct net *net) { return tcp_proc_register(net, &tcp6_seq_afinfo); } @@ -2116,6 +1921,17 @@ void tcp6_proc_exit(struct net *net) } #endif +static void tcp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, @@ -2130,11 +1946,16 @@ struct proto tcpv6_prot = { .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, @@ -2148,19 +1969,21 @@ struct proto tcpv6_prot = { .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif +#ifdef CONFIG_MEMCG_KMEM + .proto_cgroup = tcp_proto_cgroup, +#endif + .clear_sk = tcp_v6_clear_sk, }; static const struct inet6_protocol tcpv6_protocol = { + .early_demux = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, - .gso_send_check = tcp_v6_gso_send_check, - .gso_segment = tcp_tso_segment, - .gro_receive = tcp6_gro_receive, - .gro_complete = tcp6_gro_complete, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; @@ -2169,23 +1992,22 @@ static struct inet_protosw tcpv6_protosw = { .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; -static int tcpv6_net_init(struct net *net) +static int __net_init tcpv6_net_init(struct net *net) { return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); } -static void tcpv6_net_exit(struct net *net) +static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void tcpv6_net_exit_batch(struct list_head *net_exit_list) +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) { inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); } @@ -2215,10 +2037,10 @@ int __init tcpv6_init(void) out: return ret; -out_tcpv6_protocol: - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); +out_tcpv6_protocol: + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); goto out; } diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c new file mode 100644 index 00000000000..01b0ff9a0c2 --- /dev/null +++ b/net/ipv6/tcpv6_offload.c @@ -0,0 +1,93 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TCPv6 GSO/GRO support + */ +#include <linux/skbuff.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip6_checksum.h> +#include "ip6_offload.h" + +static int tcp_v6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + return 0; +} + +static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + __wsum wsum; + + /* Don't bother verifying checksum if we're going to flush anyway. */ + if (NAPI_GRO_CB(skb)->flush) + goto skip_csum; + + wsum = NAPI_GRO_CB(skb)->csum; + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), + wsum); + + /* fall through */ + + case CHECKSUM_COMPLETE: + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, + wsum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + +skip_csum: + return tcp_gro_receive(head, skb); +} + +static int tcp6_gro_complete(struct sk_buff *skb, int thoff) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, + &iph->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + + return tcp_gro_complete(skb); +} + +static const struct net_offload tcpv6_offload = { + .callbacks = { + .gso_send_check = tcp_v6_gso_send_check, + .gso_segment = tcp_gso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + }, +}; + +int __init tcpv6_offload_init(void) +{ + return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP); +} diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 51e2832d13a..2c4e4c5c761 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -12,45 +12,50 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ +#define pr_fmt(fmt) "IPv6: " fmt + #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers; -static struct xfrm6_tunnel *tunnel46_handlers; +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -64,14 +69,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { - struct xfrm6_tunnel **pprev; + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; @@ -87,6 +95,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) EXPORT_SYMBOL(xfrm6_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -94,11 +107,11 @@ static int tunnel6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -112,11 +125,11 @@ static int tunnel46_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel46_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, skb->dev); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -128,7 +141,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { struct xfrm6_tunnel *handler; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) break; } @@ -148,11 +161,11 @@ static const struct inet6_protocol tunnel46_protocol = { static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { - printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { - printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } @@ -162,9 +175,9 @@ static int __init tunnel6_init(void) static void __exit tunnel6_fini(void) { if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) - printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) - printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 69ebdbe78c4..7092ff78fd8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <net/ndisc.h> @@ -44,27 +45,50 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> +#include <net/inet6_hashtables.h> +#include <net/busy_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <trace/events/skb.h> #include "udp_impl.h" +static unsigned int udp6_ehashfn(struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) +{ + static u32 udp6_ehash_secret __read_mostly; + static u32 udp_ipv6_hash_secret __read_mostly; + + u32 lhash, fhash; + + net_get_random_once(&udp6_ehash_secret, + sizeof(udp6_ehash_secret)); + net_get_random_once(&udp_ipv6_hash_secret, + sizeof(udp_ipv6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); + + return __inet6_ehashfn(lhash, lport, fhash, fport, + udp_ipv6_hash_secret + net_hash_mix(net)); +} + int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { - const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - __be32 sk1_rcv_saddr = inet_sk(sk)->inet_rcv_saddr; - __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) return (!sk2_ipv6only && - (!sk1_rcv_saddr || !sk2_rcv_saddr || - sk1_rcv_saddr == sk2_rcv_saddr)); + (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || + sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); if (addr_type2 == IPV6_ADDR_ANY && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) @@ -75,7 +99,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) return 1; if (sk2_rcv_saddr6 && - ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) return 1; return 0; @@ -90,9 +114,9 @@ static unsigned int udp6_portaddr_hash(struct net *net, if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) - hash = jhash_1word(addr6->s6_addr32[3], mix); + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else - hash = jhash2(addr6->s6_addr32, 4, mix); + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } @@ -102,25 +126,33 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); - unsigned int hash2_partial = - udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0); + unsigned int hash2_partial = + udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); } +static void udp_v6_rehash(struct sock *sk) +{ + u16 new_hash = udp6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + + udp_lib_rehash(sk, new_hash); +} + static inline int compute_score(struct sock *sk, struct net *net, unsigned short hnum, - struct in6_addr *saddr, __be16 sport, - struct in6_addr *daddr, __be16 dport, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) { int score = -1; if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); score = 0; @@ -129,13 +161,13 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; score++; } - if (!ipv6_addr_any(&np->daddr)) { - if (!ipv6_addr_equal(&np->daddr, saddr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1; score++; } @@ -158,10 +190,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; score = 0; if (inet->inet_dport) { @@ -169,8 +200,8 @@ static inline int compute_score2(struct sock *sk, struct net *net, return -1; score++; } - if (!ipv6_addr_any(&np->daddr)) { - if (!ipv6_addr_equal(&np->daddr, saddr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) return -1; score++; } @@ -192,7 +223,8 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; struct hlist_nulls_node *node; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; begin: result = NULL; @@ -203,8 +235,18 @@ begin: if (score > badness) { result = sk; badness = score; - if (score == SCORE2_MAX) + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } else if (score == SCORE2_MAX) goto exact_match; + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -217,7 +259,7 @@ begin: if (result) { exact_match: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, daddr, hnum, dif) < badness)) { @@ -228,9 +270,9 @@ exact_match: return result; } -static struct sock *__udp6_lib_lookup(struct net *net, - struct in6_addr *saddr, __be16 sport, - struct in6_addr *daddr, __be16 dport, +struct sock *__udp6_lib_lookup(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif, struct udp_table *udptable) { struct sock *sk, *result; @@ -238,7 +280,8 @@ static struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; - int score, badness; + int score, badness, matches = 0, reuseport = 0; + u32 hash = 0; rcu_read_lock(); if (hslot->count > 10) { @@ -258,8 +301,8 @@ static struct sock *__udp6_lib_lookup(struct net *net, if (hslot->count < hslot2->count) goto begin; - result = udp6_lib_lookup2(net, &in6addr_any, sport, - daddr, hnum, dif, + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, hslot2, slot2); } rcu_read_unlock(); @@ -273,6 +316,17 @@ begin: if (score > badness) { result = sk; badness = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + hash = udp6_ehashfn(net, daddr, hnum, + saddr, sport); + matches = 1; + } + } else if (score == badness && reuseport) { + matches++; + if (((u64)hash * matches) >> 32 == 0) + result = sk; + hash = next_pseudo_random32(hash); } } /* @@ -284,7 +338,7 @@ begin: goto begin; if (result) { - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, hnum, saddr, sport, daddr, dport, dif) < badness)) { @@ -295,13 +349,14 @@ begin: rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(__udp6_lib_lookup); static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { struct sock *sk; - struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph = ipv6_hdr(skb); if (unlikely(sk = skb_steal_sock(skb))) return sk; @@ -310,6 +365,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, udptable); } +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -323,20 +386,21 @@ int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; - int peeked; + int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); int is_udp4; - - if (addr_len) - *addr_len=sizeof(struct sockaddr_in6); + bool slow; if (flags & MSG_ERRQUEUE) - return ipv6_recv_error(sk, msg, len); + return ipv6_recv_error(sk, msg, len, addr_len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &err); + &peeked, &off, &err); if (!skb) goto out; @@ -362,15 +426,27 @@ try_again: if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied ); + msg->msg_iov, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) goto csum_copy_err; } - if (err) + if (unlikely(err)) { + trace_kfree_skb(skb, udpv6_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + } goto out_free; - + } if (!peeked) { if (is_udp4) UDP_INC_STATS_USER(sock_net(sk), @@ -384,31 +460,33 @@ try_again: /* Copy the address. */ if (msg->msg_name) { - struct sockaddr_in6 *sin6; - - sin6 = (struct sockaddr_in6 *) msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); sin6->sin6_family = AF_INET6; sin6->sin6_port = udp_hdr(skb)->source; sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (is_udp4) + if (is_udp4) { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin6->sin6_addr); - else { - ipv6_addr_copy(&sin6->sin6_addr, - &ipv6_hdr(skb)->saddr); - if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = IP6CB(skb)->iif; + sin6->sin6_scope_id = 0; + } else { + sin6->sin6_addr = ipv6_hdr(skb)->saddr; + sin6->sin6_scope_id = + ipv6_iface_scope_id(&sin6->sin6_addr, + IP6CB(skb)->iif); } - + *addr_len = sizeof(*sin6); } + + if (np->rxopt.all) + ip6_datagram_recv_common_ctl(sk, msg, skb); + if (is_udp4) { if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); } else { if (np->rxopt.all) - datagram_recv_ctl(sk, msg, skb); + ip6_datagram_recv_specific_ctl(sk, msg, skb); } err = copied; @@ -421,19 +499,27 @@ out: return err; csum_copy_err: - lock_sock(sk); + slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { - if (is_udp4) + if (is_udp4) { + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - else + } else { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); UDP6_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + } } - release_sock(sk); + unlock_sock_fast(sk, slow); - if (flags & MSG_DONTWAIT) + if (noblock) return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; goto try_again; } @@ -442,9 +528,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct udp_table *udptable) { struct ipv6_pinfo *np; - struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; - struct in6_addr *saddr = &hdr->saddr; - struct in6_addr *daddr = &hdr->daddr; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct in6_addr *saddr = &hdr->saddr; + const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr*)(skb->data+offset); struct sock *sk; int err; @@ -454,6 +540,16 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk == NULL) return; + if (type == ICMPV6_PKT_TOOBIG) { + if (!ip6_sk_accept_pmtu(sk)) + goto out; + ip6_sk_update_pmtu(skb, sk, info); + } + if (type == NDISC_REDIRECT) { + ip6_sk_redirect(skb, sk); + goto out; + } + np = inet6_sk(sk); if (!icmpv6_err_convert(type, code, &err) && !np->recverr) @@ -471,6 +567,30 @@ out: sock_put(sk); } +static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + if (!ipv6_addr_any(&sk->sk_v6_daddr)) { + sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); + } + + rc = sock_queue_rcv_skb(sk, skb); + if (rc < 0) { + int is_udplite = IS_UDPLITE(sk); + + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; + } + return 0; +} + static __inline__ void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info ) @@ -478,7 +598,15 @@ static __inline__ void udpv6_err(struct sk_buff *skb, __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +static struct static_key udpv6_encap_needed __read_mostly; +void udpv6_encap_enable(void) +{ + if (!static_key_enabled(&udpv6_encap_needed)) + static_key_slow_inc(&udpv6_encap_needed); +} +EXPORT_SYMBOL(udpv6_encap_enable); + +int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; @@ -487,6 +615,41 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + + /* + * This is an encapsulation socket so pass the skb to + * the socket's udp_encap_rcv() hook. Otherwise, just + * fall through and pass this up the UDP socket. + * up->encap_rcv() returns the following value: + * =0 if skb was successfully passed to the encap + * handler or was discarded by it. + * >0 if skb should be passed on to UDP. + * <0 if skb should be resubmitted as proto -N + */ + + /* if we're overly short, let UDP handle it */ + encap_rcv = ACCESS_ONCE(up->encap_rcv); + if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { + int ret; + + /* Verify checksum before giving to encap */ + if (udp_lib_checksum_complete(skb)) + goto csum_error; + + ret = encap_rcv(sk, skb); + if (ret <= 0) { + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); + return -ret; + } + } + + /* FALLTHROUGH -- it's a UDP Packet */ + } + /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ @@ -506,64 +669,74 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_access_pointer(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) - goto drop; + goto csum_error; } - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { - /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); - goto drop_no_sk_drops_inc; + if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) { + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + goto drop; } - return 0; + skb_dst_drop(skb); + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = __udpv6_queue_rcv_skb(sk, skb); + else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + bh_unlock_sock(sk); + goto drop; + } + bh_unlock_sock(sk); + + return rc; + +csum_error: + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - atomic_inc(&sk->sk_drops); -drop_no_sk_drops_inc: UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; } static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, struct in6_addr *loc_addr, - __be16 rmt_port, struct in6_addr *rmt_addr, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, int dif) { struct hlist_nulls_node *node; - struct sock *s = sk; unsigned short num = ntohs(loc_port); - sk_nulls_for_each_from(s, node) { - struct inet_sock *inet = inet_sk(s); + sk_nulls_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); - if (!net_eq(sock_net(s), net)) + if (!net_eq(sock_net(sk), net)) continue; - if (udp_sk(s)->udp_port_hash == num && - s->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(s); + if (udp_sk(sk)->udp_port_hash == num && + sk->sk_family == PF_INET6) { if (inet->inet_dport) { if (inet->inet_dport != rmt_port) continue; } - if (!ipv6_addr_any(&np->daddr) && - !ipv6_addr_equal(&np->daddr, rmt_addr)) + if (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) continue; - if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) continue; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { + if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) continue; } - if (!inet6_mc_check(s, loc_addr, rmt_addr)) + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) continue; - return s; + return sk; } } return NULL; @@ -572,36 +745,45 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, static void flush_stack(struct sock **stack, unsigned int count, struct sk_buff *skb, unsigned int final) { - unsigned int i; + struct sk_buff *skb1 = NULL; struct sock *sk; - struct sk_buff *skb1; + unsigned int i; for (i = 0; i < count; i++) { - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - sk = stack[i]; - if (skb1) { - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - udpv6_queue_rcv_skb(sk, skb1); - else - sk_add_backlog(sk, skb1); - bh_unlock_sock(sk); - } else { + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + if (!skb1) { atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_INERRORS, IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); } + + if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; } + if (unlikely(skb1)) + kfree_skb(skb1); +} + +static void udp6_csum_zero_error(struct sk_buff *skb) +{ + /* RFC 2460 section 8.1 says that we SHOULD log + * this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", + &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), + &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); } + /* * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, - struct in6_addr *saddr, struct in6_addr *daddr, + const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; @@ -615,7 +797,12 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, dif = inet6_iif(skb); sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); while (sk) { - stack[count++] = sk; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (uh->check || udp_sk(sk)->no_check6_rx) + stack[count++] = sk; + sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, uh->source, saddr, dif); if (unlikely(count == ARRAY_SIZE(stack))) { @@ -644,52 +831,17 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, return 0; } -static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, - int proto) -{ - int err; - - UDP_SKB_CB(skb)->partial_cov = 0; - UDP_SKB_CB(skb)->cscov = skb->len; - - if (proto == IPPROTO_UDPLITE) { - err = udplite_checksum_init(skb, uh); - if (err) - return err; - } - - if (uh->check == 0) { - /* RFC 2460 section 8.1 says that we SHOULD log - this error. Well, it is reasonable. - */ - LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); - return 1; - } - if (skb->ip_summed == CHECKSUM_COMPLETE && - !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, - skb->len, proto, skb->csum)) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (!skb_csum_unnecessary(skb)) - skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr, - skb->len, proto, 0)); - - return 0; -} - int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + struct net *net = dev_net(skb->dev); struct sock *sk; struct udphdr *uh; - struct net_device *dev = skb->dev; - struct in6_addr *saddr, *daddr; + const struct in6_addr *saddr, *daddr; u32 ulen = 0; - struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, sizeof(struct udphdr))) - goto short_packet; + goto discard; saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -719,7 +871,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, } if (udp6_csum_init(skb, uh, proto)) - goto discard; + goto csum_error; /* * Multicast receive code @@ -735,38 +887,56 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, * for sock caches... i'll skip this for now. */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (sk != NULL) { + int ret; - if (sk == NULL) { - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard; + if (!uh->check && !udp_sk(sk)->no_check6_rx) { + sock_put(sk); + udp6_csum_zero_error(skb); + goto csum_error; + } - if (udp_lib_checksum_complete(skb)) - goto discard; - UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, - proto == IPPROTO_UDPLITE); + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); + /* a return value > 0 means to resubmit the input, but + * it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; - kfree_skb(skb); return 0; } - /* deliver */ + if (!uh->check) { + udp6_csum_zero_error(skb); + goto csum_error; + } - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) - udpv6_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (udp_lib_checksum_complete(skb)) + goto csum_error; + + UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: %d/%u\n", + LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", proto == IPPROTO_UDPLITE ? "-Lite" : "", - ulen, skb->len); - + saddr, + ntohs(uh->source), + ulen, + skb->len, + daddr, + ntohs(uh->dest)); + goto discard; +csum_error: + UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); @@ -845,11 +1015,16 @@ static int udp_v6_push_pending_frames(struct sock *sk) struct udphdr *uh; struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); - struct flowi *fl = &inet->cork.fl; + struct flowi6 *fl6; int err = 0; int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; + if (up->pending == AF_INET) + return udp_push_pending_frames(sk); + + fl6 = &inet->cork.fl.u.ip6; + /* Grab the skbuff where UDP header space exists. */ if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) goto out; @@ -858,23 +1033,26 @@ static int udp_v6_push_pending_frames(struct sock *sk) * Create a UDP header */ uh = udp_hdr(skb); - uh->source = fl->fl_ip_sport; - uh->dest = fl->fl_ip_dport; + uh->source = fl6->fl6_sport; + uh->dest = fl6->fl6_dport; uh->len = htons(up->len); uh->check = 0; if (is_udplite) csum = udplite_csum_outgoing(sk, skb); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp6_hwcsum_outgoing(sk, skb, &fl->fl6_src, &fl->fl6_dst, + else if (up->no_check6_tx) { /* UDP csum disabled */ + skb->ip_summed = CHECKSUM_NONE; + goto send; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, up->len); goto send; } else csum = udp_csum_outgoing(sk, skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_ipv6_magic(&fl->fl6_src, &fl->fl6_dst, - up->len, fl->proto, csum ); + uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + up->len, fl6->flowi6_proto, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -902,16 +1080,17 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct in6_addr *daddr, *final_p = NULL, final; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); + struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi fl; + struct flowi6 fl6; struct dst_entry *dst; int addr_len = msg->msg_namelen; int ulen = len; int hlimit = -1; int tclass = -1; + int dontfrag = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; @@ -942,7 +1121,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, } else if (!up->pending) { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - daddr = &np->daddr; + daddr = &sk->sk_v6_daddr; } else daddr = NULL; @@ -988,22 +1167,21 @@ do_udp_sendmsg: } ulen += sizeof(struct udphdr); - memset(&fl, 0, sizeof(fl)); + memset(&fl6, 0, sizeof(fl6)); if (sin6) { if (sin6->sin6_port == 0) return -EINVAL; - fl.fl_ip_dport = sin6->sin6_port; + fl6.fl6_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; - daddr = &flowlabel->dst; } } @@ -1012,43 +1190,44 @@ do_udp_sendmsg: * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && - ipv6_addr_equal(daddr, &np->daddr)) - daddr = &np->daddr; + ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) + daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && - ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) + fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - fl.fl_ip_dport = inet->inet_dport; - daddr = &np->daddr; - fl.fl6_flowlabel = np->flow_label; + fl6.fl6_dport = inet->inet_dport; + daddr = &sk->sk_v6_daddr; + fl6.flowlabel = np->flow_label; connected = 1; } - if (!fl.oif) - fl.oif = sk->sk_bound_dev_if; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = sk->sk_bound_dev_if; - if (!fl.oif) - fl.oif = np->sticky_pktinfo.ipi6_ifindex; + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl.mark = sk->sk_mark; + fl6.flowi6_mark = sk->sk_mark; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -1062,53 +1241,36 @@ do_udp_sendmsg: opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); - fl.proto = sk->sk_protocol; + fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) - ipv6_addr_copy(&fl.fl6_dst, daddr); + fl6.daddr = *daddr; else - fl.fl6_dst.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ - if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl_ip_sport = inet->inet_sport; - - /* merge ip6_build_xmit from ip6_output */ - if (opt && opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, opt, &final); + if (final_p) connected = 0; - } - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) { - fl.oif = np->mcast_oif; + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) { + fl6.flowi6_oif = np->mcast_oif; connected = 0; - } + } else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, &fl); + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - err = ip6_sk_dst_lookup(sk, &dst, &fl); - if (err) + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto out; - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); - if (err < 0) { - if (err == -EREMOTE) - err = ip6_dst_blackhole(sk, &dst, &fl); - if (err < 0) - goto out; } - if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl.fl6_dst)) - hlimit = np->mcast_hops; - else - hlimit = np->hop_limit; - if (hlimit < 0) - hlimit = ip6_dst_hoplimit(dst); - } + if (hlimit < 0) + hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (tclass < 0) tclass = np->tclass; @@ -1131,12 +1293,14 @@ back_from_confirm: up->pending = AF_INET6; do_append_data: + if (dontfrag < 0) + dontfrag = np->dontfrag; up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, &fl, + sizeof(struct udphdr), hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) @@ -1147,10 +1311,10 @@ do_append_data: if (dst) { if (connected) { ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? - &np->daddr : NULL, + ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ? + &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_equal(&fl.fl6_src, &np->saddr) ? + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? &np->saddr : #endif NULL); @@ -1191,10 +1355,18 @@ do_confirm: void udpv6_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); lock_sock(sk); udp_v6_flush_pending_frames(sk); release_sock(sk); + if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } + inet6_destroy_sock(sk); } @@ -1239,164 +1411,47 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static int udp6_ufo_send_check(struct sk_buff *skb) -{ - struct ipv6hdr *ipv6h; - struct udphdr *uh; - - if (!pskb_may_pull(skb, sizeof(*uh))) - return -EINVAL; - - ipv6h = ipv6_hdr(skb); - uh = udp_hdr(skb); - - uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, - IPPROTO_UDP, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - return 0; -} - -static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int mss; - unsigned int unfrag_ip6hlen, unfrag_len; - struct frag_hdr *fptr; - u8 *mac_start, *prevhdr; - u8 nexthdr; - u8 frag_hdr_sz = sizeof(struct frag_hdr); - int offset; - __wsum csum; - - mss = skb_shinfo(skb)->gso_size; - if (unlikely(skb->len <= mss)) - goto out; - - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { - /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || - !(type & (SKB_GSO_UDP)))) - goto out; - - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); - - segs = NULL; - goto out; - } - - /* Do software UFO. Complete and fill in the UDP checksum as HW cannot - * do checksum of UDP packets sent as multiple IP fragments. - */ - offset = skb->csum_start - skb_headroom(skb); - csum = skb_checksum(skb, offset, skb->len- offset, 0); - offset += skb->csum_offset; - *(__sum16 *)(skb->data + offset) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; - - /* Check if there is enough headroom to insert fragment header. */ - if ((skb_headroom(skb) < frag_hdr_sz) && - pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) - goto out; - - /* Find the unfragmentable header and shift it left by frag_hdr_sz - * bytes to insert fragment header. - */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - nexthdr = *prevhdr; - *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + - unfrag_ip6hlen; - mac_start = skb_mac_header(skb); - memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); - - skb->mac_header -= frag_hdr_sz; - skb->network_header -= frag_hdr_sz; - - fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); - fptr->nexthdr = nexthdr; - fptr->reserved = 0; - ipv6_select_ident(fptr); - - /* Fragment the skb. ipv6 header and the remaining fields of the - * fragment header are updated in ipv6_gso_segment() - */ - segs = skb_segment(skb, features); - -out: - return segs; -} - static const struct inet6_protocol udpv6_protocol = { .handler = udpv6_rcv, .err_handler = udpv6_err, - .gso_send_check = udp6_ufo_send_check, - .gso_segment = udp6_ufo_fragment, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS - -static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) -{ - struct inet_sock *inet = inet_sk(sp); - struct ipv6_pinfo *np = inet6_sk(sp); - struct in6_addr *dest, *src; - __u16 destp, srcp; - - dest = &np->daddr; - src = &np->rcv_saddr; - destp = ntohs(inet->inet_dport); - srcp = ntohs(inet->inet_sport); - seq_printf(seq, - "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " - "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", - bucket, - src->s6_addr32[0], src->s6_addr32[1], - src->s6_addr32[2], src->s6_addr32[3], srcp, - dest->s6_addr32[0], dest->s6_addr32[1], - dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), - 0, 0L, 0, - sock_i_uid(sp), 0, - sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); -} - int udp6_seq_show(struct seq_file *seq, void *v) { - if (v == SEQ_START_TOKEN) - seq_printf(seq, - " sl " - "local_address " - "remote_address " - "st tx_queue rx_queue tr tm->when retrnsmt" - " uid timeout inode ref pointer drops\n"); - else - udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + if (v == SEQ_START_TOKEN) { + seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); + } else { + int bucket = ((struct udp_iter_state *)seq->private)->bucket; + struct inet_sock *inet = inet_sk(v); + __u16 srcp = ntohs(inet->inet_sport); + __u16 destp = ntohs(inet->inet_dport); + ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + } return 0; } +static const struct file_operations udp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct udp_seq_afinfo udp6_seq_afinfo = { .name = "udp6", .family = AF_INET6, .udp_table = &udp_table, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &udp6_afinfo_seq_fops, .seq_ops = { .show = udp6_seq_show, }, }; -int udp6_proc_init(struct net *net) +int __net_init udp6_proc_init(struct net *net) { return udp_proc_register(net, &udp6_seq_afinfo); } @@ -1406,6 +1461,17 @@ void udp6_proc_exit(struct net *net) { } #endif /* CONFIG_PROC_FS */ +void udp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { @@ -1420,9 +1486,10 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, - .backlog_rcv = udpv6_queue_rcv_skb, + .backlog_rcv = __udpv6_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, @@ -1435,6 +1502,7 @@ struct proto udpv6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udpv6_protosw = { @@ -1442,7 +1510,6 @@ static struct inet_protosw udpv6_protosw = { .protocol = IPPROTO_UDP, .prot = &udpv6_prot, .ops = &inet6_dgram_ops, - .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }; diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index d7571046bfc..c779c3c90b9 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -7,31 +7,32 @@ #include <net/inet_common.h> #include <net/transp_v6.h> -extern int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int ); -extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, - u8 , u8 , int , __be32 , struct udp_table *); +int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); +void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, + __be32, struct udp_table *); -extern int udp_v6_get_port(struct sock *sk, unsigned short snum); +int udp_v6_get_port(struct sock *sk, unsigned short snum); -extern int udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -extern int udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); +int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); #ifdef CONFIG_COMPAT -extern int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, unsigned int optlen); -extern int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); +int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); #endif -extern int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len); -extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t len, - int noblock, int flags, int *addr_len); -extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern void udpv6_destroy_sock(struct sock *sk); +int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len); +int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len); +int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +void udpv6_destroy_sock(struct sock *sk); + +void udp_v6_clear_sk(struct sock *sk, int size); #ifdef CONFIG_PROC_FS -extern int udp6_seq_show(struct seq_file *seq, void *v); +int udp6_seq_show(struct seq_file *seq, void *v); #endif #endif /* _UDP6_IMPL_H */ diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c new file mode 100644 index 00000000000..0ae3d98f83e --- /dev/null +++ b/net/ipv6/udp_offload.c @@ -0,0 +1,140 @@ +/* + * IPV6 GSO/GRO offload support + * Linux INET6 implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * UDPv6 GSO support + */ +#include <linux/skbuff.h> +#include <net/protocol.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> +#include "ip6_offload.h" + +static int udp6_ufo_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + if (likely(!skb->encapsulation)) { + ipv6h = ipv6_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + } + + return 0; +} + +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *packet_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + int offset; + __wsum csum; + int tnl_hlen; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPIP | + SKB_GSO_SIT | + SKB_GSO_MPLS) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + if (skb->encapsulation && skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) + segs = skb_udp_tunnel_segment(skb, features); + else { + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + fptr->identification = skb_shinfo(skb)->ip6_frag_id; + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + } + +out: + return segs; +} +static const struct net_offload udpv6_offload = { + .callbacks = { + .gso_send_check = udp6_ufo_send_check, + .gso_segment = udp6_ufo_fragment, + }, +}; + +int __init udp_offload_init(void) +{ + return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); +} diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 6ea6938919e..9cf097e206e 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -11,6 +11,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <linux/export.h> #include "udp_impl.h" static int udplitev6_rcv(struct sk_buff *skb) @@ -55,6 +56,7 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { @@ -62,7 +64,6 @@ static struct inet_protosw udplite6_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplitev6_prot, .ops = &inet6_dgram_ops, - .no_check = 0, .flags = INET_PROTOSW_PERMANENT, }; @@ -92,24 +93,31 @@ void udplitev6_exit(void) } #ifdef CONFIG_PROC_FS + +static const struct file_operations udplite6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + static struct udp_seq_afinfo udplite6_seq_afinfo = { .name = "udplite6", .family = AF_INET6, .udp_table = &udplite_table, - .seq_fops = { - .owner = THIS_MODULE, - }, + .seq_fops = &udplite6_afinfo_seq_fops, .seq_ops = { .show = udp6_seq_show, }, }; -static int udplite6_proc_init_net(struct net *net) +static int __net_init udplite6_proc_init_net(struct net *net) { return udp_proc_register(net, &udplite6_seq_afinfo); } -static void udplite6_proc_exit_net(struct net *net) +static void __net_exit udplite6_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udplite6_seq_afinfo); } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 9084582d236..f8c3cf842f5 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -42,7 +42,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); - NF_HOOK(PF_INET6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, ip6_rcv_finish); return -1; } @@ -101,7 +101,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, break; } - x = xfrm_state_lookup_byaddr(net, dst, src, proto, AF_INET6); + x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); if (!x) continue; diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index bbd48b101ba..9949a356d62 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -41,10 +41,8 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *top_iph; struct ip_beet_phdr *ph; - struct iphdr *iphv4; int optlen, hdr_len; - iphv4 = ip_hdr(skb); hdr_len = 0; optlen = XFRM_MODE_SKB_CB(skb)->optlen; if (unlikely(optlen)) @@ -74,15 +72,14 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->nexthdr = IPPROTO_BEETPH; } - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; - const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); int err; @@ -92,17 +89,14 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) __skb_push(skb, size); skb_reset_network_header(skb); - - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); - ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6); - ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6); + ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; + ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; err = 0; out: return err; diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 63d5d493098..0e015906f9c 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -15,8 +15,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /* * Authors: diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 3927832227b..901ef6f8add 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -5,6 +5,7 @@ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -13,12 +14,13 @@ #include <net/dsfield.h> #include <net/dst.h> #include <net/inet_ecn.h> +#include <net/ip6_route.h> #include <net/ipv6.h> #include <net/xfrm.h> static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { - struct ipv6hdr *outer_iph = ipv6_hdr(skb); + const struct ipv6hdr *outer_iph = ipv6_hdr(skb); struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) @@ -47,29 +49,37 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) sizeof(top_iph->flow_lbl)); top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); - dsfield = XFRM_MODE_SKB_CB(skb)->tos; - dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + dsfield = 0; + else + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); if (x->props.flags & XFRM_STATE_NOECN) dsfield &= ~INET_ECN_MASK; ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; return 0; } +#define for_each_input_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) + + static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; - const unsigned char *old_mac; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) goto out; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; - if (skb_cloned(skb) && - (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + err = skb_unclone(skb, GFP_ATOMIC); + if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) @@ -78,10 +88,9 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index c4f4eef032a..433672d07d0 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -17,6 +17,7 @@ #include <linux/netfilter_ipv6.h> #include <net/dst.h> #include <net/ipv6.h> +#include <net/ip6_route.h> #include <net/xfrm.h> int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, @@ -27,6 +28,47 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, EXPORT_SYMBOL(xfrm6_find_1stfragopt); +static int xfrm6_local_dontfrag(struct sk_buff *skb) +{ + int proto; + struct sock *sk = skb->sk; + + if (sk) { + if (sk->sk_family != AF_INET6) + return 0; + + proto = sk->sk_protocol; + if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) + return inet6_sk(sk)->dontfrag; + } + + return 0; +} + +static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + struct sock *sk = skb->sk; + + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.daddr = ipv6_hdr(skb)->daddr; + + ipv6_local_rxpmtu(sk, &fl6, mtu); +} + +void xfrm6_local_error(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + const struct ipv6hdr *hdr; + struct sock *sk = skb->sk; + + hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb); + fl6.fl6_dport = inet_sk(sk)->inet_dport; + fl6.daddr = hdr->daddr; + + ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); +} + static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -36,9 +78,15 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->local_df && skb->len > mtu) { + if (!skb->ignore_df && skb->len > mtu) { skb->dev = dst->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + + if (xfrm6_local_dontfrag(skb)) + xfrm6_local_rxpmtu(skb, mtu); + else if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; } @@ -66,30 +114,61 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) if (err) return err; - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); -#ifdef CONFIG_NETFILTER - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -#endif - - skb->protocol = htons(ETH_P_IPV6); - skb->local_df = 1; + skb->ignore_df = 1; return x->outer_mode->output2(x, skb); } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + skb->protocol = htons(ETH_P_IPV6); + #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; #endif - skb->protocol = htons(ETH_P_IPV6); return xfrm_output(skb); } -int xfrm6_output(struct sk_buff *skb) +static int __xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + int mtu; + +#ifdef CONFIG_NETFILTER + if (!x) { + IP6CB(skb)->flags |= IP6SKB_REROUTED; + return dst_output(skb); + } +#endif + + if (skb->protocol == htons(ETH_P_IPV6)) + mtu = ip6_skb_dst_mtu(skb); + else + mtu = dst_mtu(skb_dst(skb)); + + if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + xfrm6_local_rxpmtu(skb, mtu); + return -EMSGSIZE; + } else if (!skb->ignore_df && skb->len > mtu && skb->sk) { + xfrm_local_error(skb, mtu); + return -EMSGSIZE; + } + + if (x->props.mode == XFRM_MODE_TUNNEL && + ((skb->len > mtu && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)))) { + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); + } + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm6_output(struct sock *sk, struct sk_buff *skb) { - return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb_dst(skb)->dev, - xfrm6_output_finish); + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, + NULL, skb_dst(skb)->dev, __xfrm6_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 7254e3f899a..2a0bbda2c76 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -20,26 +20,26 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif -static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, - xfrm_address_t *saddr, - xfrm_address_t *daddr) + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi fl = {}; + struct flowi6 fl6; struct dst_entry *dst; int err; - memcpy(&fl.fl6_dst, daddr, sizeof(fl.fl6_dst)); + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) - memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src)); + memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); - dst = ip6_route_output(net, NULL, &fl); + dst = ip6_route_output(net, NULL, &fl6); err = dst->error; if (dst->error) { @@ -68,39 +68,16 @@ static int xfrm6_get_saddr(struct net *net, return 0; } -static struct dst_entry * -__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +static int xfrm6_get_tos(const struct flowi *fl) { - struct dst_entry *dst; - - /* Still not clear if we should set fl->fl6_{src,dst}... */ - read_lock_bh(&policy->lock); - for (dst = policy->bundles; dst; dst = dst->next) { - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct in6_addr fl_dst_prefix, fl_src_prefix; - - ipv6_addr_prefix(&fl_dst_prefix, - &fl->fl6_dst, - xdst->u.rt6.rt6i_dst.plen); - ipv6_addr_prefix(&fl_src_prefix, - &fl->fl6_src, - xdst->u.rt6.rt6i_src.plen); - if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && - ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && - xfrm_bundle_ok(policy, xdst, fl, AF_INET6, - (xdst->u.rt6.rt6i_dst.plen != 128 || - xdst->u.rt6.rt6i_src.plen != 128))) { - dst_clone(dst); - break; - } - } - read_unlock_bh(&policy->lock); - return dst; + return 0; } -static int xfrm6_get_tos(struct flowi *fl) +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) { - return 0; + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); } static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -117,16 +94,21 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, return 0; } -static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) { struct rt6_info *rt = (struct rt6_info*)xdst->route; xdst->u.dst.dev = dev; dev_hold(dev); - xdst->u.rt6.rt6i_idev = in6_dev_get(rt->u.dst.dev); - if (!xdst->u.rt6.rt6i_idev) + xdst->u.rt6.rt6i_idev = in6_dev_get(dev); + if (!xdst->u.rt6.rt6i_idev) { + dev_put(dev); return -ENODEV; + } + + rt6_transfer_peer(&xdst->u.rt6, rt); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ @@ -146,16 +128,24 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) static inline void _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) { + struct flowi6 *fl6 = &fl->u.ip6; int onlyproto = 0; u16 offset = skb_network_header_len(skb); - struct ipv6hdr *hdr = ipv6_hdr(skb); + const struct ipv6hdr *hdr = ipv6_hdr(skb); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u8 nexthdr = nh[IP6CB(skb)->nhoff]; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; - memset(fl, 0, sizeof(struct flowi)); - ipv6_addr_copy(&fl->fl6_dst, reverse ? &hdr->saddr : &hdr->daddr); - ipv6_addr_copy(&fl->fl6_src, reverse ? &hdr->daddr : &hdr->saddr); + memset(fl6, 0, sizeof(struct flowi6)); + fl6->flowi6_mark = skb->mark; + fl6->flowi6_oif = reverse ? skb->skb_iif : oif; + + fl6->daddr = reverse ? hdr->saddr : hdr->daddr; + fl6->saddr = reverse ? hdr->daddr : hdr->saddr; while (nh + offset + 1 < skb->data || pskb_may_pull(skb, nh + offset + 1 - skb->data)) { @@ -182,31 +172,31 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, nh + offset + 4 - skb->data))) { __be16 *ports = (__be16 *)exthdr; - fl->fl_ip_sport = ports[!!reverse]; - fl->fl_ip_dport = ports[!reverse]; + fl6->fl6_sport = ports[!!reverse]; + fl6->fl6_dport = ports[!reverse]; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; case IPPROTO_ICMPV6: if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { u8 *icmp = (u8 *)exthdr; - fl->fl_icmp_type = icmp[0]; - fl->fl_icmp_code = icmp[1]; + fl6->fl6_icmp_type = icmp[0]; + fl6->fl6_icmp_code = icmp[1]; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; mh = (struct ip6_mh *)exthdr; - fl->fl_mh_type = mh->ip6mh_type; + fl6->fl6_mh_type = mh->ip6mh_type; } - fl->proto = nexthdr; + fl6->flowi6_proto = nexthdr; return; #endif @@ -215,8 +205,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) case IPPROTO_ESP: case IPPROTO_COMP: default: - fl->fl_ipsec_spi = 0; - fl->proto = nexthdr; + fl6->fl6_ipsec_spi = 0; + fl6->flowi6_proto = nexthdr; return; } } @@ -224,16 +214,28 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm6_garbage_collect(struct dst_ops *ops) { - xfrm6_policy_afinfo.garbage_collect(&init_net); - return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); + struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); + + xfrm6_policy_afinfo.garbage_collect(net); + return dst_entries_get_fast(ops) > ops->gc_thresh * 2; } -static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; - path->ops->update_pmtu(path, mtu); + path->ops->update_pmtu(path, sk, skb, mtu); +} + +static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) @@ -242,6 +244,11 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); + dst_destroy_metrics_generic(dst); + if (rt6_has_peer(&xdst->u.rt6)) { + struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6); + inet_putpeer(peer); + } xfrm_dst_destroy(xdst); } @@ -277,11 +284,12 @@ static struct dst_ops xfrm6_dst_ops = { .protocol = cpu_to_be16(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, + .redirect = xfrm6_redirect, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, - .gc_thresh = 1024, - .entries = ATOMIC_INIT(0), + .gc_thresh = 32768, }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { @@ -289,11 +297,12 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, - .find_bundle = __xfrm6_find_bundle, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, + .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) @@ -310,7 +319,7 @@ static void xfrm6_policy_fini(void) static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", - .data = &xfrm6_dst_ops.gc_thresh, + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -318,40 +327,79 @@ static struct ctl_table xfrm6_policy_table[] = { { } }; -static struct ctl_table_header *sysctl_hdr; +static int __net_init xfrm6_net_init(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = xfrm6_policy_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; + } + + hdr = register_net_sysctl(net, "net/ipv6", table); + if (!hdr) + goto err_reg; + + net->ipv6.sysctl.xfrm6_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + struct ctl_table *table; + + if (net->ipv6.sysctl.xfrm6_hdr == NULL) + return; + + table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations xfrm6_net_ops = { + .init = xfrm6_net_init, + .exit = xfrm6_net_exit, +}; #endif int __init xfrm6_init(void) { int ret; - unsigned int gc_thresh; + + dst_entries_init(&xfrm6_dst_ops); ret = xfrm6_policy_init(); - if (ret) + if (ret) { + dst_entries_destroy(&xfrm6_dst_ops); goto out; - + } ret = xfrm6_state_init(); if (ret) goto out_policy; - /* - * We need a good default value for the xfrm6 gc threshold. - * In ipv4 we set it to the route hash table size * 8, which - * is half the size of the maximaum route cache for ipv4. It - * would be good to do the same thing for v6, except the table is - * constructed differently here. Here each table for a net namespace - * can have FIB_TABLE_HASHSZ entries, so lets go with the same - * computation that we used for ipv4 here. Also, lets keep the initial - * gc_thresh to a minimum of 1024, since, the ipv6 route cache defaults - * to that as a minimum as well - */ - gc_thresh = FIB6_TABLE_HASHSZ * 8; - xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh; + + ret = xfrm6_protocol_init(); + if (ret) + goto out_state; + #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv6_ctl_path, - xfrm6_policy_table); + register_pernet_subsys(&xfrm6_net_ops); #endif out: return ret; +out_state: + xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; @@ -360,10 +408,10 @@ out_policy: void xfrm6_fini(void) { #ifdef CONFIG_SYSCTL - if (sysctl_hdr) - unregister_net_sysctl_table(sysctl_hdr); + unregister_pernet_subsys(&xfrm6_net_ops); #endif - //xfrm6_input_fini(); + xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); + dst_entries_destroy(&xfrm6_dst_ops); } diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c new file mode 100644 index 00000000000..54d13f8dbba --- /dev/null +++ b/net/ipv6/xfrm6_protocol.c @@ -0,0 +1,279 @@ +/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6. + * + * Copyright (C) 2013 secunet Security Networks AG + * + * Author: + * Steffen Klassert <steffen.klassert@secunet.com> + * + * Based on: + * net/ipv4/xfrm4_protocol.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly; +static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly; +static DEFINE_MUTEX(xfrm6_protocol_mutex); + +static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_handlers; + case IPPROTO_AH: + return &ah6_handlers; + case IPPROTO_COMP: + return &ipcomp6_handlers; + } + + return NULL; +} + +#define for_each_protocol_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(protocol); + + if (!head) + return 0; + + for_each_protocol_rcu(*proto_handlers(protocol), handler) + if ((ret = handler->cb_handler(skb, err)) <= 0) + return ret; + + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_cb); + +static int xfrm6_esp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(esp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(esp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ah_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ah6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ah6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static int xfrm6_ipcomp_rcv(struct sk_buff *skb) +{ + int ret; + struct xfrm6_protocol *handler; + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if ((ret = handler->handler(skb)) != -EINVAL) + return ret; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; +} + +static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_protocol *handler; + + for_each_protocol_rcu(ipcomp6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static const struct inet6_protocol esp6_protocol = { + .handler = xfrm6_esp_rcv, + .err_handler = xfrm6_esp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ah6_protocol = { + .handler = xfrm6_ah_rcv, + .err_handler = xfrm6_ah_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static const struct inet6_protocol ipcomp6_protocol = { + .handler = xfrm6_ipcomp_rcv, + .err_handler = xfrm6_ipcomp_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static struct xfrm_input_afinfo xfrm6_input_afinfo = { + .family = AF_INET6, + .owner = THIS_MODULE, + .callback = xfrm6_rcv_cb, +}; + +static inline const struct inet6_protocol *netproto(unsigned char protocol) +{ + switch (protocol) { + case IPPROTO_ESP: + return &esp6_protocol; + case IPPROTO_AH: + return &ah6_protocol; + case IPPROTO_COMP: + return &ipcomp6_protocol; + } + + return NULL; +} + +int xfrm6_protocol_register(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + bool add_netproto = false; + int ret = -EEXIST; + int priority = handler->priority; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) + add_netproto = true; + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t->priority < priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&xfrm6_protocol_mutex); + + if (add_netproto) { + if (inet6_add_protocol(netproto(protocol), protocol)) { + pr_err("%s: can't add protocol\n", __func__); + ret = -EAGAIN; + } + } + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_register); + +int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, + unsigned char protocol) +{ + struct xfrm6_protocol __rcu **pprev; + struct xfrm6_protocol *t; + int ret = -ENOENT; + + if (!proto_handlers(protocol) || !netproto(protocol)) + return -EINVAL; + + mutex_lock(&xfrm6_protocol_mutex); + + for (pprev = proto_handlers(protocol); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&xfrm6_protocol_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + if (!rcu_dereference_protected(*proto_handlers(protocol), + lockdep_is_held(&xfrm6_protocol_mutex))) { + if (inet6_del_protocol(netproto(protocol), protocol) < 0) { + pr_err("%s: can't remove protocol\n", __func__); + ret = -EAGAIN; + } + } + + mutex_unlock(&xfrm6_protocol_mutex); + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(xfrm6_protocol_deregister); + +int __init xfrm6_protocol_init(void) +{ + return xfrm_input_register_afinfo(&xfrm6_input_afinfo); +} + +void xfrm6_protocol_fini(void) +{ + xfrm_input_unregister_afinfo(&xfrm6_input_afinfo); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index f417b77fa0e..3fc970135fc 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -15,28 +15,35 @@ #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/netfilter_ipv6.h> +#include <linux/export.h> #include <net/dsfield.h> #include <net/ipv6.h> #include <net/addrconf.h> static void -__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { + const struct flowi6 *fl6 = &fl->u.ip6; + /* Initialize temporary selector matching only * to current session. */ - ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); - ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); - x->sel.dport = xfrm_flowi_dport(fl); - x->sel.dport_mask = htons(0xffff); - x->sel.sport = xfrm_flowi_sport(fl); - x->sel.sport_mask = htons(0xffff); - x->sel.family = AF_INET6; - x->sel.prefixlen_d = 128; - x->sel.prefixlen_s = 128; - x->sel.proto = fl->proto; - x->sel.ifindex = fl->oif; + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void +xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) +{ x->id = tmpl->id; if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); @@ -94,7 +101,7 @@ static int __xfrm6_state_sort_cmp(void *p) return 1; else return 3; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; @@ -127,7 +134,7 @@ static int __xfrm6_tmpl_sort_cmp(void *p) switch (v->mode) { case XFRM_MODE_TRANSPORT: return 1; -#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; @@ -168,12 +175,15 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .eth_proto = htons(ETH_P_IPV6), .owner = THIS_MODULE, .init_tempsel = __xfrm6_init_tempsel, + .init_temprop = xfrm6_init_temprop, .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, + .local_error = xfrm6_local_error, }; int __init xfrm6_state_init(void) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 438831d3359..1c66465a42d 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -12,8 +12,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see <http://www.gnu.org/licenses/>. * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> @@ -23,6 +22,7 @@ */ #include <linux/module.h> #include <linux/xfrm.h> +#include <linux/slab.h> #include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> @@ -30,6 +30,25 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> +#include <net/netns/generic.h> + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +struct xfrm6_tunnel_net { + struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; + struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + u32 spi; +}; + +static int xfrm6_tunnel_net_id __read_mostly; +static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) +{ + return net_generic(net, xfrm6_tunnel_net_id); +} /* * xfrm_tunnel_spi things are for allocating unique id ("spi") @@ -46,24 +65,13 @@ struct xfrm6_tunnel_spi { static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); -static u32 xfrm6_tunnel_spi; - -#define XFRM6_TUNNEL_SPI_MIN 1 -#define XFRM6_TUNNEL_SPI_MAX 0xffffffff - static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; -#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 -#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 - -static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; -static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; - -static inline unsigned xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) +static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { - unsigned h; + unsigned int h; - h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]); + h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; @@ -71,70 +79,33 @@ static inline unsigned xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) return h; } -static inline unsigned xfrm6_tunnel_spi_hash_byspi(u32 spi) +static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } - -static int xfrm6_tunnel_spi_init(void) -{ - int i; - - xfrm6_tunnel_spi = 0; - xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", - sizeof(struct xfrm6_tunnel_spi), - 0, SLAB_HWCACHE_ALIGN, - NULL); - if (!xfrm6_tunnel_spi_kmem) - return -ENOMEM; - - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) - INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); - for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) - INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byspi[i]); - return 0; -} - -static void xfrm6_tunnel_spi_fini(void) -{ - int i; - - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { - if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) - return; - } - for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { - if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) - return; - } - rcu_barrier(); - kmem_cache_destroy(xfrm6_tunnel_spi_kmem); - xfrm6_tunnel_spi_kmem = NULL; -} - -static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos; - hlist_for_each_entry_rcu(x6spi, pos, - &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + hlist_for_each_entry_rcu(x6spi, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) + if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } -__be32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; rcu_read_lock_bh(); - x6spi = __xfrm6_tunnel_spi_lookup(saddr); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; rcu_read_unlock_bh(); return htonl(spi); @@ -142,14 +113,14 @@ __be32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); -static int __xfrm6_tunnel_spi_check(u32 spi) +static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); - struct hlist_node *pos; - hlist_for_each_entry(x6spi, pos, - &xfrm6_tunnel_spi_byspi[index], + hlist_for_each_entry(x6spi, + &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) return -1; @@ -157,61 +128,61 @@ static int __xfrm6_tunnel_spi_check(u32 spi) return index; } -static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; int index; - if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || - xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) - xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; + if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else - xfrm6_tunnel_spi++; + xfrm6_tn->spi++; - for (spi = xfrm6_tunnel_spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { - index = __xfrm6_tunnel_spi_check(spi); + for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } - for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tunnel_spi; spi++) { - index = __xfrm6_tunnel_spi_check(spi); + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } spi = 0; goto out; alloc_spi: - xfrm6_tunnel_spi = spi; + xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; - INIT_RCU_HEAD(&x6spi->rcu_head); memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; atomic_set(&x6spi->refcnt, 1); - hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tunnel_spi_byspi[index]); + hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); - hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); + hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } -__be32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; spin_lock_bh(&xfrm6_tunnel_spi_lock); - x6spi = __xfrm6_tunnel_spi_lookup(saddr); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { atomic_inc(&x6spi->refcnt); spi = x6spi->spi; } else - spi = __xfrm6_tunnel_alloc_spi(saddr); + spi = __xfrm6_tunnel_alloc_spi(net, saddr); spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); @@ -225,18 +196,19 @@ static void x6spi_destroy_rcu(struct rcu_head *head) container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } -void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) +static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; - struct hlist_node *pos, *n; + struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); - hlist_for_each_entry_safe(x6spi, pos, n, - &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + hlist_for_each_entry_safe(x6spi, n, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (atomic_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); @@ -248,8 +220,6 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) spin_unlock_bh(&xfrm6_tunnel_spi_lock); } -EXPORT_SYMBOL(xfrm6_tunnel_free_spi); - static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); @@ -263,11 +233,12 @@ static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_tunnel_rcv(struct sk_buff *skb) { - struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; - spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi) > 0 ? : 0; + spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -326,7 +297,9 @@ static int xfrm6_tunnel_init_state(struct xfrm_state *x) static void xfrm6_tunnel_destroy(struct xfrm_state *x) { - xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); + struct net *net = xs_net(x); + + xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { @@ -339,46 +312,85 @@ static const struct xfrm_type xfrm6_tunnel_type = { .output = xfrm6_tunnel_output, }; -static struct xfrm6_tunnel xfrm6_tunnel_handler = { +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; -static struct xfrm6_tunnel xfrm46_tunnel_handler = { +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; +static int __net_init xfrm6_tunnel_net_init(struct net *net) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); + xfrm6_tn->spi = 0; + + return 0; +} + +static void __net_exit xfrm6_tunnel_net_exit(struct net *net) +{ +} + +static struct pernet_operations xfrm6_tunnel_net_ops = { + .init = xfrm6_tunnel_net_init, + .exit = xfrm6_tunnel_net_exit, + .id = &xfrm6_tunnel_net_id, + .size = sizeof(struct xfrm6_tunnel_net), +}; + static int __init xfrm6_tunnel_init(void) { - if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) - goto err; - if (xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6)) - goto unreg; - if (xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET)) - goto dereg6; - if (xfrm6_tunnel_spi_init() < 0) - goto dereg46; + int rv; + + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!xfrm6_tunnel_spi_kmem) + return -ENOMEM; + rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); + if (rv < 0) + goto out_pernet; + rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); + if (rv < 0) + goto out_type; + rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); + if (rv < 0) + goto out_xfrm6; + rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); + if (rv < 0) + goto out_xfrm46; return 0; -dereg46: - xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); -dereg6: +out_xfrm46: xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); -unreg: +out_xfrm6: xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); -err: - return -EAGAIN; +out_type: + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); +out_pernet: + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + return rv; } static void __exit xfrm6_tunnel_fini(void) { - xfrm6_tunnel_spi_fini(); xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); |
