diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 3812 |
1 files changed, 1688 insertions, 2124 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 381dd6a6aeb..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> @@ -20,7 +18,7 @@ * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics - * Alan Cox : Super /proc >4K + * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. @@ -38,7 +36,7 @@ * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. - * + * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) @@ -55,6 +53,8 @@ * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. + * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect + * Ilia Sotnikov : Removed TOS from hash calculations * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -62,16 +62,14 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/module.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/mm.h> -#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -82,16 +80,18 @@ #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> -#include <linux/rtnetlink.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <linux/slab.h> +#include <linux/jhash.h> +#include <net/dst.h> +#include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/route.h> @@ -102,74 +102,75 @@ #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> -#include <net/ip_mp_alg.h> +#include <net/netevent.h> +#include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#include <linux/kmemleak.h> #endif +#include <net/secure_seq.h> -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) - -#define IP_MAX_MTU 0xFFF0 +#define RT_FL_TOS(oldflp4) \ + ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) #define RT_GC_TIMEOUT (300*HZ) -static int ip_rt_min_delay = 2 * HZ; -static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; -static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; -static int ip_rt_gc_min_interval = HZ / 2; -static int ip_rt_redirect_number = 9; -static int ip_rt_redirect_load = HZ / 50; -static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); -static int ip_rt_error_cost = HZ; -static int ip_rt_error_burst = 5 * HZ; -static int ip_rt_gc_elasticity = 8; -static int ip_rt_mtu_expires = 10 * 60 * HZ; -static int ip_rt_min_pmtu = 512 + 20 + 20; -static int ip_rt_min_advmss = 256; -static int ip_rt_secret_interval = 10 * 60 * HZ; -static unsigned long rt_deadline; - -#define RTprint(a...) printk(KERN_DEBUG a) - -static struct timer_list rt_flush_timer; -static struct timer_list rt_periodic_timer; -static struct timer_list rt_secret_timer; +static int ip_rt_redirect_number __read_mostly = 9; +static int ip_rt_redirect_load __read_mostly = HZ / 50; +static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost __read_mostly = HZ; +static int ip_rt_error_burst __read_mostly = 5 * HZ; +static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static int ip_rt_min_advmss __read_mostly = 256; /* * Interface to generic destination cache. */ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); -static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, - struct net_device *dev, int how); +static unsigned int ipv4_default_advmss(const struct dst_entry *dst); +static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(void); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + WARN_ON(1); + return NULL; +} + +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), - .gc = rt_garbage_collect, + .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_dst_check, + .default_advmss = ipv4_default_advmss, + .mtu = ipv4_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, - .entry_size = sizeof(struct rtable), + .redirect = ip_do_redirect, + .local_out = __ip_local_out, + .neigh_lookup = ipv4_neigh_lookup, }; #define ECN_OR_COST(class) TC_PRIO_##class -__u8 ip_tos2prio[16] = { +const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -185,140 +186,27 @@ __u8 ip_tos2prio[16] = { TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; +EXPORT_SYMBOL(ip_tos2prio); - -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable *chain; -}; -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - */ -#if NR_CPUS >= 32 -#define RT_HASH_LOCK_SZ 4096 -#elif NR_CPUS >= 16 -#define RT_HASH_LOCK_SZ 2048 -#elif NR_CPUS >= 8 -#define RT_HASH_LOCK_SZ 1024 -#elif NR_CPUS >= 4 -#define RT_HASH_LOCK_SZ 512 -#else -#define RT_HASH_LOCK_SZ 256 -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] -# define rt_hash_lock_init() { \ - int i; \ - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ - if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ - for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ - spin_lock_init(&rt_hash_locks[i]); \ - } -#else -# define rt_hash_lock_addr(slot) NULL -# define rt_hash_lock_init() -#endif - -static struct rt_hash_bucket *rt_hash_table; -static unsigned rt_hash_mask; -static int rt_hash_log; -static unsigned int rt_hash_rnd; - -static struct rt_cache_stat *rt_cache_stat; -#define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) - -static int rt_intern_hash(unsigned hash, struct rtable *rth, - struct rtable **res); - -static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) -{ - return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) - & rt_hash_mask); -} +static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - int bucket; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ - struct rtable *r = NULL; - struct rt_cache_iter_state *st = seq->private; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - if (r) - break; - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) -{ - struct rt_cache_iter_state *st = rcu_dereference(seq->private); - - r = r->u.rt_next; - while (!r) { - rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - } - return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(seq); - - if (r) - while (pos && (r = rt_cache_get_next(seq, r))) - --pos; - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + if (*pos) + return NULL; + return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r = NULL; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(seq); - else - r = rt_cache_get_next(seq, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -328,32 +216,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - char temp[256]; - - sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", - r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), - r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == - dev_queue_xmit) : 0, - r->rt_spec_dst); - seq_printf(seq, "%-127s\n", temp); - } - return 0; -} - -static struct seq_operations rt_cache_seq_ops = { + return 0; +} + +static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, @@ -362,31 +228,15 @@ static struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &rt_cache_seq_ops); - if (rc) - goto out_kfree; - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open(file, &rt_cache_seq_ops); } -static struct file_operations rt_cache_seq_fops = { +static const struct file_operations rt_cache_seq_fops = { .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, }; @@ -397,11 +247,11 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return per_cpu_ptr(rt_cache_stat, cpu); + return &per_cpu(rt_cache_stat, cpu); } return NULL; } @@ -410,14 +260,14 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; - return per_cpu_ptr(rt_cache_stat, cpu); + return &per_cpu(rt_cache_stat, cpu); } return NULL; - + } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) @@ -433,11 +283,11 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } - + seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", - atomic_read(&ipv4_dst_ops.entries), - st->in_hit, + dst_entries_get_slow(&ipv4_dst_ops), + 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -445,21 +295,21 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - st->out_hit, + 0, /* st->out_hit */ st->out_slow_tot, - st->out_slow_mc, - - st->gc_total, - st->gc_ignored, - st->gc_goal_miss, - st->gc_dst_overflow, - st->in_hlist_search, - st->out_hlist_search + st->out_slow_mc, + + 0, /* st->gc_total */ + 0, /* st->gc_ignored */ + 0, /* st->gc_goal_miss */ + 0, /* st->gc_dst_overflow */ + 0, /* st->in_hlist_search */ + 0 /* st->out_hlist_search */ ); return 0; } -static struct seq_operations rt_cpu_seq_ops = { +static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, @@ -472,7 +322,7 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file) return seq_open(file, &rt_cpu_seq_ops); } -static struct file_operations rt_cpu_seq_fops = { +static const struct file_operations rt_cpu_seq_fops = { .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, @@ -480,652 +330,409 @@ static struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#endif /* CONFIG_PROC_FS */ - -static __inline__ void rt_free(struct rtable *rt) +#ifdef CONFIG_IP_ROUTE_CLASSID +static int rt_acct_proc_show(struct seq_file *m, void *v) { - multipath_remove(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); -} + struct ip_rt_acct *dst, *src; + unsigned int i, j; -static __inline__ void rt_drop(struct rtable *rt) -{ - multipath_remove(rt); - ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); -} + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; -static __inline__ int rt_fast_clean(struct rtable *rth) -{ - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.rt_next; + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } + } + + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; } -static __inline__ int rt_valuable(struct rtable *rth) +static int rt_acct_proc_open(struct inode *inode, struct file *file) { - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + return single_open(file, rt_acct_proc_show, NULL); } -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static int __net_init ip_rt_do_proc_init(struct net *net) { - unsigned long age; - int ret = 0; + struct proc_dir_entry *pde; - if (atomic_read(&rth->u.dst.__refcnt)) - goto out; + pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + &rt_cache_seq_fops); + if (!pde) + goto err1; - ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) - goto out; + pde = proc_create("rt_cache", S_IRUGO, + net->proc_net_stat, &rt_cpu_seq_fops); + if (!pde) + goto err2; - age = jiffies - rth->u.dst.lastuse; - ret = 0; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; +#ifdef CONFIG_IP_ROUTE_CLASSID + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); + if (!pde) + goto err3; +#endif + return 0; + +#ifdef CONFIG_IP_ROUTE_CLASSID +err3: + remove_proc_entry("rt_cache", net->proc_net_stat); +#endif +err2: + remove_proc_entry("rt_cache", net->proc_net); +err1: + return -ENOMEM; } -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) +static void __net_exit ip_rt_do_proc_exit(struct net *net) { - u32 score = jiffies - rt->u.dst.lastuse; - - score = ~score & ~(3<<30); - - if (rt_valuable(rt)) - score |= (1<<31); + remove_proc_entry("rt_cache", net->proc_net_stat); + remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_IP_ROUTE_CLASSID + remove_proc_entry("rt_acct", net->proc_net); +#endif +} - if (!rt->fl.iif || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); +static struct pernet_operations ip_rt_proc_ops __net_initdata = { + .init = ip_rt_do_proc_init, + .exit = ip_rt_do_proc_exit, +}; - return score; +static int __init ip_rt_proc_init(void) +{ + return register_pernet_subsys(&ip_rt_proc_ops); } -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +#else +static inline int ip_rt_proc_init(void) { - return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && - fl1->oif == fl2->oif && - fl1->iif == fl2->iif; + return 0; } +#endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED -static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, - struct rtable *expentry, - int *removed_count) +static inline bool rt_is_expired(const struct rtable *rth) { - int passedexpired = 0; - struct rtable **nextstep = NULL; - struct rtable **rthp = chain_head; - struct rtable *rth; - - if (removed_count) - *removed_count = 0; - - while ((rth = *rthp) != NULL) { - if (rth == expentry) - passedexpired = 1; - - if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && - compare_keys(&(*rthp)->fl, &expentry->fl)) { - if (*rthp == expentry) { - *rthp = rth->u.rt_next; - continue; - } else { - *rthp = rth->u.rt_next; - rt_free(rth); - if (removed_count) - ++(*removed_count); - } - } else { - if (!((*rthp)->u.dst.flags & DST_BALANCED) && - passedexpired && !nextstep) - nextstep = &rth->u.rt_next; - - rthp = &rth->u.rt_next; - } - } - - rt_free(expentry); - if (removed_count) - ++(*removed_count); - - return nextstep; + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ +void rt_cache_flush(struct net *net) +{ + rt_genid_bump_ipv4(net); +} -/* This runs via a timer and thus is always in BH context. */ -static void rt_check_expire(unsigned long dummy) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) { - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth, **rthp; - unsigned long now = jiffies; - u64 mult; + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + struct neighbour *n; - mult = ((u64)ip_rt_gc_interval) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; + rt = (const struct rtable *) dst; + if (rt->rt_gateway) + pkey = (const __be32 *) &rt->rt_gateway; + else if (skb) + pkey = &ip_hdr(skb)->daddr; - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; + n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); + if (n) + return n; + return neigh_create(&arp_tbl, pkey, dev); +} - if (*rthp == 0) - continue; - spin_lock(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; - } +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; - /* Cleanup aged off entries. */ -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - /* remove all related balanced entries if necessary */ - if (rth->u.dst.flags & DST_BALANCED) { - rthp = rt_remove_balanced_route( - &rt_hash_table[i].chain, - rth, NULL); - if (!rthp) - break; - } else { - *rthp = rth->u.rt_next; - rt_free(rth); - } -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - *rthp = rth->u.rt_next; - rt_free(rth); -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - } - spin_unlock(rt_hash_lock_addr(i)); +static struct ip_ident_bucket *ip_idents __read_mostly; - /* Fallback loop breaker. */ - if (time_after(jiffies, now)) - break; - } - rover = i; - mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); -} - -/* This can run from both BH and non-BH contexts, the latter - * in the case of a forced flush event. +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static void rt_run_flush(unsigned long dummy) +u32 ip_idents_reserve(u32 hash, int segs) { - int i; - struct rtable *rth, *next; - - rt_deadline = 0; - - get_random_bytes(&rt_hash_rnd, 4); + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; - for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - if (rth) - rt_hash_table[i].chain = NULL; - spin_unlock_bh(rt_hash_lock_addr(i)); + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); - for (; rth; rth = next) { - next = rth->u.rt_next; - rt_free(rth); - } - } + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -static DEFINE_SPINLOCK(rt_flush_lock); - -void rt_cache_flush(int delay) +void __ip_select_ident(struct iphdr *iph, int segs) { - unsigned long now = jiffies; - int user_mode = !in_softirq(); + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - if (delay < 0) - delay = ip_rt_min_delay; + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - /* flush existing multipath state*/ - multipath_flush(); - - spin_lock_bh(&rt_flush_lock); - - if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { - long tmo = (long)(rt_deadline - now); - - /* If flush timer is already running - and flush request is not immediate (delay > 0): - - if deadline is not achieved, prolongate timer to "delay", - otherwise fire it at deadline time. - */ + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); +} +EXPORT_SYMBOL(__ip_select_ident); - if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) - tmo = 0; - - if (delay > tmo) - delay = tmo; - } +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) +{ + if (sk) { + const struct inet_sock *inet = inet_sk(sk); - if (delay <= 0) { - spin_unlock_bh(&rt_flush_lock); - rt_run_flush(0); - return; + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; } - - if (rt_deadline == 0) - rt_deadline = now + ip_rt_max_delay; - - mod_timer(&rt_flush_timer, now+delay); - spin_unlock_bh(&rt_flush_lock); + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); } -static void rt_secret_rebuild(unsigned long dummy) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) { - unsigned long now = jiffies; + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; - rt_cache_flush(0); - mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -/* - Short description of GC goals. +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); +} - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) +{ + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); +} -static int rt_garbage_collect(void) +static inline void rt_free(struct rtable *rt) { - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth, **rthp; - unsigned long now = jiffies; - int goal; + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ +static DEFINE_SPINLOCK(fnhe_lock); - RT_CACHE_STAT_INC(gc_total); +static void fnhe_flush_routes(struct fib_nh_exception *fnhe) +{ + struct rtable *rt; - if (now - last_gc < ip_rt_gc_min_interval && - atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + rt_free(rt); } - - /* Calculate number of entries, which we want to expire now. */ - goal = atomic_read(&ipv4_dst_ops.entries) - - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); - equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + rt_free(rt); } +} - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +{ + struct fib_nh_exception *fnhe, *oldest; - if (goal <= 0) { - equilibrium += goal; - goto work_done; + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; } + fnhe_flush_routes(oldest); + return oldest; +} - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { - if (!rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; - } -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - /* remove all related balanced entries - * if necessary - */ - if (rth->u.dst.flags & DST_BALANCED) { - int r; - - rthp = rt_remove_balanced_route( - &rt_hash_table[i].chain, - rth, - &r); - goal -= r; - if (!rthp) - break; - } else { - *rthp = rth->u.rt_next; - rt_free(rth); - goal--; - } -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - *rthp = rth->u.rt_next; - rt_free(rth); - goal--; -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); +static inline u32 fnhe_hashfun(__be32 daddr) +{ + u32 hval; - if (expire == 0) - break; + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); - expire >>= 1; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, i); -#endif + return hval & (FNHE_HASH_SIZE - 1); +} - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) +{ + rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->dst.expires = fnhe->fnhe_expires; - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - if (net_ratelimit()) - printk(KERN_WARNING "dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, rover); -#endif -out: return 0; + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + rt->rt_uses_gateway = 1; + } } -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + u32 pmtu, unsigned long expires) { - struct rtable *rth, **rthp; - unsigned long now; - struct rtable *cand, **candp; - u32 min_score; - int chain_length; - int attempts = !in_softirq(); + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + struct rtable *rt; + unsigned int i; + int depth; + u32 hval = fnhe_hashfun(daddr); -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; + spin_lock_bh(&fnhe_lock); - rthp = &rt_hash_table[hash].chain; + hash = nh->nh_exceptions; + if (!hash) { + hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + if (!hash) + goto out_unlock; + nh->nh_exceptions = hash; + } - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if (!(rth->u.dst.flags & DST_BALANCED) && - compare_keys(&rth->fl, &rt->fl)) { -#else - if (compare_keys(&rth->fl, &rt->fl)) { -#endif - /* Put it first */ - *rthp = rth->u.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->u.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); + hash += hval; - rth->u.dst.__use++; - dst_hold(&rth->u.dst); - rth->u.dst.lastuse = now; - spin_unlock_bh(rt_hash_lock_addr(hash)); + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + break; + depth++; + } - rt_drop(rt); - *rp = rth; - return 0; + if (fnhe) { + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = max(1UL, expires); } - - if (!atomic_read(&rth->u.dst.__refcnt)) { - u32 score = rt_score(rth); - - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } + /* Update all cached dsts too */ + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) + fill_route_from_fnhe(rt, fnhe); + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) + fill_route_from_fnhe(rt, fnhe); + } else { + if (depth > FNHE_RECLAIM_DEPTH) + fnhe = fnhe_oldest(hash); + else { + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); } - - chain_length++; - - rthp = &rth->u.rt_next; - } - - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. + fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; + + /* Exception created; mark the cached routes for the nexthop + * stale, so anyone caching it rechecks if this exception + * applies to them. */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.rt_next; - rt_free(cand); + rt = rcu_dereference(nh->nh_rth_input); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; + + for_each_possible_cpu(i) { + struct rtable __rcu **prt; + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + rt = rcu_dereference(*prt); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; } } - /* Try to bind route to arp only if it is output - route or unicast forwarding path. - */ - if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); - if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); - - if (err != -ENOBUFS) { - rt_drop(rt); - return err; - } + fnhe->fnhe_stamp = jiffies; - /* Neighbour tables are full and nothing - can be released. Try to shrink route cache, - it is most likely it holds some neighbour records. - */ - if (attempts-- > 0) { - int saved_elasticity = ip_rt_gc_elasticity; - int saved_int = ip_rt_gc_min_interval; - ip_rt_gc_elasticity = 1; - ip_rt_gc_min_interval = 0; - rt_garbage_collect(); - ip_rt_gc_min_interval = saved_int; - ip_rt_gc_elasticity = saved_elasticity; - goto restart; - } - - if (net_ratelimit()) - printk(KERN_WARNING "Neighbour table overflow.\n"); - rt_drop(rt); - return -ENOBUFS; - } - } - - rt->u.rt_next = rt_hash_table[hash].chain; -#if RT_CACHE_DEBUG >= 2 - if (rt->u.rt_next) { - struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, - NIPQUAD(rt->rt_dst)); - for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) - printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); - printk("\n"); - } -#endif - rt_hash_table[hash].chain = rt; - spin_unlock_bh(rt_hash_lock_addr(hash)); - *rp = rt; - return 0; +out_unlock: + spin_unlock_bh(&fnhe_lock); } -void rt_bind_peer(struct rtable *rt, int create) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + bool kill_route) { - static DEFINE_SPINLOCK(rt_peer_lock); - struct inet_peer *peer; + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct neighbour *n; + struct net *net; - peer = inet_getpeer(rt->rt_dst, create); + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; + default: + return; } - spin_unlock_bh(&rt_peer_lock); - if (peer) - inet_putpeer(peer); -} - -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; - - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id(ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) -{ - struct rtable *rt = (struct rtable *) dst; - - if (rt) { - if (rt->peer == NULL) - rt_bind_peer(rt, 1); - - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. - */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; - } - } else - printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", - __builtin_return_address(0)); - - ip_select_fb_ident(iph); -} - -static void rt_del(unsigned hash, struct rtable *rt) -{ - struct rtable **rthp; - - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - for (rthp = &rt_hash_table[hash].chain; *rthp; - rthp = &(*rthp)->u.rt_next) - if (*rthp == rt) { - *rthp = rt->u.rt_next; - rt_free(rt); - break; - } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} -void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, - u32 saddr, u8 tos, struct net_device *dev) -{ - int i, k; - struct in_device *in_dev = in_dev_get(dev); - struct rtable *rth, **rthp; - u32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - - tos &= IPTOS_RT_MASK; + if (rt->rt_gateway != old_gw) + return; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + net = dev_net(dev); + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1134,133 +741,73 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(new_gw) != RTN_UNICAST) + if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash_code(daddr, - skeys[i] ^ (ikeys[k] << 5), - tos); - - rthp=&rt_hash_table[hash].chain; - - rcu_read_lock(); - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.fl4_tos != tos || - rth->fl.oif != ikeys[k] || - rth->fl.iif != 0) { - rthp = &rth->u.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->u.dst.error || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; - - dst_hold(&rth->u.dst); - rcu_read_unlock(); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - in_dev_put(in_dev); - return; - } - - /* Copy all the information. */ - *rt = *rth; - INIT_RCU_HEAD(&rt->u.dst.rcu_head); - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - if (rt->idev) - in_dev_hold(rt->idev); - rt->u.dst.obsolete = 0; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; - rt->u.dst.xfrm = NULL; - - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt)) - ip_rt_put(rt); - goto do_next; + update_or_create_fnhe(nh, fl4->daddr, new_gw, + 0, 0); } - rcu_read_unlock(); - do_next: - ; + if (kill_route) + rt->dst.obsolete = DST_OBSOLETE_KILL; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } - in_dev_put(in_dev); return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " - "%u.%u.%u.%u ignored.\n" - " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, " - "tos %02x\n", - NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), - NIPQUAD(saddr), NIPQUAD(daddr), tos); + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); + } #endif - in_dev_put(in_dev); + ; +} + +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4; + const struct iphdr *iph = (const struct iphdr *) skb->data; + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + rt = (struct rtable *) dst; + + __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __ip_do_redirect(rt, skb, &fl4, true); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable*)dst; + struct rtable *rt = (struct rtable *)dst; struct dst_entry *ret = dst; if (rt) { - if (dst->obsolete) { + if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { - unsigned hash = rt_hash_code(rt->fl.fl4_dst, - rt->fl.fl4_src ^ - (rt->fl.oif << 5), - rt->fl.fl4_tos); -#if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ip_rt_advice: redirect to " - "%u.%u.%u.%u/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); -#endif - rt_del(hash, rt); + rt->dst.expires) { + ip_rt_put(rt); ret = NULL; } } @@ -1285,211 +832,305 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; - struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct inet_peer *peer; + struct net *net; + int log_martians; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); return; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); - if (!IN_DEV_TX_REDIRECTS(in_dev)) - goto out; + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); + return; + } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; - goto out; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + if (peer->rate_tokens == 0 || + time_after(jiffies, + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && - rt->u.dst.rate_tokens == ip_rt_redirect_number && - net_ratelimit()) - printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " - "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", - NIPQUAD(rt->rt_src), rt->rt_iif, - NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); + if (log_martians && + peer->rate_tokens == ip_rt_redirect_number) + net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", + &ip_hdr(skb)->saddr, inet_iif(skb), + &ip_hdr(skb)->daddr, &gw); #endif } -out: - in_dev_put(in_dev); +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + struct net *net; + bool send; int code; - switch (rt->u.dst.error) { - case EINVAL: - default: - goto out; + net = dev_net(rt->dst.dev); + if (!IN_DEV_FORWARD(in_dev)) { + switch (rt->dst.error) { case EHOSTUNREACH: - code = ICMP_HOST_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); break; + case ENETUNREACH: - code = ICMP_NET_UNREACH; - break; - case EACCES: - code = ICMP_PKT_FILTERED; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); break; + } + goto out; } - now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + switch (rt->dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; + } + + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; + inet_putpeer(peer); } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; -} +} -/* - * The last two values are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ + struct dst_entry *dst = &rt->dst; + struct fib_result res; -static unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + if (dst_metric_locked(dst, RTAX_MTU)) + return; -static __inline__ unsigned short guess_mtu(unsigned short old_mtu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) - if (old_mtu > mtu_plateau[i]) - return mtu_plateau[i]; - return 68; + if (dst->dev->mtu < mtu) + return; + + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + + if (rt->rt_pmtu == mtu && + time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) + return; + + rcu_read_lock(); + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + jiffies + ip_rt_mtu_expires); + } + rcu_read_unlock(); } -unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { - int i; - unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - u32 skeys[2] = { iph->saddr, 0, }; - u32 daddr = iph->daddr; - u8 tos = iph->tos & IPTOS_RT_MASK; - unsigned short est_mtu = 0; + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; - if (ipv4_config.no_pmtu_disc) - return 0; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash_code(daddr, skeys[i], tos); +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, + int oif, u32 mark, u8 protocol, int flow_flags) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == skeys[i] && - rth->rt_dst == daddr && - rth->rt_src == iph->saddr && - rth->fl.fl4_tos == tos && - rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { - unsigned short mtu = new_mtu; - - if (new_mtu < 68 || new_mtu >= old_mtu) { - - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; - - mtu = guess_mtu(old_mtu); - } - if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { - if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { - dst_confirm(&rth->u.dst); - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= - (1 << RTAX_MTU); - } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - } - rcu_read_unlock(); + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } - return est_mtu ? : new_mtu; } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); - } - dst->metrics[RTAX_MTU-1] = mtu; - dst_set_expires(dst, ip_rt_mtu_expires); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } } -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - return NULL; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + struct dst_entry *odst = NULL; + bool new = false; + + bh_lock_sock(sk); + + if (!ip_sk_accept_pmtu(sk)) + goto out; + + odst = sk_dst_get(sk); + + if (sock_owned_by_user(sk) || !odst) { + __ipv4_sk_update_pmtu(skb, sk, mtu); + goto out; + } + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + + if (!dst_check(&rt->dst, 0)) { + if (new) + dst_release(&rt->dst); + + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + if (new) + sk_dst_set(sk, &rt->dst); + +out: + bh_unlock_sock(sk); + dst_release(odst); } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ipv4_dst_destroy(struct dst_entry *dst) +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer = rt->peer; - struct in_device *idev = rt->idev; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (peer) { - rt->peer = NULL; - inet_putpeer(peer); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } +} +EXPORT_SYMBOL_GPL(ipv4_redirect); - if (idev) { - rt->idev = NULL; - in_dev_put(idev); +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; - struct in_device *idev = rt->idev; - if (dev != &loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); - if (loopback_idev) { - rt->idev = loopback_idev; - in_dev_put(idev); - } - } + + /* All IPV4 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + * + * When a PMTU/redirect information update invalidates a route, + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or + * DST_OBSOLETE_DEAD by dst_free(). + */ + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + return NULL; + return dst; } static void ipv4_link_failure(struct sk_buff *skb) @@ -1498,17 +1139,18 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = (struct rtable *) skb->dst; + rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", - NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), - skb->dev ? skb->dev->name : "?"); + pr_debug("%s: %pI4 -> %pI4, %s\n", + __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + skb->dev ? skb->dev->name : "?"); kfree_skb(skb); + WARN_ON(1); return 0; } @@ -1521,161 +1163,345 @@ static int ip_rt_bug(struct sk_buff *skb) in IP options! */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { - u32 src; - struct fib_result res; + __be32 src; - if (rt->fl.iif == 0) - src = rt->rt_src; - else if (fib_lookup(&rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); + if (rt_is_output_route(rt)) + src = ip_hdr(skb)->saddr; + else { + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; + + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + else + src = inet_select_addr(rt->dst.dev, + rt_nexthop(rt, iph->daddr), + RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct fib_info *fi = res->fi; + unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (advmss == 0) { + advmss = max_t(unsigned int, dst->dev->mtu - 40, + ip_rt_min_advmss); + if (advmss > 65535 - 40) + advmss = 65535 - 40; + } + return advmss; +} + +static unsigned int ipv4_mtu(const struct dst_entry *dst) +{ + const struct rtable *rt = (const struct rtable *) dst; + unsigned int mtu = rt->rt_pmtu; + + if (!mtu || time_after_eq(jiffies, rt->dst.expires)) + mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + return mtu; + + mtu = dst->dev->mtu; + + if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + if (rt->rt_uses_gateway && mtu > 576) + mtu = 576; + } + + return min_t(unsigned int, mtu, IP_MAX_MTU); +} + +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; + + if (!hash) + return NULL; + + hval = fnhe_hashfun(daddr); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; +} + +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) +{ + bool ret = false; + + spin_lock_bh(&fnhe_lock); + + if (daddr == fnhe->fnhe_daddr) { + struct rtable __rcu **porig; + struct rtable *orig; + int genid = fnhe_genid(dev_net(rt->dst.dev)); + + if (rt_is_input_route(rt)) + porig = &fnhe->fnhe_rth_input; + else + porig = &fnhe->fnhe_rth_output; + orig = rcu_dereference(*porig); + + if (fnhe->fnhe_genid != genid) { + fnhe->fnhe_genid = genid; + fnhe->fnhe_gw = 0; + fnhe->fnhe_pmtu = 0; + fnhe->fnhe_expires = 0; + fnhe_flush_routes(fnhe); + orig = NULL; + } + fill_route_from_fnhe(rt, fnhe); + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + + if (!(rt->dst.flags & DST_NOCACHE)) { + rcu_assign_pointer(*porig, rt); + if (orig) + rt_free(orig); + ret = true; + } + + fnhe->fnhe_stamp = jiffies; + } + spin_unlock_bh(&fnhe_lock); + + return ret; +} + +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p; + bool ret = true; + + if (rt_is_input_route(rt)) { + p = (struct rtable **)&nh->nh_rth_input; + } else { + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + if (orig) + rt_free(orig); + } else + ret = false; + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (!list_empty(&rt->rt_uncached)) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); + } +} + +static bool rt_cache_valid(const struct rtable *rt) +{ + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, + const struct fib_result *res, + struct fib_nh_exception *fnhe, + struct fib_info *fi, u16 type, u32 itag) +{ + bool cached = false; if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); - if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && - rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { + rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; } -#ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + dst_init_metrics(&rt->dst, fi->fib_metrics, true); +#ifdef CONFIG_IP_ROUTE_CLASSID + rt->dst.tclassid = nh->nh_tclassid; #endif + if (unlikely(fnhe)) + cached = rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_cache_route(nh, rt); + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, - ip_rt_min_advmss); - if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; - -#ifdef CONFIG_NET_CLS_ROUTE + rt_add_uncached_list(rt); + +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; } -static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm, bool will_cache) +{ + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned hash; struct rtable *rth; - u32 spec_dst; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ if (in_dev == NULL) return -EINVAL; - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) { - if (!LOCAL_MCAST(daddr)) + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(saddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) - goto e_inval; - rth = dst_alloc(&ipv4_dst_ops); + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) + goto e_inval; + } else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); + if (err < 0) + goto e_err; + } + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (in_dev->cnf.no_policy) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; -#endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_type = RTN_MULTICAST; + rth->dst.output = ip_rt_bug; + + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; + rth->rt_is_input= 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); - in_dev_put(in_dev); - hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos); - return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: - in_dev_put(in_dev); return -ENOBUFS; - e_inval: - in_dev_put(in_dev); return -EINVAL; +e_err: + return err; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, - u32 daddr, - u32 saddr) + __be32 daddr, + __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE @@ -1684,206 +1510,129 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source %u.%u.%u.%u from " - "%u.%u.%u.%u, on dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); - if (dev->hard_header_len && skb->mac.raw) { - int i; - unsigned char *p = skb->mac.raw; - printk(KERN_WARNING "ll header: "); - for (i = 0; i < dev->hard_header_len; i++, p++) { - printk("%02x", *p); - if (i < (dev->hard_header_len - 1)) - printk(":"); - } - printk("\n"); + pr_warn("martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); + if (dev->hard_header_len && skb_mac_header_was_set(skb)) { + print_hex_dump(KERN_WARNING, "ll header: ", + DUMP_PREFIX_OFFSET, 16, 1, + skb_mac_header(skb), + dev->hard_header_len, true); } } #endif } -static inline int __mkroute_input(struct sk_buff *skb, - struct fib_result* res, - struct in_device *in_dev, - u32 daddr, u32 saddr, u32 tos, - struct rtable **result) +/* called in rcu_read_lock() section */ +static int __mkroute_input(struct sk_buff *skb, + const struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { - + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; - unsigned flags = 0; - u32 spec_dst, itag; + unsigned int flags = 0; + bool do_cache; + u32 itag = 0; /* get a working reference to the output device */ - out_dev = in_dev_get(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { - if (net_ratelimit()) - printk(KERN_CRIT "Bug in ip_route_input" \ - "_slow(). Please, report\n"); + net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } - - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, in_dev, &itag); if (err < 0) { - ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, + ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - - err = -EINVAL; + goto cleanup; } - if (err) - flags |= RTCF_DIRECTSRC; - - if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } } + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + if (do_cache) { + if (fnhe != NULL) + rth = rcu_dereference(fnhe->fnhe_rth_input); + else + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; + } + } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if (res->fi->fib_nhs > 1) - rth->u.dst.flags |= DST_BALANCED; -#endif - if (in_dev->cnf.no_policy) - rth->u.dst.flags |= DST_NOPOLICY; - if (in_dev->cnf.no_xfrm) - rth->u.dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; -#endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_spec_dst= spec_dst; - - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - - rt_set_nexthop(rth, res, itag); - + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); - *result = rth; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); +out: err = 0; cleanup: - /* release the working reference to the output device */ - in_dev_put(out_dev); return err; -} +} -static inline int ip_mkroute_input_def(struct sk_buff *skb, - struct fib_result* res, - const struct flowi *fl, - struct in_device *in_dev, - u32 daddr, u32 saddr, u32 tos) +static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, + const struct flowi4 *fl4, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { - struct rtable* rth = NULL; - int err; - unsigned hash; - #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - /* put it into the cache */ - hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); - return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); -} - -static inline int ip_mkroute_input(struct sk_buff *skb, - struct fib_result* res, - const struct flowi *fl, - struct in_device *in_dev, - u32 daddr, u32 saddr, u32 tos) -{ -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - struct rtable* rth = NULL, *rtres; - unsigned char hop, hopcount; - int err = -EINVAL; - unsigned int hash; - - if (res->fi) - hopcount = res->fi->fib_nhs; - else - hopcount = 1; - - /* distinguish between multipath and singlepath */ - if (hopcount < 2) - return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, - saddr, tos); - - /* add all alternatives to the routing cache */ - for (hop = 0; hop < hopcount; hop++) { - res->nh_sel = hop; - - /* put reference to previous result */ - if (hop) - ip_rt_put(rtres); - - /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, - &rth); - if (err) - return err; - - /* put it into the cache */ - hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); - err = rt_intern_hash(hash, rth, &rtres); - if (err) - return err; - - /* forward hop information to multipath impl. */ - multipath_set_nhinfo(rth, - FIB_RES_NETWORK(*res), - FIB_RES_NETMASK(*res), - res->prefixlen, - &FIB_RES_NH(*res)); - } - skb->dst = &rtres->u.dst; - return err; -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); -#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } - /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet @@ -1892,30 +1641,21 @@ static inline int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ -static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, +static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = saddr, - .tos = tos, - .scope = RT_SCOPE_UNIVERSE, -#ifdef CONFIG_IP_ROUTE_FWMARK - .fwmark = skb->nfmark -#endif - } }, - .iif = dev->ifindex }; - unsigned flags = 0; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flowi4 fl4; + unsigned int flags = 0; u32 itag = 0; - struct rtable * rth; - unsigned hash; - u32 spec_dst; + struct rtable *rth; int err = -EINVAL; - int free_res = 0; + struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -1926,129 +1666,140 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, by fib_lookup. */ - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) + res.fi = NULL; + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) goto martian_source; - if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + if (ipv4_is_zeronet(daddr)) goto martian_destination; + /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), + * and call it once if daddr or/and saddr are loopback addresses + */ + if (ipv4_is_loopback(daddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_destination; + } else if (ipv4_is_loopback(saddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + } + /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&fl, &res)) != 0) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + err = -EHOSTUNREACH; goto no_route; } - free_res = 1; - - RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, - loopback_dev.ifindex, - dev, &spec_dst, &itag); - if (result < 0) - goto martian_source; - if (result) - flags |= RTCF_DIRECTSRC; - spec_dst = daddr; + err = fib_validate_source(skb, saddr, daddr, tos, + 0, dev, in_dev, &itag); + if (err < 0) + goto martian_source_keep_err; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; + goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); - if (err == -ENOBUFS) - goto e_nobufs; - if (err == -EINVAL) - goto e_inval; - -done: - in_dev_put(in_dev); - if (free_res) - fib_res_put(&res); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) - goto martian_source; - if (err) - flags |= RTCF_DIRECTSRC; + goto martian_source_keep_err; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + do_cache = false; + if (res.fi) { + if (!itag) { + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; + } + do_cache = true; + } + } + + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (in_dev->cnf.no_policy) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= skb->nfmark; -#endif - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.input= ip_local_deliver; + rth->dst.output= ip_rt_bug; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - rth->rt_type = res.type; - hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); - goto done; + if (do_cache) { + if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + rth->dst.flags |= DST_NOCACHE; + rt_add_uncached_list(rth); + } + } + skb_dst_set(skb, &rth->dst); + err = 0; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; + if (err == -ESRCH) + err = -ENETUNREACH; goto local_input; /* @@ -2057,61 +1808,32 @@ no_route: martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination %u.%u.%u.%u from " - "%u.%u.%u.%u, dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + if (IN_DEV_LOG_MARTIANS(in_dev)) + net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif -e_hostunreach: - err = -EHOSTUNREACH; - goto done; - e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto out; } -int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { - struct rtable * rth; - unsigned hash; - int iif = dev->ifindex; - - tos &= IPTOS_RT_MASK; - hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); + int res; rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == saddr && - rth->fl.iif == iif && - rth->fl.oif == 0 && -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark == skb->nfmark && -#endif - rth->fl.fl4_tos == tos) { - rth->u.dst.lastuse = jiffies; - dst_hold(&rth->u.dst); - rth->u.dst.__use++; - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - skb->dst = (struct dst_entry*)rth; - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - rcu_read_unlock(); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing @@ -2124,296 +1846,204 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, Note, that multicast routers are not affected, because route cache entry is created eventually. */ - if (MULTICAST(daddr)) { - struct in_device *in_dev; + if (ipv4_is_multicast(daddr)) { + struct in_device *in_dev = __in_dev_get_rcu(dev); - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { - int our = ip_check_mc(in_dev, daddr, saddr, - skb->nh.iph->protocol); + if (in_dev) { + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + return res; } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; } +EXPORT_SYMBOL(ip_route_input_noref); -static inline int __mkroute_output(struct rtable **result, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +/* called with rcu_read_lock() */ +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, int orig_oif, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); - int err = 0; + u16 type = res->type; + struct rtable *rth; + bool do_cache; - if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) - return -EINVAL; + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) + return ERR_PTR(-EINVAL); - if (fl->fl4_dst == 0xFFFFFFFF) - res->type = RTN_BROADCAST; - else if (MULTICAST(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) - return -EINVAL; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); - if (!in_dev) - return -EINVAL; - - if (res->type == RTN_BROADCAST) { + do_cache = true; + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } - } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + fi = NULL; + } else if (type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST | RTCF_LOCAL; + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; + else + do_cache = false; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; + } + + fnhe = NULL; + do_cache &= fi != NULL; + if (do_cache) { + struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); + + fnhe = find_exception(nh, fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth_output; + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } +add: + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM), + do_cache); + if (!rth) + return ERR_PTR(-ENOBUFS); - rth = dst_alloc(&ipv4_dst_ops); - if (!rth) { - err = -ENOBUFS; - goto cleanup; - } + rth->dst.output = ip_output; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if (res->fi) { - rth->rt_multipath_alg = res->fi->fib_mp_alg; - if (res->fi->fib_nhs > 1) - rth->u.dst.flags |= DST_BALANCED; - } -#endif - if (in_dev->cnf.no_xfrm) - rth->u.dst.flags |= DST_NOXFRM; - if (in_dev->cnf.no_policy) - rth->u.dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark= oldflp->fl4_fwmark; -#endif - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; - /* get references to the devices that are to be hold by the routing - cache entry */ - rth->u.dst.dev = dev_out; - dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; - - rth->u.dst.output=ip_output; + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_is_input = 0; + rth->rt_iif = orig_oif ? : 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; - } + if (flags & RTCF_LOCAL) + rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; - if (flags & RTCF_LOCAL && + if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !LOCAL_MCAST(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + !ipv4_is_local_multicast(fl4->daddr)) { + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif } - rt_set_nexthop(rth, res, 0); - - rth->rt_flags = flags; - - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; -} - -static inline int ip_mkroute_output_def(struct rtable **rp, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - u32 tos = RT_FL_TOS(oldflp); - - hash = rt_hash_code(oldflp->fl4_dst, - oldflp->fl4_src ^ (oldflp->oif << 5), tos); - err = rt_intern_hash(hash, rth, rp); - } - - return err; -} - -static inline int ip_mkroute_output(struct rtable** rp, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - u32 tos = RT_FL_TOS(oldflp); - unsigned char hop; - unsigned hash; - int err = -EINVAL; - struct rtable *rth = NULL; - - if (res->fi && res->fi->fib_nhs > 1) { - unsigned char hopcount = res->fi->fib_nhs; - - for (hop = 0; hop < hopcount; hop++) { - struct net_device *dev2nexthop; - - res->nh_sel = hop; - - /* hold a work reference to the output device */ - dev2nexthop = FIB_RES_DEV(*res); - dev_hold(dev2nexthop); - - /* put reference to previous result */ - if (hop) - ip_rt_put(*rp); - - err = __mkroute_output(&rth, res, fl, oldflp, - dev2nexthop, flags); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (err != 0) - goto cleanup; - - hash = rt_hash_code(oldflp->fl4_dst, - oldflp->fl4_src ^ - (oldflp->oif << 5), tos); - err = rt_intern_hash(hash, rth, rp); - - /* forward hop information to multipath impl. */ - multipath_set_nhinfo(rth, - FIB_RES_NETWORK(*res), - FIB_RES_NETMASK(*res), - res->prefixlen, - &FIB_RES_NH(*res)); - cleanup: - /* release work reference to output device */ - dev_put(dev2nexthop); - - if (err != 0) - return err; - } - return err; - } else { - return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, - flags); - } -#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); -#endif + return rth; } /* * Major route resolver routine. */ -static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = oldflp->fl4_dst, - .saddr = oldflp->fl4_src, - .tos = tos & IPTOS_RT_MASK, - .scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : - RT_SCOPE_UNIVERSE), -#ifdef CONFIG_IP_ROUTE_FWMARK - .fwmark = oldflp->fl4_fwmark -#endif - } }, - .iif = loopback_dev.ifindex, - .oif = oldflp->oif }; - struct fib_result res; - unsigned flags = 0; +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +{ struct net_device *dev_out = NULL; - int free_res = 0; - int err; - + __u8 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + int orig_oif; + res.tclassid = 0; res.fi = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif + res.table = NULL; - if (oldflp->fl4_src) { - err = -EINVAL; - if (MULTICAST(oldflp->fl4_src) || - BADCLASS(oldflp->fl4_src) || - ZERONET(oldflp->fl4_src)) - goto out; + orig_oif = fl4->flowi4_oif; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) + fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fl4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: - 1. ip_dev_find(saddr) can return wrong iface, if saddr is - assigned to multiple interfaces. + 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */ - if (oldflp->oif == 0 - && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) { + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = __ip_dev_find(net, fl4->saddr, false); + if (dev_out == NULL) + goto out; + /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2429,60 +2059,61 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; goto make_route; } - if (dev_out) - dev_put(dev_out); - dev_out = NULL; + + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + if (!__ip_dev_find(net, fl4->saddr, false)) + goto out; + } } - if (oldflp->oif) { - dev_out = dev_get_by_index(oldflp->oif); - err = -ENODEV; + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { - dev_put(dev_out); - goto out; /* Wrong error code */ + if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { + rth = ERR_PTR(-ENETUNREACH); + goto out; } - - if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (MULTICAST(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (!fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); - dev_out = &loopback_dev; - dev_hold(dev_out); - fl.oif = loopback_dev.ifindex; + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); + dev_out = net->loopback_dev; + fl4->flowi4_oif = LOOPBACK_IFINDEX; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(&fl, &res)) { + if (fib_lookup(net, fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + res.table = NULL; + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2501,713 +2132,646 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) - fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); - dev_out = &loopback_dev; - dev_hold(dev_out); - fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); - res.fi = NULL; + if (!fl4->saddr) { + if (res.fi->fib_prefsrc) + fl4->saddr = res.fi->fib_prefsrc; + else + fl4->saddr = fl4->daddr; + } + dev_out = net->loopback_dev; + fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(&fl, &res); + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); +out: + rcu_read_unlock(); + return rth; +} +EXPORT_SYMBOL_GPL(__ip_route_output_key); - if (free_res) - fib_res_put(&res); - if (dev_out) - dev_put(dev_out); -out: return err; +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; } -int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) { - unsigned hash; - struct rtable *rth; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); - - rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && - rth->fl.iif == 0 && - rth->fl.oif == flp->oif && -#ifdef CONFIG_IP_ROUTE_FWMARK - rth->fl.fl4_fwmark == flp->fl4_fwmark && -#endif - !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK))) { + return mtu ? : dst->dev->mtu; +} - /* check for multipath routes and choose one if - * necessary - */ - if (multipath_select_route(flp, rth, rp)) { - dst_hold(&(*rp)->u.dst); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - return 0; - } +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ +} - rth->u.dst.lastuse = jiffies; - dst_hold(&rth->u.dst); - rth->u.dst.__use++; - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - *rp = rth; - return 0; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} - return ip_route_output_slow(rp, flp); +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +static struct dst_ops ipv4_dst_blackhole_ops = { + .family = AF_INET, + .protocol = cpu_to_be16(ETH_P_IP), + .check = ipv4_blackhole_dst_check, + .mtu = ipv4_blackhole_mtu, + .default_advmss = ipv4_default_advmss, + .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .redirect = ipv4_rt_blackhole_redirect, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, + .neigh_lookup = ipv4_neigh_lookup, +}; -int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - int err; + struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + if (rt) { + struct dst_entry *new = &rt->dst; + + new->__use = 1; + new->input = dst_discard; + new->output = dst_discard_sk; - if ((err = __ip_route_output_key(rp, flp)) != 0) - return err; + new->dev = ort->dst.dev; + if (new->dev) + dev_hold(new->dev); - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + rt->rt_is_input = ort->rt_is_input; + rt->rt_iif = ort->rt_iif; + rt->rt_pmtu = ort->rt_pmtu; + + rt->rt_genid = rt_genid_ipv4(net); + rt->rt_flags = ort->rt_flags; + rt->rt_type = ort->rt_type; + rt->rt_gateway = ort->rt_gateway; + rt->rt_uses_gateway = ort->rt_uses_gateway; + + INIT_LIST_HEAD(&rt->rt_uncached); + + dst_free(new); } - return 0; -} + dst_release(dst_orig); -EXPORT_SYMBOL_GPL(ip_route_output_flow); + return rt ? &rt->dst : ERR_PTR(-ENOMEM); +} -int ip_route_output_key(struct rtable **rp, struct flowi *flp) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - return ip_route_output_flow(rp, flp, NULL, 0); + struct rtable *rt = __ip_route_output_key(net, flp4); + + if (IS_ERR(rt)) + return rt; + + if (flp4->flowi4_proto) + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); + + return rt; } +EXPORT_SYMBOL_GPL(ip_route_output_flow); -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - int nowait, unsigned int flags) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, int event, int nowait, unsigned int flags) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); struct rtmsg *r; - struct nlmsghdr *nlh; - unsigned char *b = skb->tail; - struct rta_cacheinfo ci; -#ifdef CONFIG_IP_MROUTE - struct rtattr *eptr; -#endif - nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); - r = NLMSG_DATA(nlh); + struct nlmsghdr *nlh; + unsigned long expires = 0; + u32 error; + u32 metrics[RTAX_MAX]; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); + if (nlh == NULL) + return -EMSGSIZE; + + r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = fl4->flowi4_tos; r->rtm_table = RT_TABLE_MAIN; + if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); - if (rt->fl.fl4_src) { + + if (nla_put_be32(skb, RTA_DST, dst)) + goto nla_put_failure; + if (src) { r->rtm_src_len = 32; - RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src); - } - if (rt->u.dst.dev) - RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); + if (nla_put_be32(skb, RTA_SRC, src)) + goto nla_put_failure; + } + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (rt->dst.tclassid && + nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) + goto nla_put_failure; #endif -#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - if (rt->rt_multipath_alg != IP_MP_ALG_NONE) { - __u32 alg = rt->rt_multipath_alg; - - RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); + if (!rt_is_input_route(rt) && + fl4->saddr != src) { + if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) + goto nla_put_failure; } -#endif - if (rt->fl.iif) - RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) - RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); - if (rt->rt_dst != rt->rt_gateway) - RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) - goto rtattr_failure; - ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); - ci.rta_used = rt->u.dst.__use; - ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); - if (rt->u.dst.expires) - ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies); - else - ci.rta_expires = 0; - ci.rta_error = rt->u.dst.error; - ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; - if (rt->peer) { - ci.rta_id = rt->peer->ip_id_count; - if (rt->peer->tcp_ts_stamp) { - ci.rta_ts = rt->peer->tcp_ts; - ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; - } + if (rt->rt_uses_gateway && + nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + goto nla_put_failure; + + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; + + if (time_before(now, expires)) + expires -= now; + else + expires = 0; } -#ifdef CONFIG_IP_MROUTE - eptr = (struct rtattr*)skb->tail; -#endif - RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); - if (rt->fl.iif) { -#ifdef CONFIG_IP_MROUTE - u32 dst = rt->rt_dst; - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - ipv4_devconf.mc_forwarding) { - int err = ipmr_get_route(skb, r, nowait); + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt_pmtu && expires) + metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) + goto nla_put_failure; + + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; + + error = rt->dst.error; + + if (rt_is_input_route(rt)) { +#ifdef CONFIG_IP_MROUTE + if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) return 0; - goto nlmsg_failure; + goto nla_put_failure; } else { if (err == -EMSGSIZE) - goto nlmsg_failure; - ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + goto nla_put_failure; + error = err; } } } else #endif - RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) + goto nla_put_failure; } - nlh->nlmsg_len = skb->tail - b; - return skb->len; + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) + goto nla_put_failure; -nlmsg_failure: -rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; } -int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { - struct rtattr **rta = arg; - struct rtmsg *rtm = NLMSG_DATA(nlh); + struct net *net = sock_net(in_skb->sk); + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; - u32 dst = 0; - u32 src = 0; - int iif = 0; - int err = -ENOBUFS; + struct flowi4 fl4; + __be32 dst = 0; + __be32 src = 0; + u32 iif; + int err; + int mark; struct sk_buff *skb; + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); + if (err < 0) + goto errout; + + rtm = nlmsg_data(nlh); + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - goto out; + if (skb == NULL) { + err = -ENOBUFS; + goto errout; + } /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ - skb->mac.raw = skb->data; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + + /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ + ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - if (rta[RTA_SRC - 1]) - memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4); - if (rta[RTA_DST - 1]) - memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4); - if (rta[RTA_IIF - 1]) - memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int)); + src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; + dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; + iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; + mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst; + fl4.saddr = src; + fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_mark = mark; if (iif) { - struct net_device *dev = __dev_get_by_index(iif); - err = -ENODEV; - if (!dev) - goto out_free; + struct net_device *dev; + + dev = __dev_get_by_index(net, iif); + if (dev == NULL) { + err = -ENODEV; + goto errout_free; + } + skb->protocol = htons(ETH_P_IP); skb->dev = dev; + skb->mark = mark; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = (struct rtable*)skb->dst; - if (!err && rt->u.dst.error) - err = -rt->u.dst.error; + + rt = skb_rtable(skb); + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst, - .saddr = src, - .tos = rtm->rtm_tos } } }; - int oif = 0; - if (rta[RTA_OIF - 1]) - memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); - fl.oif = oif; - err = ip_route_output_key(&rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } + if (err) - goto out_free; + goto errout_free; - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; - - err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); - if (!err) - goto out_free; - if (err < 0) { - err = -EMSGSIZE; - goto out_free; - } + err = rt_fill_info(net, dst, src, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0, 0); + if (err <= 0) + goto errout_free; - err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); - if (err > 0) - err = 0; -out: return err; + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +errout: + return err; -out_free: +errout_free: kfree_skb(skb); - goto out; -} - -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct rtable *rt; - int h, s_h; - int idx, s_idx; - - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - for (h = 0; h <= rt_hash_mask; h++) { - if (h < s_h) continue; - if (h > s_h) - s_idx = 0; - rcu_read_lock_bh(); - for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference(rt->u.rt_next), idx++) { - if (idx < s_idx) - continue; - skb->dst = dst_clone(&rt->u.dst); - if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - dst_release(xchg(&skb->dst, NULL)); - rcu_read_unlock_bh(); - goto done; - } - dst_release(xchg(&skb->dst, NULL)); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; - return skb->len; + goto errout; } void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL -static int flush_delay; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_gc_elasticity __read_mostly = 8; -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = (struct net *)__ctl->extra1; + if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + rt_cache_flush(net); + fnhe_genid_bump(net); return 0; - } + } return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen, - void **context) -{ - int delay; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(delay, (int __user *)newval)) - return -EFAULT; - rt_cache_flush(delay); - return 0; -} - -ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .data = &flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, +static struct ctl_table ipv4_route_table[] = { { - .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, - .procname = "min_delay", - .data = &ip_rt_min_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, - .procname = "max_delay", - .data = &ip_rt_max_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ - - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, + .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_COST, .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, .procname = "min_pmtu", .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, + { } +}; + +static struct ctl_table ipv4_route_flush_table[] = { { - .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, - .procname = "secret_interval", - .data = &ip_rt_secret_interval, + .procname = "flush", .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .mode = 0200, + .proc_handler = ipv4_sysctl_rtcache_flush, }, - { .ctl_name = 0 } + { }, }; -#endif - -#ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct; - -/* This code sucks. But you should have seen it before! --RR */ - -/* IP route accounting ptr for this logical cpu number. */ -#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) -#ifdef CONFIG_PROC_FS -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) +static __net_init int sysctl_route_net_init(struct net *net) { - unsigned int i; + struct ctl_table *tbl; - if ((offset & 3) || (length & 3)) - return -EIO; + tbl = ipv4_route_flush_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; } + tbl[0].extra1 = net; - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; - } + net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; - offset /= sizeof(u32); +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} - if (length > 0) { - u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; - u32 *dst = (u32 *) buffer; +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} - /* Copy first cpu. */ - *start = buffer; - memcpy(dst, src, length); +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; +#endif - /* Add the other cpus in, one int at a time */ - for_each_cpu(i) { - unsigned int j; +static __net_init int rt_genid_init(struct net *net) +{ + atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->fnhe_genid, 0); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); + return 0; +} - src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, +}; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int __net_init ipv4_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv4.peers = bp; + return 0; } -#endif /* CONFIG_PROC_FS */ -#endif /* CONFIG_NET_CLS_ROUTE */ -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) +static void __net_exit ipv4_inetpeer_exit(struct net *net) { - if (!str) - return 0; - rhash_entries = simple_strtoul(str, &str, 0); - return 1; + struct inet_peer_base *bp = net->ipv4.peers; + + net->ipv4.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); } -__setup("rhash_entries=", set_rhash_entries); + +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { + .init = ipv4_inetpeer_init, + .exit = ipv4_inetpeer_exit, +}; + +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */ int __init ip_rt_init(void) { int rc = 0; - rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7))); + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); -#ifdef CONFIG_NET_CLS_ROUTE - { - int order; - for (order = 0; - (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) - /* NOTHING */; - ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + +#ifdef CONFIG_IP_ROUTE_CLASSID + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); - memset(ip_rt_acct, 0, PAGE_SIZE << order); - } #endif - ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", - sizeof(struct rtable), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - - if (!ipv4_dst_ops.kmem_cachep) - panic("IP: failed to allocate ip_dst_cache\n"); - - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (num_physpages >= 128 * 1024) ? - (27 - PAGE_SHIFT) : - (29 - PAGE_SHIFT), - HASH_HIGHMEM, - &rt_hash_log, - &rt_hash_mask, - 0); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; - - rt_cache_stat = alloc_percpu(struct rt_cache_stat); - if (!rt_cache_stat) - return -ENOMEM; + ipv4_dst_ops.kmem_cachep = + kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - devinet_init(); - ip_fib_init(); + ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; - init_timer(&rt_flush_timer); - rt_flush_timer.function = rt_run_flush; - init_timer(&rt_periodic_timer); - rt_periodic_timer.function = rt_check_expire; - init_timer(&rt_secret_timer); - rt_secret_timer.function = rt_secret_rebuild; + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + - ip_rt_gc_interval; - add_timer(&rt_periodic_timer); + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || - !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - proc_net_stat))) { - free_percpu(rt_cache_stat); - return -ENOMEM; - } - rtstat_pde->proc_fops = &rt_cpu_seq_fops; - } -#ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); -#endif -#endif + devinet_init(); + ip_fib_init(); + + if (ip_rt_proc_init()) + pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); + +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif + register_pernet_subsys(&rt_genid_ops); + register_pernet_subsys(&ipv4_inetpeer_ops); return rc; } -EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_input); -EXPORT_SYMBOL(ip_route_output_key); +#ifdef CONFIG_SYSCTL +/* + * We really need to sanitize the damn ipv4 init order, then all + * this nonsense will go away. + */ +void __init ip_static_sysctl_init(void) +{ + register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); +} +#endif |
