diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 3335 |
1 files changed, 1528 insertions, 1807 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7b5e8e1d94b..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> @@ -64,14 +62,14 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/module.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -81,7 +79,6 @@ #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> @@ -89,9 +86,10 @@ #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <linux/slab.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -108,70 +106,71 @@ #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#include <linux/kmemleak.h> #endif +#include <net/secure_seq.h> -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) - -#define IP_MAX_MTU 0xFFF0 +#define RT_FL_TOS(oldflp4) \ + ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; -static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; -static int ip_rt_gc_min_interval = HZ / 2; -static int ip_rt_redirect_number = 9; -static int ip_rt_redirect_load = HZ / 50; -static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); -static int ip_rt_error_cost = HZ; -static int ip_rt_error_burst = 5 * HZ; -static int ip_rt_gc_elasticity = 8; -static int ip_rt_mtu_expires = 10 * 60 * HZ; -static int ip_rt_min_pmtu = 512 + 20 + 20; -static int ip_rt_min_advmss = 256; -static int ip_rt_secret_interval = 10 * 60 * HZ; - -#define RTprint(a...) printk(KERN_DEBUG a) - -static void rt_worker_func(struct work_struct *work); -static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); -static struct timer_list rt_secret_timer; +static int ip_rt_redirect_number __read_mostly = 9; +static int ip_rt_redirect_load __read_mostly = HZ / 50; +static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost __read_mostly = HZ; +static int ip_rt_error_burst __read_mostly = 5 * HZ; +static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static int ip_rt_min_advmss __read_mostly = 256; /* * Interface to generic destination cache. */ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); -static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, - struct net_device *dev, int how); +static unsigned int ipv4_default_advmss(const struct dst_entry *dst); +static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(struct dst_ops *ops); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + WARN_ON(1); + return NULL; +} + +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), - .gc = rt_garbage_collect, + .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_dst_check, + .default_advmss = ipv4_default_advmss, + .mtu = ipv4_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, - .local_out = ip_local_out, - .entry_size = sizeof(struct rtable), - .entries = ATOMIC_INIT(0), + .redirect = ip_do_redirect, + .local_out = __ip_local_out, + .neigh_lookup = ipv4_neigh_lookup, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -187,166 +186,27 @@ const __u8 ip_tos2prio[16] = { TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; - - -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable *chain; -}; -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define RT_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define RT_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define RT_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define RT_HASH_LOCK_SZ 512 -# else -# define RT_HASH_LOCK_SZ 256 -# endif -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ - int i; - - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, - GFP_KERNEL); - if (!rt_hash_locks) - panic("IP: failed to allocate rt_hash_locks\n"); - - for (i = 0; i < RT_HASH_LOCK_SZ; i++) - spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket *rt_hash_table; -static unsigned rt_hash_mask; -static unsigned int rt_hash_log; -static atomic_t rt_genid; +EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) \ - (__raw_get_cpu_var(rt_cache_stat).field++) - -static unsigned int rt_hash_code(u32 daddr, u32 saddr) -{ - return jhash_2words(daddr, saddr, atomic_read(&rt_genid)) - & rt_hash_mask; -} - -#define rt_hash(daddr, saddr, idx) \ - rt_hash_code((__force u32)(__be32)(daddr),\ - (__force u32)(__be32)(saddr) ^ ((idx) << 5)) +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - int bucket; - int genid; -}; - -static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st) -{ - struct rtable *r = NULL; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - rcu_read_lock_bh(); - r = rcu_dereference(rt_hash_table[st->bucket].chain); - while (r) { - if (r->rt_genid == st->genid) - return r; - r = rcu_dereference(r->u.dst.rt_next); - } - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r) -{ - r = r->u.dst.rt_next; - while (!r) { - rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - } - return rcu_dereference(r); -} - -static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(st); - - if (r) - while (pos && (r = rt_cache_get_next(st, r))) { - if (r->rt_genid != st->genid) - continue; - --pos; - } - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - struct rt_cache_iter_state *st = seq->private; - if (*pos) - return rt_cache_get_idx(st, *pos - 1); - st->genid = atomic_read(&rt_genid); + return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r; - struct rt_cache_iter_state *st = seq->private; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(st); - else - r = rt_cache_get_next(st, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -356,28 +216,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - char temp[256]; - - sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", - r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), - r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == - dev_queue_xmit) : 0, - r->rt_spec_dst); - seq_printf(seq, "%-127s\n", temp); - } return 0; } @@ -390,8 +228,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &rt_cache_seq_ops, - sizeof(struct rt_cache_iter_state)); + return seq_open(file, &rt_cache_seq_ops); } static const struct file_operations rt_cache_seq_fops = { @@ -399,7 +236,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release, }; @@ -410,7 +247,7 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -423,7 +260,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -449,8 +286,8 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", - atomic_read(&ipv4_dst_ops.entries), - st->in_hit, + dst_entries_get_slow(&ipv4_dst_ops), + 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -458,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - st->out_hit, + 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, - st->gc_total, - st->gc_ignored, - st->gc_goal_miss, - st->gc_dst_overflow, - st->in_hlist_search, - st->out_hlist_search + 0, /* st->gc_total */ + 0, /* st->gc_ignored */ + 0, /* st->gc_goal_miss */ + 0, /* st->gc_dst_overflow */ + 0, /* st->in_hlist_search */ + 0 /* st->out_hlist_search */ ); return 0; } @@ -493,52 +330,51 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) +#ifdef CONFIG_IP_ROUTE_CLASSID +static int rt_acct_proc_show(struct seq_file *m, void *v) { - unsigned int i; + struct ip_rt_acct *dst, *src; + unsigned int i, j; - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } } - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src; + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} - src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); } + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif -static __init int ip_rt_proc_init(struct net *net) +static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, - &rt_cache_seq_fops); + pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + &rt_cache_seq_fops); if (!pde) goto err1; @@ -547,15 +383,14 @@ static __init int ip_rt_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE - pde = create_proc_read_entry("rt_acct", 0, net->proc_net, - ip_rt_acct_read, NULL); +#ifdef CONFIG_IP_ROUTE_CLASSID + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -564,580 +399,340 @@ err2: err1: return -ENOMEM; } -#else -static inline int ip_rt_proc_init(struct net *net) -{ - return 0; -} -#endif /* CONFIG_PROC_FS */ -static __inline__ void rt_free(struct rtable *rt) +static void __net_exit ip_rt_do_proc_exit(struct net *net) { - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + remove_proc_entry("rt_cache", net->proc_net_stat); + remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_IP_ROUTE_CLASSID + remove_proc_entry("rt_acct", net->proc_net); +#endif } -static __inline__ void rt_drop(struct rtable *rt) +static struct pernet_operations ip_rt_proc_ops __net_initdata = { + .init = ip_rt_do_proc_init, + .exit = ip_rt_do_proc_exit, +}; + +static int __init ip_rt_proc_init(void) { - ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + return register_pernet_subsys(&ip_rt_proc_ops); } -static __inline__ int rt_fast_clean(struct rtable *rth) +#else +static inline int ip_rt_proc_init(void) { - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.dst.rt_next; + return 0; } +#endif /* CONFIG_PROC_FS */ -static __inline__ int rt_valuable(struct rtable *rth) +static inline bool rt_is_expired(const struct rtable *rth) { - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +void rt_cache_flush(struct net *net) { - unsigned long age; - int ret = 0; - - if (atomic_read(&rth->u.dst.__refcnt)) - goto out; - - ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) - goto out; - - age = jiffies - rth->u.dst.lastuse; - ret = 0; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; + rt_genid_bump_ipv4(net); } -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) { - u32 score = jiffies - rt->u.dst.lastuse; + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + struct neighbour *n; - score = ~score & ~(3<<30); + rt = (const struct rtable *) dst; + if (rt->rt_gateway) + pkey = (const __be32 *) &rt->rt_gateway; + else if (skb) + pkey = &ip_hdr(skb)->daddr; - if (rt_valuable(rt)) - score |= (1<<31); - - if (!rt->fl.iif || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; + n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); + if (n) + return n; + return neigh_create(&arp_tbl, pkey, dev); } -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) -{ - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | - (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | - (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; -} +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ - return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net; -} +static struct ip_ident_bucket *ip_idents __read_mostly; -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static void rt_do_flush(int process_context) +u32 ip_idents_reserve(u32 hash, int segs) { - unsigned int i; - struct rtable *rth, *next; - - for (i = 0; i <= rt_hash_mask; i++) { - if (process_context && need_resched()) - cond_resched(); - rth = rt_hash_table[i].chain; - if (!rth) - continue; + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; - spin_unlock_bh(rt_hash_lock_addr(i)); + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); - for (; rth; rth = next) { - next = rth->u.dst.rt_next; - rt_free(rth); - } - } + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -static void rt_check_expire(void) +void __ip_select_ident(struct iphdr *iph, int segs) { - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth, **rthp; - u64 mult; - - mult = ((u64)ip_rt_gc_interval) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); - if (need_resched()) - cond_resched(); + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); +} +EXPORT_SYMBOL(__ip_select_ident); - if (*rthp == NULL) - continue; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { - *rthp = rth->u.dst.rt_next; - rt_free(rth); - continue; - } - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->u.dst.expires)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) +{ + if (sk) { + const struct inet_sock *inet = inet_sk(sk); - /* Cleanup aged off entries. */ - *rthp = rth->u.dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; } - rover = i; + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); } -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) { - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -/* - * Pertubation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(void) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { - unsigned char shuffle; + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &rt_genid); + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); } -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(int delay) +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) { - rt_cache_invalidate(); - if (delay >= 0) - rt_do_flush(!in_softirq()); + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); } -/* - * We change rt_genid and let gc do the cleanup - */ -static void rt_secret_rebuild(unsigned long dummy) +static inline void rt_free(struct rtable *rt) { - rt_cache_invalidate(); - mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); + call_rcu(&rt->dst.rcu_head, dst_rcu_free); } -/* - Short description of GC goals. +static DEFINE_SPINLOCK(fnhe_lock); - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. - - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) +static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth, **rthp; - unsigned long now = jiffies; - int goal; - - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ - - RT_CACHE_STAT_INC(gc_total); + struct rtable *rt; - if (now - last_gc < ip_rt_gc_min_interval && - atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + rt_free(rt); } - - /* Calculate number of entries, which we want to expire now. */ - goal = atomic_read(&ipv4_dst_ops.entries) - - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + rt_free(rt); } +} - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +{ + struct fib_nh_exception *fnhe, *oldest; - if (goal <= 0) { - equilibrium += goal; - goto work_done; + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; } + fnhe_flush_routes(oldest); + return oldest; +} - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { - if (rth->rt_genid == atomic_read(&rt_genid) && - !rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } - *rthp = rth->u.dst.rt_next; - rt_free(rth); - goal--; - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); +static inline u32 fnhe_hashfun(__be32 daddr) +{ + u32 hval; - if (expire == 0) - break; + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); - expire >>= 1; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, i); -#endif + return hval & (FNHE_HASH_SIZE - 1); +} - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) +{ + rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->dst.expires = fnhe->fnhe_expires; - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - if (net_ratelimit()) - printk(KERN_WARNING "dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, rover); -#endif -out: return 0; + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + rt->rt_uses_gateway = 1; + } } -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + u32 pmtu, unsigned long expires) { - struct rtable *rth, **rthp; - unsigned long now; - struct rtable *cand, **candp; - u32 min_score; - int chain_length; - int attempts = !in_softirq(); + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + struct rtable *rt; + unsigned int i; + int depth; + u32 hval = fnhe_hashfun(daddr); -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; + spin_lock_bh(&fnhe_lock); - rthp = &rt_hash_table[hash].chain; + hash = nh->nh_exceptions; + if (!hash) { + hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + if (!hash) + goto out_unlock; + nh->nh_exceptions = hash; + } - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { - *rthp = rth->u.dst.rt_next; - rt_free(rth); - continue; - } - if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { - /* Put it first */ - *rthp = rth->u.dst.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->u.dst.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); + hash += hval; - dst_use(&rth->u.dst, now); - spin_unlock_bh(rt_hash_lock_addr(hash)); + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + break; + depth++; + } - rt_drop(rt); - *rp = rth; - return 0; + if (fnhe) { + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = max(1UL, expires); } - - if (!atomic_read(&rth->u.dst.__refcnt)) { - u32 score = rt_score(rth); - - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } + /* Update all cached dsts too */ + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) + fill_route_from_fnhe(rt, fnhe); + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) + fill_route_from_fnhe(rt, fnhe); + } else { + if (depth > FNHE_RECLAIM_DEPTH) + fnhe = fnhe_oldest(hash); + else { + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); } - - chain_length++; - - rthp = &rth->u.dst.rt_next; - } - - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. + fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; + + /* Exception created; mark the cached routes for the nexthop + * stale, so anyone caching it rechecks if this exception + * applies to them. */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.dst.rt_next; - rt_free(cand); - } - } + rt = rcu_dereference(nh->nh_rth_input); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; - /* Try to bind route to arp only if it is output - route or unicast forwarding path. - */ - if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); - if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); - - if (err != -ENOBUFS) { - rt_drop(rt); - return err; - } - - /* Neighbour tables are full and nothing - can be released. Try to shrink route cache, - it is most likely it holds some neighbour records. - */ - if (attempts-- > 0) { - int saved_elasticity = ip_rt_gc_elasticity; - int saved_int = ip_rt_gc_min_interval; - ip_rt_gc_elasticity = 1; - ip_rt_gc_min_interval = 0; - rt_garbage_collect(&ipv4_dst_ops); - ip_rt_gc_min_interval = saved_int; - ip_rt_gc_elasticity = saved_elasticity; - goto restart; - } - - if (net_ratelimit()) - printk(KERN_WARNING "Neighbour table overflow.\n"); - rt_drop(rt); - return -ENOBUFS; + for_each_possible_cpu(i) { + struct rtable __rcu **prt; + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + rt = rcu_dereference(*prt); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; -#if RT_CACHE_DEBUG >= 2 - if (rt->u.dst.rt_next) { - struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, - NIPQUAD(rt->rt_dst)); - for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); - printk("\n"); - } -#endif - rt_hash_table[hash].chain = rt; - spin_unlock_bh(rt_hash_lock_addr(hash)); - *rp = rt; - return 0; -} - -void rt_bind_peer(struct rtable *rt, int create) -{ - static DEFINE_SPINLOCK(rt_peer_lock); - struct inet_peer *peer; - - peer = inet_getpeer(rt->rt_dst, create); + fnhe->fnhe_stamp = jiffies; - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) - inet_putpeer(peer); +out_unlock: + spin_unlock_bh(&fnhe_lock); } -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; - - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); -} - -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + bool kill_route) { - struct rtable *rt = (struct rtable *) dst; - - if (rt) { - if (rt->peer == NULL) - rt_bind_peer(rt, 1); - - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. - */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; - } - } else - printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", - __builtin_return_address(0)); - - ip_select_fb_ident(iph); -} + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct neighbour *n; + struct net *net; -static void rt_del(unsigned hash, struct rtable *rt) -{ - struct rtable **rthp, *aux; + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; - rthp = &rt_hash_table[hash].chain; - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - while ((aux = *rthp) != NULL) { - if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) { - *rthp = aux->u.dst.rt_next; - rt_free(aux); - continue; - } - rthp = &aux->u.dst.rt_next; + default: + return; } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) -{ - int i, k; - struct in_device *in_dev = in_dev_get(dev); - struct rtable *rth, **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; + if (rt->rt_gateway != old_gw) + return; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) - || ipv4_is_zeronet(new_gw)) + net = dev_net(dev); + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1146,133 +741,73 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST) + if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); - - rthp=&rt_hash_table[hash].chain; - - rcu_read_lock(); - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rth->fl.iif != 0 || - rth->rt_genid != atomic_read(&rt_genid)) { - rthp = &rth->u.dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->u.dst.error || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; - - dst_hold(&rth->u.dst); - rcu_read_unlock(); + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - in_dev_put(in_dev); - return; - } - - /* Copy all the information. */ - *rt = *rth; - INIT_RCU_HEAD(&rt->u.dst.rcu_head); - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - if (rt->idev) - in_dev_hold(rt->idev); - rt->u.dst.obsolete = 0; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; - rt->u.dst.xfrm = NULL; - rt->rt_genid = atomic_read(&rt_genid); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } - - netevent.old = &rth->u.dst; - netevent.new = &rt->u.dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); - - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt)) - ip_rt_put(rt); - goto do_next; + update_or_create_fnhe(nh, fl4->daddr, new_gw, + 0, 0); } - rcu_read_unlock(); - do_next: - ; + if (kill_route) + rt->dst.obsolete = DST_OBSOLETE_KILL; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } - in_dev_put(in_dev); return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " - "%u.%u.%u.%u ignored.\n" - " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n", - NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), - NIPQUAD(saddr), NIPQUAD(daddr)); + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); + } #endif - in_dev_put(in_dev); + ; +} + +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4; + const struct iphdr *iph = (const struct iphdr *) skb->data; + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + rt = (struct rtable *) dst; + + __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __ip_do_redirect(rt, skb, &fl4, true); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable*)dst; + struct rtable *rt = (struct rtable *)dst; struct dst_entry *ret = dst; if (rt) { - if (dst->obsolete) { + if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); -#if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to " - "%u.%u.%u.%u/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); -#endif - rt_del(hash, rt); + rt->dst.expires) { + ip_rt_put(rt); ret = NULL; } } @@ -1297,216 +832,305 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; - struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct inet_peer *peer; + struct net *net; + int log_martians; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); return; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); - if (!IN_DEV_TX_REDIRECTS(in_dev)) - goto out; + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); + return; + } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; - goto out; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->u.dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && - rt->u.dst.rate_tokens == ip_rt_redirect_number && - net_ratelimit()) - printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " - "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", - NIPQUAD(rt->rt_src), rt->rt_iif, - NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); + if (log_martians && + peer->rate_tokens == ip_rt_redirect_number) + net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", + &ip_hdr(skb)->saddr, inet_iif(skb), + &ip_hdr(skb)->daddr, &gw); #endif } -out: - in_dev_put(in_dev); +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + struct net *net; + bool send; int code; - switch (rt->u.dst.error) { - case EINVAL: - default: - goto out; + net = dev_net(rt->dst.dev); + if (!IN_DEV_FORWARD(in_dev)) { + switch (rt->dst.error) { case EHOSTUNREACH: - code = ICMP_HOST_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); break; + case ENETUNREACH: - code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); - break; - case EACCES: - code = ICMP_PKT_FILTERED; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); break; + } + goto out; } - now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + switch (rt->dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; } + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; + inet_putpeer(peer); + } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + out: kfree_skb(skb); return 0; } -/* - * The last two values are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ + struct dst_entry *dst = &rt->dst; + struct fib_result res; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + if (dst_metric_locked(dst, RTAX_MTU)) + return; -static __inline__ unsigned short guess_mtu(unsigned short old_mtu) -{ - int i; + if (dst->dev->mtu < mtu) + return; - for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) - if (old_mtu > mtu_plateau[i]) - return mtu_plateau[i]; - return 68; + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + + if (rt->rt_pmtu == mtu && + time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) + return; + + rcu_read_lock(); + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); + + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + jiffies + ip_rt_mtu_expires); + } + rcu_read_unlock(); } -unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, - unsigned short new_mtu) +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { - int i; - unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; - unsigned short est_mtu = 0; + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; - if (ipv4_config.no_pmtu_disc) - return 0; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); +} - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], 0); +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, + int oif, u32 mark, u8 protocol, int flow_flags) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == skeys[i] && - rth->rt_dst == daddr && - rth->rt_src == iph->saddr && - rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && - rth->u.dst.dev->nd_net == net && - rth->rt_genid == atomic_read(&rt_genid)) { - unsigned short mtu = new_mtu; - - if (new_mtu < 68 || new_mtu >= old_mtu) { - - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; - - mtu = guess_mtu(old_mtu); - } - if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { - if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { - dst_confirm(&rth->u.dst); - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= - (1 << RTAX_MTU); - } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - } - rcu_read_unlock(); + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } - return est_mtu ? : new_mtu; } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); - } - dst->metrics[RTAX_MTU-1] = mtu; - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } } -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - return NULL; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + struct dst_entry *odst = NULL; + bool new = false; + + bh_lock_sock(sk); + + if (!ip_sk_accept_pmtu(sk)) + goto out; + + odst = sk_dst_get(sk); + + if (sock_owned_by_user(sk) || !odst) { + __ipv4_sk_update_pmtu(skb, sk, mtu); + goto out; + } + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + + if (!dst_check(&rt->dst, 0)) { + if (new) + dst_release(&rt->dst); + + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + if (new) + sk_dst_set(sk, &rt->dst); + +out: + bh_unlock_sock(sk); + dst_release(odst); } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ipv4_dst_destroy(struct dst_entry *dst) +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer = rt->peer; - struct in_device *idev = rt->idev; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (peer) { - rt->peer = NULL; - inet_putpeer(peer); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } +} +EXPORT_SYMBOL_GPL(ipv4_redirect); - if (idev) { - rt->idev = NULL; - in_dev_put(idev); +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; - struct in_device *idev = rt->idev; - if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = - in_dev_get(dev->nd_net->loopback_dev); - if (loopback_idev) { - rt->idev = loopback_idev; - in_dev_put(idev); - } - } + + /* All IPV4 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + * + * When a PMTU/redirect information update invalidates a route, + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or + * DST_OBSOLETE_DEAD by dst_free(). + */ + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + return NULL; + return dst; } static void ipv4_link_failure(struct sk_buff *skb) @@ -1515,17 +1139,18 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = (struct rtable *) skb->dst; + rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", - NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), - skb->dev ? skb->dev->name : "?"); + pr_debug("%s: %pI4 -> %pI4, %s\n", + __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + skb->dev ? skb->dev->name : "?"); kfree_skb(skb); + WARN_ON(1); return 0; } @@ -1538,82 +1163,275 @@ static int ip_rt_bug(struct sk_buff *skb) in IP options! */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; - struct fib_result res; - if (rt->fl.iif == 0) - src = rt->rt_src; - else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); + if (rt_is_output_route(rt)) + src = ip_hdr(skb)->saddr; + else { + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; + + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + else + src = inet_select_addr(rt->dst.dev, + rt_nexthop(rt, iph->daddr), + RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct fib_info *fi = res->fi; + unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (advmss == 0) { + advmss = max_t(unsigned int, dst->dev->mtu - 40, + ip_rt_min_advmss); + if (advmss > 65535 - 40) + advmss = 65535 - 40; + } + return advmss; +} + +static unsigned int ipv4_mtu(const struct dst_entry *dst) +{ + const struct rtable *rt = (const struct rtable *) dst; + unsigned int mtu = rt->rt_pmtu; + + if (!mtu || time_after_eq(jiffies, rt->dst.expires)) + mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + return mtu; + + mtu = dst->dev->mtu; + + if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + if (rt->rt_uses_gateway && mtu > 576) + mtu = 576; + } + + return min_t(unsigned int, mtu, IP_MAX_MTU); +} + +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; + + if (!hash) + return NULL; + + hval = fnhe_hashfun(daddr); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; +} + +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) +{ + bool ret = false; + + spin_lock_bh(&fnhe_lock); + + if (daddr == fnhe->fnhe_daddr) { + struct rtable __rcu **porig; + struct rtable *orig; + int genid = fnhe_genid(dev_net(rt->dst.dev)); + + if (rt_is_input_route(rt)) + porig = &fnhe->fnhe_rth_input; + else + porig = &fnhe->fnhe_rth_output; + orig = rcu_dereference(*porig); + + if (fnhe->fnhe_genid != genid) { + fnhe->fnhe_genid = genid; + fnhe->fnhe_gw = 0; + fnhe->fnhe_pmtu = 0; + fnhe->fnhe_expires = 0; + fnhe_flush_routes(fnhe); + orig = NULL; + } + fill_route_from_fnhe(rt, fnhe); + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + + if (!(rt->dst.flags & DST_NOCACHE)) { + rcu_assign_pointer(*porig, rt); + if (orig) + rt_free(orig); + ret = true; + } + + fnhe->fnhe_stamp = jiffies; + } + spin_unlock_bh(&fnhe_lock); + + return ret; +} + +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p; + bool ret = true; + + if (rt_is_input_route(rt)) { + p = (struct rtable **)&nh->nh_rth_input; + } else { + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + if (orig) + rt_free(orig); + } else + ret = false; + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (!list_empty(&rt->rt_uncached)) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); + } +} + +static bool rt_cache_valid(const struct rtable *rt) +{ + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, + const struct fib_result *res, + struct fib_nh_exception *fnhe, + struct fib_info *fi, u16 type, u32 itag) +{ + bool cached = false; if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); - if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && - rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { + rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; } -#ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + dst_init_metrics(&rt->dst, fi->fib_metrics, true); +#ifdef CONFIG_IP_ROUTE_CLASSID + rt->dst.tclassid = nh->nh_tclassid; #endif + if (unlikely(fnhe)) + cached = rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_cache_route(nh, rt); + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, - ip_rt_min_advmss); - if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; - -#ifdef CONFIG_NET_CLS_ROUTE + rt_add_uncached_list(rt); + +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; } +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm, bool will_cache) +{ + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); +} + +/* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned hash; struct rtable *rth; - __be32 spec_dst; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ @@ -1621,69 +1439,61 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) + skb->protocol != htons(ETH_P_IP)) goto e_inval; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(saddr)) + goto e_inval; + if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) - goto e_inval; - - rth = dst_alloc(&ipv4_dst_ops); + } else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); + if (err < 0) + goto e_err; + } + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_genid = atomic_read(&rt_genid); + rth->dst.output = ip_rt_bug; + + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; + rth->rt_is_input= 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); - in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); - return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: - in_dev_put(in_dev); return -ENOBUFS; - e_inval: - in_dev_put(in_dev); return -EINVAL; +e_err: + return err; } @@ -1700,143 +1510,127 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source %u.%u.%u.%u from " - "%u.%u.%u.%u, on dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + pr_warn("martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { - int i; - const unsigned char *p = skb_mac_header(skb); - printk(KERN_WARNING "ll header: "); - for (i = 0; i < dev->hard_header_len; i++, p++) { - printk("%02x", *p); - if (i < (dev->hard_header_len - 1)) - printk(":"); - } - printk("\n"); + print_hex_dump(KERN_WARNING, "ll header: ", + DUMP_PREFIX_OFFSET, 16, 1, + skb_mac_header(skb), + dev->hard_header_len, true); } } #endif } -static inline int __mkroute_input(struct sk_buff *skb, - struct fib_result* res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) +/* called in rcu_read_lock() section */ +static int __mkroute_input(struct sk_buff *skb, + const struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { - + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; - unsigned flags = 0; - __be32 spec_dst; - u32 itag; + unsigned int flags = 0; + bool do_cache; + u32 itag = 0; /* get a working reference to the output device */ - out_dev = in_dev_get(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { - if (net_ratelimit()) - printk(KERN_CRIT "Bug in ip_route_input" \ - "_slow(). Please, report\n"); + net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } - - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - err = -EINVAL; goto cleanup; } - if (err) - flags |= RTCF_DIRECTSRC; - - if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } } + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + if (do_cache) { + if (fnhe != NULL) + rth = rcu_dereference(fnhe->fnhe_rth_input); + else + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; + } + } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_spec_dst= spec_dst; - - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - rth->rt_genid = atomic_read(&rt_genid); - - rt_set_nexthop(rth, res, itag); - + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); - *result = rth; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); +out: err = 0; cleanup: - /* release the working reference to the output device */ - in_dev_put(out_dev); return err; } -static inline int ip_mkroute_input(struct sk_buff *skb, - struct fib_result* res, - const struct flowi *fl, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) +static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, + const struct flowi4 *fl4, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { - struct rtable* rth = NULL; - int err; - unsigned hash; - #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); - return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* @@ -1847,29 +1641,21 @@ static inline int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = saddr, - .tos = tos, - .scope = RT_SCOPE_UNIVERSE, - } }, - .mark = skb->mark, - .iif = dev->ifindex }; - unsigned flags = 0; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flowi4 fl4; + unsigned int flags = 0; u32 itag = 0; - struct rtable * rth; - unsigned hash; - __be32 spec_dst; + struct rtable *rth; int err = -EINVAL; - int free_res = 0; - struct net * net = dev->nd_net; + struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -1880,11 +1666,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) + res.fi = NULL; + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; @@ -1893,111 +1679,124 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || - ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr)) goto martian_destination; + /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), + * and call it once if daddr or/and saddr are loopback addresses + */ + if (ipv4_is_loopback(daddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_destination; + } else if (ipv4_is_loopback(saddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + } + /* * Now we are ready to route packet. */ - if ((err = fib_lookup(net, &fl, &res)) != 0) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + err = -EHOSTUNREACH; goto no_route; } - free_res = 1; - - RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag); - if (result < 0) - goto martian_source; - if (result) - flags |= RTCF_DIRECTSRC; - spec_dst = daddr; + err = fib_validate_source(skb, saddr, daddr, tos, + 0, dev, in_dev, &itag); + if (err < 0) + goto martian_source_keep_err; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; + goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -done: - in_dev_put(in_dev); - if (free_res) - fib_res_put(&res); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) - goto martian_source; - if (err) - flags |= RTCF_DIRECTSRC; + goto martian_source_keep_err; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + do_cache = false; + if (res.fi) { + if (!itag) { + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; + } + do_cache = true; + } + } + + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - rth->rt_genid = atomic_read(&rt_genid); - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.input= ip_local_deliver; + rth->dst.output= ip_rt_bug; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); - goto done; + if (do_cache) { + if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + rth->dst.flags |= DST_NOCACHE; + rt_add_uncached_list(rth); + } + } + skb_dst_set(skb, &rth->dst); + err = 0; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; @@ -2009,61 +1808,32 @@ no_route: martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination %u.%u.%u.%u from " - "%u.%u.%u.%u, dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + if (IN_DEV_LOG_MARTIANS(in_dev)) + net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif -e_hostunreach: - err = -EHOSTUNREACH; - goto done; - e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto out; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { - struct rtable * rth; - unsigned hash; - int iif = dev->ifindex; - struct net *net; - - net = dev->nd_net; - tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + int res; rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == saddr && - rth->fl.iif == iif && - rth->fl.oif == 0 && - rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos && - rth->u.dst.dev->nd_net == net && - rth->rt_genid == atomic_read(&rt_genid)) { - dst_use(&rth->u.dst, jiffies); - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - skb->dst = (struct dst_entry*)rth; - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - rcu_read_unlock(); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing @@ -2077,208 +1847,185 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { - struct in_device *in_dev; + struct in_device *in_dev = __in_dev_get_rcu(dev); - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { - int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + if (in_dev) { + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + return res; } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; } +EXPORT_SYMBOL(ip_route_input_noref); -static inline int __mkroute_output(struct rtable **result, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +/* called with rcu_read_lock() */ +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, int orig_oif, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); - int err = 0; + u16 type = res->type; + struct rtable *rth; + bool do_cache; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) - return -EINVAL; + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) + return ERR_PTR(-EINVAL); - if (fl->fl4_dst == htonl(0xFFFFFFFF)) - res->type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); - if (!in_dev) - return -EINVAL; - - if (res->type == RTN_BROADCAST) { + do_cache = true; + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } - } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + fi = NULL; + } else if (type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST | RTCF_LOCAL; + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; + else + do_cache = false; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; + } + + fnhe = NULL; + do_cache &= fi != NULL; + if (do_cache) { + struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); + + fnhe = find_exception(nh, fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth_output; + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } +add: + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM), + do_cache); + if (!rth) + return ERR_PTR(-ENOBUFS); - rth = dst_alloc(&ipv4_dst_ops); - if (!rth) { - err = -ENOBUFS; - goto cleanup; - } + rth->dst.output = ip_output; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; - /* get references to the devices that are to be hold by the routing - cache entry */ - rth->u.dst.dev = dev_out; - dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; - - rth->u.dst.output=ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_is_input = 0; + rth->rt_iif = orig_oif ? : 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; - } + if (flags & RTCF_LOCAL) + rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + !ipv4_is_local_multicast(fl4->daddr)) { + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - rth->rt_flags = flags; - - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; -} - -static inline int ip_mkroute_output(struct rtable **rp, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); - err = rt_intern_hash(hash, rth, rp); - } - - return err; + return rth; } /* * Major route resolver routine. */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, - const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = oldflp->fl4_dst, - .saddr = oldflp->fl4_src, - .tos = tos & IPTOS_RT_MASK, - .scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : - RT_SCOPE_UNIVERSE), - } }, - .mark = oldflp->mark, - .iif = net->loopback_dev->ifindex, - .oif = oldflp->oif }; - struct fib_result res; - unsigned flags = 0; +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +{ struct net_device *dev_out = NULL; - int free_res = 0; - int err; - + __u8 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + int orig_oif; + res.tclassid = 0; res.fi = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif + res.table = NULL; - if (oldflp->fl4_src) { - err = -EINVAL; - if (ipv4_is_multicast(oldflp->fl4_src) || - ipv4_is_lbcast(oldflp->fl4_src) || - ipv4_is_zeronet(oldflp->fl4_src)) - goto out; + orig_oif = fl4->flowi4_oif; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) + fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fl4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2289,9 +2036,14 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 - && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = __ip_dev_find(net, fl4->saddr, false); + if (dev_out == NULL) + goto out; + /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2307,61 +2059,61 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; goto make_route; } - if (dev_out) - dev_put(dev_out); - dev_out = NULL; + + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + if (!__ip_dev_find(net, fl4->saddr, false)) + goto out; + } } - if (oldflp->oif) { - dev_out = dev_get_by_index(net, oldflp->oif); - err = -ENODEV; + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { - dev_put(dev_out); - goto out; /* Wrong error code */ + if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { + rth = ERR_PTR(-ENETUNREACH); + goto out; } - - if (ipv4_is_local_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF)) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (ipv4_is_multicast(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (!fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = net->loopback_dev->ifindex; + fl4->flowi4_oif = LOOPBACK_IFINDEX; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl, &res)) { + if (fib_lookup(net, fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + res.table = NULL; + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2380,198 +2132,161 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) - fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); + if (!fl4->saddr) { + if (res.fi->fib_prefsrc) + fl4->saddr = res.fi->fib_prefsrc; + else + fl4->saddr = fl4->daddr; + } dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); - res.fi = NULL; + fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); +out: + rcu_read_unlock(); + return rth; +} +EXPORT_SYMBOL_GPL(__ip_route_output_key); - if (free_res) - fib_res_put(&res); - if (dev_out) - dev_put(dev_out); -out: return err; +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; } -int __ip_route_output_key(struct net *net, struct rtable **rp, - const struct flowi *flp) +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) { - unsigned hash; - struct rtable *rth; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); - - rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && - rth->fl.iif == 0 && - rth->fl.oif == flp->oif && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && - rth->u.dst.dev->nd_net == net && - rth->rt_genid == atomic_read(&rt_genid)) { - dst_use(&rth->u.dst, jiffies); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - *rp = rth; - return 0; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); + return mtu ? : dst->dev->mtu; +} - return ip_route_output_slow(net, rp, flp); +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) { + return NULL; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), - .destroy = ipv4_dst_destroy, - .check = ipv4_dst_check, + .protocol = cpu_to_be16(ETH_P_IP), + .check = ipv4_blackhole_dst_check, + .mtu = ipv4_blackhole_mtu, + .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entry_size = sizeof(struct rtable), - .entries = ATOMIC_INIT(0), + .redirect = ipv4_rt_blackhole_redirect, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, + .neigh_lookup = ipv4_neigh_lookup, }; - -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = *rp; - struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { - struct dst_entry *new = &rt->u.dst; + struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; - new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + new->output = dst_discard_sk; - new->dev = ort->u.dst.dev; + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); - rt->fl = ort->fl; + rt->rt_is_input = ort->rt_is_input; + rt->rt_iif = ort->rt_iif; + rt->rt_pmtu = ort->rt_pmtu; - rt->idev = ort->idev; - if (rt->idev) - in_dev_hold(rt->idev); - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_dst = ort->rt_dst; - rt->rt_src = ort->rt_src; - rt->rt_iif = ort->rt_iif; rt->rt_gateway = ort->rt_gateway; - rt->rt_spec_dst = ort->rt_spec_dst; - rt->peer = ort->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); + rt->rt_uses_gateway = ort->rt_uses_gateway; + + INIT_LIST_HEAD(&rt->rt_uncached); dst_free(new); } - dst_release(&(*rp)->u.dst); - *rp = rt; - return (rt ? 0 : -ENOMEM); + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, - struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - int err; - - if ((err = __ip_route_output_key(net, rp, flp)) != 0) - return err; + struct rtable *rt = __ip_route_output_key(net, flp4); - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, - flags ? XFRM_LOOKUP_WAIT : 0); - if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp, sk); + if (IS_ERR(rt)) + return rt; - return err; - } + if (flp4->flowi4_proto) + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); - return 0; + return rt; } - EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, int event, int nowait, unsigned int flags) { - return ip_route_output_flow(net, rp, flp, NULL, 0); -} - -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - int nowait, unsigned int flags) -{ - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires; - u32 id = 0, ts = 0, tsage = 0, error; + unsigned long expires = 0; + u32 error; + u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2579,9 +2294,10 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = fl4->flowi4_tos; r->rtm_table = RT_TABLE_MAIN; - NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); + if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; @@ -2589,46 +2305,59 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - - if (rt->fl.fl4_src) { + if (nla_put_be32(skb, RTA_DST, dst)) + goto nla_put_failure; + if (src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); + if (nla_put_be32(skb, RTA_SRC, src)) + goto nla_put_failure; } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (rt->dst.tclassid && + nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) + goto nla_put_failure; #endif - if (rt->fl.iif) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); + if (!rt_is_input_route(rt) && + fl4->saddr != src) { + if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) + goto nla_put_failure; + } + if (rt->rt_uses_gateway && + nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + goto nla_put_failure; - if (rt->rt_dst != rt->rt_gateway) - NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } + + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt_pmtu && expires) + metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - error = rt->u.dst.error; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; - if (rt->peer) { - id = rt->peer->ip_id_count; - if (rt->peer->tcp_ts_stamp) { - ts = rt->peer->tcp_ts; - tsage = get_seconds() - rt->peer->tcp_ts_stamp; - } - } + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - if (rt->fl.iif) { -#ifdef CONFIG_IP_MROUTE - __be32 dst = rt->rt_dst; + error = rt->dst.error; + if (rt_is_input_route(rt)) { +#ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { - int err = ipmr_get_route(skb, r, nowait); + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2642,11 +2371,11 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) + goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2656,21 +2385,20 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; + struct flowi4 fl4; __be32 dst = 0; __be32 src = 0; u32 iif; int err; + int mark; struct sk_buff *skb; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; @@ -2696,11 +2424,19 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; + mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst; + fl4.saddr = src; + fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_mark = mark; if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + dev = __dev_get_by_index(net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; @@ -2708,40 +2444,36 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb->protocol = htons(ETH_P_IP); skb->dev = dev; + skb->mark = mark; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = (struct rtable*) skb->dst; - if (err == 0 && rt->u.dst.error) - err = -rt->u.dst.error; + rt = skb_rtable(skb); + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { - struct flowi fl = { - .nl_u = { - .ip4_u = { - .daddr = dst, - .saddr = src, - .tos = rtm->rtm_tos, - }, - }, - .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - }; - err = ip_route_output_key(&init_net, &rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } if (err) goto errout_free; - skb->dst = &rt->u.dst; + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); + err = rt_fill_info(net, dst, src, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; @@ -2750,255 +2482,251 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct rtable *rt; - int h, s_h; - int idx, s_idx; - - s_h = cb->args[0]; - if (s_h < 0) - s_h = 0; - s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++) { - rcu_read_lock_bh(); - for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference(rt->u.dst.rt_next), idx++) { - if (idx < s_idx) - continue; - if (rt->rt_genid != atomic_read(&rt_genid)) - continue; - skb->dst = dst_clone(&rt->u.dst); - if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - dst_release(xchg(&skb->dst, NULL)); - rcu_read_unlock_bh(); - goto done; - } - dst_release(xchg(&skb->dst, NULL)); - } - rcu_read_unlock_bh(); - s_idx = 0; - } - -done: - cb->args[0] = h; - cb->args[1] = idx; - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL -static int flush_delay; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_gc_elasticity __read_mostly = 8; -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = (struct net *)__ctl->extra1; + if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + rt_cache_flush(net); + fnhe_genid_bump(net); return 0; } return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - int delay; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(delay, (int __user *)newval)) - return -EFAULT; - rt_cache_flush(delay); - return 0; -} - -ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .data = &flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, +static struct ctl_table ipv4_route_table[] = { { - .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, + .proc_handler = proc_dointvec_ms_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_COST, .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, .procname = "min_pmtu", .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, + { } +}; + +static struct ctl_table ipv4_route_flush_table[] = { { - .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, - .procname = "secret_interval", - .data = &ip_rt_secret_interval, + .procname = "flush", .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .mode = 0200, + .proc_handler = ipv4_sysctl_rtcache_flush, }, - { .ctl_name = 0 } + { }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, }; #endif -#ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +static __net_init int rt_genid_init(struct net *net) +{ + atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->fnhe_genid, 0); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); + return 0; +} + +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, +}; -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) +static int __net_init ipv4_inetpeer_init(struct net *net) { - if (!str) - return 0; - rhash_entries = simple_strtoul(str, &str, 0); - return 1; + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); + + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv4.peers = bp; + return 0; +} + +static void __net_exit ipv4_inetpeer_exit(struct net *net) +{ + struct inet_peer_base *bp = net->ipv4.peers; + + net->ipv4.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); } -__setup("rhash_entries=", set_rhash_entries); + +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { + .init = ipv4_inetpeer_init, + .exit = ipv4_inetpeer_exit, +}; + +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */ int __init ip_rt_init(void) { int rc = 0; - atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); -#ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); +#ifdef CONFIG_IP_ROUTE_CLASSID + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif @@ -3009,48 +2737,41 @@ int __init ip_rt_init(void) ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (num_physpages >= 128 * 1024) ? - 15 : 17, - 0, - &rt_hash_log, - &rt_hash_mask, - 0); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); - devinet_init(); - ip_fib_init(); + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + devinet_init(); + ip_fib_init(); - if (ip_rt_proc_init(&init_net)) - printk(KERN_ERR "Unable to create route proc files\n"); + if (ip_rt_proc_init()) + pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif + register_pernet_subsys(&rt_genid_ops); + register_pernet_subsys(&ipv4_inetpeer_ops); return rc; } -EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_input); -EXPORT_SYMBOL(ip_route_output_key); +#ifdef CONFIG_SYSCTL +/* + * We really need to sanitize the damn ipv4 init order, then all + * this nonsense will go away. + */ +void __init ip_static_sysctl_init(void) +{ + register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); +} +#endif |
