diff options
Diffstat (limited to 'net/ipv4/route.c')
| -rw-r--r-- | net/ipv4/route.c | 3327 |
1 files changed, 1344 insertions, 1983 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e446496f564..190199851c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -62,14 +62,14 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "IPv4: " fmt + #include <linux/module.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -79,7 +79,6 @@ #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <linux/workqueue.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> @@ -87,9 +86,10 @@ #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <linux/slab.h> +#include <linux/jhash.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -106,68 +106,71 @@ #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> +#include <linux/kmemleak.h> #endif +#include <net/secure_seq.h> -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) - -#define IP_MAX_MTU 0xFFF0 +#define RT_FL_TOS(oldflp4) \ + ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; -static int ip_rt_gc_interval __read_mostly = 60 * HZ; -static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; -static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; -static int rt_chain_length_max __read_mostly = 20; - -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; /* * Interface to generic destination cache. */ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); -static void ipv4_dst_destroy(struct dst_entry *dst); -static void ipv4_dst_ifdown(struct dst_entry *dst, - struct net_device *dev, int how); +static unsigned int ipv4_default_advmss(const struct dst_entry *dst); +static unsigned int ipv4_mtu(const struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(struct dst_ops *ops); -static void rt_emergency_hash_rebuild(struct net *net); +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu); +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); + +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + WARN_ON(1); + return NULL; +} +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .gc = rt_garbage_collect, .check = ipv4_dst_check, + .default_advmss = ipv4_default_advmss, + .mtu = ipv4_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, - .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .redirect = ip_do_redirect, .local_out = __ip_local_out, - .entries = ATOMIC_INIT(0), + .neigh_lookup = ipv4_neigh_lookup, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -183,188 +186,27 @@ const __u8 ip_tos2prio[16] = { TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; - - -/* - * Route cache. - */ - -/* The locking scheme is rather straight forward: - * - * 1) Read-Copy Update protects the buckets of the central route hash. - * 2) Only writers remove entries, and they hold the lock - * as they look at rtable reference counts. - * 3) Only readers acquire references to rtable entries, - * they do so with atomic increments and with the - * lock held. - */ - -struct rt_hash_bucket { - struct rtable *chain; -}; - -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) -/* - * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks - * The size of this table is a power of two and depends on the number of CPUS. - * (on lockdep we have a quite big spinlock_t, so keep the size down there) - */ -#ifdef CONFIG_LOCKDEP -# define RT_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define RT_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define RT_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define RT_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define RT_HASH_LOCK_SZ 512 -# else -# define RT_HASH_LOCK_SZ 256 -# endif -#endif - -static spinlock_t *rt_hash_locks; -# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] - -static __init void rt_hash_lock_init(void) -{ - int i; - - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, - GFP_KERNEL); - if (!rt_hash_locks) - panic("IP: failed to allocate rt_hash_locks\n"); - - for (i = 0; i < RT_HASH_LOCK_SZ; i++) - spin_lock_init(&rt_hash_locks[i]); -} -#else -# define rt_hash_lock_addr(slot) NULL - -static inline void rt_hash_lock_init(void) -{ -} -#endif - -static struct rt_hash_bucket *rt_hash_table __read_mostly; -static unsigned rt_hash_mask __read_mostly; -static unsigned int rt_hash_log __read_mostly; +EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) \ - (__raw_get_cpu_var(rt_cache_stat).field++) - -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) -{ - return jhash_3words((__force u32)(__be32)(daddr), - (__force u32)(__be32)(saddr), - idx, genid) - & rt_hash_mask; -} - -static inline int rt_genid(struct net *net) -{ - return atomic_read(&net->ipv4.rt_genid); -} +#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS -struct rt_cache_iter_state { - struct seq_net_private p; - int bucket; - int genid; -}; - -static struct rtable *rt_cache_get_first(struct seq_file *seq) -{ - struct rt_cache_iter_state *st = seq->private; - struct rtable *r = NULL; - - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rt_hash_table[st->bucket].chain) - continue; - rcu_read_lock_bh(); - r = rcu_dereference(rt_hash_table[st->bucket].chain); - while (r) { - if (dev_net(r->u.dst.dev) == seq_file_net(seq) && - r->rt_genid == st->genid) - return r; - r = rcu_dereference(r->u.dst.rt_next); - } - rcu_read_unlock_bh(); - } - return r; -} - -static struct rtable *__rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - - r = r->u.dst.rt_next; - while (!r) { - rcu_read_unlock_bh(); - do { - if (--st->bucket < 0) - return NULL; - } while (!rt_hash_table[st->bucket].chain); - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - } - return rcu_dereference(r); -} - -static struct rtable *rt_cache_get_next(struct seq_file *seq, - struct rtable *r) -{ - struct rt_cache_iter_state *st = seq->private; - while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->u.dst.dev) != seq_file_net(seq)) - continue; - if (r->rt_genid == st->genid) - break; - } - return r; -} - -static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) -{ - struct rtable *r = rt_cache_get_first(seq); - - if (r) - while (pos && (r = rt_cache_get_next(seq, r))) - --pos; - return pos ? NULL : r; -} - static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - struct rt_cache_iter_state *st = seq->private; if (*pos) - return rt_cache_get_idx(seq, *pos - 1); - st->genid = rt_genid(seq_file_net(seq)); + return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r; - - if (v == SEQ_START_TOKEN) - r = rt_cache_get_first(seq); - else - r = rt_cache_get_next(seq, v); ++*pos; - return r; + return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { - if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -374,29 +216,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); - else { - struct rtable *r = v; - int len; - - seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), - r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == - dev_queue_xmit) : 0, - r->rt_spec_dst, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); - } return 0; } @@ -409,8 +228,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &rt_cache_seq_ops, - sizeof(struct rt_cache_iter_state)); + return seq_open(file, &rt_cache_seq_ops); } static const struct file_operations rt_cache_seq_fops = { @@ -418,7 +236,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_net, + .release = seq_release, }; @@ -468,8 +286,8 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", - atomic_read(&ipv4_dst_ops.entries), - st->in_hit, + dst_entries_get_slow(&ipv4_dst_ops), + 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, @@ -477,16 +295,16 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) st->in_martian_dst, st->in_martian_src, - st->out_hit, + 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, - st->gc_total, - st->gc_ignored, - st->gc_goal_miss, - st->gc_dst_overflow, - st->in_hlist_search, - st->out_hlist_search + 0, /* st->gc_total */ + 0, /* st->gc_ignored */ + 0, /* st->gc_goal_miss */ + 0, /* st->gc_dst_overflow */ + 0, /* st->in_hlist_search */ + 0 /* st->out_hlist_search */ ); return 0; } @@ -512,7 +330,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -555,8 +373,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, - &rt_cache_seq_fops); + pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + &rt_cache_seq_fops); if (!pde) goto err1; @@ -565,14 +383,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -586,7 +404,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); +#endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { @@ -606,746 +426,306 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); -} - -static inline void rt_drop(struct rtable *rt) +static inline bool rt_is_expired(const struct rtable *rth) { - ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); } -static inline int rt_fast_clean(struct rtable *rth) +void rt_cache_flush(struct net *net) { - /* Kill broadcast/multicast entries very aggresively, if they - collide in hash table with more useful entries */ - return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.dst.rt_next; + rt_genid_bump_ipv4(net); } -static inline int rt_valuable(struct rtable *rth) +static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) { - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; -} + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + struct neighbour *n; -static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) -{ - unsigned long age; - int ret = 0; + rt = (const struct rtable *) dst; + if (rt->rt_gateway) + pkey = (const __be32 *) &rt->rt_gateway; + else if (skb) + pkey = &ip_hdr(skb)->daddr; - if (atomic_read(&rth->u.dst.__refcnt)) - goto out; + n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); + if (n) + return n; + return neigh_create(&arp_tbl, pkey, dev); +} - ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) - goto out; +#define IP_IDENTS_SZ 2048u +struct ip_ident_bucket { + atomic_t id; + u32 stamp32; +}; - age = jiffies - rth->u.dst.lastuse; - ret = 0; - if ((age <= tmo1 && !rt_fast_clean(rth)) || - (age <= tmo2 && rt_valuable(rth))) - goto out; - ret = 1; -out: return ret; -} +static struct ip_ident_bucket *ip_idents __read_mostly; -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter +/* In order to protect privacy, we add a perturbation to identifiers + * if one generator is seldom used. This makes hard for an attacker + * to infer how many packets were sent between two points in time. */ -static inline u32 rt_score(struct rtable *rt) +u32 ip_idents_reserve(u32 hash, int segs) { - u32 score = jiffies - rt->u.dst.lastuse; - - score = ~score & ~(3<<30); + struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ; + u32 old = ACCESS_ONCE(bucket->stamp32); + u32 now = (u32)jiffies; + u32 delta = 0; - if (rt_valuable(rt)) - score |= (1<<31); + if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) + delta = prandom_u32_max(now - old); - if (!rt->fl.iif || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; -} - -static inline bool rt_caching(const struct net *net) -{ - return net->ipv4.current_rt_cache_rebuild_count <= - net->ipv4.sysctl_rt_cache_rebuild_count; + return atomic_add_return(segs + delta, &bucket->id) - segs; } +EXPORT_SYMBOL(ip_idents_reserve); -static inline bool compare_hash_inputs(const struct flowi *fl1, - const struct flowi *fl2) +void __ip_select_ident(struct iphdr *iph, int segs) { - return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | - (fl1->iif ^ fl2->iif)) == 0); -} + static u32 ip_idents_hashrnd __read_mostly; + u32 hash, id; -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) -{ - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | - (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | - (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; -} + net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); -static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) -{ - return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); + hash = jhash_3words((__force u32)iph->daddr, + (__force u32)iph->saddr, + iph->protocol, + ip_idents_hashrnd); + id = ip_idents_reserve(hash, segs); + iph->id = htons(id); } +EXPORT_SYMBOL(__ip_select_ident); -static inline int rt_is_expired(struct rtable *rth) +static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct iphdr *iph, + int oif, u8 tos, + u8 prot, u32 mark, int flow_flags) { - return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); -} + if (sk) { + const struct inet_sock *inet = inet_sk(sk); -/* - * Perform a full scan of hash table and free all entries. - * Can be called by a softirq or a process. - * In the later case, we want to be reschedule if necessary - */ -static void rt_do_flush(int process_context) -{ - unsigned int i; - struct rtable *rth, *next; - struct rtable * tail; - - for (i = 0; i <= rt_hash_mask; i++) { - if (process_context && need_resched()) - cond_resched(); - rth = rt_hash_table[i].chain; - if (!rth) - continue; - - spin_lock_bh(rt_hash_lock_addr(i)); -#ifdef CONFIG_NET_NS - { - struct rtable ** prev, * p; - - rth = rt_hash_table[i].chain; - - /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->u.dst.rt_next) - if (!rt_is_expired(tail)) - break; - if (rth != tail) - rt_hash_table[i].chain = tail; - - /* call rt_free on entries after the tail requiring flush */ - prev = &rt_hash_table[i].chain; - for (p = *prev; p; p = next) { - next = p->u.dst.rt_next; - if (!rt_is_expired(p)) { - prev = &p->u.dst.rt_next; - } else { - *prev = next; - rt_free(p); - } - } - } -#else - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; - tail = NULL; -#endif - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; rth != tail; rth = next) { - next = rth->u.dst.rt_next; - rt_free(rth); - } + oif = sk->sk_bound_dev_if; + mark = sk->sk_mark; + tos = RT_CONN_FLAGS(sk); + prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; } + flowi4_init_output(fl4, oif, mark, tos, + RT_SCOPE_UNIVERSE, prot, + flow_flags, + iph->daddr, iph->saddr, 0, 0); } -/* - * While freeing expired entries, we compute average chain length - * and standard deviation, using fixed-point arithmetic. - * This to have an estimation of rt_chain_length_max - * rt_chain_length_max = max(elasticity, AVG + 4*SD) - * We use 3 bits for frational part, and 29 (or 61) for magnitude. - */ - -#define FRACT_BITS 3 -#define ONE (1UL << FRACT_BITS) - -static void rt_check_expire(void) +static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, + const struct sock *sk) { - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth, *aux, **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (*rthp == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { - prefetch(rth->u.dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; - rt_free(rth); - continue; - } - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->u.dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - for (aux = rt_hash_table[i].chain;;) { - if (aux == rth) { - length += ONE; - break; - } - if (compare_hash_inputs(&aux->fl, &rth->fl)) - break; - aux = aux->u.dst.rt_next; - } - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; + const struct iphdr *iph = ip_hdr(skb); + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; - /* Cleanup aged off entries. */ - *rthp = rth->u.dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; + __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); } -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) +static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} + const struct inet_sock *inet = inet_sk(sk); + const struct ip_options_rcu *inet_opt; + __be32 daddr = inet->inet_daddr; -/* - * Pertubation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) -{ - unsigned char shuffle; - - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &net->ipv4.rt_genid); + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_sk_flowi_flags(sk), + daddr, inet->inet_saddr, 0, 0); + rcu_read_unlock(); } -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) +static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, + const struct sk_buff *skb) { - rt_cache_invalidate(net); - if (delay >= 0) - rt_do_flush(!in_softirq()); + if (skb) + build_skb_flow_key(fl4, skb, sk); + else + build_sk_flow_key(fl4, sk); } -/* Flush previous cache invalidated entries from the cache */ -void rt_cache_flush_batch(void) +static inline void rt_free(struct rtable *rt) { - rt_do_flush(!in_softirq()); + call_rcu(&rt->dst.rcu_head, dst_rcu_free); } -/* - * We change rt_genid and let gc do the cleanup - */ -static void rt_secret_rebuild(unsigned long __net) -{ - struct net *net = (struct net *)__net; - rt_cache_invalidate(net); - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} +static DEFINE_SPINLOCK(fnhe_lock); -static void rt_secret_rebuild_oneshot(struct net *net) +static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { - del_timer_sync(&net->ipv4.rt_secret_timer); - rt_cache_invalidate(net); - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); + struct rtable *rt; + + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); + rt_free(rt); + } + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) { + RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); + rt_free(rt); } } -static void rt_emergency_hash_rebuild(struct net *net) +static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) { - if (net_ratelimit()) { - printk(KERN_WARNING "Route hash chain too long!\n"); - printk(KERN_WARNING "Adjust your secret_interval!\n"); - } + struct fib_nh_exception *fnhe, *oldest; - rt_secret_rebuild_oneshot(net); + oldest = rcu_dereference(hash->chain); + for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + oldest = fnhe; + } + fnhe_flush_routes(oldest); + return oldest; } -/* - Short description of GC goals. - - We want to build algorithm, which will keep routing cache - at some equilibrium point, when number of aged off entries - is kept approximately equal to newly generated ones. - - Current expiration strength is variable "expire". - We try to adjust it dynamically, so that if networking - is idle expires is large enough to keep enough of warm entries, - and when load increases it reduces to limit cache size. - */ - -static int rt_garbage_collect(struct dst_ops *ops) +static inline u32 fnhe_hashfun(__be32 daddr) { - static unsigned long expire = RT_GC_TIMEOUT; - static unsigned long last_gc; - static int rover; - static int equilibrium; - struct rtable *rth, **rthp; - unsigned long now = jiffies; - int goal; - - /* - * Garbage collection is pretty expensive, - * do not make it too frequently. - */ + u32 hval; - RT_CACHE_STAT_INC(gc_total); + hval = (__force u32) daddr; + hval ^= (hval >> 11) ^ (hval >> 22); - if (now - last_gc < ip_rt_gc_min_interval && - atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); - goto out; - } - - /* Calculate number of entries, which we want to expire now. */ - goal = atomic_read(&ipv4_dst_ops.entries) - - (ip_rt_gc_elasticity << rt_hash_log); - if (goal <= 0) { - if (equilibrium < ipv4_dst_ops.gc_thresh) - equilibrium = ipv4_dst_ops.gc_thresh; - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - if (goal > 0) { - equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; - } - } else { - /* We are in dangerous area. Try to reduce cache really - * aggressively. - */ - goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; - } + return hval & (FNHE_HASH_SIZE - 1); +} - if (now - last_gc >= ip_rt_gc_min_interval) - last_gc = now; +static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) +{ + rt->rt_pmtu = fnhe->fnhe_pmtu; + rt->dst.expires = fnhe->fnhe_expires; - if (goal <= 0) { - equilibrium += goal; - goto work_done; + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; + rt->rt_uses_gateway = 1; } - - do { - int i, k; - - for (i = rt_hash_mask, k = rover; i >= 0; i--) { - unsigned long tmo = expire; - - k = (k + 1) & rt_hash_mask; - rthp = &rt_hash_table[k].chain; - spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { - if (!rt_is_expired(rth) && - !rt_may_expire(rth, tmo, expire)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } - *rthp = rth->u.dst.rt_next; - rt_free(rth); - goal--; - } - spin_unlock_bh(rt_hash_lock_addr(k)); - if (goal <= 0) - break; - } - rover = k; - - if (goal <= 0) - goto work_done; - - /* Goal is not achieved. We stop process if: - - - if expire reduced to zero. Otherwise, expire is halfed. - - if table is not full. - - if we are called from interrupt. - - jiffies check is just fallback/debug loop breaker. - We will not spin here for long time in any case. - */ - - RT_CACHE_STAT_INC(gc_goal_miss); - - if (expire == 0) - break; - - expire >>= 1; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, i); -#endif - - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - } while (!in_softirq() && time_before_eq(jiffies, now)); - - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) - goto out; - if (net_ratelimit()) - printk(KERN_WARNING "dst cache overflow\n"); - RT_CACHE_STAT_INC(gc_dst_overflow); - return 1; - -work_done: - expire += ip_rt_gc_min_interval; - if (expire > ip_rt_gc_timeout || - atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, rover); -#endif -out: return 0; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb) +static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, + u32 pmtu, unsigned long expires) { - struct rtable *rth, **rthp; - unsigned long now; - struct rtable *cand, **candp; - u32 min_score; - int chain_length; - int attempts = !in_softirq(); - -restart: - chain_length = 0; - min_score = ~(u32)0; - cand = NULL; - candp = NULL; - now = jiffies; - - if (!rt_caching(dev_net(rt->u.dst.dev))) { - /* - * If we're not caching, just tell the caller we - * were successful and don't touch the route. The - * caller hold the sole reference to the cache entry, and - * it will be released when the caller is done with it. - * If we drop it here, the callers have no way to resolve routes - * when we're not caching. Instead, just point *rp at rt, so - * the caller gets a single use out of the route - * Note that we do rt_free on this new route entry, so that - * once its refcount hits zero, we are still able to reap it - * (Thanks Alexey) - * Note also the rt_free uses call_rcu. We don't actually - * need rcu protection here, this is just our path to get - * on the route gc list. - */ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe; + struct rtable *rt; + unsigned int i; + int depth; + u32 hval = fnhe_hashfun(daddr); - if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); - if (err) { - if (net_ratelimit()) - printk(KERN_WARNING - "Neighbour table failure & not caching routes.\n"); - rt_drop(rt); - return err; - } - } + spin_lock_bh(&fnhe_lock); - rt_free(rt); - goto skip_hashing; + hash = nh->nh_exceptions; + if (!hash) { + hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + if (!hash) + goto out_unlock; + nh->nh_exceptions = hash; } - rthp = &rt_hash_table[hash].chain; + hash += hval; - spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { - if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; - rt_free(rth); - continue; - } - if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { - /* Put it first */ - *rthp = rth->u.dst.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->u.dst.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); - - dst_use(&rth->u.dst, now); - spin_unlock_bh(rt_hash_lock_addr(hash)); - - rt_drop(rt); - if (rp) - *rp = rth; - else - skb_dst_set(skb, &rth->u.dst); - return 0; - } - - if (!atomic_read(&rth->u.dst.__refcnt)) { - u32 score = rt_score(rth); - - if (score <= min_score) { - cand = rth; - candp = rthp; - min_score = score; - } - } - - chain_length++; - - rthp = &rth->u.dst.rt_next; + depth = 0; + for (fnhe = rcu_dereference(hash->chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + break; + depth++; } - if (cand) { - /* ip_rt_gc_elasticity used to be average length of chain - * length, when exceeded gc becomes really aggressive. - * - * The second limit is less certain. At the moment it allows - * only 2 entries per bucket. We will see. - */ - if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.dst.rt_next; - rt_free(cand); + if (fnhe) { + if (gw) + fnhe->fnhe_gw = gw; + if (pmtu) { + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = max(1UL, expires); } + /* Update all cached dsts too */ + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (rt) + fill_route_from_fnhe(rt, fnhe); + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (rt) + fill_route_from_fnhe(rt, fnhe); } else { - if (chain_length > rt_chain_length_max) { - struct net *net = dev_net(rt->u.dst.dev); - int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(dev_net(rt->u.dst.dev))) { - printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", - rt->u.dst.dev->name, num); - } - rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + if (depth > FNHE_RECLAIM_DEPTH) + fnhe = fnhe_oldest(hash); + else { + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + rcu_assign_pointer(hash->chain, fnhe); } - } - - /* Try to bind route to arp only if it is output - route or unicast forwarding path. - */ - if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); - if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); - - if (err != -ENOBUFS) { - rt_drop(rt); - return err; - } - - /* Neighbour tables are full and nothing - can be released. Try to shrink route cache, - it is most likely it holds some neighbour records. - */ - if (attempts-- > 0) { - int saved_elasticity = ip_rt_gc_elasticity; - int saved_int = ip_rt_gc_min_interval; - ip_rt_gc_elasticity = 1; - ip_rt_gc_min_interval = 0; - rt_garbage_collect(&ipv4_dst_ops); - ip_rt_gc_min_interval = saved_int; - ip_rt_gc_elasticity = saved_elasticity; - goto restart; - } - - if (net_ratelimit()) - printk(KERN_WARNING "Neighbour table overflow.\n"); - rt_drop(rt); - return -ENOBUFS; + fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev)); + fnhe->fnhe_daddr = daddr; + fnhe->fnhe_gw = gw; + fnhe->fnhe_pmtu = pmtu; + fnhe->fnhe_expires = expires; + + /* Exception created; mark the cached routes for the nexthop + * stale, so anyone caching it rechecks if this exception + * applies to them. + */ + rt = rcu_dereference(nh->nh_rth_input); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; + + for_each_possible_cpu(i) { + struct rtable __rcu **prt; + prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + rt = rcu_dereference(*prt); + if (rt) + rt->dst.obsolete = DST_OBSOLETE_KILL; } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; - -#if RT_CACHE_DEBUG >= 2 - if (rt->u.dst.rt_next) { - struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %pI4", - hash, &rt->rt_dst); - for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . %pI4", &trt->rt_dst); - printk("\n"); - } -#endif - /* - * Since lookup is lockfree, we must make sure - * previous writes to rt are comitted to memory - * before making rt visible to other CPUS. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rt); - - spin_unlock_bh(rt_hash_lock_addr(hash)); - -skip_hashing: - if (rp) - *rp = rt; - else - skb_dst_set(skb, &rt->u.dst); - return 0; -} - -void rt_bind_peer(struct rtable *rt, int create) -{ - static DEFINE_SPINLOCK(rt_peer_lock); - struct inet_peer *peer; - - peer = inet_getpeer(rt->rt_dst, create); - - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) - inet_putpeer(peer); -} - -/* - * Peer allocation may fail only in serious out-of-memory conditions. However - * we still can generate some output. - * Random ID selection looks a bit dangerous because we have no chances to - * select ID being unique in a reasonable period of time. - * But broken packet identifier may be better than no packet at all. - */ -static void ip_select_fb_ident(struct iphdr *iph) -{ - static DEFINE_SPINLOCK(ip_fb_id_lock); - static u32 ip_fallback_id; - u32 salt; + fnhe->fnhe_stamp = jiffies; - spin_lock_bh(&ip_fb_id_lock); - salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); - iph->id = htons(salt & 0xFFFF); - ip_fallback_id = salt; - spin_unlock_bh(&ip_fb_id_lock); +out_unlock: + spin_unlock_bh(&fnhe_lock); } -void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + bool kill_route) { - struct rtable *rt = (struct rtable *) dst; - - if (rt) { - if (rt->peer == NULL) - rt_bind_peer(rt, 1); - - /* If peer is attached to destination, it is never detached, - so that we need not to grab a lock to dereference it. - */ - if (rt->peer) { - iph->id = htons(inet_getid(rt->peer, more)); - return; - } - } else - printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", - __builtin_return_address(0)); - - ip_select_fb_ident(iph); -} + __be32 new_gw = icmp_hdr(skb)->un.gateway; + __be32 old_gw = ip_hdr(skb)->saddr; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct fib_result res; + struct neighbour *n; + struct net *net; -static void rt_del(unsigned hash, struct rtable *rt) -{ - struct rtable **rthp, *aux; + switch (icmp_hdr(skb)->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + break; - rthp = &rt_hash_table[hash].chain; - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - while ((aux = *rthp) != NULL) { - if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->u.dst.rt_next; - rt_free(aux); - continue; - } - rthp = &aux->u.dst.rt_next; + default: + return; } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) -{ - int i, k; - struct in_device *in_dev = in_dev_get(dev); - struct rtable *rth, **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; - struct net *net; + if (rt->rt_gateway != old_gw) + return; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; @@ -1355,9 +735,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ipv4_is_zeronet(new_gw)) goto reject_redirect; - if (!rt_caching(net)) - goto reject_redirect; - if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -1368,110 +745,55 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rthp=&rt_hash_table[hash].chain; - - rcu_read_lock(); - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + if (n) { + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + } else { + if (fib_lookup(net, fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rth->fl.iif != 0 || - rt_is_expired(rth) || - !net_eq(dev_net(rth->u.dst.dev), net)) { - rthp = &rth->u.dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->u.dst.error || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; - - dst_hold(&rth->u.dst); - rcu_read_unlock(); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - in_dev_put(in_dev); - return; - } - - /* Copy all the information. */ - *rt = *rth; - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - if (rt->idev) - in_dev_hold(rt->idev); - rt->u.dst.obsolete = 0; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; -#ifdef CONFIG_XFRM - rt->u.dst.xfrm = NULL; -#endif - rt->rt_genid = rt_genid(net); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } - - netevent.old = &rth->u.dst; - netevent.new = &rt->u.dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); - - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL)) - ip_rt_put(rt); - goto do_next; + update_or_create_fnhe(nh, fl4->daddr, new_gw, + 0, 0); } - rcu_read_unlock(); - do_next: - ; + if (kill_route) + rt->dst.obsolete = DST_OBSOLETE_KILL; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } + neigh_release(n); } - in_dev_put(in_dev); return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" - " Advised path = %pI4 -> %pI4\n", - &old_gw, dev->name, &new_gw, - &saddr, &daddr); + if (IN_DEV_LOG_MARTIANS(in_dev)) { + const struct iphdr *iph = (const struct iphdr *) skb->data; + __be32 daddr = iph->daddr; + __be32 saddr = iph->saddr; + + net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); + } #endif - in_dev_put(in_dev); + ; +} + +static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl4; + const struct iphdr *iph = (const struct iphdr *) skb->data; + int oif = skb->dev->ifindex; + u8 tos = RT_TOS(iph->tos); + u8 prot = iph->protocol; + u32 mark = skb->mark; + + rt = (struct rtable *) dst; + + __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0); + __ip_do_redirect(rt, skb, &fl4, true); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1480,19 +802,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) struct dst_entry *ret = dst; if (rt) { - if (dst->obsolete) { + if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif, - rt_genid(dev_net(dst->dev))); -#if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", - &rt->rt_dst, rt->fl.fl4_tos); -#endif - rt_del(hash, rt); + rt->dst.expires) { + ip_rt_put(rt); ret = NULL; } } @@ -1519,10 +834,12 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; + struct inet_peer *peer; + struct net *net; int log_martians; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; @@ -1530,209 +847,290 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); + net = dev_net(rt->dst.dev); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); + return; + } + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; - return; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; + goto out_put_peer; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->u.dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->u.dst.rate_tokens == ip_rt_redirect_number && - net_ratelimit()) - printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", - &rt->rt_src, rt->rt_iif, - &rt->rt_dst, &rt->rt_gateway); + peer->rate_tokens == ip_rt_redirect_number) + net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", + &ip_hdr(skb)->saddr, inet_iif(skb), + &ip_hdr(skb)->daddr, &gw); #endif } +out_put_peer: + inet_putpeer(peer); } static int ip_error(struct sk_buff *skb) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + struct net *net; + bool send; int code; - switch (rt->u.dst.error) { - case EINVAL: - default: - goto out; + net = dev_net(rt->dst.dev); + if (!IN_DEV_FORWARD(in_dev)) { + switch (rt->dst.error) { case EHOSTUNREACH: - code = ICMP_HOST_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); break; + case ENETUNREACH: - code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->u.dst.dev), - IPSTATS_MIB_INNOROUTES); - break; - case EACCES: - code = ICMP_PKT_FILTERED; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); break; + } + goto out; } - now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + switch (rt->dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; } + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; + inet_putpeer(peer); + } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + out: kfree_skb(skb); return 0; } -/* - * The last two values are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +{ + struct dst_entry *dst = &rt->dst; + struct fib_result res; + + if (dst_metric_locked(dst, RTAX_MTU)) + return; -static const unsigned short mtu_plateau[] = -{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + if (dst->dev->mtu < mtu) + return; + + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + + if (rt->rt_pmtu == mtu && + time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) + return; + + rcu_read_lock(); + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + struct fib_nh *nh = &FIB_RES_NH(res); -static inline unsigned short guess_mtu(unsigned short old_mtu) + update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + jiffies + ip_rt_mtu_expires); + } + rcu_read_unlock(); +} + +static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) { - int i; + struct rtable *rt = (struct rtable *) dst; + struct flowi4 fl4; - for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) - if (old_mtu > mtu_plateau[i]) - return mtu_plateau[i]; - return 68; + ip_rt_build_flow_key(&fl4, sk, skb); + __ip_rt_update_pmtu(rt, &fl4, mtu); } -unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, - unsigned short new_mtu, - struct net_device *dev) +void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, + int oif, u32 mark, u8 protocol, int flow_flags) { - int i, k; - unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - int ikeys[2] = { dev->ifindex, 0 }; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; - unsigned short est_mtu = 0; - - for (k = 0; k < 2; k++) { - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - unsigned short mtu = new_mtu; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->rt_dst != daddr || - rth->rt_src != iph->saddr || - rth->fl.oif != ikeys[k] || - rth->fl.iif != 0 || - dst_metric_locked(&rth->u.dst, RTAX_MTU) || - !net_eq(dev_net(rth->u.dst.dev), net) || - rt_is_expired(rth)) - continue; - - if (new_mtu < 68 || new_mtu >= old_mtu) { - - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= dst_mtu(&rth->u.dst) && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; - - mtu = guess_mtu(old_mtu); - } - if (mtu <= dst_mtu(&rth->u.dst)) { - if (mtu < dst_mtu(&rth->u.dst)) { - dst_confirm(&rth->u.dst); - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= - (1 << RTAX_MTU); - } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - rcu_read_unlock(); - } + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + if (!mark) + mark = IP4_REPLY_MARK(net, skb->mark); + + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } - return est_mtu ? : new_mtu; } +EXPORT_SYMBOL_GPL(ipv4_update_pmtu); -static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - if (dst_mtu(dst) > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - mtu = ip_rt_min_pmtu; - dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); - } - dst->metrics[RTAX_MTU-1] = mtu; - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + if (!fl4.flowi4_mark) + fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); + + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_rt_update_pmtu(rt, &fl4, mtu); + ip_rt_put(rt); } } -static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - return NULL; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; + struct dst_entry *odst = NULL; + bool new = false; + + bh_lock_sock(sk); + + if (!ip_sk_accept_pmtu(sk)) + goto out; + + odst = sk_dst_get(sk); + + if (sock_owned_by_user(sk) || !odst) { + __ipv4_sk_update_pmtu(skb, sk, mtu); + goto out; + } + + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + + rt = (struct rtable *)odst; + if (odst->obsolete && odst->ops->check(odst, 0) == NULL) { + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + + if (!dst_check(&rt->dst, 0)) { + if (new) + dst_release(&rt->dst); + + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) + goto out; + + new = true; + } + + if (new) + sk_dst_set(sk, &rt->dst); + +out: + bh_unlock_sock(sk); + dst_release(odst); } +EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); -static void ipv4_dst_destroy(struct dst_entry *dst) +void ipv4_redirect(struct sk_buff *skb, struct net *net, + int oif, u32 mark, u8 protocol, int flow_flags) { - struct rtable *rt = (struct rtable *) dst; - struct inet_peer *peer = rt->peer; - struct in_device *idev = rt->idev; + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (peer) { - rt->peer = NULL; - inet_putpeer(peer); + __build_flow_key(&fl4, NULL, iph, oif, + RT_TOS(iph->tos), protocol, mark, flow_flags); + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } +} +EXPORT_SYMBOL_GPL(ipv4_redirect); + +void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) +{ + const struct iphdr *iph = (const struct iphdr *) skb->data; + struct flowi4 fl4; + struct rtable *rt; - if (idev) { - rt->idev = NULL; - in_dev_put(idev); + __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); + rt = __ip_route_output_key(sock_net(sk), &fl4); + if (!IS_ERR(rt)) { + __ip_do_redirect(rt, skb, &fl4, false); + ip_rt_put(rt); } } +EXPORT_SYMBOL_GPL(ipv4_sk_redirect); -static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; - struct in_device *idev = rt->idev; - if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = - in_dev_get(dev_net(dev)->loopback_dev); - if (loopback_idev) { - rt->idev = loopback_idev; - in_dev_put(idev); - } - } + + /* All IPV4 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + * + * When a PMTU/redirect information update invalidates a route, + * this is indicated by setting obsolete to DST_OBSOLETE_KILL or + * DST_OBSOLETE_DEAD by dst_free(). + */ + if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + return NULL; + return dst; } static void ipv4_link_failure(struct sk_buff *skb) @@ -1743,15 +1141,16 @@ static void ipv4_link_failure(struct sk_buff *skb) rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", - &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, - skb->dev ? skb->dev->name : "?"); + pr_debug("%s: %pI4 -> %pI4, %s\n", + __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + skb->dev ? skb->dev->name : "?"); kfree_skb(skb); + WARN_ON(1); return 0; } @@ -1764,82 +1163,275 @@ static int ip_rt_bug(struct sk_buff *skb) in IP options! */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; - struct fib_result res; - if (rt->fl.iif == 0) - src = rt->rt_src; - else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, - RT_SCOPE_UNIVERSE); + if (rt_is_output_route(rt)) + src = ip_hdr(skb)->saddr; + else { + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; + + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + else + src = inet_select_addr(rt->dst.dev, + rt_nexthop(rt, iph->daddr), + RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct fib_info *fi = res->fi; + unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (advmss == 0) { + advmss = max_t(unsigned int, dst->dev->mtu - 40, + ip_rt_min_advmss); + if (advmss > 65535 - 40) + advmss = 65535 - 40; + } + return advmss; +} + +static unsigned int ipv4_mtu(const struct dst_entry *dst) +{ + const struct rtable *rt = (const struct rtable *) dst; + unsigned int mtu = rt->rt_pmtu; + + if (!mtu || time_after_eq(jiffies, rt->dst.expires)) + mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + return mtu; + + mtu = dst->dev->mtu; + + if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { + if (rt->rt_uses_gateway && mtu > 576) + mtu = 576; + } + + return min_t(unsigned int, mtu, IP_MAX_MTU); +} + +static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + struct fib_nh_exception *fnhe; + u32 hval; + + if (!hash) + return NULL; + + hval = fnhe_hashfun(daddr); + + for (fnhe = rcu_dereference(hash[hval].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + if (fnhe->fnhe_daddr == daddr) + return fnhe; + } + return NULL; +} + +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, + __be32 daddr) +{ + bool ret = false; + + spin_lock_bh(&fnhe_lock); + + if (daddr == fnhe->fnhe_daddr) { + struct rtable __rcu **porig; + struct rtable *orig; + int genid = fnhe_genid(dev_net(rt->dst.dev)); + + if (rt_is_input_route(rt)) + porig = &fnhe->fnhe_rth_input; + else + porig = &fnhe->fnhe_rth_output; + orig = rcu_dereference(*porig); + + if (fnhe->fnhe_genid != genid) { + fnhe->fnhe_genid = genid; + fnhe->fnhe_gw = 0; + fnhe->fnhe_pmtu = 0; + fnhe->fnhe_expires = 0; + fnhe_flush_routes(fnhe); + orig = NULL; + } + fill_route_from_fnhe(rt, fnhe); + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + + if (!(rt->dst.flags & DST_NOCACHE)) { + rcu_assign_pointer(*porig, rt); + if (orig) + rt_free(orig); + ret = true; + } + + fnhe->fnhe_stamp = jiffies; + } + spin_unlock_bh(&fnhe_lock); + + return ret; +} + +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +{ + struct rtable *orig, *prev, **p; + bool ret = true; + + if (rt_is_input_route(rt)) { + p = (struct rtable **)&nh->nh_rth_input; + } else { + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } + orig = *p; + + prev = cmpxchg(p, orig, rt); + if (prev == orig) { + if (orig) + rt_free(orig); + } else + ret = false; + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (!list_empty(&rt->rt_uncached)) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); + } +} + +static bool rt_cache_valid(const struct rtable *rt) +{ + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); +} + +static void rt_set_nexthop(struct rtable *rt, __be32 daddr, + const struct fib_result *res, + struct fib_nh_exception *fnhe, + struct fib_info *fi, u16 type, u32 itag) +{ + bool cached = false; if (fi) { - if (FIB_RES_GW(*res) && - FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); - if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && - rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + struct fib_nh *nh = &FIB_RES_NH(*res); + + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { + rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; } -#ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + dst_init_metrics(&rt->dst, fi->fib_metrics, true); +#ifdef CONFIG_IP_ROUTE_CLASSID + rt->dst.tclassid = nh->nh_tclassid; #endif + if (unlikely(fnhe)) + cached = rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_cache_route(nh, rt); + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, - ip_rt_min_advmss); - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; - -#ifdef CONFIG_NET_CLS_ROUTE + rt_add_uncached_list(rt); + +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES - set_class_tag(rt, fib_rules_tclass(res)); + set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; } +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm, bool will_cache) +{ + return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); +} + +/* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned hash; struct rtable *rth; - __be32 spec_dst; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ @@ -1847,69 +1439,61 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) + skb->protocol != htons(ETH_P_IP)) goto e_inval; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(saddr)) + goto e_inval; + if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) goto e_inval; - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag, 0) < 0) - goto e_inval; - - rth = dst_alloc(&ipv4_dst_ops); + } else { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); + if (err < 0) + goto e_err; + } + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->rt_genid = rt_genid(dev_net(dev)); + rth->dst.output = ip_rt_bug; + + rth->rt_genid = rt_genid_ipv4(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; + rth->rt_is_input= 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); - in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb); + skb_dst_set(skb, &rth->dst); + return 0; e_nobufs: - in_dev_put(in_dev); return -ENOBUFS; - e_inval: - in_dev_put(in_dev); return -EINVAL; +e_err: + return err; } @@ -1926,143 +1510,127 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", + pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { - int i; - const unsigned char *p = skb_mac_header(skb); - printk(KERN_WARNING "ll header: "); - for (i = 0; i < dev->hard_header_len; i++, p++) { - printk("%02x", *p); - if (i < (dev->hard_header_len - 1)) - printk(":"); - } - printk("\n"); + print_hex_dump(KERN_WARNING, "ll header: ", + DUMP_PREFIX_OFFSET, 16, 1, + skb_mac_header(skb), + dev->hard_header_len, true); } } #endif } +/* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, - struct fib_result *res, + const struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) + __be32 daddr, __be32 saddr, u32 tos) { - + struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; - unsigned flags = 0; - __be32 spec_dst; - u32 itag; + unsigned int flags = 0; + bool do_cache; + u32 itag = 0; /* get a working reference to the output device */ - out_dev = in_dev_get(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { - if (net_ratelimit()) - printk(KERN_CRIT "Bug in ip_route_input" \ - "_slow(). Please, report\n"); + net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; } - - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag, skb->mark); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - err = -EINVAL; goto cleanup; } - if (err) - flags |= RTCF_DIRECTSRC; - - if (out_dev == in_dev && err && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } } + fnhe = find_exception(&FIB_RES_NH(*res), daddr); + if (do_cache) { + if (fnhe != NULL) + rth = rcu_dereference(fnhe->fnhe_rth_input); + else + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); - rth = dst_alloc(&ipv4_dst_ops); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; + } + } + + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; - rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->fl.oif = 0; - rth->rt_spec_dst= spec_dst; - - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); - - rt_set_nexthop(rth, res, itag); - + rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); - *result = rth; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); +out: err = 0; cleanup: - /* release the working reference to the output device */ - in_dev_put(out_dev); return err; } static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi *fl, + const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { - struct rtable* rth = NULL; - int err; - unsigned hash; - #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif, - rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* @@ -2073,29 +1641,21 @@ static int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = saddr, - .tos = tos, - .scope = RT_SCOPE_UNIVERSE, - } }, - .mark = skb->mark, - .iif = dev->ifindex }; - unsigned flags = 0; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flowi4 fl4; + unsigned int flags = 0; u32 itag = 0; - struct rtable * rth; - unsigned hash; - __be32 spec_dst; + struct rtable *rth; int err = -EINVAL; - int free_res = 0; - struct net * net = dev_net(dev); + struct net *net = dev_net(dev); + bool do_cache; /* IP on this device is disabled. */ @@ -2106,11 +1666,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - ipv4_is_loopback(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) + res.fi = NULL; + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; @@ -2119,111 +1679,124 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || - ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr)) goto martian_destination; + /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), + * and call it once if daddr or/and saddr are loopback addresses + */ + if (ipv4_is_loopback(daddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_destination; + } else if (ipv4_is_loopback(saddr)) { + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + } + /* * Now we are ready to route packet. */ - if ((err = fib_lookup(net, &fl, &res)) != 0) { + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + err = -EHOSTUNREACH; goto no_route; } - free_res = 1; - - RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST) goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); - if (result < 0) - goto martian_source; - if (result) - flags |= RTCF_DIRECTSRC; - spec_dst = daddr; + err = fib_validate_source(skb, saddr, daddr, tos, + 0, dev, in_dev, &itag); + if (err < 0) + goto martian_source_keep_err; goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) - goto e_hostunreach; + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; + goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -done: - in_dev_put(in_dev); - if (free_res) - fib_res_put(&res); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ipv4_is_zeronet(saddr)) - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, skb->mark); + if (!ipv4_is_zeronet(saddr)) { + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + in_dev, &itag); if (err < 0) - goto martian_source; - if (err) - flags |= RTCF_DIRECTSRC; + goto martian_source_keep_err; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + do_cache = false; + if (res.fi) { + if (!itag) { + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; + } + do_cache = true; + } + } + + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - rth->rt_genid = rt_genid(net); - - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; - rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; - rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.input= ip_local_deliver; + rth->dst.output= ip_rt_bug; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); - rth->rt_gateway = daddr; - rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + + rth->rt_genid = rt_genid_ipv4(net); rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_is_input = 1; + rth->rt_iif = 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb); - goto done; + if (do_cache) { + if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) { + rth->dst.flags |= DST_NOCACHE; + rt_add_uncached_list(rth); + } + } + skb_dst_set(skb, &rth->dst); + err = 0; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); - spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; @@ -2235,66 +1808,33 @@ no_route: martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", - &daddr, &saddr, dev->name); + if (IN_DEV_LOG_MARTIANS(in_dev)) + net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif -e_hostunreach: - err = -EHOSTUNREACH; - goto done; - e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto out; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { - struct rtable * rth; - unsigned hash; - int iif = dev->ifindex; - struct net *net; - - net = dev_net(dev); - - if (!rt_caching(net)) - goto skip_cache; - - tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif, rt_genid(net)); + int res; rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | - (rth->fl.iif ^ iif) | - rth->fl.oif | - (rth->fl.fl4_tos ^ tos)) == 0 && - rth->fl.mark == skb->mark && - net_eq(dev_net(rth->u.dst.dev), net) && - !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); - RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); - skb_dst_set(skb, &rth->u.dst); - return 0; - } - RT_CACHE_STAT_INC(in_hlist_search); - } - rcu_read_unlock(); -skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2307,12 +1847,11 @@ skip_cache: route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { - struct in_device *in_dev; + struct in_device *in_dev = __in_dev_get_rcu(dev); - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { - int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + if (in_dev) { + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2320,192 +1859,173 @@ skip_cache: IN_DEV_MFORWARD(in_dev)) #endif ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + return res; } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; } +EXPORT_SYMBOL(ip_route_input_noref); -static int __mkroute_output(struct rtable **result, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +/* called with rcu_read_lock() */ +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, int orig_oif, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + struct fib_nh_exception *fnhe; struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); - int err = 0; + u16 type = res->type; + struct rtable *rth; + bool do_cache; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) - return -EINVAL; + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) + return ERR_PTR(-EINVAL); - if (fl->fl4_dst == htonl(0xFFFFFFFF)) - res->type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); + + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); - if (!in_dev) - return -EINVAL; - - if (res->type == RTN_BROADCAST) { + do_cache = true; + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } - } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + fi = NULL; + } else if (type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST | RTCF_LOCAL; + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; + else + do_cache = false; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; + } + + fnhe = NULL; + do_cache &= fi != NULL; + if (do_cache) { + struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); + + fnhe = find_exception(nh, fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth_output; + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } +add: + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM), + do_cache); + if (!rth) + return ERR_PTR(-ENOBUFS); - rth = dst_alloc(&ipv4_dst_ops); - if (!rth) { - err = -ENOBUFS; - goto cleanup; - } + rth->dst.output = ip_output; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; - /* get references to the devices that are to be hold by the routing - cache entry */ - rth->u.dst.dev = dev_out; - dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; - - rth->u.dst.output=ip_output; - rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_genid = rt_genid_ipv4(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_is_input = 0; + rth->rt_iif = orig_oif ? : 0; + rth->rt_pmtu = 0; + rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); - if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; - } + if (flags & RTCF_LOCAL) + rth->dst.input = ip_local_deliver; if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + !ipv4_is_local_multicast(fl4->daddr)) { + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - rth->rt_flags = flags; - - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; -} - -static int ip_mkroute_output(struct rtable **rp, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL); - } - - return err; + return rth; } /* * Major route resolver routine. */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, - const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = oldflp->fl4_dst, - .saddr = oldflp->fl4_src, - .tos = tos & IPTOS_RT_MASK, - .scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : - RT_SCOPE_UNIVERSE), - } }, - .mark = oldflp->mark, - .iif = net->loopback_dev->ifindex, - .oif = oldflp->oif }; - struct fib_result res; - unsigned flags = 0; +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +{ struct net_device *dev_out = NULL; - int free_res = 0; - int err; - + __u8 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + int orig_oif; + res.tclassid = 0; res.fi = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif + res.table = NULL; + + orig_oif = fl4->flowi4_oif; - if (oldflp->fl4_src) { - err = -EINVAL; - if (ipv4_is_multicast(oldflp->fl4_src) || - ipv4_is_lbcast(oldflp->fl4_src) || - ipv4_is_zeronet(oldflp->fl4_src)) + fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fl4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2516,11 +2036,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 && - (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); + dev_out = __ip_dev_find(net, fl4->saddr, false); if (dev_out == NULL) goto out; @@ -2539,67 +2059,61 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; goto make_route; } - if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) + if (!__ip_dev_find(net, fl4->saddr, false)) goto out; - dev_put(dev_out); - dev_out = NULL; } } - if (oldflp->oif) { - dev_out = dev_get_by_index(net, oldflp->oif); - err = -ENODEV; + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { - dev_put(dev_out); - goto out; /* Wrong error code */ + if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { + rth = ERR_PTR(-ENETUNREACH); + goto out; } - - if (ipv4_is_local_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF)) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (ipv4_is_multicast(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (!fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = net->loopback_dev->ifindex; + fl4->flowi4_oif = LOOPBACK_IFINDEX; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl, &res)) { + if (fib_lookup(net, fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + res.table = NULL; + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2618,202 +2132,161 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) - fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); + if (!fl4->saddr) { + if (res.fi->fib_prefsrc) + fl4->saddr = res.fi->fib_prefsrc; + else + fl4->saddr = fl4->daddr; + } dev_out = net->loopback_dev; - dev_hold(dev_out); - fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); - res.fi = NULL; + fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); +out: + rcu_read_unlock(); + return rth; +} +EXPORT_SYMBOL_GPL(__ip_route_output_key); - if (free_res) - fib_res_put(&res); - if (dev_out) - dev_put(dev_out); -out: return err; +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; } -int __ip_route_output_key(struct net *net, struct rtable **rp, - const struct flowi *flp) +static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) { - unsigned hash; - struct rtable *rth; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - if (!rt_caching(net)) - goto slow_output; - - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); - - rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && - rth->fl.iif == 0 && - rth->fl.oif == flp->oif && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->u.dst.dev), net) && - !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); - RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); - *rp = rth; - return 0; - } - RT_CACHE_STAT_INC(out_hlist_search); - } - rcu_read_unlock_bh(); + return mtu ? : dst->dev->mtu; +} -slow_output: - return ip_route_output_slow(net, rp, flp); +static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ } -EXPORT_SYMBOL_GPL(__ip_route_output_key); +static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) { + return NULL; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), - .destroy = ipv4_dst_destroy, - .check = ipv4_dst_check, + .check = ipv4_blackhole_dst_check, + .mtu = ipv4_blackhole_mtu, + .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entries = ATOMIC_INIT(0), + .redirect = ipv4_rt_blackhole_redirect, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, + .neigh_lookup = ipv4_neigh_lookup, }; - -static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = *rp; - struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *rt; + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (rt) { - struct dst_entry *new = &rt->u.dst; + struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; - new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + new->output = dst_discard_sk; - new->dev = ort->u.dst.dev; + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); - rt->fl = ort->fl; + rt->rt_is_input = ort->rt_is_input; + rt->rt_iif = ort->rt_iif; + rt->rt_pmtu = ort->rt_pmtu; - rt->idev = ort->idev; - if (rt->idev) - in_dev_hold(rt->idev); - rt->rt_genid = rt_genid(net); + rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_dst = ort->rt_dst; - rt->rt_src = ort->rt_src; - rt->rt_iif = ort->rt_iif; rt->rt_gateway = ort->rt_gateway; - rt->rt_spec_dst = ort->rt_spec_dst; - rt->peer = ort->peer; - if (rt->peer) - atomic_inc(&rt->peer->refcnt); + rt->rt_uses_gateway = ort->rt_uses_gateway; + + INIT_LIST_HEAD(&rt->rt_uncached); dst_free(new); } - dst_release(&(*rp)->u.dst); - *rp = rt; - return (rt ? 0 : -ENOMEM); + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, - struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - int err; - - if ((err = __ip_route_output_key(net, rp, flp)) != 0) - return err; + struct rtable *rt = __ip_route_output_key(net, flp4); - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, - flags ? XFRM_LOOKUP_WAIT : 0); - if (err == -EREMOTE) - err = ipv4_dst_blackhole(net, rp, flp); + if (IS_ERR(rt)) + return rt; - return err; - } + if (flp4->flowi4_proto) + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); - return 0; + return rt; } - EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) -{ - return ip_route_output_flow(net, rp, flp, NULL, 0); -} - -static int rt_fill_info(struct net *net, - struct sk_buff *skb, u32 pid, u32 seq, int event, - int nowait, unsigned int flags) +static int rt_fill_info(struct net *net, __be32 dst, __be32 src, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, int event, int nowait, unsigned int flags) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires; - u32 id = 0, ts = 0, tsage = 0, error; + unsigned long expires = 0; + u32 error; + u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); if (nlh == NULL) return -EMSGSIZE; @@ -2821,9 +2294,10 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = fl4->flowi4_tos; r->rtm_table = RT_TABLE_MAIN; - NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); + if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) + goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; @@ -2831,46 +2305,59 @@ static int rt_fill_info(struct net *net, if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; - NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - - if (rt->fl.fl4_src) { + if (nla_put_be32(skb, RTA_DST, dst)) + goto nla_put_failure; + if (src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); + if (nla_put_be32(skb, RTA_SRC, src)) + goto nla_put_failure; } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); + if (rt->dst.dev && + nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (rt->dst.tclassid && + nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) + goto nla_put_failure; #endif - if (rt->fl.iif) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) - NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); + if (!rt_is_input_route(rt) && + fl4->saddr != src) { + if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) + goto nla_put_failure; + } + if (rt->rt_uses_gateway && + nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + goto nla_put_failure; + + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; - if (rt->rt_dst != rt->rt_gateway) - NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt_pmtu && expires) + metrics[RTAX_MTU - 1] = rt->rt_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - error = rt->u.dst.error; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; - if (rt->peer) { - id = atomic_read(&rt->peer->ip_id_count) & 0xffff; - if (rt->peer->tcp_ts_stamp) { - ts = rt->peer->tcp_ts; - tsage = get_seconds() - rt->peer->tcp_ts_stamp; - } - } + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - if (rt->fl.iif) { -#ifdef CONFIG_IP_MROUTE - __be32 dst = rt->rt_dst; + error = rt->dst.error; + if (rt_is_input_route(rt)) { +#ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, r, nowait); + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2884,11 +2371,11 @@ static int rt_fill_info(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) + goto nla_put_failure; } - if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, - expires, error) < 0) + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -2898,16 +2385,18 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; + struct flowi4 fl4; __be32 dst = 0; __be32 src = 0; u32 iif; int err; + int mark; struct sk_buff *skb; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); @@ -2935,6 +2424,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; + mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst; + fl4.saddr = src; + fl4.flowi4_tos = rtm->rtm_tos; + fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_mark = mark; if (iif) { struct net_device *dev; @@ -2947,40 +2444,36 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb->protocol = htons(ETH_P_IP); skb->dev = dev; + skb->mark = mark; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); rt = skb_rtable(skb); - if (err == 0 && rt->u.dst.error) - err = -rt->u.dst.error; + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { - struct flowi fl = { - .nl_u = { - .ip4_u = { - .daddr = dst, - .saddr = src, - .tos = rtm->rtm_tos, - }, - }, - .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - }; - err = ip_route_output_key(net, &rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } if (err) goto errout_free; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + err = rt_fill_info(net, dst, src, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; @@ -2989,120 +2482,33 @@ errout_free: goto errout; } -int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct rtable *rt; - int h, s_h; - int idx, s_idx; - struct net *net; - - net = sock_net(skb->sk); - - s_h = cb->args[0]; - if (s_h < 0) - s_h = 0; - s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { - if (!rt_hash_table[h].chain) - continue; - rcu_read_lock_bh(); - for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference(rt->u.dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) - continue; - if (rt_is_expired(rt)) - continue; - skb_dst_set(skb, dst_clone(&rt->u.dst)); - if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - 1, NLM_F_MULTI) <= 0) { - skb_dst_drop(skb); - rcu_read_unlock_bh(); - goto done; - } - skb_dst_drop(skb); - } - rcu_read_unlock_bh(); - } - -done: - cb->args[0] = h; - cb->args[1] = idx; - return skb->len; -} - void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(dev_net(in_dev->dev), 0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL -static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_gc_elasticity __read_mostly = 8; + +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write) { - int flush_delay; - ctl_table ctl; - struct net *net; - - memcpy(&ctl, __ctl, sizeof(ctl)); - ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); + struct net *net = (struct net *)__ctl->extra1; - net = (struct net *)__ctl->extra1; - rt_cache_flush(net, flush_delay); + if (write) { + rt_cache_flush(net); + fnhe_genid_bump(net); return 0; } return -EINVAL; } -static void rt_secret_reschedule(int old) -{ - struct net *net; - int new = ip_rt_secret_interval; - int diff = new - old; - - if (!diff) - return; - - rtnl_lock(); - for_each_net(net) { - int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); - - if (!new) - continue; - - if (deleted) { - long time = net->ipv4.rt_secret_timer.expires - jiffies; - - if (time <= 0 || (time += diff) <= 0) - time = 0; - - net->ipv4.rt_secret_timer.expires = time; - } else - net->ipv4.rt_secret_timer.expires = new; - - net->ipv4.rt_secret_timer.expires += jiffies; - add_timer(&net->ipv4.rt_secret_timer); - } - rtnl_unlock(); -} - -static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); - - rt_secret_reschedule(old); - - return ret; -} - -static ctl_table ipv4_route_table[] = { +static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, @@ -3210,33 +2616,9 @@ static ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "secret_interval", - .data = &ip_rt_secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ipv4_sysctl_rt_secret_interval, - }, - { } -}; - -static struct ctl_table empty[1]; - -static struct ctl_table ipv4_skeleton[] = -{ - { .procname = "route", - .mode = 0555, .child = ipv4_route_table}, - { .procname = "neigh", - .mode = 0555, .child = empty}, { } }; -static __net_initdata struct ctl_path ipv4_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { }, -}; - static struct ctl_table ipv4_route_flush_table[] = { { .procname = "flush", @@ -3247,13 +2629,6 @@ static struct ctl_table ipv4_route_flush_table[] = { { }, }; -static __net_initdata struct ctl_path ipv4_route_path[] = { - { .procname = "net", }, - { .procname = "ipv4", }, - { .procname = "route", }, - { }, -}; - static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; @@ -3263,11 +2638,14 @@ static __net_init int sysctl_route_net_init(struct net *net) tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; } tbl[0].extra1 = net; - net->ipv4.route_hdr = - register_net_sysctl_table(net, ipv4_route_path, tbl); + net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); if (net->ipv4.route_hdr == NULL) goto err_reg; return 0; @@ -3295,56 +2673,59 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { }; #endif - -static __net_init int rt_secret_timer_init(struct net *net) +static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, - (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); + atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->fnhe_genid, 0); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); + return 0; +} + +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, +}; - net->ipv4.rt_secret_timer.function = rt_secret_rebuild; - net->ipv4.rt_secret_timer.data = (unsigned long)net; - init_timer_deferrable(&net->ipv4.rt_secret_timer); +static int __net_init ipv4_inetpeer_init(struct net *net) +{ + struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires = - jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + if (!bp) + return -ENOMEM; + inet_peer_base_init(bp); + net->ipv4.peers = bp; return 0; } -static __net_exit void rt_secret_timer_exit(struct net *net) +static void __net_exit ipv4_inetpeer_exit(struct net *net) { - del_timer_sync(&net->ipv4.rt_secret_timer); + struct inet_peer_base *bp = net->ipv4.peers; + + net->ipv4.peers = NULL; + inetpeer_invalidate_tree(bp); + kfree(bp); } -static __net_initdata struct pernet_operations rt_secret_timer_ops = { - .init = rt_secret_timer_init, - .exit = rt_secret_timer_exit, +static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { + .init = ipv4_inetpeer_init, + .exit = ipv4_inetpeer_exit, }; - -#ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ - -static __initdata unsigned long rhash_entries; -static int __init set_rhash_entries(char *str) -{ - if (!str) - return 0; - rhash_entries = simple_strtoul(str, &str, 0); - return 1; -} -__setup("rhash_entries=", set_rhash_entries); +#ifdef CONFIG_IP_ROUTE_CLASSID +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; +#endif /* CONFIG_IP_ROUTE_CLASSID */ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE + ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + if (!ip_idents) + panic("IP: failed to allocate ip_idents\n"); + + prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); @@ -3356,47 +2737,31 @@ int __init ip_rt_init(void) ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; - rt_hash_table = (struct rt_hash_bucket *) - alloc_large_system_hash("IP route cache", - sizeof(struct rt_hash_bucket), - rhash_entries, - (totalram_pages >= 128 * 1024) ? - 15 : 17, - 0, - &rt_hash_log, - &rt_hash_mask, - rhash_entries ? 0 : 512 * 1024); - memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); - rt_hash_lock_init(); - - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); - devinet_init(); - ip_fib_init(); + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); + ipv4_dst_ops.gc_thresh = ~0; + ip_rt_max_size = INT_MAX; - if (register_pernet_subsys(&rt_secret_timer_ops)) - printk(KERN_ERR "Unable to setup rt_secret_timer\n"); + devinet_init(); + ip_fib_init(); if (ip_rt_proc_init()) - printk(KERN_ERR "Unable to create route proc files\n"); + pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(ip_rt_max_size); + xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif + register_pernet_subsys(&rt_genid_ops); + register_pernet_subsys(&ipv4_inetpeer_ops); return rc; } @@ -3407,10 +2772,6 @@ int __init ip_rt_init(void) */ void __init ip_static_sysctl_init(void) { - register_sysctl_paths(ipv4_path, ipv4_skeleton); + register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif - -EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_input); -EXPORT_SYMBOL(ip_route_output_key); |
