diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/route.c |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 3177 |
1 files changed, 3177 insertions, 0 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c new file mode 100644 index 00000000000..9f91a116d91 --- /dev/null +++ b/net/ipv4/route.c @@ -0,0 +1,3177 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the IP router. + * + * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Linus Torvalds, <Linus.Torvalds@helsinki.fi> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * Alan Cox : Verify area fixes. + * Alan Cox : cli() protects routing changes + * Rui Oliveira : ICMP routing table updates + * (rco@di.uminho.pt) Routing table insertion and update + * Linus Torvalds : Rewrote bits to be sensible + * Alan Cox : Added BSD route gw semantics + * Alan Cox : Super /proc >4K + * Alan Cox : MTU in route table + * Alan Cox : MSS actually. Also added the window + * clamper. + * Sam Lantinga : Fixed route matching in rt_del() + * Alan Cox : Routing cache support. + * Alan Cox : Removed compatibility cruft. + * Alan Cox : RTF_REJECT support. + * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source + * Alexey Kuznetsov : End of old history. Split to fib.c and + * route.c and rewritten from scratch. + * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark + * Robert Olsson : Added rt_cache statistics + * Arnaldo C. Melo : Convert proc stuff to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/pkt_sched.h> +#include <linux/mroute.h> +#include <linux/netfilter_ipv4.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/rcupdate.h> +#include <linux/times.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/inetpeer.h> +#include <net/sock.h> +#include <net/ip_fib.h> +#include <net/arp.h> +#include <net/tcp.h> +#include <net/icmp.h> +#include <net/xfrm.h> +#include <net/ip_mp_alg.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define RT_FL_TOS(oldflp) \ + ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) + +#define IP_MAX_MTU 0xFFF0 + +#define RT_GC_TIMEOUT (300*HZ) + +static int ip_rt_min_delay = 2 * HZ; +static int ip_rt_max_delay = 10 * HZ; +static int ip_rt_max_size; +static int ip_rt_gc_timeout = RT_GC_TIMEOUT; +static int ip_rt_gc_interval = 60 * HZ; +static int ip_rt_gc_min_interval = HZ / 2; +static int ip_rt_redirect_number = 9; +static int ip_rt_redirect_load = HZ / 50; +static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost = HZ; +static int ip_rt_error_burst = 5 * HZ; +static int ip_rt_gc_elasticity = 8; +static int ip_rt_mtu_expires = 10 * 60 * HZ; +static int ip_rt_min_pmtu = 512 + 20 + 20; +static int ip_rt_min_advmss = 256; +static int ip_rt_secret_interval = 10 * 60 * HZ; +static unsigned long rt_deadline; + +#define RTprint(a...) printk(KERN_DEBUG a) + +static struct timer_list rt_flush_timer; +static struct timer_list rt_periodic_timer; +static struct timer_list rt_secret_timer; + +/* + * Interface to generic destination cache. + */ + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +static void ipv4_dst_destroy(struct dst_entry *dst); +static void ipv4_dst_ifdown(struct dst_entry *dst, + struct net_device *dev, int how); +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_link_failure(struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static int rt_garbage_collect(void); + + +static struct dst_ops ipv4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = rt_garbage_collect, + .check = ipv4_dst_check, + .destroy = ipv4_dst_destroy, + .ifdown = ipv4_dst_ifdown, + .negative_advice = ipv4_negative_advice, + .link_failure = ipv4_link_failure, + .update_pmtu = ip_rt_update_pmtu, + .entry_size = sizeof(struct rtable), +}; + +#define ECN_OR_COST(class) TC_PRIO_##class + +__u8 ip_tos2prio[16] = { + TC_PRIO_BESTEFFORT, + ECN_OR_COST(FILLER), + TC_PRIO_BESTEFFORT, + ECN_OR_COST(BESTEFFORT), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK) +}; + + +/* + * Route cache. + */ + +/* The locking scheme is rather straight forward: + * + * 1) Read-Copy Update protects the buckets of the central route hash. + * 2) Only writers remove entries, and they hold the lock + * as they look at rtable reference counts. + * 3) Only readers acquire references to rtable entries, + * they do so with atomic increments and with the + * lock held. + */ + +struct rt_hash_bucket { + struct rtable *chain; + spinlock_t lock; +} __attribute__((__aligned__(8))); + +static struct rt_hash_bucket *rt_hash_table; +static unsigned rt_hash_mask; +static int rt_hash_log; +static unsigned int rt_hash_rnd; + +struct rt_cache_stat *rt_cache_stat; + +static int rt_intern_hash(unsigned hash, struct rtable *rth, + struct rtable **res); + +static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) +{ + return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) + & rt_hash_mask); +} + +#ifdef CONFIG_PROC_FS +struct rt_cache_iter_state { + int bucket; +}; + +static struct rtable *rt_cache_get_first(struct seq_file *seq) +{ + struct rtable *r = NULL; + struct rt_cache_iter_state *st = seq->private; + + for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + if (r) + break; + rcu_read_unlock_bh(); + } + return r; +} + +static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) +{ + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + + r = r->u.rt_next; + while (!r) { + rcu_read_unlock_bh(); + if (--st->bucket < 0) + break; + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + } + return r; +} + +static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) +{ + struct rtable *r = rt_cache_get_first(seq); + + if (r) + while (pos && (r = rt_cache_get_next(seq, r))) + --pos; + return pos ? NULL : r; +} + +static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct rtable *r = NULL; + + if (v == SEQ_START_TOKEN) + r = rt_cache_get_first(seq); + else + r = rt_cache_get_next(seq, v); + ++*pos; + return r; +} + +static void rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + rcu_read_unlock_bh(); +} + +static int rt_cache_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" + "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" + "HHUptod\tSpecDst"); + else { + struct rtable *r = v; + char temp[256]; + + sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" + "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", + r->u.dst.dev ? r->u.dst.dev->name : "*", + (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + r->rt_flags, atomic_read(&r->u.dst.__refcnt), + r->u.dst.__use, 0, (unsigned long)r->rt_src, + (dst_metric(&r->u.dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->u.dst, RTAX_WINDOW), + (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + + dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->fl.fl4_tos, + r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, + r->u.dst.hh ? (r->u.dst.hh->hh_output == + dev_queue_xmit) : 0, + r->rt_spec_dst); + seq_printf(seq, "%-127s\n", temp); + } + return 0; +} + +static struct seq_operations rt_cache_seq_ops = { + .start = rt_cache_seq_start, + .next = rt_cache_seq_next, + .stop = rt_cache_seq_stop, + .show = rt_cache_seq_show, +}; + +static int rt_cache_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &rt_cache_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + + +static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; +} + +static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; + +} + +static void rt_cpu_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int rt_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct rt_cache_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + return 0; + } + + seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " + " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + atomic_read(&ipv4_dst_ops.entries), + st->in_hit, + st->in_slow_tot, + st->in_slow_mc, + st->in_no_route, + st->in_brd, + st->in_martian_dst, + st->in_martian_src, + + st->out_hit, + st->out_slow_tot, + st->out_slow_mc, + + st->gc_total, + st->gc_ignored, + st->gc_goal_miss, + st->gc_dst_overflow, + st->in_hlist_search, + st->out_hlist_search + ); + return 0; +} + +static struct seq_operations rt_cpu_seq_ops = { + .start = rt_cpu_seq_start, + .next = rt_cpu_seq_next, + .stop = rt_cpu_seq_stop, + .show = rt_cpu_seq_show, +}; + + +static int rt_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rt_cpu_seq_ops); +} + +static struct file_operations rt_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static __inline__ void rt_free(struct rtable *rt) +{ + multipath_remove(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ void rt_drop(struct rtable *rt) +{ + multipath_remove(rt); + ip_rt_put(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + rth->fl.iif && rth->u.rt_next; +} + +static __inline__ int rt_valuable(struct rtable *rth) +{ + return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || + rth->u.dst.expires; +} + +static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +{ + unsigned long age; + int ret = 0; + + if (atomic_read(&rth->u.dst.__refcnt)) + goto out; + + ret = 1; + if (rth->u.dst.expires && + time_after_eq(jiffies, rth->u.dst.expires)) + goto out; + + age = jiffies - rth->u.dst.lastuse; + ret = 0; + if ((age <= tmo1 && !rt_fast_clean(rth)) || + (age <= tmo2 && rt_valuable(rth))) + goto out; + ret = 1; +out: return ret; +} + +/* Bits of score are: + * 31: very valuable + * 30: not quite useless + * 29..0: usage counter + */ +static inline u32 rt_score(struct rtable *rt) +{ + u32 score = jiffies - rt->u.dst.lastuse; + + score = ~score & ~(3<<30); + + if (rt_valuable(rt)) + score |= (1<<31); + + if (!rt->fl.iif || + !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) + score |= (1<<30); + + return score; +} + +static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +{ + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && + fl1->iif == fl2->iif; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, + struct rtable *expentry, + int *removed_count) +{ + int passedexpired = 0; + struct rtable **nextstep = NULL; + struct rtable **rthp = chain_head; + struct rtable *rth; + + if (removed_count) + *removed_count = 0; + + while ((rth = *rthp) != NULL) { + if (rth == expentry) + passedexpired = 1; + + if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && + compare_keys(&(*rthp)->fl, &expentry->fl)) { + if (*rthp == expentry) { + *rthp = rth->u.rt_next; + continue; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + if (removed_count) + ++(*removed_count); + } + } else { + if (!((*rthp)->u.dst.flags & DST_BALANCED) && + passedexpired && !nextstep) + nextstep = &rth->u.rt_next; + + rthp = &rth->u.rt_next; + } + } + + rt_free(expentry); + if (removed_count) + ++(*removed_count); + + return nextstep; +} +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + + +/* This runs via a timer and thus is always in BH context. */ +static void rt_check_expire(unsigned long dummy) +{ + static int rover; + int i = rover, t; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + + for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; + t -= ip_rt_gc_timeout) { + unsigned long tmo = ip_rt_gc_timeout; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + spin_lock(&rt_hash_table[i].lock); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + + /* Cleanup aged off entries. */ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries if necessary */ + if (rth->u.dst.flags & DST_BALANCED) { + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, NULL); + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock(&rt_hash_table[i].lock); + + /* Fallback loop breaker. */ + if (time_after(jiffies, now)) + break; + } + rover = i; + mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); +} + +/* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ +static void rt_run_flush(unsigned long dummy) +{ + int i; + struct rtable *rth, *next; + + rt_deadline = 0; + + get_random_bytes(&rt_hash_rnd, 4); + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(&rt_hash_table[i].lock); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; + spin_unlock_bh(&rt_hash_table[i].lock); + + for (; rth; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } + } +} + +static DEFINE_SPINLOCK(rt_flush_lock); + +void rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_softirq(); + + if (delay < 0) + delay = ip_rt_min_delay; + + /* flush existing multipath state*/ + multipath_flush(); + + spin_lock_bh(&rt_flush_lock); + + if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { + long tmo = (long)(rt_deadline - now); + + /* If flush timer is already running + and flush request is not immediate (delay > 0): + + if deadline is not achieved, prolongate timer to "delay", + otherwise fire it at deadline time. + */ + + if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + spin_unlock_bh(&rt_flush_lock); + rt_run_flush(0); + return; + } + + if (rt_deadline == 0) + rt_deadline = now + ip_rt_max_delay; + + mod_timer(&rt_flush_timer, now+delay); + spin_unlock_bh(&rt_flush_lock); +} + +static void rt_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + + rt_cache_flush(0); + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); +} + +/* + Short description of GC goals. + + We want to build algorithm, which will keep routing cache + at some equilibrium point, when number of aged off entries + is kept approximately equal to newly generated ones. + + Current expiration strength is variable "expire". + We try to adjust it dynamically, so that if networking + is idle expires is large enough to keep enough of warm entries, + and when load increases it reduces to limit cache size. + */ + +static int rt_garbage_collect(void) +{ + static unsigned long expire = RT_GC_TIMEOUT; + static unsigned long last_gc; + static int rover; + static int equilibrium; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + int goal; + + /* + * Garbage collection is pretty expensive, + * do not make it too frequently. + */ + + RT_CACHE_STAT_INC(gc_total); + + if (now - last_gc < ip_rt_gc_min_interval && + atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { + RT_CACHE_STAT_INC(gc_ignored); + goto out; + } + + /* Calculate number of entries, which we want to expire now. */ + goal = atomic_read(&ipv4_dst_ops.entries) - + (ip_rt_gc_elasticity << rt_hash_log); + if (goal <= 0) { + if (equilibrium < ipv4_dst_ops.gc_thresh) + equilibrium = ipv4_dst_ops.gc_thresh; + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + if (goal > 0) { + equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + } + } else { + /* We are in dangerous area. Try to reduce cache really + * aggressively. + */ + goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + } + + if (now - last_gc >= ip_rt_gc_min_interval) + last_gc = now; + + if (goal <= 0) { + equilibrium += goal; + goto work_done; + } + + do { + int i, k; + + for (i = rt_hash_mask, k = rover; i >= 0; i--) { + unsigned long tmo = expire; + + k = (k + 1) & rt_hash_mask; + rthp = &rt_hash_table[k].chain; + spin_lock_bh(&rt_hash_table[k].lock); + while ((rth = *rthp) != NULL) { + if (!rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries + * if necessary + */ + if (rth->u.dst.flags & DST_BALANCED) { + int r; + + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, + &r); + goal -= r; + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock_bh(&rt_hash_table[k].lock); + if (goal <= 0) + break; + } + rover = k; + + if (goal <= 0) + goto work_done; + + /* Goal is not achieved. We stop process if: + + - if expire reduced to zero. Otherwise, expire is halfed. + - if table is not full. + - if we are called from interrupt. + - jiffies check is just fallback/debug loop breaker. + We will not spin here for long time in any case. + */ + + RT_CACHE_STAT_INC(gc_goal_miss); + + if (expire == 0) + break; + + expire >>= 1; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, i); +#endif + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + } while (!in_softirq() && time_before_eq(jiffies, now)); + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + if (net_ratelimit()) + printk(KERN_WARNING "dst cache overflow\n"); + RT_CACHE_STAT_INC(gc_dst_overflow); + return 1; + +work_done: + expire += ip_rt_gc_min_interval; + if (expire > ip_rt_gc_timeout || + atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, rover); +#endif +out: return 0; +} + +static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +{ + struct rtable *rth, **rthp; + unsigned long now; + struct rtable *cand, **candp; + u32 min_score; + int chain_length; + int attempts = !in_softirq(); + +restart: + chain_length = 0; + min_score = ~(u32)0; + cand = NULL; + candp = NULL; + now = jiffies; + + rthp = &rt_hash_table[hash].chain; + + spin_lock_bh(&rt_hash_table[hash].lock); + while ((rth = *rthp) != NULL) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (!(rth->u.dst.flags & DST_BALANCED) && + compare_keys(&rth->fl, &rt->fl)) { +#else + if (compare_keys(&rth->fl, &rt->fl)) { +#endif + /* Put it first */ + *rthp = rth->u.rt_next; + /* + * Since lookup is lockfree, the deletion + * must be visible to another weakly ordered CPU before + * the insertion at the start of the hash chain. + */ + rcu_assign_pointer(rth->u.rt_next, + rt_hash_table[hash].chain); + /* + * Since lookup is lockfree, the update writes + * must be ordered for consistency on SMP. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rth); + + rth->u.dst.__use++; + dst_hold(&rth->u.dst); + rth->u.dst.lastuse = now; + spin_unlock_bh(&rt_hash_table[hash].lock); + + rt_drop(rt); + *rp = rth; + return 0; + } + + if (!atomic_read(&rth->u.dst.__refcnt)) { + u32 score = rt_score(rth); + + if (score <= min_score) { + cand = rth; + candp = rthp; + min_score = score; + } + } + + chain_length++; + + rthp = &rth->u.rt_next; + } + + if (cand) { + /* ip_rt_gc_elasticity used to be average length of chain + * length, when exceeded gc becomes really aggressive. + * + * The second limit is less certain. At the moment it allows + * only 2 entries per bucket. We will see. + */ + if (chain_length > ip_rt_gc_elasticity) { + *candp = cand->u.rt_next; + rt_free(cand); + } + } + + /* Try to bind route to arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { + int err = arp_bind_neighbour(&rt->u.dst); + if (err) { + spin_unlock_bh(&rt_hash_table[hash].lock); + + if (err != -ENOBUFS) { + rt_drop(rt); + return err; + } + + /* Neighbour tables are full and nothing + can be released. Try to shrink route cache, + it is most likely it holds some neighbour records. + */ + if (attempts-- > 0) { + int saved_elasticity = ip_rt_gc_elasticity; + int saved_int = ip_rt_gc_min_interval; + ip_rt_gc_elasticity = 1; + ip_rt_gc_min_interval = 0; + rt_garbage_collect(); + ip_rt_gc_min_interval = saved_int; + ip_rt_gc_elasticity = saved_elasticity; + goto restart; + } + + if (net_ratelimit()) + printk(KERN_WARNING "Neighbour table overflow.\n"); + rt_drop(rt); + return -ENOBUFS; + } + } + + rt->u.rt_next = rt_hash_table[hash].chain; +#if RT_CACHE_DEBUG >= 2 + if (rt->u.rt_next) { + struct rtable *trt; + printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, + NIPQUAD(rt->rt_dst)); + for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) + printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); + printk("\n"); + } +#endif + rt_hash_table[hash].chain = rt; + spin_unlock_bh(&rt_hash_table[hash].lock); + *rp = rt; + return 0; +} + +void rt_bind_peer(struct rtable *rt, int create) +{ + static DEFINE_SPINLOCK(rt_peer_lock); + struct inet_peer *peer; + + peer = inet_getpeer(rt->rt_dst, create); + + spin_lock_bh(&rt_peer_lock); + if (rt->peer == NULL) { + rt->peer = peer; + peer = NULL; + } + spin_unlock_bh(&rt_peer_lock); + if (peer) + inet_putpeer(peer); +} + +/* + * Peer allocation may fail only in serious out-of-memory conditions. However + * we still can generate some output. + * Random ID selection looks a bit dangerous because we have no chances to + * select ID being unique in a reasonable period of time. + * But broken packet identifier may be better than no packet at all. + */ +static void ip_select_fb_ident(struct iphdr *iph) +{ + static DEFINE_SPINLOCK(ip_fb_id_lock); + static u32 ip_fallback_id; + u32 salt; + + spin_lock_bh(&ip_fb_id_lock); + salt = secure_ip_id(ip_fallback_id ^ iph->daddr); + iph->id = htons(salt & 0xFFFF); + ip_fallback_id = salt; + spin_unlock_bh(&ip_fb_id_lock); +} + +void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt) { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + + /* If peer is attached to destination, it is never detached, + so that we need not to grab a lock to dereference it. + */ + if (rt->peer) { + iph->id = htons(inet_getid(rt->peer, more)); + return; + } + } else + printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); + + ip_select_fb_ident(iph); +} + +static void rt_del(unsigned hash, struct rtable *rt) +{ + struct rtable **rthp; + + spin_lock_bh(&rt_hash_table[hash].lock); + ip_rt_put(rt); + for (rthp = &rt_hash_table[hash].chain; *rthp; + rthp = &(*rthp)->u.rt_next) + if (*rthp == rt) { + *rthp = rt->u.rt_next; + rt_free(rt); + break; + } + spin_unlock_bh(&rt_hash_table[hash].lock); +} + +void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, + u32 saddr, u8 tos, struct net_device *dev) +{ + int i, k; + struct in_device *in_dev = in_dev_get(dev); + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + + tos &= IPTOS_RT_MASK; + + if (!in_dev) + return; + + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) + || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + goto reject_redirect; + + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + goto reject_redirect; + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) + goto reject_redirect; + } + + for (i = 0; i < 2; i++) { + for (k = 0; k < 2; k++) { + unsigned hash = rt_hash_code(daddr, + skeys[i] ^ (ikeys[k] << 5), + tos); + + rthp=&rt_hash_table[hash].chain; + + rcu_read_lock(); + while ((rth = rcu_dereference(*rthp)) != NULL) { + struct rtable *rt; + + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } + + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; + + dst_hold(&rth->u.dst); + rcu_read_unlock(); + + rt = dst_alloc(&ipv4_dst_ops); + if (rt == NULL) { + ip_rt_put(rth); + in_dev_put(in_dev); + return; + } + + /* Copy all the information. */ + *rt = *rth; + INIT_RCU_HEAD(&rt->u.dst.rcu_head); + rt->u.dst.__use = 1; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.child = NULL; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + if (rt->idev) + in_dev_hold(rt->idev); + rt->u.dst.obsolete = 0; + rt->u.dst.lastuse = jiffies; + rt->u.dst.path = &rt->u.dst; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); + + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + + if (arp_bind_neighbour(&rt->u.dst) || + !(rt->u.dst.neighbour->nud_state & + NUD_VALID)) { + if (rt->u.dst.neighbour) + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rth); + rt_drop(rt); + goto do_next; + } + + rt_del(hash, rth); + if (!rt_intern_hash(hash, rt, &rt)) + ip_rt_put(rt); + goto do_next; + } + rcu_read_unlock(); + do_next: + ; + } + } + in_dev_put(in_dev); + return; + +reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_rateli |