diff options
Diffstat (limited to 'net/ipv4/inetpeer.c')
| -rw-r--r-- | net/ipv4/inetpeer.c | 430 |
1 files changed, 193 insertions, 237 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index dd1b20eca1a..bd5f5928167 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -17,27 +17,16 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> +#include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> +#include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. - * At this moment this information consists only of ID field for the next - * outgoing IP packet. This field is incremented with each packet as encoded - * in inet_getid() function (include/net/inetpeer.h). - * At the moment of writing this notes identifier of IP packets is generated - * to be unpredictable using this code only for packets subjected - * (actually or potentially) to defragmentation. I.e. DF packets less than - * PMTU in size uses a constant ID and do not use this code (see - * ip_select_ident() in include/net/ip.h). * - * Route cache entries hold references to our nodes. - * New cache entries get references via lookup by destination IP address in - * the avl tree. The reference is grabbed only when it's needed i.e. only - * when we try to output IP packet which needs an unpredictable ID (see - * __ip_select_ident() in net/ipv4/route.c). * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can @@ -54,21 +43,21 @@ * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. - * 3. Nodes appears and disappears from unused node list only under - * "inet_peer_unused_lock". - * 4. Global variable peer_total is modified under the pool lock. - * 5. struct inet_peer fields modification: + * 3. Global variable peer_total is modified under the pool lock. + * 4. struct inet_peer fields modification: * avl_left, avl_right, avl_parent, avl_height: pool lock - * unused: unused node list lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing - * dtime: unused node list lock * daddr: unchangeable - * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; +static LIST_HEAD(gc_list); +static const int gc_delay = 60 * HZ; +static struct delayed_work gc_work; +static DEFINE_SPINLOCK(gc_lock); + #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) @@ -79,23 +68,32 @@ static const struct inet_peer peer_fake_node = { .avl_height = 0 }; -struct inet_peer_base { - struct inet_peer __rcu *root; - seqlock_t lock; - int total; -}; +void inet_peer_base_init(struct inet_peer_base *bp) +{ + bp->root = peer_avl_empty_rcu; + seqlock_init(&bp->lock); + bp->flush_seq = ~0U; + bp->total = 0; +} +EXPORT_SYMBOL_GPL(inet_peer_base_init); -static struct inet_peer_base v4_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), - .total = 0, -}; +static atomic_t v4_seq = ATOMIC_INIT(0); +static atomic_t v6_seq = ATOMIC_INIT(0); -static struct inet_peer_base v6_peers = { - .root = peer_avl_empty_rcu, - .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), - .total = 0, -}; +static atomic_t *inetpeer_seq_ptr(int family) +{ + return (family == AF_INET ? &v4_seq : &v6_seq); +} + +static inline void flush_check(struct inet_peer_base *base, int family) +{ + atomic_t *fp = inetpeer_seq_ptr(family); + + if (unlikely(base->flush_seq != atomic_read(fp))) { + inetpeer_invalidate_tree(base); + base->flush_seq = atomic_read(fp); + } +} #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ @@ -104,20 +102,53 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ -int inet_peer_gc_mintime __read_mostly = 10 * HZ; -int inet_peer_gc_maxtime __read_mostly = 120 * HZ; - -static struct { - struct list_head list; - spinlock_t lock; -} unused_peers = { - .list = LIST_HEAD_INIT(unused_peers.list), - .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), -}; -static void peer_check_expire(unsigned long dummy); -static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); +static void inetpeer_gc_worker(struct work_struct *work) +{ + struct inet_peer *p, *n, *c; + struct list_head list; + + spin_lock_bh(&gc_lock); + list_replace_init(&gc_list, &list); + spin_unlock_bh(&gc_lock); + + if (list_empty(&list)) + return; + + list_for_each_entry_safe(p, n, &list, gc_list) { + + if (need_resched()) + cond_resched(); + + c = rcu_dereference_protected(p->avl_left, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_left = peer_avl_empty_rcu; + } + + c = rcu_dereference_protected(p->avl_right, 1); + if (c != peer_avl_empty) { + list_add_tail(&c->gc_list, &list); + p->avl_right = peer_avl_empty_rcu; + } + + n = list_entry(p->gc_list.next, struct inet_peer, gc_list); + + if (!atomic_read(&p->refcnt)) { + list_del(&p->gc_list); + kmem_cache_free(peer_cachep, p); + } + } + if (list_empty(&list)) + return; + + spin_lock_bh(&gc_lock); + list_splice(&list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -142,23 +173,7 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - peer_periodic_timer.expires = jiffies - + net_random() % inet_peer_gc_maxtime - + inet_peer_gc_maxtime; - add_timer(&peer_periodic_timer); -} - -/* Called with or without local BH being disabled. */ -static void unlink_from_unused(struct inet_peer *p) -{ - if (!list_empty(&p->unused)) { - spin_lock_bh(&unused_peers.lock); - list_del_init(&p->unused); - spin_unlock_bh(&unused_peers.lock); - } + INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } static int addr_compare(const struct inetpeer_addr *a, @@ -169,7 +184,7 @@ static int addr_compare(const struct inetpeer_addr *a, for (i = 0; i < n; i++) { if (a->addr.a6[i] == b->addr.a6[i]) continue; - if (a->addr.a6[i] < b->addr.a6[i]) + if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) return -1; return 1; } @@ -191,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a, stackptr = _stack; \ *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty; ) { \ + u != peer_avl_empty;) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ @@ -222,11 +237,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int cmp = addr_compare(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was - * deleted, unlink_from_pool() sets refcnt=-1 to make - * distinction between an unused entry (refcnt=0) and - * a freed one. + * deleted (refcnt=-1) */ - if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) + if (!atomic_add_unless(&u->refcnt, 1, -1)) u = NULL; return u; } @@ -248,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu; ) { \ + u->avl_right != peer_avl_empty_rcu;) { \ v = &u->avl_right; \ *stackptr++ = v; \ u = rcu_deref_locked(*v, base); \ @@ -353,121 +366,87 @@ static void inetpeer_free_rcu(struct rcu_head *head) kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } -/* May be called with local BH enabled. */ -static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) +static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH]) { - int do_free; - - do_free = 0; - - write_seqlock_bh(&base->lock); - /* Check the reference counter. It was artificially incremented by 1 - * in cleanup() function to prevent sudden disappearing. If we can - * atomically (because of lockless readers) take this last reference, - * it's safe to remove the node and free it later. - * We use refcnt=-1 to alert lockless readers this entry is deleted. - */ - if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer __rcu **stack[PEER_MAXDEPTH]; - struct inet_peer __rcu ***stackptr, ***delp; - if (lookup(&p->daddr, stack, base) != p) - BUG(); - delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty_rcu) { - *delp[0] = p->avl_right; - --stackptr; - } else { - /* look for a node to insert instead of p */ - struct inet_peer *t; - t = lookup_rightempty(p, base); - BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); - **--stackptr = t->avl_left; - /* t is removed, t->daddr > x->daddr for any - * x in p->avl_left subtree. - * Put t in the old place of p. */ - RCU_INIT_POINTER(*delp[0], t); - t->avl_left = p->avl_left; - t->avl_right = p->avl_right; - t->avl_height = p->avl_height; - BUG_ON(delp[1] != &p->avl_left); - delp[1] = &t->avl_left; /* was &p->avl_left */ - } - peer_avl_rebalance(stack, stackptr, base); - base->total--; - do_free = 1; + struct inet_peer __rcu ***stackptr, ***delp; + + if (lookup(&p->daddr, stack, base) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty_rcu) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p, base); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); + **--stackptr = t->avl_left; + /* t is removed, t->daddr > x->daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + RCU_INIT_POINTER(*delp[0], t); + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + BUG_ON(delp[1] != &p->avl_left); + delp[1] = &t->avl_left; /* was &p->avl_left */ } - write_sequnlock_bh(&base->lock); - - if (do_free) - call_rcu(&p->rcu, inetpeer_free_rcu); - else - /* The node is used again. Decrease the reference counter - * back. The loop "cleanup -> unlink_from_unused - * -> unlink_from_pool -> putpeer -> link_to_unused - * -> cleanup (for the same node)" - * doesn't really exist because the entry will have a - * recent deletion time and will not be cleaned again soon. - */ - inet_putpeer(p); + peer_avl_rebalance(stack, stackptr, base); + base->total--; + call_rcu(&p->rcu, inetpeer_free_rcu); } -static struct inet_peer_base *family_to_base(int family) +/* perform garbage collect on all items stacked during a lookup */ +static int inet_peer_gc(struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH], + struct inet_peer __rcu ***stackptr) { - return (family == AF_INET ? &v4_peers : &v6_peers); -} - -static struct inet_peer_base *peer_to_base(struct inet_peer *p) -{ - return family_to_base(p->daddr.family); -} - -/* May be called with local BH enabled. */ -static int cleanup_once(unsigned long ttl) -{ - struct inet_peer *p = NULL; - - /* Remove the first entry from the list of unused nodes. */ - spin_lock_bh(&unused_peers.lock); - if (!list_empty(&unused_peers.list)) { - __u32 delta; - - p = list_first_entry(&unused_peers.list, struct inet_peer, unused); - delta = (__u32)jiffies - p->dtime; + struct inet_peer *p, *gchead = NULL; + __u32 delta, ttl; + int cnt = 0; - if (delta < ttl) { - /* Do not prune fresh entries. */ - spin_unlock_bh(&unused_peers.lock); - return -1; + if (base->total >= inet_peer_threshold) + ttl = 0; /* be aggressive */ + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + base->total / inet_peer_threshold * HZ; + stackptr--; /* last stack slot is peer_avl_empty */ + while (stackptr > stack) { + stackptr--; + p = rcu_deref_locked(**stackptr, base); + if (atomic_read(&p->refcnt) == 0) { + smp_rmb(); + delta = (__u32)jiffies - p->dtime; + if (delta >= ttl && + atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { + p->gc_next = gchead; + gchead = p; + } } - - list_del_init(&p->unused); - - /* Grab an extra reference to prevent node disappearing - * before unlink_from_pool() call. */ - atomic_inc(&p->refcnt); } - spin_unlock_bh(&unused_peers.lock); - - if (p == NULL) - /* It means that the total number of USED entries has - * grown over inet_peer_threshold. It shouldn't really - * happen because of entry limits in route cache. */ - return -1; - - unlink_from_pool(p, peer_to_base(p)); - return 0; + while ((p = gchead) != NULL) { + gchead = p->gc_next; + cnt++; + unlink_from_pool(p, base, stack); + } + return cnt; } -/* Called with or without local BH being disabled. */ -struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) +struct inet_peer *inet_getpeer(struct inet_peer_base *base, + const struct inetpeer_addr *daddr, + int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; unsigned int sequence; - int invalidated; + int invalidated, gccnt = 0; + + flush_check(base, daddr->family); - /* Look up for the address quickly, lockless. + /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); @@ -476,13 +455,8 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) invalidated = read_seqretry(&base->lock, sequence); rcu_read_unlock(); - if (p) { - /* The existing node has been found. - * Remove the entry from unused list if it was there. - */ - unlink_from_unused(p); + if (p) return p; - } /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) @@ -492,29 +466,30 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) * At least, nodes should be hot in our cache. */ write_seqlock_bh(&base->lock); +relookup: p = lookup(daddr, stack, base); if (p != peer_avl_empty) { atomic_inc(&p->refcnt); write_sequnlock_bh(&base->lock); - /* Remove the entry from unused list if it was there. */ - unlink_from_unused(p); return p; } + if (!gccnt) { + gccnt = inet_peer_gc(base, stack, stackptr); + if (gccnt && create) + goto relookup; + } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); - p->tcp_ts_stamp = 0; p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; - p->rate_last = 0; - p->pmtu_expires = 0; - p->pmtu_orig = 0; - memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - INIT_LIST_HEAD(&p->unused); - + /* 60*HZ is arbitrary, but chosen enough high so that the first + * calculation of tokens is at its maximum. + */ + p->rate_last = jiffies - 60*HZ; + INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ link_to_pool(p, base); @@ -522,62 +497,15 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) } write_sequnlock_bh(&base->lock); - if (base->total >= inet_peer_threshold) - /* Remove one less-recently-used entry. */ - cleanup_once(0); - return p; } - -static int compute_total(void) -{ - return v4_peers.total + v6_peers.total; -} EXPORT_SYMBOL_GPL(inet_getpeer); -/* Called with local BH disabled. */ -static void peer_check_expire(unsigned long dummy) -{ - unsigned long now = jiffies; - int ttl, total; - - total = compute_total(); - if (total >= inet_peer_threshold) - ttl = inet_peer_minttl; - else - ttl = inet_peer_maxttl - - (inet_peer_maxttl - inet_peer_minttl) / HZ * - total / inet_peer_threshold * HZ; - while (!cleanup_once(ttl)) { - if (jiffies != now) - break; - } - - /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime - * interval depending on the total number of entries (more entries, - * less interval). */ - total = compute_total(); - if (total >= inet_peer_threshold) - peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; - else - peer_periodic_timer.expires = jiffies - + inet_peer_gc_maxtime - - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - total / inet_peer_threshold * HZ; - add_timer(&peer_periodic_timer); -} - void inet_putpeer(struct inet_peer *p) { - local_bh_disable(); - - if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { - list_add_tail(&p->unused, &unused_peers.list); - p->dtime = (__u32)jiffies; - spin_unlock(&unused_peers.lock); - } - - local_bh_enable(); + p->dtime = (__u32)jiffies; + smp_mb__before_atomic(); + atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); @@ -621,3 +549,31 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); + +static void inetpeer_inval_rcu(struct rcu_head *head) +{ + struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); + + spin_lock_bh(&gc_lock); + list_add_tail(&p->gc_list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} + +void inetpeer_invalidate_tree(struct inet_peer_base *base) +{ + struct inet_peer *root; + + write_seqlock_bh(&base->lock); + + root = rcu_deref_locked(base->root, base); + if (root != peer_avl_empty) { + base->root = peer_avl_empty_rcu; + base->total = 0; + call_rcu(&root->gc_rcu, inetpeer_inval_rcu); + } + + write_sequnlock_bh(&base->lock); +} +EXPORT_SYMBOL(inetpeer_invalidate_tree); |
