aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/inetpeer.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/inetpeer.c')
-rw-r--r--net/ipv4/inetpeer.c168
1 files changed, 118 insertions, 50 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bf4a9c4808e..bd5f5928167 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
#include <net/secure_seq.h>
@@ -25,20 +26,7 @@
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
- * At this moment this information consists only of ID field for the next
- * outgoing IP packet. This field is incremented with each packet as encoded
- * in inet_getid() function (include/net/inetpeer.h).
- * At the moment of writing this notes identifier of IP packets is generated
- * to be unpredictable using this code only for packets subjected
- * (actually or potentially) to defragmentation. I.e. DF packets less than
- * PMTU in size uses a constant ID and do not use this code (see
- * ip_select_ident() in include/net/ip.h).
*
- * Route cache entries hold references to our nodes.
- * New cache entries get references via lookup by destination IP address in
- * the avl tree. The reference is grabbed only when it's needed i.e. only
- * when we try to output IP packet which needs an unpredictable ID (see
- * __ip_select_ident() in net/ipv4/route.c).
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
@@ -61,11 +49,15 @@
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
- * ip_id_count: atomic value (no lock needed)
*/
static struct kmem_cache *peer_cachep __read_mostly;
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
#define node_height(x) x->avl_height
#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -76,23 +68,32 @@ static const struct inet_peer peer_fake_node = {
.avl_height = 0
};
-struct inet_peer_base {
- struct inet_peer __rcu *root;
- seqlock_t lock;
- int total;
-};
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->flush_seq = ~0U;
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static struct inet_peer_base v4_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
- .total = 0,
-};
+static atomic_t v4_seq = ATOMIC_INIT(0);
+static atomic_t v6_seq = ATOMIC_INIT(0);
-static struct inet_peer_base v6_peers = {
- .root = peer_avl_empty_rcu,
- .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
- .total = 0,
-};
+static atomic_t *inetpeer_seq_ptr(int family)
+{
+ return (family == AF_INET ? &v4_seq : &v6_seq);
+}
+
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+ atomic_t *fp = inetpeer_seq_ptr(family);
+
+ if (unlikely(base->flush_seq != atomic_read(fp))) {
+ inetpeer_invalidate_tree(base);
+ base->flush_seq = atomic_read(fp);
+ }
+}
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -102,6 +103,52 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n, *c;
+ struct list_head list;
+
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if (need_resched())
+ cond_resched();
+
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
+ }
+
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
@@ -126,6 +173,7 @@ void __init inet_initpeers(void)
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
+ INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
}
static int addr_compare(const struct inetpeer_addr *a,
@@ -158,7 +206,7 @@ static int addr_compare(const struct inetpeer_addr *a,
stackptr = _stack; \
*stackptr++ = &_base->root; \
for (u = rcu_deref_locked(_base->root, _base); \
- u != peer_avl_empty; ) { \
+ u != peer_avl_empty;) { \
int cmp = addr_compare(_daddr, &u->daddr); \
if (cmp == 0) \
break; \
@@ -213,7 +261,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
for (u = rcu_deref_locked(*v, base); \
- u->avl_right != peer_avl_empty_rcu; ) { \
+ u->avl_right != peer_avl_empty_rcu;) { \
v = &u->avl_right; \
*stackptr++ = v; \
u = rcu_deref_locked(*v, base); \
@@ -350,11 +398,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
call_rcu(&p->rcu, inetpeer_free_rcu);
}
-static struct inet_peer_base *family_to_base(int family)
-{
- return family == AF_INET ? &v4_peers : &v6_peers;
-}
-
/* perform garbage collect on all items stacked during a lookup */
static int inet_peer_gc(struct inet_peer_base *base,
struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -392,14 +435,17 @@ static int inet_peer_gc(struct inet_peer_base *base,
return cnt;
}
-struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
{
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- struct inet_peer_base *base = family_to_base(daddr->family);
struct inet_peer *p;
unsigned int sequence;
int invalidated, gccnt = 0;
+ flush_check(base, daddr->family);
+
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
@@ -437,19 +483,13 @@ relookup:
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count,
- (daddr->family == AF_INET) ?
- secure_ip_id(daddr->addr.a4) :
- secure_ipv6_id(daddr->addr.a6));
- p->tcp_ts_stamp = 0;
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
- p->rate_last = 0;
- p->pmtu_expires = 0;
- p->pmtu_orig = 0;
- p->redirect_genid = 0;
- memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
-
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+ INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
link_to_pool(p, base);
@@ -464,7 +504,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
p->dtime = (__u32)jiffies;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&p->refcnt);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
@@ -509,3 +549,31 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+static void inetpeer_inval_rcu(struct rcu_head *head)
+{
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
+
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
+
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
+{
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
+
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
+ base->total = 0;
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+ }
+
+ write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);