From 25a91d8d91911f84a03a039339b29537e7f1970d Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Sat, 18 Jan 2014 09:54:23 +0800
Subject: skbuff: Introduce skb_to_sgvec_nomark to map skb without mark new end

As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given sglist
without mark the sg which contain last skb data as the end. So the caller can
mannipulate sg list as will when padding new data after the first call without
calling sg_unmark_end to expend sg list.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f589c9af8cb..c574bf3bd6f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -691,6 +691,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 				     unsigned int headroom);
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
 				int newtailroom, gfp_t priority);
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+			int offset, int len);
 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset,
 		 int len);
 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
-- 
cgit v1.2.3-70-g09d2


From ca925cf1534ebcec332c08719a7dee6ee1782ce4 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Sat, 18 Jan 2014 09:55:27 +0800
Subject: flowcache: Make flow cache name space aware

Inserting a entry into flowcache, or flushing flowcache should be based
on per net scope. The reason to do so is flushing operation from fat
netns crammed with flow entries will also making the slim netns with only
a few flow cache entries go away in original implementation.

Since flowcache is tightly coupled with IPsec, so it would be easier to
put flow cache global parameters into xfrm namespace part. And one last
thing needs to do is bumping flow cache genid, and flush flow cache should
also be made in per net style.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/flow.h              |   5 +-
 include/net/flowcache.h         |  25 ++++++++
 include/net/netns/xfrm.h        |  11 ++++
 net/core/flow.c                 | 127 ++++++++++++++++++----------------------
 net/xfrm/xfrm_policy.c          |   7 ++-
 security/selinux/include/xfrm.h |   5 +-
 6 files changed, 103 insertions(+), 77 deletions(-)
 create mode 100644 include/net/flowcache.h

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index d23e7fa2042..bee3741e5a6 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -218,9 +218,10 @@ struct flow_cache_object *flow_cache_lookup(struct net *net,
 					    const struct flowi *key, u16 family,
 					    u8 dir, flow_resolve_t resolver,
 					    void *ctx);
+int flow_cache_init(struct net *net);
 
-void flow_cache_flush(void);
-void flow_cache_flush_deferred(void);
+void flow_cache_flush(struct net *net);
+void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
 #endif
diff --git a/include/net/flowcache.h b/include/net/flowcache.h
new file mode 100644
index 00000000000..c8f665ec6e0
--- /dev/null
+++ b/include/net/flowcache.h
@@ -0,0 +1,25 @@
+#ifndef _NET_FLOWCACHE_H
+#define _NET_FLOWCACHE_H
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+
+struct flow_cache_percpu {
+	struct hlist_head		*hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
+};
+
+struct flow_cache {
+	u32				hash_shift;
+	struct flow_cache_percpu __percpu *percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+};
+#endif	/* _NET_FLOWCACHE_H */
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1006a265beb..52d0086d55d 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -6,6 +6,7 @@
 #include <linux/workqueue.h>
 #include <linux/xfrm.h>
 #include <net/dst_ops.h>
+#include <net/flowcache.h>
 
 struct ctl_table_header;
 
@@ -61,6 +62,16 @@ struct netns_xfrm {
 	spinlock_t xfrm_policy_sk_bundle_lock;
 	rwlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
+
+	/* flow cache part */
+	struct flow_cache	flow_cache_global;
+	struct kmem_cache	*flow_cachep;
+	atomic_t		flow_cache_genid;
+	struct list_head	flow_cache_gc_list;
+	spinlock_t		flow_cache_gc_lock;
+	struct work_struct	flow_cache_gc_work;
+	struct work_struct	flow_cache_flush_work;
+	struct mutex		flow_flush_sem;
 };
 
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index dfa602ceb8c..344a184011f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -24,6 +24,7 @@
 #include <net/flow.h>
 #include <linux/atomic.h>
 #include <linux/security.h>
+#include <net/net_namespace.h>
 
 struct flow_cache_entry {
 	union {
@@ -38,37 +39,12 @@ struct flow_cache_entry {
 	struct flow_cache_object	*object;
 };
 
-struct flow_cache_percpu {
-	struct hlist_head		*hash_table;
-	int				hash_count;
-	u32				hash_rnd;
-	int				hash_rnd_recalc;
-	struct tasklet_struct		flush_tasklet;
-};
-
 struct flow_flush_info {
 	struct flow_cache		*cache;
 	atomic_t			cpuleft;
 	struct completion		completion;
 };
 
-struct flow_cache {
-	u32				hash_shift;
-	struct flow_cache_percpu __percpu *percpu;
-	struct notifier_block		hotcpu_notifier;
-	int				low_watermark;
-	int				high_watermark;
-	struct timer_list		rnd_timer;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-EXPORT_SYMBOL(flow_cache_genid);
-static struct flow_cache flow_cache_global;
-static struct kmem_cache *flow_cachep __read_mostly;
-
-static DEFINE_SPINLOCK(flow_cache_gc_lock);
-static LIST_HEAD(flow_cache_gc_list);
-
 #define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
 #define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
@@ -84,46 +60,50 @@ static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
-static int flow_entry_valid(struct flow_cache_entry *fle)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+				struct netns_xfrm *xfrm)
 {
-	if (atomic_read(&flow_cache_genid) != fle->genid)
+	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
 		return 0;
 	if (fle->object && !fle->object->ops->check(fle->object))
 		return 0;
 	return 1;
 }
 
-static void flow_entry_kill(struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle,
+				struct netns_xfrm *xfrm)
 {
 	if (fle->object)
 		fle->object->ops->delete(fle->object);
-	kmem_cache_free(flow_cachep, fle);
+	kmem_cache_free(xfrm->flow_cachep, fle);
 }
 
 static void flow_cache_gc_task(struct work_struct *work)
 {
 	struct list_head gc_list;
 	struct flow_cache_entry *fce, *n;
+	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+						flow_cache_gc_work);
 
 	INIT_LIST_HEAD(&gc_list);
-	spin_lock_bh(&flow_cache_gc_lock);
-	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
-	spin_unlock_bh(&flow_cache_gc_lock);
+	spin_lock_bh(&xfrm->flow_cache_gc_lock);
+	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
 
 	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
-		flow_entry_kill(fce);
+		flow_entry_kill(fce, xfrm);
 }
-static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
 
 static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-				     int deleted, struct list_head *gc_list)
+				     int deleted, struct list_head *gc_list,
+				     struct netns_xfrm *xfrm)
 {
 	if (deleted) {
 		fcp->hash_count -= deleted;
-		spin_lock_bh(&flow_cache_gc_lock);
-		list_splice_tail(gc_list, &flow_cache_gc_list);
-		spin_unlock_bh(&flow_cache_gc_lock);
-		schedule_work(&flow_cache_gc_work);
+		spin_lock_bh(&xfrm->flow_cache_gc_lock);
+		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+		spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+		schedule_work(&xfrm->flow_cache_gc_work);
 	}
 }
 
@@ -135,6 +115,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 	struct hlist_node *tmp;
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
+	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+						flow_cache_global);
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int saved = 0;
@@ -142,7 +124,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 		hlist_for_each_entry_safe(fle, tmp,
 					  &fcp->hash_table[i], u.hlist) {
 			if (saved < shrink_to &&
-			    flow_entry_valid(fle)) {
+			    flow_entry_valid(fle, xfrm)) {
 				saved++;
 			} else {
 				deleted++;
@@ -152,7 +134,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
 		}
 	}
 
-	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 }
 
 static void flow_cache_shrink(struct flow_cache *fc,
@@ -208,7 +190,7 @@ struct flow_cache_object *
 flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		  flow_resolve_t resolver, void *ctx)
 {
-	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache *fc = &net->xfrm.flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, *tfle;
 	struct flow_cache_object *flo;
@@ -248,7 +230,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
+		fle = kmem_cache_alloc(net->xfrm.flow_cachep, GFP_ATOMIC);
 		if (fle) {
 			fle->net = net;
 			fle->family = family;
@@ -258,7 +240,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
 			fcp->hash_count++;
 		}
-	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
 		flo = fle->object;
 		if (!flo)
 			goto ret_object;
@@ -279,7 +261,7 @@ nocache:
 	}
 	flo = resolver(net, key, family, dir, flo, ctx);
 	if (fle) {
-		fle->genid = atomic_read(&flow_cache_genid);
+		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
 		if (!IS_ERR(flo))
 			fle->object = flo;
 		else
@@ -303,12 +285,14 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	struct hlist_node *tmp;
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
+	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+						flow_cache_global);
 
 	fcp = this_cpu_ptr(fc->percpu);
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		hlist_for_each_entry_safe(fle, tmp,
 					  &fcp->hash_table[i], u.hlist) {
-			if (flow_entry_valid(fle))
+			if (flow_entry_valid(fle, xfrm))
 				continue;
 
 			deleted++;
@@ -317,7 +301,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 		}
 	}
 
-	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
 
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
@@ -351,10 +335,9 @@ static void flow_cache_flush_per_cpu(void *data)
 	tasklet_schedule(tasklet);
 }
 
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
 {
 	struct flow_flush_info info;
-	static DEFINE_MUTEX(flow_flush_sem);
 	cpumask_var_t mask;
 	int i, self;
 
@@ -365,8 +348,8 @@ void flow_cache_flush(void)
 
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
-	mutex_lock(&flow_flush_sem);
-	info.cache = &flow_cache_global;
+	mutex_lock(&net->xfrm.flow_flush_sem);
+	info.cache = &net->xfrm.flow_cache_global;
 	for_each_online_cpu(i)
 		if (!flow_cache_percpu_empty(info.cache, i))
 			cpumask_set_cpu(i, mask);
@@ -386,21 +369,23 @@ void flow_cache_flush(void)
 	wait_for_completion(&info.completion);
 
 done:
-	mutex_unlock(&flow_flush_sem);
+	mutex_unlock(&net->xfrm.flow_flush_sem);
 	put_online_cpus();
 	free_cpumask_var(mask);
 }
 
 static void flow_cache_flush_task(struct work_struct *work)
 {
-	flow_cache_flush();
-}
+	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+						flow_cache_gc_work);
+	struct net *net = container_of(xfrm, struct net, xfrm);
 
-static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+	flow_cache_flush(net);
+}
 
-void flow_cache_flush_deferred(void)
+void flow_cache_flush_deferred(struct net *net)
 {
-	schedule_work(&flow_cache_flush_work);
+	schedule_work(&net->xfrm.flow_cache_flush_work);
 }
 
 static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
@@ -425,7 +410,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
-	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	struct flow_cache *fc = container_of(nfb, struct flow_cache,
+						hotcpu_notifier);
 	int res, cpu = (unsigned long) hcpu;
 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 
@@ -444,9 +430,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static int __init flow_cache_init(struct flow_cache *fc)
+int flow_cache_init(struct net *net)
 {
 	int i;
+	struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+	/* Initialize per-net flow cache global variables here */
+	net->xfrm.flow_cachep = kmem_cache_create("flow_cache",
+					sizeof(struct flow_cache_entry),
+					0, SLAB_PANIC, NULL);
+	spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+	mutex_init(&net->xfrm.flow_flush_sem);
 
 	fc->hash_shift = 10;
 	fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -484,14 +481,4 @@ err:
 
 	return -ENOMEM;
 }
-
-static int __init flow_cache_init_global(void)
-{
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC, NULL);
-
-	return flow_cache_init(&flow_cache_global);
-}
-
-module_init(flow_cache_init_global);
+EXPORT_SYMBOL(flow_cache_init);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4b98b25793c..2232c6f26af 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -661,7 +661,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.flow_cache_genid);
 
 	/* After previous checking, family can either be AF_INET or AF_INET6 */
 	if (policy->family == AF_INET)
@@ -2567,14 +2567,14 @@ static void __xfrm_garbage_collect(struct net *net)
 
 void xfrm_garbage_collect(struct net *net)
 {
-	flow_cache_flush();
+	flow_cache_flush(net);
 	__xfrm_garbage_collect(net);
 }
 EXPORT_SYMBOL(xfrm_garbage_collect);
 
 static void xfrm_garbage_collect_deferred(struct net *net)
 {
-	flow_cache_flush_deferred();
+	flow_cache_flush_deferred(net);
 	__xfrm_garbage_collect(net);
 }
 
@@ -2947,6 +2947,7 @@ static int __net_init xfrm_net_init(struct net *net)
 	spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
+	flow_cache_init(net);
 	return 0;
 
 out_sysctl:
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 48c3cc94c16..dfe3fda7e5f 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -45,10 +45,11 @@ static inline void selinux_xfrm_notify_policyload(void)
 {
 	struct net *net;
 
-	atomic_inc(&flow_cache_genid);
 	rtnl_lock();
-	for_each_net(net)
+	for_each_net(net) {
+		atomic_inc(&net->xfrm.flow_cache_genid);
 		rt_genid_bump_all(net);
+	}
 	rtnl_unlock();
 }
 #else
-- 
cgit v1.2.3-70-g09d2


From 0f24558e91563888d51e9be5b70981da920c37ac Mon Sep 17 00:00:00 2001
From: Horia Geanta <horia.geanta@freescale.com>
Date: Wed, 12 Feb 2014 16:20:06 +0200
Subject: xfrm: avoid creating temporary SA when there are no listeners

In the case when KMs have no listeners, km_query() will fail and
temporary SAs are garbage collected immediately after their allocation.
This causes strain on memory allocation, leading even to OOM since
temporary SA alloc/free cycle is performed for every packet
and garbage collection does not keep up the pace.

The sane thing to do is to make sure we have audience before
temporary SA allocation.

Signed-off-by: Horia Geanta <horia.geanta@freescale.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    | 15 +++++++++++++++
 net/key/af_key.c      | 19 +++++++++++++++++++
 net/xfrm/xfrm_state.c | 31 +++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c  |  6 ++++++
 4 files changed, 71 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index afa5730fb3b..5313ccfdeed 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -594,6 +594,7 @@ struct xfrm_mgr {
 					   const struct xfrm_migrate *m,
 					   int num_bundles,
 					   const struct xfrm_kmaddress *k);
+	bool			(*is_alive)(const struct km_event *c);
 };
 
 int xfrm_register_km(struct xfrm_mgr *km);
@@ -1646,6 +1647,20 @@ static inline int xfrm_aevent_is_on(struct net *net)
 	rcu_read_unlock();
 	return ret;
 }
+
+static inline int xfrm_acquire_is_on(struct net *net)
+{
+	struct sock *nlsk;
+	int ret = 0;
+
+	rcu_read_lock();
+	nlsk = rcu_dereference(net->xfrm.nlsk);
+	if (nlsk)
+		ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE);
+	rcu_read_unlock();
+
+	return ret;
+}
 #endif
 
 static inline int xfrm_alg_len(const struct xfrm_algo *alg)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1a04c132936..e1c69d02419 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3059,6 +3059,24 @@ static u32 get_acqseq(void)
 	return res;
 }
 
+static bool pfkey_is_alive(const struct km_event *c)
+{
+	struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id);
+	struct sock *sk;
+	bool is_alive = false;
+
+	rcu_read_lock();
+	sk_for_each_rcu(sk, &net_pfkey->table) {
+		if (pfkey_sk(sk)->registered) {
+			is_alive = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return is_alive;
+}
+
 static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp)
 {
 	struct sk_buff *skb;
@@ -3784,6 +3802,7 @@ static struct xfrm_mgr pfkeyv2_mgr =
 	.new_mapping	= pfkey_send_new_mapping,
 	.notify_policy	= pfkey_send_policy_notify,
 	.migrate	= pfkey_send_migrate,
+	.is_alive	= pfkey_is_alive,
 };
 
 static int __net_init pfkey_net_init(struct net *net)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a26b7aa7947..0bf12f665b9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -161,6 +161,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 int __xfrm_state_delete(struct xfrm_state *x);
 
 int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
+bool km_is_alive(const struct km_event *c);
 void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
 
 static DEFINE_SPINLOCK(xfrm_type_lock);
@@ -788,6 +789,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
 	struct xfrm_state *best = NULL;
 	u32 mark = pol->mark.v & pol->mark.m;
 	unsigned short encap_family = tmpl->encap_family;
+	struct km_event c;
 
 	to_put = NULL;
 
@@ -832,6 +834,17 @@ found:
 			error = -EEXIST;
 			goto out;
 		}
+
+		c.net = net;
+		/* If the KMs have no listeners (yet...), avoid allocating an SA
+		 * for each and every packet - garbage collection might not
+		 * handle the flood.
+		 */
+		if (!km_is_alive(&c)) {
+			error = -ESRCH;
+			goto out;
+		}
+
 		x = xfrm_state_alloc(net);
 		if (x == NULL) {
 			error = -ENOMEM;
@@ -1793,6 +1806,24 @@ int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address
 }
 EXPORT_SYMBOL(km_report);
 
+bool km_is_alive(const struct km_event *c)
+{
+	struct xfrm_mgr *km;
+	bool is_alive = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+		if (km->is_alive && km->is_alive(c)) {
+			is_alive = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return is_alive;
+}
+EXPORT_SYMBOL(km_is_alive);
+
 int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen)
 {
 	int err;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ade9988f6e3..d7694f25829 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2982,6 +2982,11 @@ static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
 	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MAPPING, GFP_ATOMIC);
 }
 
+static bool xfrm_is_alive(const struct km_event *c)
+{
+	return (bool)xfrm_acquire_is_on(c->net);
+}
+
 static struct xfrm_mgr netlink_mgr = {
 	.id		= "netlink",
 	.notify		= xfrm_send_state_notify,
@@ -2991,6 +2996,7 @@ static struct xfrm_mgr netlink_mgr = {
 	.report		= xfrm_send_report,
 	.migrate	= xfrm_send_migrate,
 	.new_mapping	= xfrm_send_mapping,
+	.is_alive	= xfrm_is_alive,
 };
 
 static int __net_init xfrm_user_net_init(struct net *net)
-- 
cgit v1.2.3-70-g09d2


From d3623099d3509fa68fa28235366049dd3156c63a Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 14 Feb 2014 15:30:36 +0100
Subject: ipsec: add support of limited SA dump

The goal of this patch is to allow userland to dump only a part of SA by
specifying a filter during the dump.
The kernel is in charge to filter SA, this avoids to generate useless netlink
traffic (it save also some cpu cycles). This is particularly useful when there
is a big number of SA set on the system.

Note that I removed the union in struct xfrm_state_walk to fix a problem on arm.
struct netlink_callback->args is defined as a array of 6 long and the first long
is used in xfrm code to flag the cb as initialized. Hence, we must have:
sizeof(struct xfrm_state_walk) <= sizeof(long) * 5.
With the union, it was false on arm (sizeof(struct xfrm_state_walk) was
sizeof(long) * 7), due to the padding.
In fact, whatever the arch is, this union seems useless, there will be always
padding after it. Removing it will not increase the size of this struct (and
reduce it on arm).

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h           | 10 +++++-----
 include/uapi/linux/pfkeyv2.h | 15 ++++++++++++++-
 include/uapi/linux/xfrm.h    | 10 ++++++++++
 net/key/af_key.c             | 19 ++++++++++++++++++-
 net/xfrm/xfrm_state.c        | 25 ++++++++++++++++++++++++-
 net/xfrm/xfrm_user.c         | 28 +++++++++++++++++++++++++++-
 6 files changed, 98 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5313ccfdeed..45332acac02 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -118,11 +118,10 @@
 struct xfrm_state_walk {
 	struct list_head	all;
 	u8			state;
-	union {
-		u8		dying;
-		u8		proto;
-	};
+	u8			dying;
+	u8			proto;
 	u32			seq;
+	struct xfrm_filter	*filter;
 };
 
 /* Full description of state of transformer. */
@@ -1406,7 +1405,8 @@ static inline void xfrm_sysctl_fini(struct net *net)
 }
 #endif
 
-void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto);
+void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
+			  struct xfrm_filter *filter);
 int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
 		    int (*func)(struct xfrm_state *, int, void*), void *);
 void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
diff --git a/include/uapi/linux/pfkeyv2.h b/include/uapi/linux/pfkeyv2.h
index 0b80c806631..ada7f0171cc 100644
--- a/include/uapi/linux/pfkeyv2.h
+++ b/include/uapi/linux/pfkeyv2.h
@@ -235,6 +235,18 @@ struct sadb_x_kmaddress {
 } __attribute__((packed));
 /* sizeof(struct sadb_x_kmaddress) == 8 */
 
+/* To specify the SA dump filter */
+struct sadb_x_filter {
+	__u16	sadb_x_filter_len;
+	__u16	sadb_x_filter_exttype;
+	__u32	sadb_x_filter_saddr[4];
+	__u32	sadb_x_filter_daddr[4];
+	__u16	sadb_x_filter_family;
+	__u8	sadb_x_filter_splen;
+	__u8	sadb_x_filter_dplen;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_filter) == 40 */
+
 /* Message types */
 #define SADB_RESERVED		0
 #define SADB_GETSPI		1
@@ -358,7 +370,8 @@ struct sadb_x_kmaddress {
 #define SADB_X_EXT_SEC_CTX		24
 /* Used with MIGRATE to pass @ to IKE for negotiation */
 #define SADB_X_EXT_KMADDRESS		25
-#define SADB_EXT_MAX			25
+#define SADB_X_EXT_FILTER		26
+#define SADB_EXT_MAX			26
 
 /* Identity Extension values */
 #define SADB_IDENTTYPE_RESERVED	0
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index a8cd6a4a297..6550c679584 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -298,6 +298,8 @@ enum xfrm_attr_type_t {
 	XFRMA_TFCPAD,		/* __u32 */
 	XFRMA_REPLAY_ESN_VAL,	/* struct xfrm_replay_esn */
 	XFRMA_SA_EXTRA_FLAGS,	/* __u32 */
+	XFRMA_PROTO,		/* __u8 */
+	XFRMA_FILTER,		/* struct xfrm_filter */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -474,6 +476,14 @@ struct xfrm_user_mapping {
 	__be16				new_sport;
 };
 
+struct xfrm_filter {
+	xfrm_address_t			saddr;
+	xfrm_address_t			daddr;
+	__u16				family;
+	__u8				splen;
+	__u8				dplen;
+};
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
diff --git a/net/key/af_key.c b/net/key/af_key.c
index e1c69d02419..f0879c19f45 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1798,6 +1798,7 @@ static void pfkey_dump_sa_done(struct pfkey_sock *pfk)
 static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
 {
 	u8 proto;
+	struct xfrm_filter *filter = NULL;
 	struct pfkey_sock *pfk = pfkey_sk(sk);
 
 	if (pfk->dump.dump != NULL)
@@ -1807,11 +1808,27 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
 	if (proto == 0)
 		return -EINVAL;
 
+	if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
+		struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
+
+		filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+		if (filter == NULL)
+			return -ENOMEM;
+
+		memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
+		       sizeof(xfrm_address_t));
+		memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr,
+		       sizeof(xfrm_address_t));
+		filter->family = xfilter->sadb_x_filter_family;
+		filter->splen = xfilter->sadb_x_filter_splen;
+		filter->dplen = xfilter->sadb_x_filter_dplen;
+	}
+
 	pfk->dump.msg_version = hdr->sadb_msg_version;
 	pfk->dump.msg_portid = hdr->sadb_msg_pid;
 	pfk->dump.dump = pfkey_dump_sa;
 	pfk->dump.done = pfkey_dump_sa_done;
-	xfrm_state_walk_init(&pfk->dump.u.state, proto);
+	xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
 
 	return pfkey_do_dump(pfk);
 }
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 0bf12f665b9..a750901ac3d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1603,6 +1603,23 @@ unlock:
 }
 EXPORT_SYMBOL(xfrm_alloc_spi);
 
+static bool __xfrm_state_filter_match(struct xfrm_state *x,
+				      struct xfrm_filter *filter)
+{
+	if (filter) {
+		if ((filter->family == AF_INET ||
+		     filter->family == AF_INET6) &&
+		    x->props.family != filter->family)
+			return false;
+
+		return addr_match(&x->props.saddr, &filter->saddr,
+				  filter->splen) &&
+		       addr_match(&x->id.daddr, &filter->daddr,
+				  filter->dplen);
+	}
+	return true;
+}
+
 int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
 		    int (*func)(struct xfrm_state *, int, void*),
 		    void *data)
@@ -1625,6 +1642,8 @@ int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
 		state = container_of(x, struct xfrm_state, km);
 		if (!xfrm_id_proto_match(state->id.proto, walk->proto))
 			continue;
+		if (!__xfrm_state_filter_match(state, walk->filter))
+			continue;
 		err = func(state, walk->seq, data);
 		if (err) {
 			list_move_tail(&walk->all, &x->all);
@@ -1643,17 +1662,21 @@ out:
 }
 EXPORT_SYMBOL(xfrm_state_walk);
 
-void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
+			  struct xfrm_filter *filter)
 {
 	INIT_LIST_HEAD(&walk->all);
 	walk->proto = proto;
 	walk->state = XFRM_STATE_DEAD;
 	walk->seq = 0;
+	walk->filter = filter;
 }
 EXPORT_SYMBOL(xfrm_state_walk_init);
 
 void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
 {
+	kfree(walk->filter);
+
 	if (list_empty(&walk->all))
 		return;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d7694f25829..023e5e7ea4c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -887,6 +887,7 @@ static int xfrm_dump_sa_done(struct netlink_callback *cb)
 	return 0;
 }
 
+static const struct nla_policy xfrma_policy[XFRMA_MAX+1];
 static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
@@ -902,8 +903,31 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 	info.nlmsg_flags = NLM_F_MULTI;
 
 	if (!cb->args[0]) {
+		struct nlattr *attrs[XFRMA_MAX+1];
+		struct xfrm_filter *filter = NULL;
+		u8 proto = 0;
+		int err;
+
 		cb->args[0] = 1;
-		xfrm_state_walk_init(walk, 0);
+
+		err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX,
+				  xfrma_policy);
+		if (err < 0)
+			return err;
+
+		if (attrs[XFRMA_FILTER]) {
+			filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+			if (filter == NULL)
+				return -ENOMEM;
+
+			memcpy(filter, nla_data(attrs[XFRMA_FILTER]),
+			       sizeof(*filter));
+		}
+
+		if (attrs[XFRMA_PROTO])
+			proto = nla_get_u8(attrs[XFRMA_PROTO]);
+
+		xfrm_state_walk_init(walk, proto, filter);
 	}
 
 	(void) xfrm_state_walk(net, walk, dump_one_state, &info);
@@ -2309,6 +2333,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_TFCPAD]		= { .type = NLA_U32 },
 	[XFRMA_REPLAY_ESN_VAL]	= { .len = sizeof(struct xfrm_replay_state_esn) },
 	[XFRMA_SA_EXTRA_FLAGS]	= { .type = NLA_U32 },
+	[XFRMA_PROTO]		= { .type = NLA_U8 },
+	[XFRMA_FILTER]		= { .len = sizeof(struct xfrm_filter) },
 };
 
 static const struct xfrm_link {
-- 
cgit v1.2.3-70-g09d2


From 1a1ccc96abb2ed9b8fbb71018e64b97324caef53 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Wed, 19 Feb 2014 10:07:34 +0100
Subject: xfrm: Remove caching of xfrm_policy_sk_bundles

We currently cache socket policy bundles at xfrm_policy_sk_bundles.
These cached bundles are never used. Instead we create and cache
a new one whenever xfrm_lookup() is called on a socket policy.

Most protocols cache the used routes to the socket, so let's
remove the unused caching of socket policy bundles in xfrm.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  1 -
 net/xfrm/xfrm_policy.c   | 28 ----------------------------
 2 files changed, 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 52d0086d55d..51f0dce7b64 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -59,7 +59,6 @@ struct netns_xfrm {
 	struct dst_ops		xfrm6_dst_ops;
 #endif
 	spinlock_t xfrm_state_lock;
-	spinlock_t xfrm_policy_sk_bundle_lock;
 	rwlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2232c6f26af..bb3669d973a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -39,8 +39,6 @@
 #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
 #define XFRM_MAX_QUEUE_LEN	100
 
-static struct dst_entry *xfrm_policy_sk_bundles;
-
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
 						__read_mostly;
@@ -2109,13 +2107,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 				goto no_transform;
 			}
 
-			dst_hold(&xdst->u.dst);
-
-			spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-			xdst->u.dst.next = xfrm_policy_sk_bundles;
-			xfrm_policy_sk_bundles = &xdst->u.dst;
-			spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-
 			route = xdst->route;
 		}
 	}
@@ -2549,33 +2540,15 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-static void __xfrm_garbage_collect(struct net *net)
-{
-	struct dst_entry *head, *next;
-
-	spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-	head = xfrm_policy_sk_bundles;
-	xfrm_policy_sk_bundles = NULL;
-	spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-
-	while (head) {
-		next = head->next;
-		dst_free(head);
-		head = next;
-	}
-}
-
 void xfrm_garbage_collect(struct net *net)
 {
 	flow_cache_flush(net);
-	__xfrm_garbage_collect(net);
 }
 EXPORT_SYMBOL(xfrm_garbage_collect);
 
 static void xfrm_garbage_collect_deferred(struct net *net)
 {
 	flow_cache_flush_deferred(net);
-	__xfrm_garbage_collect(net);
 }
 
 static void xfrm_init_pmtu(struct dst_entry *dst)
@@ -2944,7 +2917,6 @@ static int __net_init xfrm_net_init(struct net *net)
 	/* Initialize the per-net locks here */
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
 	rwlock_init(&net->xfrm.xfrm_policy_lock);
-	spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
 	flow_cache_init(net);
-- 
cgit v1.2.3-70-g09d2