aboutsummaryrefslogtreecommitdiff
path: root/net/netfilter/ipvs
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r--net/netfilter/ipvs/Kconfig26
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c460
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c782
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c1262
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c89
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c196
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c245
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c55
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c26
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c9
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c1053
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c80
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c67
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c65
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c234
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c750
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c186
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1192
28 files changed, 3647 insertions, 3594 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 70bd1d0774c..0c3b1670b0d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -28,12 +28,11 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
+ select IP6_NF_IPTABLES
---help---
- Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+ Add IPv6 support to IPVS.
- See http://www.mindbasket.com/ipvs for more information.
-
- Say N if unsure.
+ Say Y if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
@@ -122,7 +121,6 @@ config IP_VS_RR
config IP_VS_WRR
tristate "weighted round-robin scheduling"
- select GCD
---help---
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
@@ -232,11 +230,27 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+comment 'IPVS SH scheduler'
+
+config IP_VS_SH_TAB_BITS
+ int "IPVS source hashing table size (the Nth power of 2)"
+ range 4 20
+ default 8
+ ---help---
+ The source hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is tiled by each destination
+ until all slots in the table are filled. When using weights to
+ allow destinations to receive more connections, the table is
+ tiled an amount proportional to the weights specified. The table
+ needs to be large enough to effectively fit all the destinations
+ multiplied by their respective weights.
+
comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
- depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
+ NF_CONNTRACK_FTP
select IP_VS_NFCT
---help---
FTP is a protocol that transfers IP address and/or port number in
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 059af3120be..dfd7b65b3d2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -31,7 +31,6 @@
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/tcp.h>
-#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -59,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
module_put(app->module);
}
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
/*
* Allocate/initialize app incarnation and register it in proto apps.
@@ -107,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
return 0;
out:
- kfree(inc->timeout_table);
- kfree(inc);
+ ip_vs_app_inc_destroy(inc);
return ret;
}
@@ -132,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
list_del(&inc->a_list);
- kfree(inc->timeout_table);
- kfree(inc);
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
}
@@ -145,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
- atomic_inc(&inc->usecnt);
- if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
- atomic_dec(&inc->usecnt);
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
return result;
}
@@ -157,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
- ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
}
@@ -181,45 +190,71 @@ register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
}
-/*
- * ip_vs_app registration routine
- */
-int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+/* Register application for netns */
+struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- /* increase the module use count */
- ip_vs_use_count_inc();
+ struct ip_vs_app *a;
+ int err = 0;
+
+ if (!ipvs)
+ return ERR_PTR(-ENOENT);
mutex_lock(&__ip_vs_app_mutex);
- list_add(&app->a_list, &ipvs->app_list);
+ list_for_each_entry(a, &ipvs->app_list, a_list) {
+ if (!strcmp(app->name, a->name)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ a = kmemdup(app, sizeof(*app), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ INIT_LIST_HEAD(&a->incs_list);
+ list_add(&a->a_list, &ipvs->app_list);
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+out_unlock:
mutex_unlock(&__ip_vs_app_mutex);
- return 0;
+ return err ? ERR_PTR(err) : a;
}
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
*/
void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
- struct ip_vs_app *inc, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a, *anxt, *inc, *nxt;
+
+ if (!ipvs)
+ return;
mutex_lock(&__ip_vs_app_mutex);
- list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
- ip_vs_app_inc_release(net, inc);
- }
+ list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
+ if (app && strcmp(app->name, a->name))
+ continue;
+ list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
+ ip_vs_app_inc_release(net, inc);
+ }
- list_del(&app->a_list);
+ list_del(&a->a_list);
+ kfree(a);
- mutex_unlock(&__ip_vs_app_mutex);
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+ }
- /* decrease the module use count */
- ip_vs_use_count_dec();
+ mutex_unlock(&__ip_vs_app_mutex);
}
@@ -314,17 +349,17 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
* Assumes already checked proto==IPPROTO_TCP and diff!=0.
*/
static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
- unsigned flag, __u32 seq, int diff)
+ unsigned int flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
@@ -576,26 +611,17 @@ static const struct file_operations ip_vs_app_fops = {
};
#endif
-int __net_init __ip_vs_app_init(struct net *net)
+int __net_init ip_vs_app_net_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
INIT_LIST_HEAD(&ipvs->app_list);
- proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
- return 0;
-}
-
-void __net_exit __ip_vs_app_cleanup(struct net *net)
-{
- proc_net_remove(net, "ip_vs_app");
-}
-
-int __init ip_vs_app_init(void)
-{
+ proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
return 0;
}
-
-void ip_vs_app_cleanup(void)
+void __net_exit ip_vs_app_net_cleanup(struct net *net)
{
+ unregister_ip_vs_app(net, NULL /* all */);
+ remove_proc_entry("ip_vs_app", net->proc_net);
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index bf28ac2fc99..610e19c0e13 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,58 +79,28 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
struct ip_vs_aligned_lock
{
- rwlock_t l;
+ spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-static inline void ct_read_lock(unsigned key)
+static inline void ct_write_lock_bh(unsigned int key)
{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
-static inline void ct_read_unlock(unsigned key)
+static inline void ct_write_unlock_bh(unsigned int key)
{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned key)
-{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned key)
-{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock_bh(unsigned key)
-{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock_bh(unsigned key)
-{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
/*
* Returns hash value for IPVS connection entry
*/
-static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
const union nf_inet_addr *addr,
__be16 port)
{
@@ -188,7 +158,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
*/
static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
- unsigned hash;
+ unsigned int hash;
int ret;
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
@@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
ret = 1;
} else {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
}
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -220,21 +190,21 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
+ * returns bool success. Caller should hold conn reference.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
- unsigned hash;
+ unsigned int hash;
int ret;
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- hlist_del(&cp->c_list);
+ hlist_del_rcu(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
ret = 0;
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -257,30 +257,30 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
static inline struct ip_vs_conn *
__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp;
- struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
return cp;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
return NULL;
}
@@ -308,13 +308,12 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
static int
ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse,
- struct ip_vs_conn_param *p)
+ int inverse, struct ip_vs_conn_param *p)
{
__be16 _ports[2], *pptr;
struct net *net = skb_net(skb);
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return 1;
@@ -329,12 +328,11 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
return NULL;
return ip_vs_conn_in_get(&p);
@@ -344,20 +342,21 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
/* Get reference to connection template */
struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp;
- struct hlist_node *n;
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
- if (!ip_vs_conn_net_eq(cp, p->net))
- continue;
- if (p->pe_data && p->pe->ct_match) {
- if (p->pe == cp->pe && p->pe->ct_match(p, cp))
- goto out;
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
continue;
}
@@ -367,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr is a fwmark */
ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol)
- goto out;
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
}
cp = NULL;
out:
- if (cp)
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -394,32 +394,32 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr, p->vport: pkt dest address (foreign host) */
struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
- struct hlist_node *n;
/*
* Check for "full" addressed entries
*/
hash = ip_vs_conn_hashkey_param(p, true);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->vport == cp->cport && p->cport == cp->dport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -432,12 +432,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
struct ip_vs_conn *
ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off, int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn_param p;
- if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
return NULL;
return ip_vs_conn_out_get(&p);
@@ -463,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
{
if (ip_vs_conn_unhash(cp)) {
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
atomic_dec(&ip_vs_conn_no_cport_cnt);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = cport;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* hash on new dport */
ip_vs_conn_hash(cp);
@@ -548,28 +547,31 @@ static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
unsigned int conn_flags;
+ __u32 flags;
/* if dest is NULL, then return directly */
if (!dest)
return;
/* Increase the refcnt counter of the dest */
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
conn_flags = atomic_read(&dest->conn_flags);
if (cp->protocol != IPPROTO_UDP)
conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
/* Bind with the destination and its corresponding transmitter */
- if (cp->flags & IP_VS_CONN_F_SYNC) {
+ if (flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
conn_flags &= ~IP_VS_CONN_F_INACTIVE;
/* connections inherit forwarding method from dest */
- cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
}
- cp->flags |= conn_flags;
+ flags |= conn_flags;
+ cp->flags = flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -584,12 +586,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
atomic_read(&dest->refcnt));
/* Update the connection counters */
- if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
- /* It is a normal connection, so increase the inactive
- connection counter because it is in TCP SYNRECV
- state (inactive) or other protocol inacive state */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
atomic_inc(&dest->activeconns);
else
atomic_inc(&dest->inactconns);
@@ -609,18 +611,46 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
* Check if there is a destination for the connection, if so
* bind the connection to the destination.
*/
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest;
- if ((cp) && (!cp->dest)) {
- dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
- cp->dport, &cp->vaddr, cp->vport,
- cp->protocol, cp->fwmark);
+ rcu_read_lock();
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock_bh(&cp->lock);
+ if (cp->dest) {
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
ip_vs_bind_dest(cp, dest);
- return dest;
- } else
- return NULL;
+ spin_unlock_bh(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ rcu_read_unlock();
}
@@ -672,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
}
- /*
- * Simply decrease the refcnt of the dest, because the
- * dest will be either in service's destination list
- * or in the trash.
- */
- atomic_dec(&dest->refcnt);
+ ip_vs_dest_put(dest);
}
static int expire_quiescent_template(struct netns_ipvs *ipvs,
@@ -734,23 +759,27 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
}
-static void ip_vs_conn_expire(unsigned long data)
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
{
- struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
- struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
- cp->timeout = 60*HZ;
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
/*
* do I control anybody?
@@ -758,56 +787,60 @@ static void ip_vs_conn_expire(unsigned long data)
if (atomic_read(&cp->n_control))
goto expire_later;
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
/* delete the timer if it is activated by other users */
- if (timer_pending(&cp->timer))
- del_timer(&cp->timer);
+ del_timer(&cp->timer);
/* does anybody control me? */
if (cp->control)
ip_vs_control_del(cp);
- if (cp->flags & IP_VS_CONN_F_NFCT)
- ip_vs_conn_drop_conntrack(cp);
+ if (cp->flags & IP_VS_CONN_F_NFCT) {
+ /* Do not access conntracks during subsys cleanup
+ * because nf_conntrack_find_get can not be used after
+ * conntrack cleanup for the net.
+ */
+ smp_rmb();
+ if (ipvs->enable)
+ ip_vs_conn_drop_conntrack(cp);
+ }
- ip_vs_pe_put(cp->pe);
- kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
atomic_read(&cp->n_control));
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
ip_vs_conn_put(cp);
}
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
}
@@ -816,7 +849,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
*/
struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param *p,
- const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
+ const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
struct ip_vs_dest *dest, __u32 fwmark)
{
struct ip_vs_conn *cp;
@@ -824,7 +857,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
- cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NULL;
@@ -835,13 +868,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
- ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
- ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->vaddr, p->vaddr);
cp->vport = p->vport;
- /* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
- &cp->daddr, daddr);
+ ip_vs_addr_set(p->af, &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
cp->fwmark = fwmark;
@@ -850,6 +883,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
@@ -860,19 +897,30 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
*/
atomic_set(&cp->refcnt, 1);
+ cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
+ cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
+ cp->old_state = 0;
cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
@@ -915,27 +963,30 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
int idx;
struct ip_vs_conn *cp;
struct ip_vs_iter_state *iter = seq->private;
- struct hlist_node *n;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
if (pos-- == 0) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
}
- ct_read_unlock_bh(idx);
+ cond_resched_rcu();
}
return NULL;
}
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
struct ip_vs_iter_state *iter = seq->private;
iter->l = NULL;
+ rcu_read_lock();
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
@@ -952,31 +1003,26 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if ((e = cp->c_list.next))
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
return hlist_entry(e, struct ip_vs_conn, c_list);
idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
while (++idx < ip_vs_conn_tab_size) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
- ct_read_unlock_bh(idx);
+ cond_resched_rcu();
}
iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- struct ip_vs_iter_state *iter = seq->private;
- struct hlist_head *l = iter->l;
-
- if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
+ rcu_read_unlock();
}
static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1049,7 +1095,7 @@ static const struct file_operations ip_vs_conn_fops = {
.release = seq_release_net,
};
-static const char *ip_vs_origin_name(unsigned flags)
+static const char *ip_vs_origin_name(unsigned int flags)
{
if (flags & IP_VS_CONN_F_SYNC)
return "SYNC";
@@ -1155,21 +1201,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
+ rcu_read_lock();
/*
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned hash = net_random() & ip_vs_conn_tab_mask;
- struct hlist_node *n;
+ unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(hash);
-
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
@@ -1189,6 +1230,18 @@ void ip_vs_random_dropentry(struct net *net)
default:
continue;
}
+ } else if (cp->protocol == IPPROTO_SCTP) {
+ switch (cp->state) {
+ case IP_VS_SCTP_S_INIT1:
+ case IP_VS_SCTP_S_INIT:
+ break;
+ case IP_VS_SCTP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+ default:
+ continue;
+ }
} else {
if (!todrop_entry(cp))
continue;
@@ -1196,13 +1249,17 @@ void ip_vs_random_dropentry(struct net *net)
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(hash);
+ cond_resched_rcu();
}
+ rcu_read_unlock();
}
@@ -1212,30 +1269,29 @@ void ip_vs_random_dropentry(struct net *net)
static void ip_vs_conn_flush(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
+ rcu_read_lock();
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- struct hlist_node *n;
-
- /*
- * Lock is actually needed in this loop.
- */
- ct_write_lock_bh(idx);
- hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (!ip_vs_conn_net_eq(cp, net))
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(idx);
+ cond_resched_rcu();
}
+ rcu_read_unlock();
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
@@ -1247,23 +1303,23 @@ flush_again:
/*
* per netns init and exit
*/
-int __net_init __ip_vs_conn_init(struct net *net)
+int __net_init ip_vs_conn_net_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
atomic_set(&ipvs->conn_count, 0);
- proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
- proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+ proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
return 0;
}
-void __net_exit __ip_vs_conn_cleanup(struct net *net)
+void __net_exit ip_vs_conn_net_cleanup(struct net *net)
{
/* flush all the connection entries first */
ip_vs_conn_flush(net);
- proc_net_remove(net, "ip_vs_conn");
- proc_net_remove(net, "ip_vs_conn_sync");
+ remove_proc_entry("ip_vs_conn", net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", net->proc_net);
}
int __init ip_vs_conn_init(void)
@@ -1301,7 +1357,7 @@ int __init ip_vs_conn_init(void)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
@@ -1312,6 +1368,8 @@ int __init ip_vs_conn_init(void)
void ip_vs_conn_cleanup(void)
{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
vfree(ip_vs_conn_tab);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index bfa808f4da1..e6836755c45 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
-int ip_vs_net_id __read_mostly;
-#ifdef IP_VS_GENERIC_NETNS
-EXPORT_SYMBOL(ip_vs_net_id);
-#endif
+static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
@@ -80,7 +77,7 @@ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
#define icmp_id(icmph) (((icmph)->un).echo.id)
#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
-const char *ip_vs_proto_name(unsigned proto)
+const char *ip_vs_proto_name(unsigned int proto)
{
static char buf[20];
@@ -100,7 +97,7 @@ const char *ip_vs_proto_name(unsigned proto)
return "ICMPv6";
#endif
default:
- sprintf(buf, "IP_%d", proto);
+ sprintf(buf, "IP_%u", proto);
return buf;
}
}
@@ -119,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.inpkts++;
@@ -126,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s->ustats.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(dest->svc->stats.cpustats);
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.inpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.inpkts++;
@@ -149,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.outpkts++;
@@ -156,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s->ustats.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(dest->svc->stats.cpustats);
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.outpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.outpkts++;
@@ -188,14 +193,13 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
}
-static inline int
+static inline void
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_proto_data *pd)
{
- if (unlikely(!pd->pp->state_transition))
- return 0;
- return pd->pp->state_transition(cp, direction, skb, pd);
+ if (likely(pd->pp->state_transition))
+ pd->pp->state_transition(cp, direction, skb, pd);
}
static inline int
@@ -207,7 +211,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
{
ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
vport, p);
- p->pe = svc->pe;
+ p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
return p->pe->fill_param(p, skb);
@@ -223,33 +227,32 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
- struct sk_buff *skb,
- __be16 src_port, __be16 dst_port, int *ignored)
+ struct sk_buff *skb, __be16 src_port, __be16 dst_port,
+ int *ignored, struct ip_vs_iphdr *iph)
{
struct ip_vs_conn *cp = NULL;
- struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
__be16 dport = 0; /* destination port to forward */
unsigned int flags;
struct ip_vs_conn_param param;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
union nf_inet_addr snet; /* source network of the client,
after masking */
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
/* Mask saddr with the netmask to adjust template granularity */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
else
#endif
- snet.ip = iph.saddr.ip & svc->netmask;
+ snet.ip = iph->saddr.ip & svc->netmask;
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -266,9 +269,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* is created for other persistent services.
*/
{
- int protocol = iph.protocol;
- const union nf_inet_addr *vaddr = &iph.daddr;
- const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ int protocol = iph->protocol;
+ const union nf_inet_addr *vaddr = &iph->daddr;
__be16 vport = 0;
if (dst_port == svc->port) {
@@ -303,12 +305,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
/*
* No template found or the dest of the connection
* template is not available.
* return *ignored=0 i.e. ICMP and NF_DROP
*/
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
@@ -343,14 +348,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
dport = dest->port;
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
- && iph.protocol == IPPROTO_UDP)?
+ && iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
- src_port, &iph.daddr, dst_port, &param);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
+ src_port, &iph->daddr, dst_port, &param);
cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
@@ -393,18 +398,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_proto_data *pd, int *ignored)
+ struct ip_vs_proto_data *pd, int *ignored,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
- struct ip_vs_iphdr iph;
+ struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
*ignored = 1;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ /*
+ * IPv6 frags, only the first hit here.
+ */
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return NULL;
@@ -424,7 +432,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* Do not schedule replies from local real server.
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
+ (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
"Not scheduling reply for existing connection");
__ip_vs_conn_put(cp);
@@ -435,7 +443,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* Persistent service
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
- return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ iph);
*ignored = 0;
@@ -450,14 +459,15 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
}
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
- && iph.protocol == IPPROTO_UDP)?
+ && iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
/*
@@ -466,9 +476,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
- &iph.saddr, pptr[0], &iph.daddr, pptr[1],
- &p);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0], &iph->daddr,
+ pptr[1], &p);
cp = ip_vs_conn_new(&p, &dest->addr,
dest->port ? dest->port : pptr[1],
flags, dest, skb->mark);
@@ -497,21 +507,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
- struct ip_vs_proto_data *pd)
+ struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
__be16 _ports[2], *pptr;
- struct ip_vs_iphdr iph;
#ifdef CONFIG_SYSCTL
struct net *net;
struct netns_ipvs *ipvs;
int unicast;
#endif
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL) {
- ip_vs_service_put(svc);
return NF_DROP;
}
@@ -520,32 +526,30 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+ unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
else
#endif
- unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
+ unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is a non-local unicast, then create
a cache_bypass connection entry */
ipvs = net_ipvs(net);
if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
- int ret, cs;
+ int ret;
struct ip_vs_conn *cp;
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
- iph.protocol == IPPROTO_UDP)?
+ iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
- ip_vs_service_put(svc);
-
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1], &p);
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], &p);
cp = ip_vs_conn_new(&p, &daddr, 0,
IP_VS_CONN_F_BYPASS | flags,
NULL, skb->mark);
@@ -557,10 +561,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ip_vs_in_stats(cp, skb);
/* set state */
- cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
/* transmit the first SYN packet */
- ret = cp->packet_xmit(skb, cp, pd->pp);
+ ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
@@ -575,12 +579,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
- ip_vs_service_put(svc);
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
return NF_ACCEPT;
- }
-
- ip_vs_service_put(svc);
/*
* Notify the client that the destination is unreachable, and
@@ -592,9 +592,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
- skb->dev = net->loopback_dev;
+ skb->dev = net_->loopback_dev;
}
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
@@ -647,22 +647,17 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- int err = ip_defrag(skb, user);
+ int err;
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
return err;
}
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
- /* TODO IPv6: Find out what to do here for IPv6 */
- return 0;
-}
-#endif
-
static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
{
#ifdef CONFIG_IP_VS_IPV6
@@ -733,10 +728,19 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int inout)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- unsigned int icmp_offset = sizeof(struct ipv6hdr);
- struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
- icmp_offset);
- struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
+ unsigned int icmp_offset = 0;
+ unsigned int offs = 0; /* header offset*/
+ int protocol;
+ struct icmp6hdr *icmph;
+ struct ipv6hdr *ciph;
+ unsigned short fragoffs;
+
+ ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
+ icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
+ offs = icmp_offset + sizeof(struct icmp6hdr);
+ ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
+
+ protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
if (inout) {
iph->saddr = cp->vaddr.in6;
@@ -747,10 +751,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
/* the TCP/UDP/SCTP port */
- if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
- IPPROTO_SCTP == ciph->nexthdr) {
- __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)) {
+ __be16 *ports = (void *)(skb_network_header(skb) + offs);
+ IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
+ ntohs(inout ? ports[1] : ports[0]),
+ ntohs(inout ? cp->vport : cp->dport));
if (inout)
ports[1] = cp->vport;
else
@@ -852,7 +859,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
*related = 1;
/* reassemble IP fragments */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -899,51 +906,35 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking outgoing ICMP for");
- offset += cih->ihl * 4;
-
- ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
if (!cp)
return NF_ACCEPT;
snet.ip = iph->saddr;
return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
- pp, offset, ihl);
+ pp, ciph.len, ihl);
}
#ifdef CONFIG_IP_VS_IPV6
static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
- unsigned int hooknum)
+ unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
{
- struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ciph, *cih; /* The ip header contained
- within the ICMP */
- struct ip_vs_iphdr ciph;
+ struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
- unsigned int offset;
union nf_inet_addr snet;
+ unsigned int writable;
*related = 1;
-
- /* reassemble IP fragments */
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- iph = ipv6_hdr(skb);
- offset = sizeof(struct ipv6hdr);
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
if (ic == NULL)
return NF_DROP;
- IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
- ic->icmp6_type, ntohs(icmpv6_id(ic)),
- &iph->saddr, &iph->daddr);
-
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
@@ -951,42 +942,45 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
- if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
- (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
- (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &ipvsh->saddr, &ipvsh->daddr);
/* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
+ ciph.len = ipvsh->len + sizeof(_icmph);
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
-
- pp = ip_vs_proto_get(cih->nexthdr);
+ ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
- /* Is the embedded protocol header present? */
- /* TODO: we don't support fragmentation at the moment anyways */
- if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
- return NF_ACCEPT;
-
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
- "Checking outgoing ICMPv6 for");
-
- offset += sizeof(struct ipv6hdr);
-
- ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
if (!cp)
return NF_ACCEPT;
- ipv6_addr_copy(&snet.in6, &iph->saddr);
- return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
- pp, offset, sizeof(struct ipv6hdr));
+ snet.in6 = ciph.saddr.in6;
+ writable = ciph.len;
+ return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
+ pp, writable, sizeof(struct ipv6hdr));
}
#endif
@@ -1015,21 +1009,47 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
return th->rst;
}
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
- struct ip_vs_conn *cp, int ihl)
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct ip_vs_protocol *pp = pd->pp;
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
- if (!skb_make_writable(skb, ihl))
+ if (!skb_make_writable(skb, iph->len))
goto drop;
/* mangle the packet */
- if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
goto drop;
#ifdef CONFIG_IP_VS_IPV6
@@ -1116,17 +1136,16 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (!net_ipvs(net)->enable)
return NF_ACCEPT;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(skb, &related,
- hooknum);
+ hooknum, &iph);
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
@@ -1136,7 +1155,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
pd = ip_vs_proto_data_get(net, iph.protocol);
@@ -1146,45 +1164,35 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb,
- ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
- } else
+ if (af == AF_INET)
#endif
- if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
- !pp->dont_defrag)) {
+ if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
if (ip_vs_gather_frags(skb,
ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
}
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
+ cp = pp->conn_out_get(af, skb, &iph, 0);
if (likely(cp))
- return handle_response(af, skb, pd, cp, iph.len);
+ return handle_response(af, skb, pd, cp, &iph);
if (sysctl_nat_icmp_send(net) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
pp->protocol == IPPROTO_SCTP)) {
__be16 _ports[2], *pptr;
- pptr = skb_header_pointer(skb, iph.len,
- sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(net, af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
@@ -1199,9 +1207,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- struct net *net =
- dev_net(skb_dst(skb)->dev);
-
if (!skb->dev)
skb->dev = net->loopback_dev;
icmpv6_send(skb,
@@ -1228,11 +1233,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_out(hooknum, skb, AF_INET);
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
}
/*
@@ -1240,17 +1245,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1261,11 +1260,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_out(hooknum, skb, AF_INET6);
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
}
/*
@@ -1273,17 +1272,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
* Check if packet is reply for established ip_vs_conn.
*/
static unsigned int
-ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
}
#endif
@@ -1305,12 +1298,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
- unsigned int offset, ihl, verdict;
+ unsigned int offset, offset2, ihl, verdict;
+ bool ipip;
*related = 1;
/* reassemble IP fragments */
- if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_is_fragment(ip_hdr(skb))) {
if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1347,6 +1341,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
net = skb_net(skb);
+ /* Special case for errors for IPIP packets */
+ ipip = false;
+ if (cih->protocol == IPPROTO_IPIP) {
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ /* Error for our IPIP must arrive at LOCAL_IN */
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
+ return NF_ACCEPT;
+ offset += cih->ihl * 4;
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ipip = true;
+ }
+
pd = ip_vs_proto_data_get(net, cih->protocol);
if (!pd)
return NF_ACCEPT;
@@ -1360,11 +1369,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
"Checking incoming ICMP for");
- offset += cih->ihl * 4;
-
- ip_vs_fill_iphdr(AF_INET, cih, &ciph);
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
+ offset2 = offset;
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ offset = ciph.len;
+ /* The embedded headers contain source and dest in reverse order.
+ * For IPIP this is error for request, not for reply.
+ */
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
if (!cp)
return NF_ACCEPT;
@@ -1378,51 +1390,95 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
goto out;
}
+ if (ipip) {
+ __be32 info = ic->un.gateway;
+ __u8 type = ic->type;
+ __u8 code = ic->code;
+
+ /* Update the MTU */
+ if (ic->type == ICMP_DEST_UNREACH &&
+ ic->code == ICMP_FRAG_NEEDED) {
+ struct ip_vs_dest *dest = cp->dest;
+ u32 mtu = ntohs(ic->un.frag.mtu);
+ __be16 frag_off = cih->frag_off;
+
+ /* Strip outer IP and ICMP, go to IPIP header */
+ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
+ goto ignore_ipip;
+ offset2 -= ihl + sizeof(_icmph);
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ ipv4_update_pmtu(skb, dev_net(skb->dev),
+ mtu, 0, 0, 0, 0);
+ /* Client uses PMTUD? */
+ if (!(frag_off & htons(IP_DF)))
+ goto ignore_ipip;
+ /* Prefer the resulting PMTU */
+ if (dest) {
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
+ }
+ if (mtu > 68 + sizeof(struct iphdr))
+ mtu -= sizeof(struct iphdr);
+ info = htonl(mtu);
+ }
+ /* Strip outer IP, ICMP and IPIP, go to IP header of
+ * original request.
+ */
+ if (pskb_pull(skb, offset2) == NULL)
+ goto ignore_ipip;
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ type, code, ntohl(info));
+ icmp_send(skb, type, code, info);
+ /* ICMP can be shorter but anyways, account it */
+ ip_vs_out_stats(cp, skb);
+
+ignore_ipip:
+ consume_skb(skb);
+ verdict = NF_STOLEN;
+ goto out;
+ }
+
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
- if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+ IPPROTO_SCTP == cih->protocol)
offset += 2 * sizeof(__u16);
- verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum);
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
- out:
+out:
__ip_vs_conn_put(cp);
return verdict;
}
#ifdef CONFIG_IP_VS_IPV6
-static int
-ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *iph)
{
struct net *net = NULL;
- struct ipv6hdr *iph;
+ struct ipv6hdr _ip6h, *ip6h;
struct icmp6hdr _icmph, *ic;
- struct ipv6hdr _ciph, *cih; /* The ip header contained
- within the ICMP */
- struct ip_vs_iphdr ciph;
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
- unsigned int offset, verdict;
+ unsigned int offs_ciph, writable, verdict;
*related = 1;
- /* reassemble IP fragments */
- if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
- return NF_STOLEN;
- }
-
- iph = ipv6_hdr(skb);
- offset = sizeof(struct ipv6hdr);
- ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
if (ic == NULL)
return NF_DROP;
- IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
- ic->icmp6_type, ntohs(icmpv6_id(ic)),
- &iph->saddr, &iph->daddr);
-
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
@@ -1430,47 +1486,71 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
- if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
- (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
- (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &iph->saddr, &iph->daddr);
/* Now find the contained IP header */
- offset += sizeof(_icmph);
- cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
- if (cih == NULL)
+ ciph.len = iph->len + sizeof(_icmph);
+ offs_ciph = ciph.len; /* Save ip header offset */
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
net = skb_net(skb);
- pd = ip_vs_proto_data_get(net, cih->nexthdr);
+ pd = ip_vs_proto_data_get(net, ciph.protocol);
if (!pd)
return NF_ACCEPT;
pp = pd->pp;
- /* Is the embedded protocol header present? */
- /* TODO: we don't support fragmentation at the moment anyways */
- if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+ /* Cannot handle fragmented embedded protocol */
+ if (ciph.fragoffs)
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
"Checking incoming ICMPv6 for");
- offset += sizeof(struct ipv6hdr);
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
- ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
+ return NF_ACCEPT;
+ }
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
- if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
- IPPROTO_SCTP == cih->nexthdr)
- offset += 2 * sizeof(__u16);
- verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum);
+
+ /* Need to mangle contained IPv6 header in ICMPv6 packet */
+ writable = ciph.len;
+ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
+ IPPROTO_SCTP == ciph.protocol)
+ writable += 2 * sizeof(__u16); /* Also mangle ports */
+
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
__ip_vs_conn_put(cp);
@@ -1491,7 +1571,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
- int ret, restart, pkts;
+ int ret, pkts;
struct netns_ipvs *ipvs;
/* Already marked as IPVS request or reply? */
@@ -1506,7 +1586,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (unlikely((skb->pkt_type != PACKET_HOST &&
hooknum != NF_INET_LOCAL_OUT) ||
!skb_dst(skb))) {
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(af, skb, &iph);
IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
" ignored in hook %u\n",
skb->pkt_type, iph.protocol,
@@ -1515,10 +1595,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(af, skb, &iph);
/* Bad... Do not break raw sockets */
if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
@@ -1534,11 +1615,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
- int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+ &iph);
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
#endif
@@ -1548,7 +1629,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
/* Protocol supported? */
@@ -1559,12 +1639,24 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
+ cp = pp->conn_in_get(af, skb, &iph, 0);
- if (unlikely(!cp)) {
+ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
+ is_new_conn(skb, &iph)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
+ if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
int v;
- if (!pp->conn_schedule(af, skb, pd, &v, &cp))
+ /* Schedule and create new connection entry into &cp */
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
return v;
}
@@ -1572,11 +1664,17 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, af, pp, skb, 0,
"ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
- ipvs = net_ipvs(net);
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
@@ -1592,9 +1690,9 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
}
ip_vs_in_stats(cp, skb);
- restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
- ret = cp->packet_xmit(skb, cp, pp);
+ ret = cp->packet_xmit(skb, cp, pp, &iph);
/* do not touch skb anymore */
else {
IP_VS_DBG_RL("warning: packet_xmit is null");
@@ -1615,34 +1713,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
else
pkts = atomic_add_return(1, &cp->in_pkts);
- if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
- cp->protocol == IPPROTO_SCTP) {
- if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (pkts % sysctl_sync_period(ipvs)
- == sysctl_sync_threshold(ipvs))) ||
- (cp->old_state != cp->state &&
- ((cp->state == IP_VS_SCTP_S_CLOSED) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
- (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
- ip_vs_sync_conn(net, cp);
- goto out;
- }
- }
-
- /* Keep this block last: TCP and others with pp->num_states <= 1 */
- else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
- (((cp->protocol != IPPROTO_TCP ||
- cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (pkts % sysctl_sync_period(ipvs)
- == sysctl_sync_threshold(ipvs))) ||
- ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
- ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
- (cp->state == IP_VS_TCP_S_CLOSE) ||
- (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
- (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
- ip_vs_sync_conn(net, cp);
-out:
- cp->old_state = cp->state;
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
ip_vs_conn_put(cp);
return ret;
@@ -1653,12 +1725,12 @@ out:
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_in(hooknum, skb, AF_INET);
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
}
/*
@@ -1666,17 +1738,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1686,12 +1752,12 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from remote clients
*/
static unsigned int
-ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- return ip_vs_in(hooknum, skb, AF_INET6);
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
}
/*
@@ -1699,17 +1765,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
* Schedule and forward packets from local clients
*/
static unsigned int
-ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
}
#endif
@@ -1725,42 +1785,48 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
* and send them to ip_vs_in_icmp.
*/
static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
int r;
struct net *net;
+ struct netns_ipvs *ipvs;
if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp(skb, &r, hooknum);
+ return ip_vs_in_icmp(skb, &r, ops->hooknum);
}
#ifdef CONFIG_IP_VS_IPV6
static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
int r;
struct net *net;
+ struct netns_ipvs *ipvs;
+ struct ip_vs_iphdr iphdr;
- if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ if (iphdr.protocol != IPPROTO_ICMPV6)
return NF_ACCEPT;
/* ipvs enabled in this netns ? */
net = skb_net(skb);
- if (!net_ipvs(net)->enable)
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
return NF_ACCEPT;
- return ip_vs_in_icmp_v6(skb, &r, hooknum);
+ return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
}
#endif
@@ -1770,9 +1836,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 99,
+ .priority = NF_IP_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1780,32 +1846,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 101,
+ .priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -99,
+ .priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -98,
+ .priority = NF_IP_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
@@ -1813,7 +1879,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply4,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
@@ -1822,9 +1888,9 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 99,
+ .priority = NF_IP6_PRI_NAT_SRC - 2,
},
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1832,32 +1898,32 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_remote_request6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
- .priority = 101,
+ .priority = NF_IP6_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
.hook = ip_vs_local_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET,
+ .pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -99,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
.hook = ip_vs_local_request6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
- .priority = -98,
+ .priority = NF_IP6_PRI_NAT_DST + 2,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
.hook = ip_vs_forward_icmp_v6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
@@ -1865,7 +1931,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
{
.hook = ip_vs_reply6,
.owner = THIS_MODULE,
- .pf = PF_INET6,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
@@ -1879,10 +1945,9 @@ static int __net_init __ip_vs_init(struct net *net)
struct netns_ipvs *ipvs;
ipvs = net_generic(net, ip_vs_net_id);
- if (ipvs == NULL) {
- pr_err("%s(): no memory.\n", __func__);
+ if (ipvs == NULL)
return -ENOMEM;
- }
+
/* Hold the beast until a service is registerd */
ipvs->enable = 0;
ipvs->net = net;
@@ -1891,22 +1956,22 @@ static int __net_init __ip_vs_init(struct net *net)
atomic_inc(&ipvs_netns_cnt);
net->ipvs = ipvs;
- if (__ip_vs_estimator_init(net) < 0)
+ if (ip_vs_estimator_net_init(net) < 0)
goto estimator_fail;
- if (__ip_vs_control_init(net) < 0)
+ if (ip_vs_control_net_init(net) < 0)
goto control_fail;
- if (__ip_vs_protocol_init(net) < 0)
+ if (ip_vs_protocol_net_init(net) < 0)
goto protocol_fail;
- if (__ip_vs_app_init(net) < 0)
+ if (ip_vs_app_net_init(net) < 0)
goto app_fail;
- if (__ip_vs_conn_init(net) < 0)
+ if (ip_vs_conn_net_init(net) < 0)
goto conn_fail;
- if (__ip_vs_sync_init(net) < 0)
+ if (ip_vs_sync_net_init(net) < 0)
goto sync_fail;
printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
@@ -1917,35 +1982,38 @@ static int __net_init __ip_vs_init(struct net *net)
*/
sync_fail:
- __ip_vs_conn_cleanup(net);
+ ip_vs_conn_net_cleanup(net);
conn_fail:
- __ip_vs_app_cleanup(net);
+ ip_vs_app_net_cleanup(net);
app_fail:
- __ip_vs_protocol_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
protocol_fail:
- __ip_vs_control_cleanup(net);
+ ip_vs_control_net_cleanup(net);
control_fail:
- __ip_vs_estimator_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
estimator_fail:
+ net->ipvs = NULL;
return -ENOMEM;
}
static void __net_exit __ip_vs_cleanup(struct net *net)
{
- __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */
- __ip_vs_conn_cleanup(net);
- __ip_vs_app_cleanup(net);
- __ip_vs_protocol_cleanup(net);
- __ip_vs_control_cleanup(net);
- __ip_vs_estimator_cleanup(net);
+ ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(net);
+ ip_vs_app_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
+ ip_vs_control_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ net->ipvs = NULL;
}
static void __net_exit __ip_vs_dev_cleanup(struct net *net)
{
EnterFunction(2);
net_ipvs(net)->enable = 0; /* Disable packet reception */
- __ip_vs_sync_cleanup(net);
+ smp_wmb();
+ ip_vs_sync_net_cleanup(net);
LeaveFunction(2);
}
@@ -1967,36 +2035,23 @@ static int __init ip_vs_init(void)
{
int ret;
- ip_vs_estimator_init();
ret = ip_vs_control_init();
if (ret < 0) {
pr_err("can't setup control.\n");
- goto cleanup_estimator;
+ goto exit;
}
ip_vs_protocol_init();
- ret = ip_vs_app_init();
- if (ret < 0) {
- pr_err("can't setup application helper.\n");
- goto cleanup_protocol;
- }
-
ret = ip_vs_conn_init();
if (ret < 0) {
pr_err("can't setup connection table.\n");
- goto cleanup_app;
- }
-
- ret = ip_vs_sync_init();
- if (ret < 0) {
- pr_err("can't setup sync data.\n");
- goto cleanup_conn;
+ goto cleanup_protocol;
}
ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
if (ret < 0)
- goto cleanup_sync;
+ goto cleanup_conn;
ret = register_pernet_device(&ipvs_core_dev_ops);
if (ret < 0)
@@ -2008,39 +2063,40 @@ static int __init ip_vs_init(void)
goto cleanup_dev;
}
+ ret = ip_vs_register_nl_ioctl();
+ if (ret < 0) {
+ pr_err("can't register netlink/ioctl.\n");
+ goto cleanup_hooks;
+ }
+
pr_info("ipvs loaded.\n");
return ret;
+cleanup_hooks:
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
cleanup_dev:
unregister_pernet_device(&ipvs_core_dev_ops);
cleanup_sub:
unregister_pernet_subsys(&ipvs_core_ops);
-cleanup_sync:
- ip_vs_sync_cleanup();
- cleanup_conn:
+cleanup_conn:
ip_vs_conn_cleanup();
- cleanup_app:
- ip_vs_app_cleanup();
- cleanup_protocol:
+cleanup_protocol:
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
- cleanup_estimator:
- ip_vs_estimator_cleanup();
+exit:
return ret;
}
static void __exit ip_vs_cleanup(void)
{
+ ip_vs_unregister_nl_ioctl();
nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
unregister_pernet_device(&ipvs_core_dev_ops);
unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
- ip_vs_sync_cleanup();
ip_vs_conn_cleanup();
- ip_vs_app_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
- ip_vs_estimator_cleanup();
pr_info("ipvs unloaded.\n");
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 699c79a5565..581a6584ed0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -55,9 +55,6 @@
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
/* sysctl variables */
#ifdef CONFIG_IP_VS_DEBUG
@@ -71,24 +68,24 @@ int ip_vs_get_debug_level(void)
/* Protos */
-static void __ip_vs_del_service(struct ip_vs_service *svc);
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
-static int __ip_vs_addr_is_local_v6(struct net *net,
- const struct in6_addr *addr)
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
{
- struct rt6_info *rt;
struct flowi6 fl6 = {
.daddr = *addr,
};
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
- rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
- if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
- return 1;
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
- return 0;
+ dst_release(dst);
+ return is_local;
}
#endif
@@ -257,36 +254,38 @@ ip_vs_use_count_dec(void)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
* Returns hash value for virtual service
*/
-static inline unsigned
-ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
+static inline unsigned int
+ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
- register unsigned porth = ntohs(port);
+ register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
+ __u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- addr_fold ^= ((size_t)net>>8);
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
- return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
- & IP_VS_SVC_TAB_MASK;
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
}
/*
* Returns hash value of fwmark for virtual service lookup
*/
-static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
{
return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
@@ -298,7 +297,7 @@ static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
*/
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
- unsigned hash;
+ unsigned int hash;
if (svc->flags & IP_VS_SVC_F_HASHED) {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
*/
hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
&svc->addr, svc->port);
- list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
- list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
@@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/* Remove it from the svc_table table */
- list_del(&svc->s_list);
+ hlist_del_rcu(&svc->s_list);
} else {
/* Remove it from the svc_fwm_table table */
- list_del(&svc->f_list);
+ hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
@@ -361,13 +360,13 @@ static inline struct ip_vs_service *
__ip_vs_service_find(struct net *net, int af, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
- list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
@@ -388,13 +387,13 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
{
- unsigned hash;
+ unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(net, fwmark);
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
&& net_eq(svc->net, net)) {
/* HIT */
@@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
return NULL;
}
+/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
- const union nf_inet_addr *vaddr, __be16 vport)
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
struct netns_ipvs *ipvs = net_ipvs(net);
- read_lock(&__ip_vs_svc_lock);
-
/*
* Check the table hashed by fwmark first
*/
@@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
}
out:
- if (svc)
- atomic_inc(&svc->usecnt);
- read_unlock(&__ip_vs_svc_lock);
-
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
@@ -466,22 +460,35 @@ static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
- dest->svc = svc;
+ rcu_assign_pointer(dest->svc, svc);
}
-static void
-__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+static void ip_vs_service_free(struct ip_vs_service *svc)
{
- struct ip_vs_service *svc = dest->svc;
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
- dest->svc = NULL;
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
+
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
if (atomic_dec_and_test(&svc->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ntohs(svc->port));
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
}
}
@@ -489,11 +496,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
/*
* Returns hash value for real service
*/
-static inline unsigned ip_vs_rs_hashkey(int af,
+static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
- register unsigned porth = ntohs(port);
+ register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
@@ -506,17 +513,13 @@ static inline unsigned ip_vs_rs_hashkey(int af,
& IP_VS_RTAB_MASK;
}
-/*
- * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
- unsigned hash;
+ unsigned int hash;
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
+ if (dest->in_rs_table)
+ return;
/*
* Hash by proto,addr,port,
@@ -524,65 +527,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ipvs->rs_table[hash]);
-
- return 1;
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
}
-/*
- * UNhashes ip_vs_dest from rs_table.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
- if (!list_empty(&dest->d_list)) {
- list_del(&dest->d_list);
- INIT_LIST_HEAD(&dest->d_list);
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
}
-
- return 1;
}
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
- const union nf_inet_addr *daddr,
- __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- unsigned hash;
+ unsigned int hash;
struct ip_vs_dest *dest;
- /*
- * Check for "full" addressed entries
- * Return the first found entry
- */
+ /* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&ipvs->rs_lock);
- list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
- if ((dest->af == af)
- && ip_vs_addr_equal(af, &dest->addr, daddr)
- && (dest->port == dport)
- && ((dest->protocol == protocol) ||
- dest->vfwmark)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
- read_unlock(&ipvs->rs_lock);
- return dest;
+ rcu_read_unlock();
+ return true;
}
}
- read_unlock(&ipvs->rs_lock);
+ rcu_read_unlock();
- return NULL;
+ return false;
}
-/*
- * Lookup destination by {addr,port} in the given service
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
@@ -593,7 +582,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find the destination for the given service
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == svc->af)
&& ip_vs_addr_equal(svc->af, &dest->addr, daddr)
&& (dest->port == dport)) {
@@ -607,33 +596,56 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
+ * Created to be used in ip_vs_process_message() in
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
+ * Called under RCU lock, no refcnt is returned.
*/
struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
- __be16 vport, __u16 protocol, __u32 fwmark)
+ __be16 vport, __u16 protocol, __u32 fwmark,
+ __u32 flags)
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
+ __be16 port = dport;
- svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
+ svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
- dest = ip_vs_lookup_dest(svc, daddr, dport);
- if (dest)
- atomic_inc(&dest->refcnt);
- ip_vs_service_put(svc);
+ if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+ port = 0;
+ dest = ip_vs_lookup_dest(svc, daddr, port);
+ if (!dest)
+ dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
return dest;
}
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
@@ -648,13 +660,14 @@ static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
- struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_dest *dest;
struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
@@ -670,29 +683,29 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
- return dest;
- }
-
- /*
- * Try to purge the destination from trash if not referenced
- */
- if (atomic_read(&dest->refcnt) == 1) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
- "from trash\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port));
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
}
}
- return NULL;
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
}
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_svc_put(svc, false);
+ free_percpu(dest->stats.cpustats);
+ ip_vs_dest_put_and_free(dest);
+}
/*
* Clean up all the destinations in the trash
@@ -701,19 +714,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
- * be 1, so we simply release them here.
+ * be 0, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
}
}
@@ -763,6 +775,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
+ struct ip_vs_scheduler *sched;
int conn_flags;
/* set the weight and the flags */
@@ -778,20 +792,19 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_hash(ipvs, dest);
- write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
/* bind the service */
- if (!dest->svc) {
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
- if (dest->svc != svc) {
- __ip_vs_unbind_svc(dest);
+ if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
}
}
@@ -804,27 +817,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->l_threshold = udest->l_threshold;
spin_lock_bh(&dest->dst_lock);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
- if (add)
- ip_vs_start_estimator(svc->net, &dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
+ sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- list_add(&dest->n_list, &svc->destinations);
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
}
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -836,7 +842,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest **dest_p)
{
struct ip_vs_dest *dest;
- unsigned atype;
+ unsigned int atype, i;
EnterFunction(2);
@@ -856,14 +862,17 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
}
dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
- if (dest == NULL) {
- pr_err("%s(): no memory.\n", __func__);
+ if (dest == NULL)
return -ENOMEM;
- }
+
dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!dest->stats.cpustats) {
- pr_err("%s() alloc_percpu failed\n", __func__);
+ if (!dest->stats.cpustats)
goto err_alloc;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_dest_stats;
+ ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+ u64_stats_init(&ip_vs_dest_stats->syncp);
}
dest->af = svc->af;
@@ -879,7 +888,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
- INIT_LIST_HEAD(&dest->d_list);
+ INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
@@ -921,10 +930,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Check if the dest already exists in the list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
@@ -946,11 +955,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- /*
- * Get the destination from the trash
- */
- list_del(&dest->n_list);
-
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
@@ -990,10 +994,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Lookup the destination list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
@@ -1006,11 +1010,11 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return 0;
}
-
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
{
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -1019,38 +1023,20 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&ipvs->rs_lock);
- /*
- * Decrease the refcnt of the dest, and free the dest
- * if nobody refers to it (refcnt=0). Otherwise, throw
- * the destination into the trash.
- */
- if (atomic_dec_and_test(&dest->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port));
- ip_vs_dst_reset(dest);
- /* simply decrease svc->refcnt here, let the caller check
- and release the service if nobody refers to it.
- Only user context can release destination and service,
- and only one user context can update virtual service at a
- time, so the operation here is OK */
- atomic_dec(&dest->svc->refcnt);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
- } else {
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
- "dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port),
- atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ipvs->dest_trash);
- atomic_inc(&dest->refcnt);
- }
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
}
@@ -1066,14 +1052,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
/*
* Remove it from the d-linked destination list.
*/
- list_del(&dest->n_list);
+ list_del_rcu(&dest->n_list);
svc->num_dests--;
- /*
- * Call the update_service function of its scheduler
- */
- if (svcupd && svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
}
@@ -1088,37 +1076,62 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
EnterFunction(2);
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
- write_unlock_bh(&__ip_vs_svc_lock);
-
/*
* Delete the destination
*/
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, false);
LeaveFunction(2);
return 0;
}
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ spin_unlock(&ipvs->dest_trash_lock);
+}
/*
* Add a service into the service hash table
@@ -1127,7 +1140,7 @@ static int
ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0;
+ int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
@@ -1155,9 +1168,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out_err;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
}
#endif
@@ -1169,12 +1186,18 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
}
svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!svc->stats.cpustats) {
- pr_err("%s() alloc_percpu failed\n", __func__);
+ ret = -ENOMEM;
goto out_err;
}
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_stats;
+ ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+ u64_stats_init(&ip_vs_stats->syncp);
+ }
+
+
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1188,7 +1211,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
+ spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
@@ -1198,7 +1221,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
sched = NULL;
/* Bind the ct retriever */
- ip_vs_bind_pe(svc, pe);
+ RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
@@ -1214,9 +1237,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
ipvs->num_services++;
/* Hash the service into the service table */
- write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
*svc_p = svc;
/* Now there is a service - full throttle */
@@ -1226,15 +1247,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
out_err:
if (svc != NULL) {
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- if (svc->stats.cpustats)
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
@@ -1278,18 +1292,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
}
#endif
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
/*
* Set the flags and timeout value
@@ -1298,57 +1321,22 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- old_sched = svc->scheduler;
- if (sched != old_sched) {
- /*
- * Unbind the old scheduler
- */
- if ((ret = ip_vs_unbind_scheduler(svc))) {
- old_sched = sched;
- goto out_unlock;
- }
-
- /*
- * Bind the new scheduler
- */
- if ((ret = ip_vs_bind_scheduler(svc, sched))) {
- /*
- * If ip_vs_bind_scheduler fails, restore the old
- * scheduler.
- * The main reason of failure is out of memory.
- *
- * The question is if the old scheduler can be
- * restored all the time. TODO: if it cannot be
- * restored some time, we must delete the service,
- * otherwise the system may crash.
- */
- ip_vs_bind_scheduler(svc, old_sched);
- old_sched = sched;
- goto out_unlock;
- }
- }
-
- old_pe = svc->pe;
- if (pe != old_pe) {
- ip_vs_unbind_pe(svc);
- ip_vs_bind_pe(svc, pe);
- }
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
- out_unlock:
- write_unlock_bh(&__ip_vs_svc_lock);
- out:
+out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
-
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
-static void __ip_vs_del_service(struct ip_vs_service *svc)
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
@@ -1364,27 +1352,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
- old_sched = svc->scheduler;
- ip_vs_unbind_scheduler(svc);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
- /* Unbind persistence engine */
- old_pe = svc->pe;
- ip_vs_unbind_pe(svc);
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
ip_vs_pe_put(old_pe);
- /* Unbind app inc */
- if (svc->inc) {
- ip_vs_app_inc_put(svc->inc);
- svc->inc = NULL;
- }
-
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
}
/*
@@ -1398,14 +1379,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
- svc->fwmark,
- IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
- }
+ __ip_vs_svc_put(svc, true);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1414,23 +1388,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static void ip_vs_unlink_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
- write_lock_bh(&__ip_vs_svc_lock);
-
ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
- __ip_vs_del_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
+ __ip_vs_del_service(svc, cleanup);
}
/*
@@ -1440,7 +1407,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, false);
return 0;
}
@@ -1449,19 +1416,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(struct net *net)
+static int ip_vs_flush(struct net *net, bool cleanup)
{
int idx;
- struct ip_vs_service *svc, *nxt;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
/*
* Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
- s_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1469,10 +1437,10 @@ static int ip_vs_flush(struct net *net)
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt,
- &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1483,76 +1451,79 @@ static int ip_vs_flush(struct net *net)
* Delete service by {netns} in the service table.
* Called by __ip_vs_cleanup()
*/
-void __ip_vs_service_cleanup(struct net *net)
+void ip_vs_service_net_cleanup(struct net *net)
{
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(net);
+ ip_vs_flush(net, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
-/*
- * Release dst hold by dst_cache
- */
+
+/* Put all references for device (dst_cache) */
static inline void
-__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
+ struct ip_vs_dest_dst *dest_dst;
+
spin_lock_bh(&dest->dst_lock);
- if (dest->dst_cache && dest->dst_cache->dev == dev) {
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
}
-/*
- * Netdev event receiver
- * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
- * a device that is "unregister" it must be released.
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+ void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
unsigned int idx;
- if (event != NETDEV_UNREGISTER)
+ if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
}
- list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) {
- __ip_vs_dev_reset(dest, dev);
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
}
+ spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
return NOTIFY_DONE;
@@ -1565,12 +1536,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
- write_lock_bh(&__ip_vs_svc_lock);
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
- write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
@@ -1580,14 +1549,14 @@ static int ip_vs_zero_all(struct net *net)
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
@@ -1598,8 +1567,12 @@ static int ip_vs_zero_all(struct net *net)
}
#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
static int
-proc_do_defense_mode(ctl_table *table, int write,
+proc_do_defense_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = current->nsproxy->net_ns;
@@ -1620,7 +1593,7 @@ proc_do_defense_mode(ctl_table *table, int write,
}
static int
-proc_do_sync_threshold(ctl_table *table, int write,
+proc_do_sync_threshold(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
@@ -1631,7 +1604,8 @@ proc_do_sync_threshold(ctl_table *table, int write,
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
@@ -1639,7 +1613,7 @@ proc_do_sync_threshold(ctl_table *table, int write,
}
static int
-proc_do_sync_mode(ctl_table *table, int write,
+proc_do_sync_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
@@ -1651,9 +1625,24 @@ proc_do_sync_mode(ctl_table *table, int write,
if ((*valp < 0) || (*valp > 1)) {
/* Restore the correct value */
*valp = val;
- } else {
- struct net *net = current->nsproxy->net_ns;
- ip_vs_sync_switch_mode(net, val);
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
}
}
return rc;
@@ -1662,7 +1651,7 @@ proc_do_sync_mode(ctl_table *table, int write,
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
- * align with netns init in __ip_vs_control_init()
+ * align with netns init in ip_vs_control_net_init()
*/
static struct ctl_table vs_vars[] = {
@@ -1717,6 +1706,30 @@ static struct ctl_table vs_vars[] = {
.proc_handler = &proc_do_sync_mode,
},
{
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_persist_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "cache_bypass",
.maxlen = sizeof(int),
.mode = 0644,
@@ -1729,6 +1742,18 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "sloppy_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_sctp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "expire_quiescent_template",
.maxlen = sizeof(int),
.mode = 0644,
@@ -1742,11 +1767,37 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_do_sync_threshold,
},
{
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
.procname = "nat_icmp_send",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "pmtu_disc",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1845,20 +1896,13 @@ static struct ctl_table vs_vars[] = {
{ }
};
-const struct ctl_path net_vs_ctl_path[] = {
- { .procname = "net", },
- { .procname = "ipv4", },
- { .procname = "vs", },
- { }
-};
-EXPORT_SYMBOL_GPL(net_vs_ctl_path);
#endif
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
struct seq_net_private p; /* Do not move this, netns depends upon it*/
- struct list_head *table;
+ struct hlist_head *table;
int bucket;
};
@@ -1866,7 +1910,7 @@ struct ip_vs_iter {
* Write the contents of the VS rule table to a PROCfs file.
* (It is kept just for backward compatibility)
*/
-static inline const char *ip_vs_fwd_name(unsigned flags)
+static inline const char *ip_vs_fwd_name(unsigned int flags)
{
switch (flags & IP_VS_CONN_F_FWD_MASK) {
case IP_VS_CONN_F_LOCALNODE:
@@ -1891,7 +1935,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
@@ -1902,7 +1946,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
@@ -1915,17 +1960,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
+ __acquires(RCU)
{
-
- read_lock_bh(&__ip_vs_svc_lock);
+ rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *e;
+ struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
@@ -1938,13 +1982,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
- if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, s_list);
-
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
- s_list) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
return svc;
}
}
@@ -1955,13 +2000,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* next service in hashed by fwmark */
- if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, f_list);
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
- f_list)
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
return svc;
}
@@ -1969,9 +2016,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
+ __releases(RCU)
{
- read_unlock_bh(&__ip_vs_svc_lock);
+ rcu_read_unlock();
}
@@ -1989,6 +2036,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -1997,18 +2045,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- svc->scheduler->name);
+ sched->name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- svc->scheduler->name,
+ sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, svc->scheduler->name,
+ svc->fwmark, sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -2019,7 +2067,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
else
seq_putc(seq, '\n');
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
@@ -2113,7 +2161,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
- struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_stats_user rates;
int i;
@@ -2129,10 +2177,10 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
__u64 inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_bh(&u->syncp);
+ start = u64_stats_fetch_begin_irq(&u->syncp);
inbytes = u->ustats.inbytes;
outbytes = u->ustats.outbytes;
- } while (u64_stats_fetch_retry_bh(&u->syncp, start));
+ } while (u64_stats_fetch_retry_irq(&u->syncp, start));
seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
i, u->ustats.conns, u->ustats.inpkts,
@@ -2283,8 +2331,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
struct ip_vs_service *svc;
struct ip_vs_dest_user *udest_compat;
struct ip_vs_dest_user_kern udest;
+ struct netns_ipvs *ipvs = net_ipvs(net);
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
@@ -2303,6 +2352,24 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* increase the module use count */
ip_vs_use_count_inc();
+ /* Handle daemons since they have another lock */
+ if (cmd == IP_VS_SO_SET_STARTDAEMON ||
+ cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+
+ if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+ if (cmd == IP_VS_SO_SET_STARTDAEMON)
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
+ else
+ ret = stop_sync_thread(net, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ goto out_dec;
+ }
+
if (mutex_lock_interruptible(&__ip_vs_mutex)) {
ret = -ERESTARTSYS;
goto out_dec;
@@ -2310,21 +2377,12 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush(net);
+ ret = ip_vs_flush(net, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
- dm->syncid);
- goto out_unlock;
- } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
- struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- ret = stop_sync_thread(net, dm->state);
- goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2354,11 +2412,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
if (usvc.fwmark == 0)
svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2410,11 +2470,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2433,7 +2496,7 @@ __ip_vs_get_service_entries(struct net *net,
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2452,7 +2515,7 @@ __ip_vs_get_service_entries(struct net *net,
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2469,7 +2532,7 @@ __ip_vs_get_service_entries(struct net *net,
count++;
}
}
- out:
+out:
return ret;
}
@@ -2481,17 +2544,20 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
+ rcu_read_lock();
if (get->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
+ rcu_read_unlock();
if (svc) {
int count = 0;
struct ip_vs_dest *dest;
struct ip_vs_dest_entry entry;
+ memset(&entry, 0, sizeof(entry));
list_for_each_entry(dest, &svc->destinations, n_list) {
if (count >= get->num_dests)
break;
@@ -2525,6 +2591,8 @@ __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
struct ip_vs_proto_data *pd;
#endif
+ memset(u, 0, sizeof (*u));
+
#ifdef CONFIG_IP_VS_PROTO_TCP
pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
@@ -2566,7 +2634,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct netns_ipvs *ipvs = net_ipvs(net);
BUG_ON(!net);
- if (!capable(CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
@@ -2584,6 +2652,33 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
if (copy_from_user(arg, user, copylen) != 0)
return -EFAULT;
+ /*
+ * Handle daemons first since it has its own locking
+ */
+ if (cmd == IP_VS_SO_GET_DAEMON) {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ if (mutex_lock_interruptible(&ipvs->sync_mutex))
+ return -ERESTARTSYS;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
+ }
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
+ }
if (mutex_lock_interruptible(&__ip_vs_mutex))
return -ERESTARTSYS;
@@ -2639,12 +2734,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
+ rcu_read_lock();
if (entry->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET,
entry->protocol, &addr,
entry->port);
+ rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2681,33 +2778,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
break;
- case IP_VS_SO_GET_DAEMON:
- {
- struct ip_vs_daemon_user d[2];
-
- memset(&d, 0, sizeof(d));
- if (ipvs->sync_state & IP_VS_STATE_MASTER) {
- d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
- sizeof(d[0].mcast_ifn));
- d[0].syncid = ipvs->master_syncid;
- }
- if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
- d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
- sizeof(d[1].mcast_ifn));
- d[1].syncid = ipvs->backup_syncid;
- }
- if (copy_to_user(user, &d, sizeof(d)) != 0)
- ret = -EFAULT;
- }
- break;
-
default:
ret = -EINVAL;
}
- out:
+out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
@@ -2800,17 +2875,17 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
ip_vs_copy_stats(&ustats, stats);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes);
- NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps);
- NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps);
-
+ if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
+ goto nla_put_failure;
nla_nest_end(skb, nl_stats);
return 0;
@@ -2823,6 +2898,8 @@ nla_put_failure:
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
@@ -2831,23 +2908,26 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
if (!nl_service)
return -EMSGSIZE;
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
-
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
+ goto nla_put_failure;
if (svc->fwmark) {
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+ if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
+ goto nla_put_failure;
} else {
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
- NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
- NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
+ nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ goto nla_put_failure;
}
- NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
- if (svc->pe)
- NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
- NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
- NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
-
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
+ nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
+ nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
goto nla_put_failure;
@@ -2866,7 +2946,7 @@ static int ip_vs_genl_dump_service(struct sk_buff *skb,
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_SERVICE);
if (!hdr)
@@ -2892,7 +2972,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -2903,7 +2983,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -2959,15 +3039,17 @@ static int ip_vs_genl_parse_service(struct net *net,
} else {
usvc->protocol = nla_get_u16(nla_protocol);
nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
- usvc->port = nla_get_u16(nla_port);
+ usvc->port = nla_get_be16(nla_port);
usvc->fwmark = 0;
}
+ rcu_read_lock();
if (usvc->fwmark)
svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
+ rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
@@ -2997,7 +3079,7 @@ static int ip_vs_genl_parse_service(struct net *net,
usvc->sched_name = nla_data(nla_sched);
usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
- usvc->netmask = nla_get_u32(nla_netmask);
+ usvc->netmask = nla_get_be32(nla_netmask);
}
return 0;
@@ -3022,21 +3104,22 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
if (!nl_dest)
return -EMSGSIZE;
- NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
- NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
-
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
- atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
- atomic_read(&dest->activeconns));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
- atomic_read(&dest->inactconns));
- NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
- atomic_read(&dest->persistconns));
-
+ if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+ (atomic_read(&dest->conn_flags) &
+ IP_VS_CONN_F_FWD_MASK)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
+ atomic_read(&dest->weight)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+ atomic_read(&dest->activeconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+ atomic_read(&dest->inactconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+ atomic_read(&dest->persistconns)))
+ goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
goto nla_put_failure;
@@ -3054,7 +3137,7 @@ static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DEST);
if (!hdr)
@@ -3131,7 +3214,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
memset(udest, 0, sizeof(*udest));
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
- udest->port = nla_get_u16(nla_port);
+ udest->port = nla_get_be16(nla_port);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
@@ -3156,8 +3239,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
return 0;
}
-static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid)
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
{
struct nlattr *nl_daemon;
@@ -3165,10 +3248,10 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
if (!nl_daemon)
return -EMSGSIZE;
- NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
- NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
- NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
-
+ if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
@@ -3178,12 +3261,12 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid,
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
struct netlink_callback *cb)
{
void *hdr;
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DAEMON);
if (!hdr)
@@ -3205,7 +3288,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct net *net = skb_sknet(skb);
struct netns_ipvs *ipvs = net_ipvs(net);
- mutex_lock(&__ip_vs_mutex);
+ mutex_lock(&ipvs->sync_mutex);
if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
ipvs->master_mcast_ifn,
@@ -3225,7 +3308,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
}
nla_put_failure:
- mutex_unlock(&__ip_vs_mutex);
+ mutex_unlock(&ipvs->sync_mutex);
return skb->len;
}
@@ -3271,13 +3354,9 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
return ip_vs_set_timeout(net, &t);
}
-static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
- struct ip_vs_service *svc = NULL;
- struct ip_vs_service_user_kern usvc;
- struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
- int need_full_svc = 0, need_full_dest = 0;
struct net *net;
struct netns_ipvs *ipvs;
@@ -3285,19 +3364,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
- mutex_lock(&__ip_vs_mutex);
-
- if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush(net);
- goto out;
- } else if (cmd == IPVS_CMD_SET_CONFIG) {
- ret = ip_vs_genl_set_config(net, info->attrs);
- goto out;
- } else if (cmd == IPVS_CMD_NEW_DAEMON ||
- cmd == IPVS_CMD_DEL_DAEMON) {
-
+ if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+ mutex_lock(&ipvs->sync_mutex);
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
info->attrs[IPVS_CMD_ATTR_DAEMON],
@@ -3310,6 +3380,31 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
ret = ip_vs_genl_new_daemon(net, daemon_attrs);
else
ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+out:
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+ return ret;
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ip_vs_service *svc = NULL;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_dest_user_kern udest;
+ int ret = 0, cmd;
+ int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ if (cmd == IPVS_CMD_FLUSH) {
+ ret = ip_vs_flush(net, false);
+ goto out;
+ } else if (cmd == IPVS_CMD_SET_CONFIG) {
+ ret = ip_vs_genl_set_config(net, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
@@ -3392,10 +3487,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
void *reply;
int ret, cmd, reply_cmd;
struct net *net;
- struct netns_ipvs *ipvs;
net = skb_sknet(skb);
- ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3447,21 +3540,26 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
__ip_vs_get_timeouts(net, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
- t.tcp_fin_timeout);
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
+ t.tcp_timeout) ||
+ nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+ t.tcp_fin_timeout))
+ goto nla_put_failure;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
- NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
+ goto nla_put_failure;
#endif
break;
}
case IPVS_CMD_GET_INFO:
- NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
- NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
- ip_vs_conn_tab_size);
+ if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
+ IP_VS_VERSION_CODE) ||
+ nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+ ip_vs_conn_tab_size))
+ goto nla_put_failure;
break;
}
@@ -3482,7 +3580,7 @@ out:
}
-static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+static const struct genl_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.flags = GENL_ADMIN_PERM,
@@ -3536,13 +3634,13 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
.cmd = IPVS_CMD_NEW_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
- .doit = ip_vs_genl_set_cmd,
+ .doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_DEL_DAEMON,
.flags = GENL_ADMIN_PERM,
.policy = ip_vs_cmd_policy,
- .doit = ip_vs_genl_set_cmd,
+ .doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_GET_DAEMON,
@@ -3581,7 +3679,7 @@ static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
static int __init ip_vs_genl_register(void)
{
return genl_register_family_with_ops(&ip_vs_genl_family,
- ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
+ ip_vs_genl_ops);
}
static void ip_vs_genl_unregister(void)
@@ -3595,7 +3693,7 @@ static void ip_vs_genl_unregister(void)
* per netns intit/exit func.
*/
#ifdef CONFIG_SYSCTL
-int __net_init __ip_vs_control_init_sysctl(struct net *net)
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
{
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -3610,6 +3708,10 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net)
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
@@ -3628,18 +3730,33 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net)
tbl[idx++].data = &ipvs->sysctl_snat_reroute;
ipvs->sysctl_sync_ver = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+ ipvs->sysctl_pmtu_disc = 1;
+ tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
- ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
- tbl);
+ ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
if (!net_eq(net, &init_net))
kfree(tbl);
@@ -3654,19 +3771,20 @@ int __net_init __ip_vs_control_init_sysctl(struct net *net)
return 0;
}
-void __net_init __ip_vs_control_cleanup_sysctl(struct net *net)
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
}
#else
-int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; }
-void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { }
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
#endif
@@ -3674,35 +3792,41 @@ static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
};
-int __net_init __ip_vs_control_init(struct net *net)
+int __net_init ip_vs_control_net_init(struct net *net)
{
- int idx;
+ int i, idx;
struct netns_ipvs *ipvs = net_ipvs(net);
- ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
-
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
- INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!ipvs->tot_stats.cpustats) {
- pr_err("%s(): alloc_percpu.\n", __func__);
+ if (!ipvs->tot_stats.cpustats)
return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ipvs_tot_stats;
+ ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+ u64_stats_init(&ipvs_tot_stats->syncp);
}
+
spin_lock_init(&ipvs->tot_stats.lock);
- proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
- proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
- &ip_vs_stats_percpu_fops);
+ proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ &ip_vs_stats_percpu_fops);
- if (__ip_vs_control_init_sysctl(net))
+ if (ip_vs_control_net_init_sysctl(net))
goto err;
return 0;
@@ -3712,34 +3836,22 @@ err:
return -ENOMEM;
}
-void __net_exit __ip_vs_control_cleanup(struct net *net)
+void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_trash_cleanup(net);
- ip_vs_stop_estimator(net, &ipvs->tot_stats);
- __ip_vs_control_cleanup_sysctl(net);
- proc_net_remove(net, "ip_vs_stats_percpu");
- proc_net_remove(net, "ip_vs_stats");
- proc_net_remove(net, "ip_vs");
+ ip_vs_control_net_cleanup_sysctl(net);
+ remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
+ remove_proc_entry("ip_vs_stats", net->proc_net);
+ remove_proc_entry("ip_vs", net->proc_net);
free_percpu(ipvs->tot_stats.cpustats);
}
-int __init ip_vs_control_init(void)
+int __init ip_vs_register_nl_ioctl(void)
{
- int idx;
int ret;
- EnterFunction(2);
-
- /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
- }
-
- smp_wmb(); /* Do we really need it now ? */
-
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
@@ -3751,27 +3863,47 @@ int __init ip_vs_control_init(void)
pr_err("cannot register Generic Netlink interface.\n");
goto err_genl;
}
-
- ret = register_netdevice_notifier(&ip_vs_dst_notifier);
- if (ret < 0)
- goto err_notf;
-
- LeaveFunction(2);
return 0;
-err_notf:
- ip_vs_genl_unregister();
err_genl:
nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
return ret;
}
+void ip_vs_unregister_nl_ioctl(void)
+{
+ ip_vs_genl_unregister();
+ nf_unregister_sockopt(&ip_vs_sockopts);
+}
+
+int __init ip_vs_control_init(void)
+{
+ int idx;
+ int ret;
+
+ EnterFunction(2);
+
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+
+ smp_wmb(); /* Do we really need it now ? */
+
+ ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+ if (ret < 0)
+ return ret;
+
+ LeaveFunction(2);
+ return 0;
+}
+
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
- ip_vs_genl_unregister();
- nf_unregister_sockopt(&ip_vs_sockopts);
+ unregister_netdevice_notifier(&ip_vs_dst_notifier);
LeaveFunction(2);
}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 95fd0d14200..c3b84546ea9 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -51,7 +51,7 @@
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -64,11 +64,15 @@ struct ip_vs_dh_bucket {
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
/*
* Returns hash value for IPVS DH entry
*/
-static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -85,10 +89,9 @@ static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
}
@@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
p = p->next;
}
@@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
{
int i;
struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -145,52 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
/* allocate the DH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- }
- svc->sched_data = tbl;
+
+ svc->sched_data = s;
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
return 0;
}
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
+ struct ip_vs_dh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ ip_vs_dh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ struct ip_vs_dh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ ip_vs_dh_reassign(s, svc);
return 0;
}
@@ -210,27 +214,26 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Destination hashing scheduling
*/
static struct ip_vs_dest *
-ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
- struct ip_vs_dh_bucket *tbl;
- struct ip_vs_iphdr iph;
-
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+ struct ip_vs_dh_state *s;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
- dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
@@ -249,7 +252,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
- .update_service = ip_vs_dh_update_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
.schedule = ip_vs_dh_schedule,
};
@@ -263,6 +267,7 @@ static int __init ip_vs_dh_init(void)
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 508cce98777..1425e9a924c 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -56,15 +56,16 @@
* Make a summary from each cpu
*/
static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
- struct ip_vs_cpu_stats *stats)
+ struct ip_vs_cpu_stats __percpu *stats)
{
int i;
+ bool add = false;
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
unsigned int start;
__u64 inbytes, outbytes;
- if (i) {
+ if (add) {
sum->conns += s->ustats.conns;
sum->inpkts += s->ustats.inpkts;
sum->outpkts += s->ustats.outpkts;
@@ -76,6 +77,7 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
sum->inbytes += inbytes;
sum->outbytes += outbytes;
} else {
+ add = true;
sum->conns = s->ustats.conns;
sum->inpkts = s->ustats.inpkts;
sum->outpkts = s->ustats.outpkts;
@@ -192,7 +194,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
dst->outbps = (e->outbps + 0xF) >> 5;
}
-int __net_init __ip_vs_estimator_init(struct net *net)
+int __net_init ip_vs_estimator_net_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -203,16 +205,7 @@ int __net_init __ip_vs_estimator_init(struct net *net)
return 0;
}
-void __net_exit __ip_vs_estimator_cleanup(struct net *net)
+void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
{
del_timer_sync(&net_ipvs(net)->est_timer);
}
-
-int __init ip_vs_estimator_init(void)
-{
- return 0;
-}
-
-void ip_vs_estimator_cleanup(void)
-{
-}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index af63553fa33..77c173282f3 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -44,16 +44,17 @@
#include <net/ip_vs.h>
-#define SERVER_STRING "227 Entering Passive Mode ("
-#define CLIENT_STRING "PORT "
+#define SERVER_STRING "227 "
+#define CLIENT_STRING "PORT"
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
*/
+static unsigned int ports_count = 1;
static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
-module_param_array(ports, ushort, NULL, 0);
+module_param_array(ports, ushort, &ports_count, 0444);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
@@ -79,14 +80,17 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
/*
* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern" and terminated with the "term" character.
+ * with the "pattern", ignoring before "skip" and terminated with
+ * the "term" character.
* <addr,port> is in network order.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
- const char *pattern, size_t plen, char term,
+ const char *pattern, size_t plen,
+ char skip, char term,
__be32 *addr, __be16 *port,
char **start, char **end)
{
+ char *s, c;
unsigned char p[6];
int i = 0;
@@ -101,19 +105,38 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (strnicmp(data, pattern, plen) != 0) {
return 0;
}
- *start = data + plen;
+ s = data + plen;
+ if (skip) {
+ int found = 0;
+
+ for (;; s++) {
+ if (s == data_limit)
+ return -1;
+ if (!found) {
+ if (*s == skip)
+ found = 1;
+ } else if (*s != skip) {
+ break;
+ }
+ }
+ }
- for (data = *start; *data != term; data++) {
+ for (data = s; ; data++) {
if (data == data_limit)
return -1;
+ if (*data == term)
+ break;
}
*end = data;
memset(p, 0, sizeof(p));
- for (data = *start; data != *end; data++) {
- if (*data >= '0' && *data <= '9') {
- p[i] = p[i]*10 + *data - '0';
- } else if (*data == ',' && i < 5) {
+ for (data = s; ; data++) {
+ c = *data;
+ if (c == term)
+ break;
+ if (c >= '0' && c <= '9') {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
i++;
} else {
/* unexpected character */
@@ -124,8 +147,9 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (i != 5)
return -1;
- *addr = get_unaligned((__be32 *)p);
- *port = get_unaligned((__be16 *)(p + 4));
+ *start = s;
+ *addr = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
return 1;
}
@@ -153,7 +177,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
__be16 port;
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
- unsigned buf_len;
+ unsigned int buf_len;
int ret = 0;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
@@ -185,7 +209,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING,
- sizeof(SERVER_STRING)-1, ')',
+ sizeof(SERVER_STRING)-1,
+ '(', ')',
&from.ip, &port,
&start, &end) != 1)
return 1;
@@ -242,9 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* hopefully it will succeed on the retransmitted
* packet.
*/
+ rcu_read_lock();
ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ iph->ihl * 4,
start-data, end-start,
buf, buf_len);
+ rcu_read_unlock();
if (ret) {
ip_vs_nfct_expect_related(skb, ct, n_cp,
IPPROTO_TCP, 0, 0);
@@ -345,7 +373,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
*/
if (ip_vs_ftp_get_addrport(data_start, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- '\r', &to.ip, &port,
+ ' ', '\r', &to.ip, &port,
&start, &end) != 1)
return 1;
@@ -414,18 +442,14 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
struct ip_vs_app *app;
struct netns_ipvs *ipvs = net_ipvs(net);
- app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL);
- if (!app)
- return -ENOMEM;
- INIT_LIST_HEAD(&app->a_list);
- INIT_LIST_HEAD(&app->incs_list);
- ipvs->ftp_app = app;
+ if (!ipvs)
+ return -ENOENT;
- ret = register_ip_vs_app(net, app);
- if (ret)
- goto err_exit;
+ app = register_ip_vs_app(net, &ip_vs_ftp);
+ if (IS_ERR(app))
+ return PTR_ERR(app);
- for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+ for (i = 0; i < ports_count; i++) {
if (!ports[i])
continue;
ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
@@ -437,9 +461,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
return 0;
err_unreg:
- unregister_ip_vs_app(net, app);
-err_exit:
- kfree(ipvs->ftp_app);
+ unregister_ip_vs_app(net, &ip_vs_ftp);
return ret;
}
/*
@@ -447,10 +469,7 @@ err_exit:
*/
static void __ip_vs_ftp_exit(struct net *net)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- unregister_ip_vs_app(net, ipvs->ftp_app);
- kfree(ipvs->ftp_app);
+ unregister_ip_vs_app(net, &ip_vs_ftp);
}
static struct pernet_operations ip_vs_ftp_ops = {
@@ -458,11 +477,12 @@ static struct pernet_operations ip_vs_ftp_ops = {
.exit = __ip_vs_ftp_exit,
};
-int __init ip_vs_ftp_init(void)
+static int __init ip_vs_ftp_init(void)
{
int rv;
rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
return rv;
}
@@ -472,6 +492,7 @@ int __init ip_vs_ftp_init(void)
static void __exit ip_vs_ftp_exit(void)
{
unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 87e40ea77a9..547ff33c1ef 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -90,11 +90,12 @@
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -102,12 +103,14 @@ struct ip_vs_lblc_entry {
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
- struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -115,7 +118,7 @@ struct ip_vs_lblc_table {
* IPVS LBLC sysctl table
*/
#ifdef CONFIG_SYSCTL
-static ctl_table vs_vars_table[] = {
+static struct ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
.data = NULL,
@@ -127,22 +130,26 @@ static ctl_table vs_vars_table[] = {
};
#endif
-static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
{
- list_del(&en->list);
- /*
- * We don't kfree dest because it is referred either by its service
- * or the trash dest list.
- */
- atomic_dec(&en->dest->refcnt);
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
+
+ ip_vs_dest_put_and_free(en->dest);
kfree(en);
}
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
/*
* Returns hash value for IPVS LBLC entry
*/
-static inline unsigned
+static inline unsigned int
ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -163,25 +170,22 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
static void
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
- unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+ unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- * lock
- */
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
const union nf_inet_addr *addr)
{
- unsigned hash = ip_vs_lblc_hashkey(af, addr);
+ unsigned int hash = ip_vs_lblc_hashkey(af, addr);
struct ip_vs_lblc_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -191,7 +195,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
/*
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
+ * address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
@@ -200,26 +204,23 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
struct ip_vs_lblc_entry *en;
en = ip_vs_lblc_get(dest->af, tbl, daddr);
- if (!en) {
- en = kmalloc(sizeof(*en), GFP_ATOMIC);
- if (!en) {
- pr_err("%s(): no memory\n", __func__);
- return NULL;
- }
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
- en->af = dest->af;
- ip_vs_addr_copy(dest->af, &en->addr, daddr);
- en->lastuse = jiffies;
+ en->af = dest->af;
+ ip_vs_addr_copy(dest->af, &en->addr, daddr);
+ en->lastuse = jiffies;
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
- ip_vs_lblc_hash(tbl, en);
- } else if (en->dest != dest) {
- atomic_dec(&en->dest->refcnt);
- atomic_inc(&dest->refcnt);
- en->dest = dest;
- }
+ ip_vs_lblc_hash(tbl, en);
return en;
}
@@ -228,17 +229,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
{
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
int i;
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
- ip_vs_lblc_free(en);
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblc_expiration(struct ip_vs_service *svc)
@@ -254,24 +260,25 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc)
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
unsigned long now = jiffies;
int i, j;
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now,
en->lastuse +
sysctl_lblc_expiration(svc)))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -295,7 +302,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -313,26 +321,26 @@ static void ip_vs_lblc_check_expire(unsigned long data)
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
- for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
- ip_vs_lblc_free(en);
+ ip_vs_lblc_del(en);
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
- mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
}
@@ -344,11 +352,10 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblc_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
return -ENOMEM;
- }
+
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
@@ -356,12 +363,13 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
/*
* Initialize the hash buckets
*/
- for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -374,7 +382,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
@@ -382,14 +390,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblc_flush(tbl);
+ ip_vs_lblc_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -411,7 +417,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
@@ -426,13 +432,13 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -460,7 +466,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -475,20 +481,17 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_iphdr iph;
struct ip_vs_dest *dest = NULL;
struct ip_vs_lblc_entry *en;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
- en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
+ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
@@ -502,14 +505,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* free up entries from the trash at any time.
*/
- if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
- dest = en->dest;
+ dest = en->dest;
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
}
- read_unlock(&svc->sched_lock);
-
- /* If the destination has a weight and is not overloaded, use it */
- if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
- goto out;
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
@@ -519,13 +519,14 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblc_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
@@ -535,8 +536,7 @@ out:
/*
* IPVS LBLC Scheduler structure
*/
-static struct ip_vs_scheduler ip_vs_lblc_scheduler =
-{
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
.name = "lblc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
@@ -554,20 +554,27 @@ static int __net_init __ip_vs_lblc_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ if (!ipvs)
+ return -ENOENT;
+
if (!net_eq(net, &init_net)) {
ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
sizeof(vs_vars_table),
GFP_KERNEL);
if (ipvs->lblc_ctl_table == NULL)
return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblc_ctl_table[0].procname = NULL;
+
} else
ipvs->lblc_ctl_table = vs_vars_table;
ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
ipvs->lblc_ctl_header =
- register_net_sysctl_table(net, net_vs_ctl_path,
- ipvs->lblc_ctl_table);
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
if (!ipvs->lblc_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblc_ctl_table);
@@ -617,6 +624,7 @@ static void __exit ip_vs_lblc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
unregister_pernet_subsys(&ip_vs_lblc_ops);
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 90f618ab6dd..3f21a2f47de 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -89,42 +89,49 @@
*/
struct ip_vs_dest_set_elem {
struct list_head list; /* list link */
- struct ip_vs_dest *dest; /* destination server */
+ struct ip_vs_dest *dest; /* destination server */
+ struct rcu_head rcu_head;
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct list_head list; /* destination list */
- rwlock_t lock; /* lock for this list */
};
-static struct ip_vs_dest_set_elem *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
{
struct ip_vs_dest_set_elem *e;
- list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest)
- /* already existed */
- return NULL;
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest)
+ return;
+ }
}
e = kmalloc(sizeof(*e), GFP_ATOMIC);
- if (e == NULL) {
- pr_err("%s(): no memory\n", __func__);
- return NULL;
- }
+ if (e == NULL)
+ return;
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
e->dest = dest;
- list_add(&e->list, &set->list);
+ list_add_rcu(&e->list, &set->list);
atomic_inc(&set->size);
set->lastmod = jiffies;
- return e;
+}
+
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put_and_free(e->dest);
+ kfree(e);
}
static void
@@ -137,9 +144,8 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
/* HIT */
atomic_dec(&set->size);
set->lastmod = jiffies;
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
break;
}
}
@@ -149,17 +155,10 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_set_elem *e, *ep;
- write_lock(&set->lock);
list_for_each_entry_safe(e, ep, &set->list, list) {
- /*
- * We don't kfree dest because it is referred either
- * by its service or by the trash dest list.
- */
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
}
- write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
@@ -169,11 +168,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
struct ip_vs_dest *dest, *least;
int loh, doh;
- if (set == NULL)
- return NULL;
-
/* select the first destination server, whose weight > 0 */
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_rcu(e, &set->list, list) {
least = e->dest;
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -188,14 +184,14 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* find the destination with the weighted least load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
dest = e->dest;
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if ((loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
least = dest;
loh = doh;
@@ -236,12 +232,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* find the destination with the weighted most load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
+ list_for_each_entry_continue(e, &set->list, list) {
dest = e->dest;
doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
- if ((moh * atomic_read(&dest->weight) <
- doh * atomic_read(&most->weight))
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
&& (atomic_read(&dest->weight) > 0)) {
most = dest;
moh = doh;
@@ -264,11 +260,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -276,12 +273,14 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -290,7 +289,7 @@ struct ip_vs_lblcr_table {
* IPVS LBLCR sysctl table
*/
-static ctl_table vs_vars_table[] = {
+static struct ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
.data = NULL,
@@ -304,16 +303,16 @@ static ctl_table vs_vars_table[] = {
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
- list_del(&en->list);
+ hlist_del_rcu(&en->list);
ip_vs_dest_set_eraseall(&en->set);
- kfree(en);
+ kfree_rcu(en, rcu_head);
}
/*
* Returns hash value for IPVS LBLCR entry
*/
-static inline unsigned
+static inline unsigned int
ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
{
__be32 addr_fold = addr->ip;
@@ -334,25 +333,22 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
static void
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
- unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+ unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- * read lock.
- */
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
const union nf_inet_addr *addr)
{
- unsigned hash = ip_vs_lblcr_hashkey(af, addr);
+ unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
struct ip_vs_lblcr_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -362,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
/*
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
+ * IP address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
@@ -373,10 +369,8 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
en = ip_vs_lblcr_get(dest->af, tbl, daddr);
if (!en) {
en = kmalloc(sizeof(*en), GFP_ATOMIC);
- if (!en) {
- pr_err("%s(): no memory\n", __func__);
+ if (!en)
return NULL;
- }
en->af = dest->af;
ip_vs_addr_copy(dest->af, &en->addr, daddr);
@@ -385,14 +379,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/* initialize its dest set */
atomic_set(&(en->set.size), 0);
INIT_LIST_HEAD(&en->set.list);
- rwlock_init(&en->set.lock);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
ip_vs_lblcr_hash(tbl, en);
+ return en;
}
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest, true);
return en;
}
@@ -401,17 +395,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
int i;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- /* No locking required, only called during cleanup. */
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
@@ -429,13 +427,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_after(en->lastuse +
sysctl_lblcr_expiration(svc), now))
continue;
@@ -443,7 +442,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -467,7 +466,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -485,11 +485,11 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
- for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -497,7 +497,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -515,11 +515,10 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the ip_vs_lblcr_table for this service
*/
- tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
return -ENOMEM;
- }
+
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
@@ -527,12 +526,13 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
/*
* Initialize the hash buckets
*/
- for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -545,7 +545,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
@@ -553,14 +553,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblcr_flush(tbl);
+ ip_vs_lblcr_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -582,7 +580,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -598,13 +596,13 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -632,7 +630,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -647,65 +645,56 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
- struct ip_vs_iphdr iph;
- struct ip_vs_dest *dest = NULL;
+ struct ip_vs_dest *dest;
struct ip_vs_lblcr_entry *en;
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
-
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
- en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
+ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
if (en) {
- /* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/* Get the least loaded destination */
- read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- time_after(jiffies, en->set.lastmod +
+ time_after(jiffies, en->set.lastmod +
sysctl_lblcr_expiration(svc))) {
- struct ip_vs_dest *m;
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
- write_lock(&en->set.lock);
- m = ip_vs_dest_set_max(&en->set);
- if (m)
- ip_vs_dest_set_erase(&en->set, m);
- write_unlock(&en->set.lock);
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
}
/* If the destination is not overloaded, use it */
- if (dest && !is_overloaded(dest, svc)) {
- read_unlock(&svc->sched_lock);
+ if (dest && !is_overloaded(dest, svc))
goto out;
- }
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
- read_unlock(&svc->sched_lock);
return NULL;
}
/* Update our cache entry */
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
- }
- read_unlock(&svc->sched_lock);
-
- if (dest)
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
goto out;
+ }
/* No cache entry, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
@@ -715,13 +704,14 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblcr_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph->daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
return dest;
@@ -750,20 +740,26 @@ static int __net_init __ip_vs_lblcr_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ if (!ipvs)
+ return -ENOENT;
+
if (!net_eq(net, &init_net)) {
ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
sizeof(vs_vars_table),
GFP_KERNEL);
if (ipvs->lblcr_ctl_table == NULL)
return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblcr_ctl_table[0].procname = NULL;
} else
ipvs->lblcr_ctl_table = vs_vars_table;
ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
ipvs->lblcr_ctl_header =
- register_net_sysctl_table(net, net_vs_ctl_path,
- ipvs->lblcr_ctl_table);
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
if (!ipvs->lblcr_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblcr_ctl_table);
@@ -813,6 +809,7 @@ static void __exit ip_vs_lblcr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ rcu_barrier();
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index f391819c0cc..2bdcb1cf212 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -26,7 +26,8 @@
* Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
@@ -42,7 +43,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* served, but no new connection is assigned to the server.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
@@ -84,6 +85,7 @@ static int __init ip_vs_lc_init(void)
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_lc_init);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index f454c80df0a..5882bbfd198 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -19,8 +19,7 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*
* Authors:
@@ -63,6 +62,7 @@
#include <net/ip_vs.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_zones.h>
@@ -82,7 +82,7 @@ void
ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
{
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
struct nf_conntrack_tuple new_tuple;
if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
@@ -97,6 +97,11 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
+ /* Applications may adjust TCP seqs */
+ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP &&
+ !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct))
+ return;
+
/*
* The connection is not yet in the hashtable, so we update it.
* CIP->VIP will remain the same, so leave the tuple in
@@ -127,7 +132,7 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
nf_conntrack_alter_reply(ct, &new_tuple);
}
-int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+int ip_vs_confirm_conntrack(struct sk_buff *skb)
{
return nf_conntrack_confirm(skb);
}
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 984d9c137d8..961a6de9bb2 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -40,7 +40,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -55,10 +55,11 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least = NULL;
- unsigned int loh = 0, doh;
+ int loh = 0, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -75,7 +76,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
!atomic_read(&dest->weight))
@@ -91,8 +92,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
if (!least ||
- (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight))) {
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
least = dest;
loh = doh;
}
@@ -133,6 +134,7 @@ static int __init ip_vs_nq_init(void)
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_nq_init);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5cf859ccb31..1a82b29ce8e 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,20 +13,8 @@
/* IPVS pe list */
static LIST_HEAD(ip_vs_pe);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
-
-/* Bind a service with a pe */
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
-{
- svc->pe = pe;
-}
-
-/* Unbind a service from its pe */
-void ip_vs_unbind_pe(struct ip_vs_service *svc)
-{
- svc->pe = NULL;
-}
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
/* Get pe in the pe list by name */
struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
@@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
- spin_lock_bh(&ip_vs_pe_lock);
-
- list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
/* Test and get the modules atomically */
if (pe->module &&
!try_module_get(pe->module)) {
@@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
}
if (strcmp(pe_name, pe->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_pe_lock);
+ rcu_read_unlock();
return pe;
}
if (pe->module)
module_put(pe->module);
}
+ rcu_read_unlock();
- spin_unlock_bh(&ip_vs_pe_lock);
return NULL;
}
@@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_pe_lock);
-
- if (!list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- ip_vs_use_count_dec();
- pr_err("%s(): [%s] pe already linked\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
* in the pe list.
*/
list_for_each_entry(tmp, &ip_vs_pe, n_list) {
if (strcmp(tmp->name, pe->name) == 0) {
- spin_unlock_bh(&ip_vs_pe_lock);
+ mutex_unlock(&ip_vs_pe_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] pe already existed "
"in the system\n", __func__, pe->name);
@@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
}
}
/* Add it into the d-linked pe list */
- list_add(&pe->n_list, &ip_vs_pe);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
pr_info("[%s] pe registered.\n", pe->name);
@@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
/* Unregister a pe from the pe list */
int unregister_ip_vs_pe(struct ip_vs_pe *pe)
{
- spin_lock_bh(&ip_vs_pe_lock);
- if (list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- pr_err("%s(): [%s] pe is not in the list. failed\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Remove it from the d-linked pe list */
- list_del(&pe->n_list);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 13d607ae9c5..bed5f704252 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
const char *callid, size_t callid_len,
int *idx)
{
- size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
memcpy(buf + *idx, callid, len);
buf[*idx+len] = '\0';
*idx += len + 1;
@@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff,
if (ret > 0)
break;
if (!ret)
- return 0;
+ return -EINVAL;
dataoff += *matchoff;
}
- /* Empty callid is useless */
- if (!*matchlen)
- return -EINVAL;
-
/* Too large is useless */
if (*matchlen > IP_VS_PEDATA_MAXLEN)
return -EINVAL;
@@ -73,18 +70,20 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
const char *dptr;
int retc;
- ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+ ip_vs_fill_iph_skb(p->af, skb, &iph);
/* Only useful with UDP */
if (iph.protocol != IPPROTO_UDP)
return -EINVAL;
-
- /* No Data ? */
+ /* todo: IPv6 fragments:
+ * I think this only should be done for the first fragment. /HS
+ */
dataoff = iph.len + sizeof(struct udphdr);
+
if (dataoff >= skb->len)
return -EINVAL;
-
- if ((retc=skb_linearize(skb)) < 0)
+ retc = skb_linearize(skb);
+ if (retc < 0)
return retc;
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
@@ -108,7 +107,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
struct ip_vs_conn *ct)
{
- bool ret = 0;
+ bool ret = false;
if (ct->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
@@ -121,7 +120,7 @@ static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
ct->protocol == p->protocol &&
ct->pe_data && ct->pe_data_len == p->pe_data_len &&
!memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
- ret = 1;
+ ret = true;
IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -164,6 +163,7 @@ static int __init ip_vs_sip_init(void)
static void __exit ip_vs_sip_cleanup(void)
{
unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
}
module_init(ip_vs_sip_init);
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index eb86028536f..939f7fbe9b4 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -25,7 +25,6 @@
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
-#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
@@ -49,7 +48,7 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
*/
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
@@ -60,9 +59,6 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
return 0;
}
-#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
- defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
- defined(CONFIG_IP_VS_PROTO_ESP)
/*
* register an ipvs protocols netns related data
*/
@@ -70,25 +66,30 @@ static int
register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
struct ip_vs_proto_data *pd =
- kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
- if (!pd) {
- pr_err("%s(): no memory.\n", __func__);
+ if (!pd)
return -ENOMEM;
- }
+
pd->pp = pp; /* For speed issues */
pd->next = ipvs->proto_data_table[hash];
ipvs->proto_data_table[hash] = pd;
atomic_set(&pd->appcnt, 0); /* Init app counter */
- if (pp->init_netns != NULL)
- pp->init_netns(net, pd);
+ if (pp->init_netns != NULL) {
+ int ret = pp->init_netns(net, pd);
+ if (ret) {
+ /* unlink an free proto data */
+ ipvs->proto_data_table[hash] = pd->next;
+ kfree(pd);
+ return ret;
+ }
+ }
return 0;
}
-#endif
/*
* unregister an ipvs protocol
@@ -96,7 +97,7 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
{
struct ip_vs_protocol **pp_p;
- unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
pp_p = &ip_vs_proto_table[hash];
for (; *pp_p; pp_p = &(*pp_p)->next) {
@@ -119,7 +120,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data **pd_p;
- unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+ unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
pd_p = &ipvs->proto_data_table[hash];
for (; *pd_p; pd_p = &(*pd_p)->next) {
@@ -141,7 +142,7 @@ unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
{
struct ip_vs_protocol *pp;
- unsigned hash = IP_VS_PROTO_HASH(proto);
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
if (pp->protocol == proto)
@@ -155,11 +156,11 @@ EXPORT_SYMBOL(ip_vs_proto_get);
/*
* get ip_vs_protocol object data by netns and proto
*/
-struct ip_vs_proto_data *
+static struct ip_vs_proto_data *
__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
{
struct ip_vs_proto_data *pd;
- unsigned hash = IP_VS_PROTO_HASH(proto);
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
if (pd->pp->protocol == proto)
@@ -198,7 +199,7 @@ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
int *
ip_vs_create_timeout_table(int *table, int size)
{
- return kmemdup(table, size, GFP_ATOMIC);
+ return kmemdup(table, size, GFP_KERNEL);
}
@@ -279,17 +280,17 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
if (ih == NULL)
sprintf(buf, "TRUNCATED");
else if (ih->nexthdr == IPPROTO_FRAGMENT)
- sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr);
+ sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);
else {
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
sizeof(_ports), _ports);
if (pptr == NULL)
- sprintf(buf, "TRUNCATED %pI6->%pI6",
+ sprintf(buf, "TRUNCATED %pI6c->%pI6c",
&ih->saddr, &ih->daddr);
else
- sprintf(buf, "%pI6:%u->%pI6:%u",
+ sprintf(buf, "%pI6c:%u->%pI6c:%u",
&ih->saddr, ntohs(pptr[0]),
&ih->daddr, ntohs(pptr[1]));
}
@@ -316,27 +317,40 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
/*
* per network name-space init
*/
-int __net_init __ip_vs_protocol_init(struct net *net)
+int __net_init ip_vs_protocol_net_init(struct net *net)
{
+ int i, ret;
+ static struct ip_vs_protocol *protos[] = {
#ifdef CONFIG_IP_VS_PROTO_TCP
- register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+ &ip_vs_protocol_tcp,
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
- register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+ &ip_vs_protocol_udp,
#endif
#ifdef CONFIG_IP_VS_PROTO_SCTP
- register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+ &ip_vs_protocol_sctp,
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
- register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+ &ip_vs_protocol_ah,
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
- register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+ &ip_vs_protocol_esp,
#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(protos); i++) {
+ ret = register_ip_vs_proto_netns(net, protos[i]);
+ if (ret < 0)
+ goto cleanup;
+ }
return 0;
+
+cleanup:
+ ip_vs_protocol_net_cleanup(net);
+ return ret;
}
-void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd;
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5b8eb8b12c3..5de3dd312c0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -57,7 +57,7 @@ ah_esp_conn_fill_param_proto(struct net *net, int af,
static struct ip_vs_conn *
ah_esp_conn_in_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph, unsigned int proto_off,
+ const struct ip_vs_iphdr *iph,
int inverse)
{
struct ip_vs_conn *cp;
@@ -85,9 +85,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb,
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
- const struct ip_vs_iphdr *iph,
- unsigned int proto_off,
- int inverse)
+ const struct ip_vs_iphdr *iph, int inverse)
{
struct ip_vs_conn *cp;
struct ip_vs_conn_param p;
@@ -110,7 +108,8 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
static int
ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
- int *verdict, struct ip_vs_conn **cpp)
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
/*
* AH/ESP is only related traffic. Pass the packet to IP stack.
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index d12ed53ec95..2f7ea756404 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -10,36 +10,42 @@
static int
sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
- int *verdict, struct ip_vs_conn **cpp)
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
struct net *net;
struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs;
sctp_chunkhdr_t _schunkh, *sch;
sctp_sctphdr_t *sh, _sctph;
- struct ip_vs_iphdr iph;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
- if (sh == NULL)
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
- sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t),
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
sizeof(_schunkh), &_schunkh);
- if (sch == NULL)
+ if (sch == NULL) {
+ *verdict = NF_DROP;
return 0;
+ }
+
net = skb_net(skb);
- if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
- &iph.daddr, sh->dest))) {
+ ipvs = net_ipvs(net);
+ rcu_read_lock();
+ if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
int ignored;
- if (ip_vs_todrop(net_ipvs(net))) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -47,106 +53,119 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
- *verdict = ip_vs_leave(svc, skb, pd);
- else {
- ip_vs_service_put(svc);
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
+static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
+ unsigned int sctphoff)
+{
+ sctph->checksum = sctp_compute_cksum(skb, sctphoff);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
static int
-sctp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
sctp_sctphdr_t *sctph;
- unsigned int sctphoff;
- struct sk_buff *iter;
- __be32 crc32;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- sctphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- sctphoff = ip_hdrlen(skb);
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->source = cp->vport;
- /* Calculate the checksum */
- crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- skb_walk_frags(skb, iter)
- crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
- crc32);
- crc32 = sctp_end_cksum(crc32);
- sctph->checksum = crc32;
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
static int
-sctp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
sctp_sctphdr_t *sctph;
- unsigned int sctphoff;
- struct sk_buff *iter;
- __be32 crc32;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- sctphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- sctphoff = ip_hdrlen(skb);
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_in(cp, skb))
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
}
sctph = (void *) skb_network_header(skb) + sctphoff;
- sctph->dest = cp->dport;
- /* Calculate the checksum */
- crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- skb_walk_frags(skb, iter)
- crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
- crc32);
- crc32 = sctp_end_cksum(crc32);
- sctph->checksum = crc32;
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
return 1;
}
@@ -156,10 +175,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
unsigned int sctphoff;
struct sctphdr *sh, _sctph;
- struct sk_buff *iter;
- __le32 cmp;
- __le32 val;
- __u32 tmp;
+ __le32 cmp, val;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
@@ -173,13 +189,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 0;
cmp = sh->checksum;
-
- tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
- skb_walk_frags(skb, iter)
- tmp = sctp_update_cksum((__u8 *) iter->data,
- skb_headlen(iter), tmp);
-
- val = sctp_end_cksum(tmp);
+ val = sctp_compute_cksum(skb, sctphoff);
if (val != cmp) {
/* CRC failure, dump it. */
@@ -190,710 +200,159 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
return 1;
}
-struct ipvs_sctp_nextstate {
- int next_state;
-};
enum ipvs_sctp_event_t {
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_SER,
- IP_VS_SCTP_EVE_INIT_CLI,
- IP_VS_SCTP_EVE_INIT_SER,
- IP_VS_SCTP_EVE_INIT_ACK_CLI,
- IP_VS_SCTP_EVE_INIT_ACK_SER,
- IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
- IP_VS_SCTP_EVE_COOKIE_ECHO_SER,
- IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
- IP_VS_SCTP_EVE_COOKIE_ACK_SER,
- IP_VS_SCTP_EVE_ABORT_CLI,
- IP_VS_SCTP_EVE__ABORT_SER,
- IP_VS_SCTP_EVE_SHUT_CLI,
- IP_VS_SCTP_EVE_SHUT_SER,
- IP_VS_SCTP_EVE_SHUT_ACK_CLI,
- IP_VS_SCTP_EVE_SHUT_ACK_SER,
- IP_VS_SCTP_EVE_SHUT_COM_CLI,
- IP_VS_SCTP_EVE_SHUT_COM_SER,
- IP_VS_SCTP_EVE_LAST
+ IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */
+ IP_VS_SCTP_INIT,
+ IP_VS_SCTP_INIT_ACK,
+ IP_VS_SCTP_COOKIE_ECHO,
+ IP_VS_SCTP_COOKIE_ACK,
+ IP_VS_SCTP_SHUTDOWN,
+ IP_VS_SCTP_SHUTDOWN_ACK,
+ IP_VS_SCTP_SHUTDOWN_COMPLETE,
+ IP_VS_SCTP_ERROR,
+ IP_VS_SCTP_ABORT,
+ IP_VS_SCTP_EVENT_LAST
};
-static enum ipvs_sctp_event_t sctp_events[255] = {
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_INIT_CLI,
- IP_VS_SCTP_EVE_INIT_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_ABORT_CLI,
- IP_VS_SCTP_EVE_SHUT_CLI,
- IP_VS_SCTP_EVE_SHUT_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
- IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_DATA_CLI,
- IP_VS_SCTP_EVE_SHUT_COM_CLI,
+/* RFC 2960, 3.2 Chunk Field Descriptions */
+static __u8 sctp_events[] = {
+ [SCTP_CID_DATA] = IP_VS_SCTP_DATA,
+ [SCTP_CID_INIT] = IP_VS_SCTP_INIT,
+ [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK,
+ [SCTP_CID_SACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT,
+ [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN,
+ [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK,
+ [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR,
+ [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO,
+ [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK,
+ [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA,
+ [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE,
};
-static struct ipvs_sctp_nextstate
- sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = {
- /*
- * STATE : IP_VS_SCTP_S_NONE
- */
- /*next state *//*event */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ },
- },
- /*
- * STATE : IP_VS_SCTP_S_INIT_CLI
- * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT)
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_SER
- * Server sent INIT and waiting for INIT ACK from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_ACK_CLI
- * Client sent INIT ACK and waiting for ECHO from the server
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK has been resent by the client, let us stay is in
- * the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK sent by the server, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * ECHO by client, it should not happen, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO by server, this is what we are expecting, move to ECHO_SER
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, it should not happen, close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * Unexpected COOKIE ACK from server, staty in the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_INIT_ACK_SER
- * Server sent INIT ACK and waiting for ECHO from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * Unexpected INIT_ACK by the client, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK resent by the server, let us move to same state
- */
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client send the ECHO, this is what we are expecting,
- * move to ECHO_CLI
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO received from the server, Not sure what to do,
- * let us close it
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, let us stay in the same state
- */
- {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, hmm... this should not happen, lets close
- * the connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ECHO_CLI
- * Cient sent ECHO and waiting COOKEI ACK from the Server
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK has been by the client, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client resent the ECHO, let us stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO received from the server, Not sure what to do,
- * let us close it
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, this shoud not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, this is what we are awaiting,lets move to
- * ESTABLISHED.
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ECHO_SER
- * Server sent ECHO and waiting COOKEI ACK from the client
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- /*
- * INIT_ACK has been by the server, let us close the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent the ECHO, not sure what to do, let's close the
- * connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- /*
- * ECHO resent by the server, stay in the same state
- */
- {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, this is what we are expecting, let's move
- * to ESTABLISHED.
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- /*
- * COOKIE ACK from server, this should not happen, lets close the
- * connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_ESTABLISHED
- * Association established
- */
- {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_SHUT_CLI
- * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server
- */
- /*
- * We received the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this is what we are expecting, let's move
- * to SHUDOWN_ACK_SER
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_SHUT_SER
- * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client
- */
- /*
- * We received the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN resent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this is what we are expecting, let's
- * move to SHUT_ACK_CLI
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
-
- /*
- * State : IP_VS_SCTP_S_SHUT_ACK_CLI
- * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server
- */
- /*
- * We received the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
-
- {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN sent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client resent SHUDTDOWN_ACK, let's stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server sent SHUTDOWN ACK, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this should not happen, let's close the
- * connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- /*
- * SHUTDOWN COMPLETE from server this is what we are expecting.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
-
- /*
- * State : IP_VS_SCTP_S_SHUT_ACK_SER
- * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client
- */
- /*
- * We received the data chuck, keep the state unchanged. I assume
- * that still data chuncks can be received by both the peers in
- * SHUDOWN state
- */
+/* SCTP States:
+ * See RFC 2960, 4. SCTP Association State Diagram
+ *
+ * New states (not in diagram):
+ * - INIT1 state: use shorter timeout for dropped INIT packets
+ * - REJECTED state: use shorter timeout if INIT is rejected with ABORT
+ * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging
+ *
+ * The states are as seen in real server. In the diagram, INIT1, INIT,
+ * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state.
+ *
+ * States as per packets from client (C) and server (S):
+ *
+ * Setup of client connection:
+ * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK
+ *
+ * Setup of server connection:
+ * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK
+ */
- {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
- /*
- * We have got an INIT from client. From the spec.“Upon receipt of
- * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
- * an INIT ACK using the same parameters it sent in its original
- * INIT chunk (including its Initiate Tag, unchanged”).
- */
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- /*
- * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
- * “If an INIT ACK is received by an endpoint in any state other
- * than the COOKIE-WAIT state, the endpoint should discard the
- * INIT ACK chunk”. Stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- /*
- * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
- * peer and peer shall move to the ESTABISHED. if it doesn't handle
- * it will send ERROR chunk. So, stay in the same state
- */
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- /*
- * COOKIE ACK from client, not sure what to do stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- /*
- * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
- */
- {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- /*
- * SHUTDOWN sent from the server, move to SHUTDOWN_SER
- */
- {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
- /*
- * client sent SHUDTDOWN_ACK, this should not happen let's close
- * the connection.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- /*
- * Server resent SHUTDOWN ACK, stay in the same state
- */
- {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- /*
- * SHUTDOWN COM from client, this what we are expecting, let's close
- * the connection
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- /*
- * SHUTDOWN COMPLETE from server this should not happen.
- */
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- },
- /*
- * State : IP_VS_SCTP_S_CLOSED
- */
- {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
- {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
- {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
- {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
- }
+#define sNO IP_VS_SCTP_S_NONE
+#define sI1 IP_VS_SCTP_S_INIT1
+#define sIN IP_VS_SCTP_S_INIT
+#define sCS IP_VS_SCTP_S_COOKIE_SENT
+#define sCR IP_VS_SCTP_S_COOKIE_REPLIED
+#define sCW IP_VS_SCTP_S_COOKIE_WAIT
+#define sCO IP_VS_SCTP_S_COOKIE
+#define sCE IP_VS_SCTP_S_COOKIE_ECHOED
+#define sES IP_VS_SCTP_S_ESTABLISHED
+#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT
+#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED
+#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
+#define sRJ IP_VS_SCTP_S_REJECTED
+#define sCL IP_VS_SCTP_S_CLOSED
+
+static const __u8 sctp_states
+ [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = {
+ { /* INPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* OUTPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW},
+/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL},
+/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* INPUT-ONLY */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
};
-/*
- * Timeout table[state]
- */
+#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ)
+
+/* Timeout table[state] */
static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
- [IP_VS_SCTP_S_NONE] = 2 * HZ,
- [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ,
- [IP_VS_SCTP_S_CLOSED] = 10 * HZ,
- [IP_VS_SCTP_S_LAST] = 2 * HZ,
+ [IP_VS_SCTP_S_NONE] = 2 * HZ,
+ [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_LAST] = 2 * HZ,
};
static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
- [IP_VS_SCTP_S_NONE] = "NONE",
- [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI",
- [IP_VS_SCTP_S_INIT_SER] = "INIT_SER",
- [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI",
- [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER",
- [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI",
- [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER",
- [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED",
- [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI",
- [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER",
- [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI",
- [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER",
- [IP_VS_SCTP_S_CLOSED] = "CLOSED",
- [IP_VS_SCTP_S_LAST] = "BUG!"
+ [IP_VS_SCTP_S_NONE] = "NONE",
+ [IP_VS_SCTP_S_INIT1] = "INIT1",
+ [IP_VS_SCTP_S_INIT] = "INIT",
+ [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT",
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED",
+ [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT",
+ [IP_VS_SCTP_S_COOKIE] = "COOKIE",
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED",
+ [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT",
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED",
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT",
+ [IP_VS_SCTP_S_REJECTED] = "REJECTED",
+ [IP_VS_SCTP_S_CLOSED] = "CLOSED",
+ [IP_VS_SCTP_S_LAST] = "BUG!",
};
@@ -906,14 +365,14 @@ static const char *sctp_state_name(int state)
return "?";
}
-static inline int
+static inline void
set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
int direction, const struct sk_buff *skb)
{
sctp_chunkhdr_t _sctpch, *sch;
unsigned char chunk_type;
int event, next_state;
- int ihl;
+ int ihl, cofs;
#ifdef CONFIG_IP_VS_IPV6
ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -921,10 +380,10 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
ihl = ip_hdrlen(skb);
#endif
- sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t),
- sizeof(_sctpch), &_sctpch);
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
if (sch == NULL)
- return 0;
+ return;
chunk_type = sch->type;
/*
@@ -940,25 +399,30 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
*/
if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
(sch->type == SCTP_CID_COOKIE_ACK)) {
- sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) +
- sch->length), sizeof(_sctpch), &_sctpch);
- if (sch) {
- if (sch->type == SCTP_CID_ABORT)
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
chunk_type = sch->type;
}
}
- event = sctp_events[chunk_type];
+ event = (chunk_type < sizeof(sctp_events)) ?
+ sctp_events[chunk_type] : IP_VS_SCTP_DATA;
- /*
- * If the direction is IP_VS_DIR_OUTPUT, this event is from server
- */
- if (direction == IP_VS_DIR_OUTPUT)
- event++;
- /*
- * get next state
+ /* Update direction to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
*/
- next_state = sctp_states_table[cp->state][event].next_state;
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (direction == IP_VS_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ direction = IP_VS_DIR_INPUT_ONLY;
+ }
+
+ next_state = sctp_states[direction][event][cp->state];
if (next_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
@@ -993,21 +457,15 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
cp->timeout = pd->timeout_table[cp->state = next_state];
else /* What to do ? */
cp->timeout = sctp_timeouts[cp->state = next_state];
-
- return 1;
}
-static int
+static void
sctp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
- int ret = 0;
-
- spin_lock(&cp->lock);
- ret = set_sctp_state(pd, cp, direction, skb);
- spin_unlock(&cp->lock);
-
- return ret;
+ spin_lock_bh(&cp->lock);
+ set_sctp_state(pd, cp, direction, skb);
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 sctp_app_hashkey(__be16 port)
@@ -1027,30 +485,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
hash = sctp_app_hashkey(port);
- spin_lock_bh(&ipvs->sctp_app_lock);
list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
- spin_lock_bh(&ipvs->sctp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->sctp_app_lock);
+ list_del_rcu(&inc->p_list);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
@@ -1066,12 +519,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&ipvs->sctp_app_lock);
- list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1087,7 +540,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
@@ -1096,14 +549,16 @@ out:
* timeouts is netns related now.
* ---------------------------------------------
*/
-static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
{
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->sctp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
}
static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index c0cc341b840..e3a697234a9 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -33,33 +33,34 @@
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
- int *verdict, struct ip_vs_conn **cpp)
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
struct net *net;
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
- struct ip_vs_iphdr iph;
+ struct netns_ipvs *ipvs;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
net = skb_net(skb);
+ ipvs = net_ipvs(net);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
- if (th->syn &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
- &iph.daddr, th->dest))) {
+ rcu_read_lock();
+ if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
int ignored;
- if (ip_vs_todrop(net_ipvs(net))) {
+ if (ip_vs_todrop(ipvs)) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -68,18 +69,17 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
- *verdict = ip_vs_leave(svc, skb, pd);
- else {
- ip_vs_service_put(svc);
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -128,20 +128,18 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
static int
-tcp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct tcphdr *tcph;
- unsigned int tcphoff;
+ unsigned int tcphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- tcphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
@@ -208,20 +206,18 @@ tcp_snat_handler(struct sk_buff *skb,
static int
-tcp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct tcphdr *tcph;
- unsigned int tcphoff;
+ unsigned int tcphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- tcphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
@@ -407,7 +403,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
@@ -421,7 +417,7 @@ static struct tcp_states_t tcp_states [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
@@ -430,7 +426,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
-/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
@@ -444,7 +440,7 @@ static struct tcp_states_t tcp_states_dos [] = {
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
-/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
@@ -546,7 +542,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
/*
* Handle state transitions
*/
-static int
+static void
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_proto_data *pd)
@@ -561,13 +557,11 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
if (th == NULL)
- return 0;
+ return;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
set_tcp_state(pd, cp, direction, th);
- spin_unlock(&cp->lock);
-
- return 1;
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 tcp_app_hashkey(__be16 port)
@@ -588,18 +582,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
hash = tcp_app_hashkey(port);
- spin_lock_bh(&ipvs->tcp_app_lock);
list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
@@ -607,13 +599,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock_bh(&ipvs->tcp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->tcp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -632,12 +621,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&ipvs->tcp_app_lock);
- list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -654,7 +643,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -668,26 +657,28 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
: tcp_timeouts[IP_VS_TCP_S_LISTEN]);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
/* ---------------------------------------------
* timeouts is netns related now.
* ---------------------------------------------
*/
-static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
{
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->tcp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
pd->tcp_state_table = tcp_states;
+ return 0;
}
static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index f1282cbe6fe..b62a3c0ff9b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -30,23 +30,23 @@
static int
udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
- int *verdict, struct ip_vs_conn **cpp)
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
{
struct net *net;
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
- struct ip_vs_iphdr iph;
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
- uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
if (uh == NULL) {
*verdict = NF_DROP;
return 0;
}
net = skb_net(skb);
- svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
- &iph.daddr, uh->dest);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
if (svc) {
int ignored;
@@ -55,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -64,18 +64,17 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
- *verdict = ip_vs_leave(svc, skb, pd);
- else {
- ip_vs_service_put(svc);
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -125,20 +124,18 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
static int
-udp_snat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct udphdr *udph;
- unsigned int udphoff;
+ unsigned int udphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- udphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
@@ -210,20 +207,18 @@ udp_snat_handler(struct sk_buff *skb,
static int
-udp_dnat_handler(struct sk_buff *skb,
- struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct udphdr *udph;
- unsigned int udphoff;
+ unsigned int udphoff = iph->len;
int oldlen;
int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
- if (cp->af == AF_INET6)
- udphoff = sizeof(struct ipv6hdr);
- else
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
#endif
- udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
@@ -364,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
hash = udp_app_hashkey(port);
-
- spin_lock_bh(&ipvs->udp_app_lock);
list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
@@ -385,12 +377,9 @@ static void
udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
- struct netns_ipvs *ipvs = net_ipvs(net);
- spin_lock_bh(&ipvs->udp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->udp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -408,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&ipvs->udp_app_lock);
- list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -430,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -454,28 +443,29 @@ static const char * udp_state_name(int state)
return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
}
-static int
+static void
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_proto_data *pd)
{
if (unlikely(!pd)) {
pr_err("UDP no ns data\n");
- return 0;
+ return;
}
cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
- return 1;
}
-static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
{
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->udp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
}
static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index c49b388d108..176b87c35e3 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
{
- svc->sched_data = &svc->destinations;
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -46,38 +55,44 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
-ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
do {
- /* skip list head */
- if (q == &svc->destinations) {
- q = q->next;
- continue;
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
}
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
- /* HIT */
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
IP_VS_DBG_BUF(6, "RR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
@@ -94,7 +109,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
.init_service = ip_vs_rr_init_svc,
- .update_service = ip_vs_rr_update_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
.schedule = ip_vs_rr_schedule,
};
@@ -106,6 +122,7 @@ static int __init ip_vs_rr_init(void)
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_rr_init);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 08dbdd5bc18..4dbcda6258b 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err);
*/
static LIST_HEAD(ip_vs_schedulers);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_sched_lock);
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
/*
@@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- svc->scheduler = scheduler;
-
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
@@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
return ret;
}
}
-
+ rcu_assign_pointer(svc->scheduler, scheduler);
return 0;
}
@@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
/*
* Unbind a service with its scheduler
*/
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
{
- struct ip_vs_scheduler *sched = svc->scheduler;
+ struct ip_vs_scheduler *cur_sched;
- if (!sched)
- return 0;
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
- if (sched->done_service) {
- if (sched->done_service(svc) != 0) {
- pr_err("%s(): done error\n", __func__);
- return -EINVAL;
- }
- }
-
- svc->scheduler = NULL;
- return 0;
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
}
@@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return NULL;
}
@@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
if (svc->fwmark) {
IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
- svc->scheduler->name, svc->fwmark,
- svc->fwmark, msg);
+ sched->name, svc->fwmark, svc->fwmark, msg);
#ifdef CONFIG_IP_VS_IPV6
} else if (svc->af == AF_INET6) {
- IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.in6, ntohs(svc->port), msg);
#endif
} else {
IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.ip, ntohs(svc->port), msg);
}
}
@@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (!list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 89ead246ed3..e446b9fa742 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -44,7 +44,7 @@
#include <net/ip_vs.h>
-static inline unsigned int
+static inline int
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
{
/*
@@ -59,10 +59,11 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
@@ -79,7 +80,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -94,12 +95,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -134,6 +135,7 @@ static int __init ip_vs_sed_init(void)
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_sed_init);
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index b5e2556c581..cc65b2f42cd 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -30,6 +30,11 @@
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
+ * The weight destination attribute can be used to control the
+ * distribution of connections to the destinations in servernode. The
+ * greater the weight, the more connections the destination
+ * will receive.
+ *
*/
#define KMSG_COMPONENT "IPVS"
@@ -43,12 +48,16 @@
#include <net/ip_vs.h>
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
/*
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -61,11 +70,24 @@ struct ip_vs_sh_bucket {
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
/*
* Returns hash value for IPVS SH entry
*/
-static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, unsigned int offset)
{
__be32 addr_fold = addr->ip;
@@ -74,7 +96,8 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+ return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ IP_VS_SH_TAB_MASK;
}
@@ -82,38 +105,102 @@ static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
{
- return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+ unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
}
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
+ dest = rcu_dereference(s->buckets[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ int d_count;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
+ d_count = 0;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ /* Don't move to next dest until filling weight */
+ if (++d_count >= atomic_read(&dest->weight)) {
+ p = p->next;
+ d_count = 0;
+ }
- p = p->next;
}
b++;
}
@@ -124,16 +211,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
{
int i;
struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -142,64 +231,84 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
/* allocate the SH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_ATOMIC);
- if (tbl == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- }
- svc->sched_data = tbl;
+
+ svc->sched_data = s;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
+ struct ip_vs_sh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ ip_vs_sh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-/*
- * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
- * consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
- return dest->flags & IP_VS_DEST_F_OVERLOAD;
+ __be16 port;
+ struct tcphdr _tcph, *th;
+ struct udphdr _udph, *uh;
+ sctp_sctphdr_t _sctph, *sh;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (unlikely(th == NULL))
+ return 0;
+ port = th->source;
+ break;
+ case IPPROTO_UDP:
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (unlikely(uh == NULL))
+ return 0;
+ port = uh->source;
+ break;
+ case IPPROTO_SCTP:
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (unlikely(sh == NULL))
+ return 0;
+ port = sh->source;
+ break;
+ default:
+ port = 0;
+ }
+
+ return port;
}
@@ -207,28 +316,32 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Source Hashing scheduling
*/
static struct ip_vs_dest *
-ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
- struct ip_vs_iphdr iph;
-
- ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+ struct ip_vs_sh_state *s;
+ __be16 port = 0;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
- if (!dest
- || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
- || atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+ port = ip_vs_sh_get_port(skb, iph);
+
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+ dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ else
+ dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+ if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr),
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr),
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port));
@@ -247,7 +360,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
- .update_service = ip_vs_sh_update_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
@@ -261,6 +376,7 @@ static int __init ip_vs_sh_init(void)
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e292e5bddc7..db801263ee9 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -61,6 +61,7 @@
#define SYNC_PROTO_VER 1 /* Protocol version in header */
+static struct lock_class_key __ipvs_sync_key;
/*
* IPVS sync connection entry
* Version 0, i.e. original version.
@@ -195,6 +196,7 @@ struct ip_vs_sync_thread_data {
struct net *net;
struct socket *sock;
char *buf;
+ int id;
};
/* Version 0 definition of packet sizes */
@@ -244,7 +246,7 @@ struct ip_vs_sync_thread_data {
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
- __u16 size;
+ __be16 size;
/* ip_vs_sync_conn entries start here */
};
@@ -253,7 +255,7 @@ struct ip_vs_sync_mesg_v0 {
struct ip_vs_sync_mesg {
__u8 reserved; /* must be zero */
__u8 syncid;
- __u16 size;
+ __be16 size;
__u8 nr_conns;
__s8 version; /* SYNC_PROTO_VER */
__u16 spare;
@@ -270,13 +272,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
-};
-
/*
* Copy of struct ip_vs_seq
* From unaligned network order to aligned host order
@@ -299,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
put_unaligned_be32(ho->previous_delta, &no->previous_delta);
}
-static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_lock);
- if (list_empty(&ipvs->sync_queue)) {
+ if (list_empty(&ms->sync_queue)) {
sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
} else {
- sb = list_entry(ipvs->sync_queue.next,
- struct ip_vs_sync_buff,
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
list);
list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
}
spin_unlock_bh(&ipvs->sync_lock);
@@ -333,10 +332,10 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
kfree(sb);
return NULL;
}
- sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
sb->mesg->syncid = ipvs->master_syncid;
- sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
sb->mesg->nr_conns = 0;
sb->mesg->spare = 0;
sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
@@ -352,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct netns_ipvs *ipvs)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
{
- struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
spin_lock(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ipvs->sync_queue);
- else
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
ip_vs_sync_buff_release(sb);
spin_unlock(&ipvs->sync_lock);
}
@@ -369,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_buff_lock);
- if (ipvs->sync_buff &&
- time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
- sb = ipvs->sync_buff;
- ipvs->sync_buff = NULL;
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
} else
sb = NULL;
spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
-/*
- * Switch mode from sending version 0 or 1
- * - must handle sync_buf
- */
-void ip_vs_sync_switch_mode(struct net *net, int mode)
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
- return;
- if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
- return;
-
- spin_lock_bh(&ipvs->sync_buff_lock);
- /* Buffer empty ? then let buf_create do the job */
- if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
- kfree(ipvs->sync_buff);
- ipvs->sync_buff = NULL;
- } else {
- spin_lock_bh(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&ipvs->sync_buff->list,
- &ipvs->sync_queue);
- else
- ip_vs_sync_buff_release(ipvs->sync_buff);
- spin_unlock_bh(&ipvs->sync_lock);
- }
- spin_unlock_bh(&ipvs->sync_buff_lock);
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
}
/*
@@ -434,22 +418,121 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
mesg->nr_conns = 0;
mesg->syncid = ipvs->master_syncid;
- mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
sb->firstuse = jiffies;
return sb;
}
+/* Check if connection is controlled by persistence */
+static inline bool in_persistence(struct ip_vs_conn *cp)
+{
+ for (cp = cp->control; cp; cp = cp->control) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ return true;
+ }
+ return false;
+}
+
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
+ return 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
+ (1 << IP_VS_SCTP_S_CLOSED))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
/*
* Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
*/
-void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg_v0 *m;
struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
int len;
if (unlikely(cp->af != AF_INET))
@@ -458,21 +541,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
return;
- spin_lock(&ipvs->sync_buff_lock);
- if (!ipvs->sync_buff) {
- ipvs->sync_buff =
- ip_vs_sync_buff_create_v0(ipvs);
- if (!ipvs->sync_buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
- s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
/* copy members */
s->reserved = 0;
@@ -492,19 +595,25 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
}
m->nr_conns++;
- m->size += len;
- ipvs->sync_buff->head += len;
+ m->size = htons(ntohs(m->size) + len);
+ buff->head += len;
/* check if there is a space for next one */
- if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
- if (cp->control)
- ip_vs_sync_conn(net, cp->control);
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
}
/*
@@ -512,23 +621,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
* Called by ip_vs_in.
* Sending Version 1 messages
*/
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m;
union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
__u8 *p;
unsigned int len, pe_name_len, pad;
/* Handle old version of the protocol */
if (sysctl_sync_ver(ipvs) == 0) {
- ip_vs_sync_conn_v0(net, cp);
+ ip_vs_sync_conn_v0(net, cp, pkts);
return;
}
/* Do not sync ONE PACKET */
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
goto control;
sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
/* Sanity checks */
pe_name_len = 0;
if (cp->pe_data_len) {
@@ -539,7 +654,14 @@ sloop:
pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
}
- spin_lock(&ipvs->sync_buff_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -558,28 +680,33 @@ sloop:
/* check if there is a space for this one */
pad = 0;
- if (ipvs->sync_buff) {
- pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
- if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
pad = 0;
}
}
- if (!ipvs->sync_buff) {
- ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
- if (!ipvs->sync_buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
+ m = buff->mesg;
}
- m = ipvs->sync_buff->mesg;
- p = ipvs->sync_buff->head;
- ipvs->sync_buff->head += pad + len;
- m->size += pad + len;
+ p = buff->head;
+ buff->head += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
/* Add ev. padding from prev. sync_conn */
while (pad--)
*(p++) = 0;
@@ -602,9 +729,9 @@ sloop:
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6) {
p += sizeof(struct ip_vs_sync_v6);
- ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
- ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
- ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+ s->v6.caddr = cp->caddr.in6;
+ s->v6.vaddr = cp->vaddr.in6;
+ s->v6.daddr = cp->daddr.in6;
} else
#endif
{
@@ -636,23 +763,17 @@ sloop:
}
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
control:
/* synchronize its controller if it has */
cp = cp->control;
if (!cp)
return;
- /*
- * Reduce sync rate for templates
- * i.e only increment in_pkts for Templates.
- */
- if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
- int pkts = atomic_add_return(1, &cp->in_pkts);
-
- if (pkts % sysctl_sync_period(ipvs) != 1)
- return;
- }
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
goto sloop;
}
@@ -730,66 +851,46 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
else
cp = ip_vs_ct_in_get(param);
- if (cp && param->pe_data) /* Free pe_data */
+ if (cp) {
+ /* Free pe_data */
kfree(param->pe_data);
- if (!cp) {
+
+ dest = cp->dest;
+ spin_lock_bh(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
+ } else {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
* but still handled.
*/
+ rcu_read_lock();
dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
- param->vport, protocol, fwmark);
-
- /* Set the approprite ativity flag */
- if (protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- }
+ param->vport, protocol, fwmark, flags);
+
cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
- if (dest)
- atomic_dec(&dest->refcnt);
+ rcu_read_unlock();
if (!cp) {
if (param->pe_data)
kfree(param->pe_data);
IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
}
if (opt)
@@ -838,7 +939,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer,
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
- unsigned flags, state;
+ unsigned int flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
@@ -1087,10 +1188,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
IP_VS_DBG(2, "BACKUP, message header too short\n");
return;
}
- /* Convert size back to host byte order */
- m2->size = ntohs(m2->size);
- if (buflen != m2->size) {
+ if (buflen != ntohs(m2->size)) {
IP_VS_DBG(2, "BACKUP, bogus message size\n");
return;
}
@@ -1108,7 +1207,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
for (i=0; i<nr_conns; i++) {
union ip_vs_sync_conn *s;
- unsigned size;
+ unsigned int size;
int retc;
p = msg_end;
@@ -1148,6 +1247,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
@@ -1297,9 +1418,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct net *net)
+static struct socket *make_send_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1323,6 +1450,9 @@ static struct socket *make_send_sock(struct net *net)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
@@ -1348,9 +1478,15 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct net *net)
+static struct socket *make_receive_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1367,7 +1503,10 @@ static struct socket *make_receive_sock(struct net *net)
*/
sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr));
@@ -1410,18 +1549,19 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
return len;
}
-static void
+static int
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
{
int msize;
+ int ret;
- msize = msg->size;
-
- /* Put size in network byte order */
- msg->size = htons(msg->size);
+ msize = ntohs(msg->size);
- if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
- pr_err("ip_vs_send_async error\n");
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
}
static int
@@ -1437,48 +1577,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
- len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
if (len < 0)
- return -1;
+ return len;
LeaveFunction(7);
return len;
}
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->master_mcast_ifn, ipvs->master_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
- while (!kthread_should_stop()) {
- while ((sb = sb_dequeue(ipvs))) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
}
-
- /* check if entries stay in ipvs->sync_buff for 2 seconds */
- sb = get_curr_sync_buff(ipvs, 2 * HZ);
- if (sb) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ /* (Ab)use interruptible sleep to avoid increasing
+ * the load avg.
+ */
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop());
+ if (unlikely(kthread_should_stop()))
+ goto done;
}
-
- schedule_timeout_interruptible(HZ);
+ ip_vs_sync_buff_release(sb);
}
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
/* clean up the sync_buff queue */
- while ((sb = sb_dequeue(ipvs)))
+ while ((sb = sb_dequeue(ipvs, ms)))
ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
/* clean up the current sync_buff */
- sb = get_curr_sync_buff(ipvs, 0);
+ sb = get_curr_sync_buff(ipvs, ms, 0);
if (sb)
ip_vs_sync_buff_release(sb);
@@ -1497,8 +1679,8 @@ static int sync_thread_backup(void *data)
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->backup_mcast_ifn, ipvs->backup_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1510,15 +1692,12 @@ static int sync_thread_backup(void *data)
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->recv_mesg_maxlen);
if (len <= 0) {
- pr_err("receiving message error\n");
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
break;
}
- /* disable bottom half, because it accesses the data
- shared by softirq while getting/creating conns */
- local_bh_disable();
ip_vs_process_message(tinfo->net, tinfo->buf, len);
- local_bh_enable();
}
}
@@ -1534,85 +1713,142 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
- struct task_struct **realtask, *task;
+ struct task_struct **array = NULL, *task;
struct socket *sock;
struct netns_ipvs *ipvs = net_ipvs(net);
- char *name, *buf = NULL;
+ char *name;
int (*threadfn)(void *data);
+ int id, count;
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
+
if (state == IP_VS_STATE_MASTER) {
- if (ipvs->master_thread)
+ if (ipvs->ms)
return -EEXIST;
strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
sizeof(ipvs->master_mcast_ifn));
ipvs->master_syncid = syncid;
- realtask = &ipvs->master_thread;
- name = "ipvs_master:%d";
+ name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
- sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (ipvs->backup_thread)
+ if (ipvs->backup_threads)
return -EEXIST;
strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
sizeof(ipvs->backup_mcast_ifn));
ipvs->backup_syncid = syncid;
- realtask = &ipvs->backup_thread;
- name = "ipvs_backup:%d";
+ name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock(net);
} else {
return -EINVAL;
}
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto out;
- }
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
- set_sync_mesg_maxlen(net, state);
- if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
- if (!buf)
- goto outsocket;
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
}
+ set_sync_mesg_maxlen(net, state);
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto outbuf;
-
- tinfo->net = net;
- tinfo->sock = sock;
- tinfo->buf = buf;
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ } else {
+ tinfo->buf = NULL;
+ }
+ tinfo->id = id;
- task = kthread_run(threadfn, tinfo, name, ipvs->gen);
- if (IS_ERR(task)) {
- result = PTR_ERR(task);
- goto outtinfo;
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
}
/* mark as active */
- *realtask = task;
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outtinfo:
- kfree(tinfo);
-outbuf:
- kfree(buf);
outsocket:
sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
return result;
}
@@ -1620,38 +1856,60 @@ out:
int stop_sync_thread(struct net *net, int state)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
int retc = -EINVAL;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!ipvs->master_thread)
+ if (!ipvs->ms)
return -ESRCH;
- pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(ipvs->master_thread));
-
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
* add sync buffers to the queue, when we are already in
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ipvs->sync_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
ipvs->sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ipvs->sync_lock);
- retc = kthread_stop(ipvs->master_thread);
- ipvs->master_thread = NULL;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!ipvs->backup_thread)
+ if (!ipvs->backup_threads)
return -ESRCH;
- pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(ipvs->backup_thread));
-
ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
- retc = kthread_stop(ipvs->backup_thread);
- ipvs->backup_thread = NULL;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
}
/* decrease the module use count */
@@ -1663,24 +1921,22 @@ int stop_sync_thread(struct net *net, int state)
/*
* Initialize data struct for each netns
*/
-int __net_init __ip_vs_sync_init(struct net *net)
+int __net_init ip_vs_sync_net_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- INIT_LIST_HEAD(&ipvs->sync_queue);
+ __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
spin_lock_init(&ipvs->sync_lock);
spin_lock_init(&ipvs->sync_buff_lock);
-
- ipvs->sync_mcast_addr.sin_family = AF_INET;
- ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
- ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
return 0;
}
-void __ip_vs_sync_cleanup(struct net *net)
+void ip_vs_sync_net_cleanup(struct net *net)
{
int retc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ mutex_lock(&ipvs->sync_mutex);
retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Master Daemon\n");
@@ -1688,13 +1944,5 @@ void __ip_vs_sync_cleanup(struct net *net)
retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Backup Daemon\n");
-}
-
-int __init ip_vs_sync_init(void)
-{
- return 0;
-}
-
-void ip_vs_sync_cleanup(void)
-{
+ mutex_unlock(&ipvs->sync_mutex);
}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bc1bfc48a17..b5b4650d50a 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -31,10 +31,11 @@
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
-ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest, *least;
- unsigned int loh, doh;
+ int loh, doh;
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
@@ -51,7 +52,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -66,12 +67,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
- if (loh * atomic_read(&dest->weight) >
- doh * atomic_read(&least->weight)) {
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
@@ -106,6 +107,7 @@ static int __init ip_vs_wlc_init(void)
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wlc_init);
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 1ef41f50723..0546cd572d6 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -29,14 +29,45 @@
#include <net/ip_vs.h>
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
- struct list_head *cl; /* current list head */
+ struct ip_vs_dest *cl; /* current dest or head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
+ struct rcu_head rcu_head;
};
@@ -84,41 +115,45 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
/*
* Allocate the mark variable for WRR scheduling
*/
- mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
- if (mark == NULL) {
- pr_err("%s(): no memory\n", __func__);
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
+ if (mark == NULL)
return -ENOMEM;
- }
- mark->cl = &svc->destinations;
- mark->cw = 0;
- mark->mw = ip_vs_wrr_max_weight(svc);
+
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
svc->sched_data = mark;
return 0;
}
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
/*
* Release the mark variable
*/
- kfree(svc->sched_data);
-
- return 0;
+ kfree_rcu(mark, rcu_head);
}
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
- mark->cl = &svc->destinations;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
- if (mark->cw > mark->mw)
- mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -127,82 +162,82 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
-ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
{
- struct ip_vs_dest *dest;
+ struct ip_vs_dest *dest, *last, *stop = NULL;
struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
+ bool last_pass = false, restarted = false;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- /*
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
- */
- write_lock(&svc->sched_lock);
- p = mark->cl;
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
while (1) {
- if (mark->cl == &svc->destinations) {
- /* it is at the head of the destination list */
-
- if (mark->cl == mark->cl->next) {
- /* no dest entry */
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "no destinations present");
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
- /*
- * Still zero, which means no available servers.
- */
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- ip_vs_scheduler_err(svc,
- "no destination available");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
- /* not at the head of the list */
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
- /* got it */
- break;
- }
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
}
-
- if (mark->cl == p && mark->cw == mark->di) {
- /* back to the start, and no dest is found.
- It is only possible when all dests are OVERLOADED */
- dest = NULL;
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "all destinations are overloaded");
- goto out;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
}
}
+found:
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
+ mark->cl = dest;
out:
- write_unlock(&svc->sched_lock);
+ spin_unlock_bh(&svc->sched_lock);
return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
}
@@ -213,7 +248,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
- .update_service = ip_vs_wrr_update_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
.schedule = ip_vs_wrr_schedule,
};
@@ -225,6 +262,7 @@ static int __init ip_vs_wrr_init(void)
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wrr_init);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ee319a4338b..73ba1cc7a88 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
* - not all connections have destination server, for example,
* connections in backup server when fwmark is used
* - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
* LOCAL_OUT rules:
* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
* - skb->pkt_type is not set yet
@@ -49,165 +51,253 @@ enum {
IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
* local
*/
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
+ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
};
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
- u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_rtos = rtos;
- dest->dst_cookie = dst_cookie;
- dst_release(old_dst);
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
{
- struct dst_entry *dst = dest->dst_cache;
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
- if (!dst)
+ if (!dest_dst)
return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, dest->dst_cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
+ return dest_dst;
+}
+
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
}
- dst_hold(dst);
- return dst;
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
+}
+
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+ FLOWI_FLAG_KNOWN_NH : 0;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
}
/* Get route to destination or remote server */
-static struct rtable *
+static int
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
- __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
struct rtable *ort; /* Original route */
- int local;
+ struct iphdr *iph;
+ __be16 df;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos))) {
- struct flowi4 fl4;
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = dest->addr.ip;
- fl4.flowi4_tos = rtos;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt)) {
- spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &dest->addr.ip);
- return NULL;
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
+ if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- dest->dst_saddr.ip = fl4.saddr;
- IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
- "rtos=%X\n",
- &dest->addr.ip, &dest->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt), rtos);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
}
daddr = dest->addr.ip;
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.ip;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.ip;
} else {
- struct flowi4 fl4;
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = daddr;
- fl4.flowi4_tos = rtos;
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt)) {
- IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &daddr);
- return NULL;
- }
+ __be32 saddr = htonl(INADDR_ANY);
+
+ noref = 0;
+
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ if (!rt)
+ goto err_unreach;
if (ret_saddr)
- *ret_saddr = fl4.saddr;
+ *ret_saddr = saddr;
}
- local = rt->rt_flags & RTCF_LOCAL;
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &daddr);
- ip_rt_put(rt);
- return NULL;
- }
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
- IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
- "requires NAT method, dest: %pI4\n",
- &ip_hdr(skb)->daddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ goto err_put;
}
- if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
- "to non-local address, dest: %pI4\n",
- &ip_hdr(skb)->saddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ iph = ip_hdr(skb);
+ if (likely(!local)) {
+ if (unlikely(ipv4_is_loopback(iph->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI4 to non-local address, dest: %pI4\n",
+ &iph->saddr, &daddr);
+ goto err_put;
+ }
+ } else {
+ ort = skb_rtable(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !(ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+ "local requires NAT method, dest: %pI4\n",
+ &iph->daddr, &daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
}
- return rt;
-}
-
-/* Reroute packet to local IPv4 stack after DNAT */
-static int
-__ip_vs_reroute_locally(struct sk_buff *skb)
-{
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
- struct iphdr *iph = ip_hdr(skb);
-
- if (rt_is_input_route(rt)) {
- unsigned long orefdst = skb->_skb_refdst;
-
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev))
- return 0;
- refdst_drop(orefdst);
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ df = iph->frag_off & htons(IP_DF);
} else {
- struct flowi4 fl4 = {
- .daddr = iph->daddr,
- .saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
- .flowi4_mark = skb->mark,
- };
-
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- return 0;
- if (!(rt->rt_flags & RTCF_LOCAL)) {
- ip_rt_put(rt);
- return 0;
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
}
- /* Drop old route. */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ort = skb_rtable(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ /* MTU check allowed? */
+ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
+ }
+
+ /* MTU checking */
+ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+ goto err_put;
}
- return 1;
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#ifdef CONFIG_IP_VS_IPV6
static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
{
- return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+ return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
}
static struct dst_entry *
@@ -235,7 +325,7 @@ __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
goto out_err;
}
}
- ipv6_addr_copy(ret_saddr, &fl6.saddr);
+ *ret_saddr = fl6.saddr;
return dst;
out_err:
@@ -247,131 +337,196 @@ out_err:
/*
* Get route to destination or remote server
*/
-static struct rt6_info *
+static int
__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
- int do_xfrm, int rt_mode)
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct rt6_info *ort; /* Original route */
struct dst_entry *dst;
- int local;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
u32 cookie;
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
- &dest->dst_saddr.in6,
+ &dest_dst->dst_saddr.in6,
do_xfrm);
if (!dst) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest->dst_saddr.in6,
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
}
if (ret_saddr)
- ipv6_addr_copy(ret_saddr, &dest->dst_saddr.in6);
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.in6;
} else {
+ noref = 0;
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
if (!dst)
- return NULL;
+ goto err_unreach;
rt = (struct rt6_info *) dst;
}
local = __ip_vs_is_local_route6(rt);
if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
rt_mode)) {
- IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
local ? "local":"non-local", daddr);
- dst_release(&rt->dst);
- return NULL;
+ goto err_put;
}
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = (struct rt6_info *) skb_dst(skb)) &&
- __ip_vs_is_local_route6(ort))) {
- IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
- "requires NAT method, dest: %pI6\n",
- &ipv6_hdr(skb)->daddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ if (likely(!local)) {
+ if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI6c to non-local address, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err_put;
+ }
+ } else {
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !__ip_vs_is_local_route6(ort)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+ "to local requires NAT method, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
}
- if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
- IPV6_ADDR_LOOPBACK)) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
- "to non-local address, dest: %pI6\n",
- &ipv6_hdr(skb)->saddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
- return rt;
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#endif
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
{
- struct dst_entry *old_dst;
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
}
-#define IP_VS_XMIT_TUNNEL(skb, cp) \
-({ \
- int __ret = NF_ACCEPT; \
- \
- (skb)->ipvs_property = 1; \
- if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
- __ret = ip_vs_confirm_conntrack(skb, cp); \
- if (__ret == NF_ACCEPT) { \
- nf_reset(skb); \
- skb_forward_csum(skb); \
- } \
- __ret; \
-})
-
-#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- else \
- ip_vs_update_conntrack(skb, cp, 1); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
-
-#define IP_VS_XMIT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
/*
@@ -379,10 +534,10 @@ do { \
*/
int
ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
/* we do not touch skb and do not need pskb ptr */
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -393,54 +548,31 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
- IP_VS_RT_MODE_NON_LOCAL, NULL)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL) < 0)
goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(ip_hdr(skb));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ip_send_check(iph);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -448,58 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- struct ipv6hdr *iph = ipv6_hdr(skb);
- int mtu;
-
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0,
- IP_VS_RT_MODE_NON_LOCAL)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -511,96 +612,71 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
- struct iphdr *iph = ip_hdr(skb);
- int local;
+ int local, rc, was_input;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
- "ip_vs_nat_xmit(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct iphdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
- goto tx_error_put;
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- if (!local) {
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -608,66 +684,64 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
- int local;
+ int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
- sizeof(_pt), &_pt);
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR))))
- goto tx_error_icmp;
- local = __ip_vs_is_local_route6(rt);
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -678,43 +752,20 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
- "ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
- ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
-
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
+ ipv6_hdr(skb)->daddr = cp->daddr.in6;
IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
@@ -723,22 +774,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
LeaveFunction(10);
kfree_skb(skb);
+ rcu_read_unlock();
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -764,66 +812,52 @@ tx_error_put:
*/
int
ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
struct rtable *rt; /* Route to the other host */
__be32 saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
u8 tos = old_iph->tos;
- __be16 df = old_iph->frag_off;
+ __be16 df;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL,
- &saddr)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
+ rt = skb_rtable(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- df |= (old_iph->frag_off & htons(IP_DF));
-
- if ((old_iph->frag_off & htons(IP_DF) &&
- mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
+ /* Copy DF, reset fragment offset and MF */
+ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
- kfree_skb(skb);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
skb = new_skb;
old_iph = ip_hdr(skb);
}
@@ -837,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -853,36 +883,33 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
iph->daddr = cp->daddr.ip;
iph->saddr = saddr;
iph->ttl = old_iph->ttl;
- ip_select_ident(iph, &rt->dst, NULL);
+ ip_select_ident(skb, NULL);
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
struct in6_addr saddr; /* Source for tunnel */
@@ -890,59 +917,38 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *old_iph = ipv6_hdr(skb);
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
- &saddr, 1, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
+ rt = (struct rt6_info *) skb_dst(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
- if (mtu < IPV6_MIN_MTU) {
- IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
- IPV6_MIN_MTU);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
-
- if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
- !skb_is_gso(skb)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- dst_release(&rt->dst);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
- kfree_skb(skb);
+
+ if (!new_skb)
+ goto tx_error;
+ consume_skb(skb);
skb = new_skb;
old_iph = ipv6_hdr(skb);
}
@@ -953,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -967,32 +969,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
iph->priority = old_iph->priority;
memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
- ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
- ipv6_addr_copy(&iph->saddr, &saddr);
+ iph->daddr = cp->daddr.in6;
+ iph->saddr = saddr;
iph->hop_limit = old_iph->hop_limit;
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip6_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -1003,60 +1002,38 @@ tx_error_put:
*/
int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = ip_hdr(skb);
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL, NULL)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
ip_send_check(ip_hdr(skb));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1064,62 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1132,13 +1083,13 @@ tx_error:
*/
int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, int offset, unsigned int hooknum)
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
- int rt_mode;
+ int rt_mode, was_input;
EnterFunction(10);
@@ -1147,7 +1098,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp);
+ rc = cp->packet_xmit(skb, cp, pp, iph);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1158,105 +1109,76 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/*
* mangle and send the packet here (only for VS/NAT)
*/
+ was_input = rt_is_input_route(skb_rtable(skb));
/* LOCALNODE from FORWARD hook is not supported */
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos),
- rt_mode, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp(skb, pp, cp, 0);
- if (!local) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
goto out;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, int offset, unsigned int hooknum)
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
int rt_mode;
@@ -1268,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp);
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1284,25 +1206,26 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, rt_mode)))
- goto tx_error_icmp;
-
- local = __ip_vs_is_local_route6(rt);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
*/
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (cp->flags & IP_VS_CONN_F_SYNC && local) {
enum ip_conntrack_info ctinfo;
- struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct)) {
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -1313,58 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (skb->len > mtu && !skb_is_gso(skb)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
- skb->local_df = 1;
+ skb->ignore_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
goto out;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif